Commit f21c7122 authored by delanoe's avatar delanoe

[FIX] ISIDORE/SPARQLE fix duplicates in scan and corpus building. Improving query with date desc.

parent dce1e317
......@@ -49,7 +49,7 @@ def isidore(query, count=False, offset=None, limit=None):
for r in results:
doc = dict()
doc_values = dict()
doc["url"], doc["id"], doc["title"], doc["date"], doc["abstract"], doc["source"] = r
doc["url"], doc["title"], doc["date"], doc["abstract"], doc["source"] = r
for k in doc.keys():
doc_values[k] = doc[k].value
......
......@@ -31,6 +31,8 @@ class IsidoreParser(Parser):
, "source" : "source"
}
uniq_id = set()
for doc in json_docs:
hyperdata = {}
......@@ -38,27 +40,31 @@ class IsidoreParser(Parser):
for key, path in hyperdata_path.items():
hyperdata[key] = doc.get(path, "")
# Source is the Journal Name
hyperdata["source"] = doc.get("journal", "ISIDORE Database")
# Working on the date
maybeDate = doc.get("date" , None)
if hyperdata["url"] not in uniq_id:
# Removing the duplicates implicitly
uniq_id.add(hyperdata["url"])
# Source is the Journal Name
hyperdata["source"] = doc.get("journal", "ISIDORE Database")
# Working on the date
maybeDate = doc.get("date" , None)
if maybeDate is None:
date = datetime.now()
else:
try :
# Model of date: 1958-01-01T00:00:00
date = datetime.strptime(maybeDate, '%Y-%m-%dT%H:%M:%S')
except :
print("FIX DATE ISIDORE please >%s<" % maybeDate)
if maybeDate is None:
date = datetime.now()
else:
try :
# Model of date: 1958-01-01T00:00:00
date = datetime.strptime(maybeDate, '%Y-%m-%dT%H:%M:%S')
except :
print("FIX DATE ISIDORE please >%s<" % maybeDate)
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment