Commit f21c7122 authored by delanoe's avatar delanoe

[FIX] ISIDORE/SPARQLE fix duplicates in scan and corpus building. Improving query with date desc.

parent dce1e317
...@@ -49,7 +49,7 @@ def isidore(query, count=False, offset=None, limit=None): ...@@ -49,7 +49,7 @@ def isidore(query, count=False, offset=None, limit=None):
for r in results: for r in results:
doc = dict() doc = dict()
doc_values = dict() doc_values = dict()
doc["url"], doc["id"], doc["title"], doc["date"], doc["abstract"], doc["source"] = r doc["url"], doc["title"], doc["date"], doc["abstract"], doc["source"] = r
for k in doc.keys(): for k in doc.keys():
doc_values[k] = doc[k].value doc_values[k] = doc[k].value
......
...@@ -31,6 +31,8 @@ class IsidoreParser(Parser): ...@@ -31,6 +31,8 @@ class IsidoreParser(Parser):
, "source" : "source" , "source" : "source"
} }
uniq_id = set()
for doc in json_docs: for doc in json_docs:
hyperdata = {} hyperdata = {}
...@@ -38,6 +40,10 @@ class IsidoreParser(Parser): ...@@ -38,6 +40,10 @@ class IsidoreParser(Parser):
for key, path in hyperdata_path.items(): for key, path in hyperdata_path.items():
hyperdata[key] = doc.get(path, "") hyperdata[key] = doc.get(path, "")
if hyperdata["url"] not in uniq_id:
# Removing the duplicates implicitly
uniq_id.add(hyperdata["url"])
# Source is the Journal Name # Source is the Journal Name
hyperdata["source"] = doc.get("journal", "ISIDORE Database") hyperdata["source"] = doc.get("journal", "ISIDORE Database")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment