Commit 44b444e1 authored by delanoe's avatar delanoe

[REPEC/PARSERS] Some fixes (filepath, type corrections in others parsers.

parent 7eadf91e
......@@ -8,8 +8,10 @@
from ._Crawler import *
import json
from gargantext.settings import API_TOKENS
from gargantext.settings import API_TOKENS
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
class MultivacCrawler(Crawler):
''' Multivac API CLIENT'''
......@@ -79,8 +81,9 @@ class MultivacCrawler(Crawler):
return self.results_nb
def download(self, query):
self.path = "/tmp/MultivacResults.xml"
downloaded = False
self.status.append("fetching results")
corpus = []
......@@ -92,15 +95,12 @@ class MultivacCrawler(Crawler):
print("ERROR (scrap: multivac d/l ): ",msg)
self.query_max = QUERY_SIZE_N_MAX
with open(self.path, 'wb') as f:
#for page in range(1, self.query_max, paging):
for page in range(1, trunc(self.query_max / 100) + 1):
docs = self._get(query, fromPage=page, count=paging)["results"]["hits"]
for doc in docs:
corpus.append(doc)
for page in range(1, trunc(self.query_max / 100) + 1):
docs = self._get(query, fromPage=page, count=paging)["results"]["hits"]
for doc in docs:
corpus.append(doc)
f.write(json.dumps(corpus).encode("utf-8"))
downloaded = True
self.path = save(json.dumps(corpus).encode("utf-8"), name='Multivac', basedir=UPLOAD_DIRECTORY )
downloaded = True
return downloaded
......@@ -104,7 +104,7 @@ class ISTexParser(Parser):
RealDate = RealDate[0]
# print( RealDate ," | length:",len(RealDate))
Decision=""
Decision = True
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y-%b-%d').date()
......
from ._Parser import Parser
from datetime import datetime
import json
class MultivacParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"title" : "title",
"abstract" : "abstract",
"type" : "type"
}
suma = 0
for json_doc in json_docs:
hyperdata = {}
doc = json_doc["_source"]
for key, path in hyperdata_path.items():
hyperdata[key] = doc.get(path, "")
hyperdata["source"] = doc.get("serial" , {})\
.get("journaltitle", "REPEC Database")
try:
hyperdata["url"] = doc.get("file", {})\
.get("url" , "")
except:
pass
hyperdata["authors"] = ", ".join(
[ p.get("person", {})
.get("name" , "")
for p in doc.get("hasauthor", [])
]
)
year = doc.get("serial" , {})\
.get("issuedate", None)
if year is None:
year = datetime.now()
else:
try:
date = datetime.strptime(year, '%Y')
except:
print("FIX DATE MULTIVAC REPEC %s" % year)
year = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
......@@ -78,7 +78,7 @@ class PubmedParser(Parser):
if "publication_month" in hyperdata: PubmedDate+=" "+hyperdata["publication_month"]
if "publication_day" in hyperdata: PubmedDate+=" "+hyperdata["publication_day"]
Decision=""
Decision=True
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment