Commit a9f54519 authored by delanoe's avatar delanoe

[FEAT] Ajout de REPEC parser, format RIS.

parent 7bc5d3bd
...@@ -128,7 +128,7 @@ LANGUAGES = { ...@@ -128,7 +128,7 @@ LANGUAGES = {
from gargantext.util.parsers import \ from gargantext.util.parsers import \
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser, RepecParser
def resourcetype(name): def resourcetype(name):
''' '''
...@@ -208,6 +208,12 @@ RESOURCETYPES = [ ...@@ -208,6 +208,12 @@ RESOURCETYPES = [
#~ "base_url": "http://api.scoap3.org/search?", #~ "base_url": "http://api.scoap3.org/search?",
}, },
# type 11
{ 'name': 'REPEC (RIS format)',
'parser': RepecParser,
'default_language': 'en',
},
] ]
# linguistic extraction parameters --------------------------------------------- # linguistic extraction parameters ---------------------------------------------
......
...@@ -18,11 +18,12 @@ class RISParser(Parser): ...@@ -18,11 +18,12 @@ class RISParser(Parser):
b"ER": {"type": "delimiter"}, b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "}, b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "}, b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "}, b"AU": {"type": "hyperdata", "key": "authors", "separator": "\n"},
b"T2": {"type": "hyperdata", "key": "journal"}, b"T2": {"type": "hyperdata", "key": "journal"},
b"UR": {"type": "hyperdata", "key": "doi"}, b"UR": {"type": "hyperdata", "key": "doi"},
b"PY": {"type": "hyperdata", "key": "publication_year"}, b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"}, b"PD": {"type": "hyperdata", "key": "publication_month"},
b"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
b"LA": {"type": "hyperdata", "key": "language_iso2"}, b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "}, b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"}, b"WC": {"type": "hyperdata", "key": "fields"},
......
from ._Parser import Parser
from gargantext.util.languages import languages
#from admin.utils import PrintException
class RepecParser(Parser):
# def __init__(self, language_cache=None):
#
# #super(Parser, self).__init__()
# #super(Parser, self).__init__()
# self._languages_cache = LanguagesCache() if language_cache is None else language_cache
_begin = 6
_parameters = {
b"ER": {"type": "delimiter"},
b"T1": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"A1": {"type": "hyperdata", "key": "authors", "separator": "\n"},
b"JO": {"type": "hyperdata", "key": "journal"},
b"UR": {"type": "hyperdata", "key": "doi"},
b"Y1": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"N2": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
def parse(self, file):
hyperdata = {}
last_key = None
last_values = []
# browse every line of the file
for line in file:
if len(line) > 2 :
# extract the parameter key
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key]
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
if parameter["key"] == "publication_year":
hyperdata[parameter["key"]] = separator.join(last_values)[:4]
else:
hyperdata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
hyperdata = {}
last_key = parameter_key
last_values = []
try:
last_values.append(line[self._begin:-1].decode())
except Exception as error:
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
yield hyperdata
...@@ -67,19 +67,19 @@ class Parser: ...@@ -67,19 +67,19 @@ class Parser:
date_string = hyperdata[prefix + "_year"] date_string = hyperdata[prefix + "_year"]
key = prefix + "_month" key = prefix + "_month"
if key in hyperdata: if key in hyperdata:
date_string += " " + hyperdata[key] date_string += " " + hyperdata.get(key, "01")
key = prefix + "_day" key = prefix + "_day"
if key in hyperdata: if key in hyperdata:
date_string += " " + hyperdata[key] date_string += " " + hyperdata.get(key, "01")
key = prefix + "_hour" key = prefix + "_hour"
if key in hyperdata: if key in hyperdata:
date_string += " " + hyperdata[key] date_string += " " + hyperdata.get(key, "01")
key = prefix + "_minute" key = prefix + "_minute"
if key in hyperdata: if key in hyperdata:
date_string += ":" + hyperdata[key] date_string += ":" + hyperdata.get(key, "01")
key = prefix + "_second" key = prefix + "_second"
if key in hyperdata: if key in hyperdata:
date_string += ":" + hyperdata[key] date_string += ":" + hyperdata.get(key, "01")
try: try:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error: except Exception as error:
...@@ -90,13 +90,13 @@ class Parser: ...@@ -90,13 +90,13 @@ class Parser:
except Exception as error: except Exception as error:
try: try:
print(error) print("error line 93", error)
# FIXME Date format: 1994 SPR # FIXME Date format: 1994 SPR
# By default, we take the year only # By default, we take the year only
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error: except Exception as error:
print(error) print("error line 99", error)
else: else:
print("WARNING: Date unknown at _Parser level, using now()") print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
...@@ -113,7 +113,7 @@ class Parser: ...@@ -113,7 +113,7 @@ class Parser:
hyperdata[prefix + "_hour"] = date.strftime("%H") hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M") hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S") hyperdata[prefix + "_second"] = date.strftime("%S")
print(hyperdata['publication_date']) print("line 116", hyperdata['publication_date'])
# finally, return the transformed result! # finally, return the transformed result!
return hyperdata return hyperdata
......
from .Ris import RISParser from .Ris import RISParser
from .Ris_repec import RepecParser
from .Isi import ISIParser from .Isi import ISIParser
# from .Jstor import JstorParser # from .Jstor import JstorParser
# from .Zotero import ZoteroParser # from .Zotero import ZoteroParser
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment