Commit a9f54519 authored by delanoe's avatar delanoe

[FEAT] Ajout de REPEC parser, format RIS.

parent 7bc5d3bd
...@@ -128,7 +128,7 @@ LANGUAGES = { ...@@ -128,7 +128,7 @@ LANGUAGES = {
from gargantext.util.parsers import \ from gargantext.util.parsers import \
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser, RepecParser
def resourcetype(name): def resourcetype(name):
''' '''
...@@ -208,6 +208,12 @@ RESOURCETYPES = [ ...@@ -208,6 +208,12 @@ RESOURCETYPES = [
#~ "base_url": "http://api.scoap3.org/search?", #~ "base_url": "http://api.scoap3.org/search?",
}, },
# type 11
{ 'name': 'REPEC (RIS format)',
'parser': RepecParser,
'default_language': 'en',
},
] ]
# linguistic extraction parameters --------------------------------------------- # linguistic extraction parameters ---------------------------------------------
......
...@@ -18,11 +18,12 @@ class RISParser(Parser): ...@@ -18,11 +18,12 @@ class RISParser(Parser):
b"ER": {"type": "delimiter"}, b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "}, b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "}, b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "}, b"AU": {"type": "hyperdata", "key": "authors", "separator": "\n"},
b"T2": {"type": "hyperdata", "key": "journal"}, b"T2": {"type": "hyperdata", "key": "journal"},
b"UR": {"type": "hyperdata", "key": "doi"}, b"UR": {"type": "hyperdata", "key": "doi"},
b"PY": {"type": "hyperdata", "key": "publication_year"}, b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"}, b"PD": {"type": "hyperdata", "key": "publication_month"},
b"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
b"LA": {"type": "hyperdata", "key": "language_iso2"}, b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "}, b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"}, b"WC": {"type": "hyperdata", "key": "fields"},
......
from ._Parser import Parser
from gargantext.util.languages import languages
#from admin.utils import PrintException
class RepecParser(Parser):
# def __init__(self, language_cache=None):
#
# #super(Parser, self).__init__()
# #super(Parser, self).__init__()
# self._languages_cache = LanguagesCache() if language_cache is None else language_cache
_begin = 6
_parameters = {
b"ER": {"type": "delimiter"},
b"T1": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"A1": {"type": "hyperdata", "key": "authors", "separator": "\n"},
b"JO": {"type": "hyperdata", "key": "journal"},
b"UR": {"type": "hyperdata", "key": "doi"},
b"Y1": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"N2": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
def parse(self, file):
hyperdata = {}
last_key = None
last_values = []
# browse every line of the file
for line in file:
if len(line) > 2 :
# extract the parameter key
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key]
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
if parameter["key"] == "publication_year":
hyperdata[parameter["key"]] = separator.join(last_values)[:4]
else:
hyperdata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
hyperdata = {}
last_key = parameter_key
last_values = []
try:
last_values.append(line[self._begin:-1].decode())
except Exception as error:
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
yield hyperdata
...@@ -60,26 +60,26 @@ class Parser: ...@@ -60,26 +60,26 @@ class Parser:
print(error, 'Date not parsed for:', date_string) print(error, 'Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
elif hyperdata.get('publication_year', None) is not None: elif hyperdata.get('publication_year', None) is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"] prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes: for prefix in prefixes:
date_string = hyperdata[prefix + "_year"] date_string = hyperdata[prefix + "_year"]
key = prefix + "_month" key = prefix + "_month"
if key in hyperdata: if key in hyperdata:
date_string += " " + hyperdata[key] date_string += " " + hyperdata.get(key, "01")
key = prefix + "_day" key = prefix + "_day"
if key in hyperdata: if key in hyperdata:
date_string += " " + hyperdata[key] date_string += " " + hyperdata.get(key, "01")
key = prefix + "_hour" key = prefix + "_hour"
if key in hyperdata: if key in hyperdata:
date_string += " " + hyperdata[key] date_string += " " + hyperdata.get(key, "01")
key = prefix + "_minute" key = prefix + "_minute"
if key in hyperdata: if key in hyperdata:
date_string += ":" + hyperdata[key] date_string += ":" + hyperdata.get(key, "01")
key = prefix + "_second" key = prefix + "_second"
if key in hyperdata: if key in hyperdata:
date_string += ":" + hyperdata[key] date_string += ":" + hyperdata.get(key, "01")
try: try:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error: except Exception as error:
...@@ -90,13 +90,13 @@ class Parser: ...@@ -90,13 +90,13 @@ class Parser:
except Exception as error: except Exception as error:
try: try:
print(error) print("error line 93", error)
# FIXME Date format: 1994 SPR # FIXME Date format: 1994 SPR
# By default, we take the year only # By default, we take the year only
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error: except Exception as error:
print(error) print("error line 99", error)
else: else:
print("WARNING: Date unknown at _Parser level, using now()") print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
...@@ -113,7 +113,7 @@ class Parser: ...@@ -113,7 +113,7 @@ class Parser:
hyperdata[prefix + "_hour"] = date.strftime("%H") hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M") hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S") hyperdata[prefix + "_second"] = date.strftime("%S")
print(hyperdata['publication_date']) print("line 116", hyperdata['publication_date'])
# finally, return the transformed result! # finally, return the transformed result!
return hyperdata return hyperdata
......
from .Ris import RISParser from .Ris import RISParser
from .Isi import ISIParser from .Ris_repec import RepecParser
from .Isi import ISIParser
# from .Jstor import JstorParser # from .Jstor import JstorParser
# from .Zotero import ZoteroParser # from .Zotero import ZoteroParser
from .Pubmed import PubmedParser from .Pubmed import PubmedParser
# # 2015-12-08: parser 2 en 1 # # 2015-12-08: parser 2 en 1
from .Europress import EuropressParser from .Europress import EuropressParser
from .ISTex import ISTexParser from .ISTex import ISTexParser
from .CSV import CSVParser from .CSV import CSVParser
from .Cern import CernParser from .Cern import CernParser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment