Commit a9f54519 authored by delanoe's avatar delanoe

[FEAT] Ajout de REPEC parser, format RIS.

parent 7bc5d3bd
......@@ -128,7 +128,7 @@ LANGUAGES = {
from gargantext.util.parsers import \
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser, RepecParser
def resourcetype(name):
'''
......@@ -208,6 +208,12 @@ RESOURCETYPES = [
#~ "base_url": "http://api.scoap3.org/search?",
},
# type 11
{ 'name': 'REPEC (RIS format)',
'parser': RepecParser,
'default_language': 'en',
},
]
# linguistic extraction parameters ---------------------------------------------
......
......@@ -18,11 +18,12 @@ class RISParser(Parser):
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": "\n"},
b"T2": {"type": "hyperdata", "key": "journal"},
b"UR": {"type": "hyperdata", "key": "doi"},
b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
......
from ._Parser import Parser
from gargantext.util.languages import languages
#from admin.utils import PrintException
class RepecParser(Parser):
# def __init__(self, language_cache=None):
#
# #super(Parser, self).__init__()
# #super(Parser, self).__init__()
# self._languages_cache = LanguagesCache() if language_cache is None else language_cache
_begin = 6
_parameters = {
b"ER": {"type": "delimiter"},
b"T1": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"A1": {"type": "hyperdata", "key": "authors", "separator": "\n"},
b"JO": {"type": "hyperdata", "key": "journal"},
b"UR": {"type": "hyperdata", "key": "doi"},
b"Y1": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"N2": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
def parse(self, file):
hyperdata = {}
last_key = None
last_values = []
# browse every line of the file
for line in file:
if len(line) > 2 :
# extract the parameter key
parameter_key = line[:2]
if parameter_key != b' ' and parameter_key != last_key:
if last_key in self._parameters:
# translate the parameter key
parameter = self._parameters[last_key]
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
if parameter["key"] == "publication_year":
hyperdata[parameter["key"]] = separator.join(last_values)[:4]
else:
hyperdata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
hyperdata = {}
last_key = parameter_key
last_values = []
try:
last_values.append(line[self._begin:-1].decode())
except Exception as error:
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
yield hyperdata
......@@ -60,26 +60,26 @@ class Parser:
print(error, 'Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
elif hyperdata.get('publication_year', None) is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
date_string = hyperdata[prefix + "_year"]
key = prefix + "_month"
if key in hyperdata:
date_string += " " + hyperdata[key]
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_day"
if key in hyperdata:
date_string += " " + hyperdata[key]
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_hour"
if key in hyperdata:
date_string += " " + hyperdata[key]
date_string += " " + hyperdata.get(key, "01")
key = prefix + "_minute"
if key in hyperdata:
date_string += ":" + hyperdata[key]
date_string += ":" + hyperdata.get(key, "01")
key = prefix + "_second"
if key in hyperdata:
date_string += ":" + hyperdata[key]
date_string += ":" + hyperdata.get(key, "01")
try:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error:
......@@ -90,13 +90,13 @@ class Parser:
except Exception as error:
try:
print(error)
print("error line 93", error)
# FIXME Date format: 1994 SPR
# By default, we take the year only
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error:
print(error)
print("error line 99", error)
else:
print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
......@@ -113,7 +113,7 @@ class Parser:
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
print(hyperdata['publication_date'])
print("line 116", hyperdata['publication_date'])
# finally, return the transformed result!
return hyperdata
......
from .Ris import RISParser
from .Isi import ISIParser
from .Ris import RISParser
from .Ris_repec import RepecParser
from .Isi import ISIParser
# from .Jstor import JstorParser
# from .Zotero import ZoteroParser
from .Pubmed import PubmedParser
from .Pubmed import PubmedParser
# # 2015-12-08: parser 2 en 1
from .Europress import EuropressParser
from .ISTex import ISTexParser
from .CSV import CSVParser
from .Cern import CernParser
from .ISTex import ISTexParser
from .CSV import CSVParser
from .Cern import CernParser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment