From 852f71b624d05d30018147566f8d197b113ebdc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Delano=C3=AB?= <alexandre@delanoe.org> Date: Tue, 12 May 2015 17:00:11 +0100 Subject: [PATCH] [FIX] Adding zotero parser --- parsing/FileParsers/FileParser.py | 26 +++++++++++++------------ parsing/FileParsers/RisFileParser.py | 16 +++++++++++---- parsing/FileParsers/ZoteroFileParser.py | 23 ++++++++++++++++++++++ parsing/FileParsers/__init__.py | 1 + parsing/parsers_config.py | 4 ++-- 5 files changed, 52 insertions(+), 18 deletions(-) create mode 100644 parsing/FileParsers/ZoteroFileParser.py diff --git a/parsing/FileParsers/FileParser.py b/parsing/FileParsers/FileParser.py index eb5bc723..80cce5c0 100644 --- a/parsing/FileParsers/FileParser.py +++ b/parsing/FileParsers/FileParser.py @@ -4,21 +4,21 @@ import zipfile import chardet from ..Caches import LanguagesCache - + class FileParser: """Base class for performing files parsing depending on their type. """ def __init__(self, language_cache=None): self._languages_cache = LanguagesCache() if language_cache is None else language_cache - + def detect_encoding(self, string): """Useful method to detect the document encoding. """ encoding = chardet.detect(string) return encoding.get('encoding', 'UTF-8') - - + + def format_hyperdata_dates(self, hyperdata): """Format the dates found in the hyperdata. Examples: @@ -27,7 +27,7 @@ class FileParser: {"publication_year": "2014"} -> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...} """ - + # First, check the split dates... prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"] for prefix in prefixes: @@ -51,21 +51,23 @@ class FileParser: hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S") except: pass - + # ...then parse all the "date" fields, to parse it into separate elements prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"] for prefix in prefixes: date = dateutil.parser.parse(hyperdata[prefix + "_date"]) + print('date') + hyperdata[prefix + "_year"] = date.strftime("%Y") hyperdata[prefix + "_month"] = date.strftime("%m") hyperdata[prefix + "_day"] = date.strftime("%d") hyperdata[prefix + "_hour"] = date.strftime("%H") hyperdata[prefix + "_minute"] = date.strftime("%M") hyperdata[prefix + "_second"] = date.strftime("%S") - + # finally, return the transformed result! return hyperdata - + def format_hyperdata_languages(self, hyperdata): """format the languages found in the hyperdata.""" language = None @@ -81,18 +83,18 @@ class FileParser: hyperdata["language_iso3"] = language.iso3 hyperdata["language_fullname"] = language.fullname return hyperdata - + def format_hyperdata(self, hyperdata): """Format the hyperdata.""" hyperdata = self.format_hyperdata_dates(hyperdata) hyperdata = self.format_hyperdata_languages(hyperdata) return hyperdata - - + + def _parse(self, file): """This method shall be overriden by inherited classes.""" return list() - + def parse(self, file): """Parse the file, and its children files found in the file. """ diff --git a/parsing/FileParsers/RisFileParser.py b/parsing/FileParsers/RisFileParser.py index d05caf51..888f4bc9 100644 --- a/parsing/FileParsers/RisFileParser.py +++ b/parsing/FileParsers/RisFileParser.py @@ -3,15 +3,17 @@ from .FileParser import FileParser from ..Caches import LanguagesCache +from admin.utils import PrintException + class RisFileParser(FileParser): def __init__(self, language_cache=None): - + super(FileParser, self).__init__() self._languages_cache = LanguagesCache() if language_cache is None else language_cache - + self._begin = 6 - + self._parameters = { b"ER": {"type": "delimiter"}, b"TI": {"type": "hyperdata", "key": "title", "separator": " "}, @@ -24,7 +26,7 @@ class RisFileParser(FileParser): b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "}, b"WC": {"type": "hyperdata", "key": "fields"}, } - + def _parse(self, file): hyperdata = {} @@ -57,5 +59,11 @@ class RisFileParser(FileParser): print(error) # if a hyperdata object is left in memory, yield it as well if hyperdata: +# try: +# if hyperdata['date_to_parse']: +# print(hyperdata['date_to_parse']) +# except: +# pass +# #print(hyperdata['title']) yield hyperdata diff --git a/parsing/FileParsers/ZoteroFileParser.py b/parsing/FileParsers/ZoteroFileParser.py new file mode 100644 index 00000000..3d886af7 --- /dev/null +++ b/parsing/FileParsers/ZoteroFileParser.py @@ -0,0 +1,23 @@ +from .RisFileParser import RisFileParser + +from ..Caches import LanguagesCache + +class ZoteroFileParser(RisFileParser): + def __init__(self): + super(RisFileParser, self).__init__() + + self._begin = 6 + + self._parameters = { + b"ER": {"type": "delimiter"}, + b"TI": {"type": "hyperdata", "key": "title", "separator": " "}, + b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "}, + b"UR": {"type": "hyperdata", "key": "doi"}, + b"DA": {"type": "hyperdata", "key": "publication_date"}, + b"PY": {"type": "hyperdata", "key": "publication_year"}, + b"PD": {"type": "hyperdata", "key": "publication_month"}, + b"LA": {"type": "hyperdata", "key": "language_iso2"}, + b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "}, + b"WC": {"type": "hyperdata", "key": "fields"}, + } + diff --git a/parsing/FileParsers/__init__.py b/parsing/FileParsers/__init__.py index 09871c6a..e642ec9d 100644 --- a/parsing/FileParsers/__init__.py +++ b/parsing/FileParsers/__init__.py @@ -1,6 +1,7 @@ from .RisFileParser import RisFileParser from .IsiFileParser import IsiFileParser from .JstorFileParser import JstorFileParser +from .ZoteroFileParser import ZoteroFileParser from .PubmedFileParser import PubmedFileParser from .EuropressFileParser import EuropressFileParser from .ISText import ISText diff --git a/parsing/parsers_config.py b/parsing/parsers_config.py index d2b772e7..e1765e53 100644 --- a/parsing/parsers_config.py +++ b/parsing/parsers_config.py @@ -4,11 +4,11 @@ parsers = { 'Pubmed (xml format)' : PubmedFileParser, 'Web of Science (ISI format)' : IsiFileParser, 'Scopus (RIS format)' : RisFileParser, - 'Zotero (RIS format)' : JstorFileParser, + 'Zotero (RIS format)' : ZoteroFileParser, 'Jstor (RIS format)' : JstorFileParser, #'Europress' : EuropressFileParser, 'Europress (French)' : EuropressFileParser, 'Europress (English)' : EuropressFileParser, - + } -- 2.21.0