Commit 852f71b6 authored by Administrator's avatar Administrator

[FIX] Adding zotero parser

parent 69333ff9
......@@ -4,21 +4,21 @@ import zipfile
import chardet
from ..Caches import LanguagesCache
class FileParser:
"""Base class for performing files parsing depending on their type.
"""
def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
def detect_encoding(self, string):
"""Useful method to detect the document encoding.
"""
encoding = chardet.detect(string)
return encoding.get('encoding', 'UTF-8')
def format_hyperdata_dates(self, hyperdata):
"""Format the dates found in the hyperdata.
Examples:
......@@ -27,7 +27,7 @@ class FileParser:
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
......@@ -51,21 +51,23 @@ class FileParser:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except:
pass
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"])
print('date')
hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d")
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
# finally, return the transformed result!
return hyperdata
def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata."""
language = None
......@@ -81,18 +83,18 @@ class FileParser:
hyperdata["language_iso3"] = language.iso3
hyperdata["language_fullname"] = language.fullname
return hyperdata
def format_hyperdata(self, hyperdata):
"""Format the hyperdata."""
hyperdata = self.format_hyperdata_dates(hyperdata)
hyperdata = self.format_hyperdata_languages(hyperdata)
return hyperdata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
"""Parse the file, and its children files found in the file.
"""
......
......@@ -3,15 +3,17 @@ from .FileParser import FileParser
from ..Caches import LanguagesCache
from admin.utils import PrintException
class RisFileParser(FileParser):
def __init__(self, language_cache=None):
super(FileParser, self).__init__()
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
self._begin = 6
self._parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
......@@ -24,7 +26,7 @@ class RisFileParser(FileParser):
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
def _parse(self, file):
hyperdata = {}
......@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print(error)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title'])
yield hyperdata
from .RisFileParser import RisFileParser
from ..Caches import LanguagesCache
class ZoteroFileParser(RisFileParser):
def __init__(self):
super(RisFileParser, self).__init__()
self._begin = 6
self._parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "hyperdata", "key": "title", "separator": " "},
b"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
b"UR": {"type": "hyperdata", "key": "doi"},
b"DA": {"type": "hyperdata", "key": "publication_date"},
b"PY": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
b"LA": {"type": "hyperdata", "key": "language_iso2"},
b"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
b"WC": {"type": "hyperdata", "key": "fields"},
}
from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .ISText import ISText
......@@ -4,11 +4,11 @@ parsers = {
'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser,
'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : JstorFileParser,
'Zotero (RIS format)' : ZoteroFileParser,
'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment