Commit fbfb5d60 authored by sim's avatar sim

[FIX] Date parsing for ISI (Web of Science)

parent c59d8b79
import re
from .RIS import RISParser
......@@ -17,3 +19,34 @@ class ISIParser(RISParser):
"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
"WC": {"type": "hyperdata", "key": "fields"},
}
_year = re.compile(r'\b\d{4}\b')
_season = re.compile(r'\b(SPR|SUM|FAL|WIN)\b', re.I)
_month_interval = re.compile(r'\b([A-Z]{3})-([A-Z]{3})\b', re.I)
_day_interval = re.compile(r'\b(\d{1,2})-(\d{1,2})\b')
def _preprocess_PD(self, PD, PY):
# Add a year to date if applicable
if PY and self._year.search(PY) and not self._year.search(PD):
PD = PY + " " + PD
# Drop season if any
PD = self._season.sub('', PD).strip()
# If a month interval is present, keep only the first month
PD = self._month_interval.sub(r'\1', PD)
# If a day interval is present, keep only the first day
PD = self._day_interval.sub(r'\1', PD)
return PD
def parse(self, file):
PD = self._parameters["PD"]["key"]
PY = self._parameters["PY"]["key"]
for entry in super().parse(file):
if PD in entry:
entry[PD] = self._preprocess_PD(entry[PD], entry[PY])
yield entry
......@@ -86,6 +86,9 @@ class Parser:
print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.now()
# XXX Handling prefixes is most likely useless: there seem to be only
# one prefix which is "publication" (like in "publication_date").
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment