Commit 96447bcd authored by Romain Loth's avatar Romain Loth

fix RIS parser end of records etc

parent 5ce859aa
......@@ -15,7 +15,7 @@ class RISParser(Parser):
_begin = 6
_parameters = {
"ER": {"type": "delimiter"},
"ER": {"type": "delimiter"}, # the record delimiter
"TI": {"type": "hyperdata", "key": "title", "separator": " "},
"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
"AU": {"type": "hyperdata", "key": "authors", "separator": "\n"},
......@@ -30,36 +30,70 @@ class RISParser(Parser):
}
def parse(self, file):
print("=====> PARSING RIS", file)
hyperdata = {}
last_key = None
last_values = []
for line in file:
# bytes ~~> str
line = line.decode("UTF-8").rstrip('\r\n')
if len(line) > 2 :
# extract the parameter key
if len(line) >= 2 :
# extract the parameter key...
parameter_key = line[:2]
if parameter_key != ' ' and parameter_key != last_key:
# ...and keep the rest for when we know what to do with it
current_value = line[self._begin:]
# it's a new key => therefore the previous key is finished
if parameter_key != last_key:
if last_key in self._parameters:
# translate the parameter key
# translate key
parameter = self._parameters[last_key]
# 1 - we record the previous value array...
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
hyperdata[parameter["key"]] = separator.join(last_values)
#... or even finish the record (rare here, most often after empty line)
elif parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
last_key = None
hyperdata = {}
last_key = parameter_key
# 2 - new key: also we start a new value array and move on to the next key
last_values = []
last_key = parameter_key
# 3 - new key or old: in any case we feed the value array "buffer"
try:
last_values.append(line[self._begin:])
last_values.append(current_value)
except Exception as error:
print(error)
# empty line => we still need to check if PREVIOUS LINE was record delimiter
else:
# print("\n\n\nEMPTY LINE, with last_key", last_key)
if last_key in self._parameters:
if parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
last_key = None
hyperdata = {}
# [end of loop per lines]
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment