1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import collections
import datetime
import dateutil.parser
import zipfile
import chardet
import re
from ..Caches import LanguagesCache
DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
class FileParser:
"""Base class for performing files parsing depending on their type.
"""
def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
def detect_encoding(self, string):
"""Useful method to detect the document encoding.
"""
encoding = chardet.detect(string)
return encoding.get('encoding', 'UTF-8')
def format_hyperdata_dates(self, hyperdata):
"""Format the dates found in the hyperdata.
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
# This part mainly deal with Zotero data but can be usefull for others
# parts
date_string = hyperdata.get('publication_date_to_parse', None)
if date_string is not None:
date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string)
#date_string = re.sub(r'undefined', '', date_string)
try:
hyperdata['publication' + "_date"] = dateutil.parser.parse(
date_string,
default=DEFAULT_DATE
).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error:
print(error, 'Parser Zotero, Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
elif hyperdata.get('publication_year', None) is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
date_string = hyperdata[prefix + "_year"]
key = prefix + "_month"
if key in hyperdata:
date_string += " " + hyperdata[key]
key = prefix + "_day"
if key in hyperdata:
date_string += " " + hyperdata[key]
key = prefix + "_hour"
if key in hyperdata:
date_string += " " + hyperdata[key]
key = prefix + "_minute"
if key in hyperdata:
date_string += ":" + hyperdata[key]
key = prefix + "_second"
if key in hyperdata:
date_string += ":" + hyperdata[key]
try:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except:
pass
else:
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(hyperdata[prefix + "_date"])
#print(date)
hyperdata[prefix + "_year"] = date.strftime("%Y")
hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d")
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
# finally, return the transformed result!
return hyperdata
print(hyperdata['publication_date'])
def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata."""
language = None
for key in ["fullname", "iso3", "iso2"]:
language_key = "language_" + key
if language_key in hyperdata:
language_symbol = hyperdata[language_key]
language = self._languages_cache[language_symbol]
if language:
break
if language:
hyperdata["language_iso2"] = language.iso2
hyperdata["language_iso3"] = language.iso3
hyperdata["language_fullname"] = language.fullname
return hyperdata
def format_hyperdata(self, hyperdata):
"""Format the hyperdata."""
hyperdata = self.format_hyperdata_dates(hyperdata)
hyperdata = self.format_hyperdata_languages(hyperdata)
return hyperdata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
"""Parse the file, and its children files found in the file.
"""
# initialize the list of hyperdata
hyperdata_list = []
if zipfile.is_zipfile(file):
#print(file, "# is the file is a ZIP archive, recurse on each of its files...")
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
try:
f = zipArchive.open(filename, 'r')
hyperdata_list += self.parse(f)
f.close()
except Exception as error:
print(error)
# ...otherwise, let's parse it directly!
else:
#print(file, "it is not a zip file")
try:
for hyperdata in self._parse(file):
hyperdata_list.append(self.format_hyperdata(hyperdata))
if hasattr(file, 'close'):
file.close()
except Exception as error:
print(error)
# return the list of formatted hyperdata
return hyperdata_list