1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import collections
import dateutil.parser
import zipfile
import chardet
from ..Caches import LanguagesCache
class FileParser:
"""Base class for performing files parsing depending on their type.
"""
def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
def detect_encoding(self, string):
"""Useful method to detect the document encoding.
"""
encoding = chardet.detect(string)
return encoding.get('encoding', 'UTF-8')
def format_metadata_dates(self, metadata):
"""Format the dates found in the metadata.
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
date_string = metadata[prefix + "_year"]
key = prefix + "_month"
if key in metadata:
date_string += " " + metadata[key]
key = prefix + "_day"
if key in metadata:
date_string += " " + metadata[key]
key = prefix + "_hour"
if key in metadata:
date_string += " " + metadata[key]
key = prefix + "_minute"
if key in metadata:
date_string += ":" + metadata[key]
key = prefix + "_second"
if key in metadata:
date_string += ":" + metadata[key]
try:
metadata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except:
pass
# ...then parse all the "date" fields, to parse it into separate elements
prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_date"]
for prefix in prefixes:
date = dateutil.parser.parse(metadata[prefix + "_date"])
metadata[prefix + "_year"] = date.strftime("%Y")
metadata[prefix + "_month"] = date.strftime("%m")
metadata[prefix + "_day"] = date.strftime("%d")
metadata[prefix + "_hour"] = date.strftime("%H")
metadata[prefix + "_minute"] = date.strftime("%M")
metadata[prefix + "_second"] = date.strftime("%S")
# finally, return the transformed result!
return metadata
def format_metadata_languages(self, metadata):
"""format the languages found in the metadata."""
language = None
for key in ["fullname", "iso3", "iso2"]:
language_key = "language_" + key
if language_key in metadata:
language_symbol = metadata[language_key]
language = self._languages_cache[language_symbol]
if language:
break
if language:
metadata["language_iso2"] = language.iso2
metadata["language_iso3"] = language.iso3
metadata["language_fullname"] = language.fullname
return metadata
def format_metadata(self, metadata):
"""Format the metadata."""
metadata = self.format_metadata_dates(metadata)
metadata = self.format_metadata_languages(metadata)
return metadata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
"""Parse the file, and its children files found in the file.
"""
# initialize the list of metadata
metadata_list = []
# is the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file):
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
try:
f = zipArchive.open(filename, 'r')
metadata_list += self.parse(f)
f.close()
except Exception as error:
print(error)
# ...otherwise, let's parse it directly!
else:
try:
for metadata in self._parse(file):
metadata_list.append(self.format_metadata(metadata))
if hasattr(file, 'close'):
file.close()
except Exception as error:
print(error)
# return the list of formatted metadata
return metadata_list