Commit a83006e4 authored by delanoe's avatar delanoe

[FIX] dates for Europresse parser: ok.

parent 2fac6579
......@@ -29,9 +29,9 @@ class EuropressFileParser(FileParser):
format_date = re.compile('.*\d{4}.*', re.UNICODE)
if isinstance(file, str):
file_open = open(file, 'rb')
file = open(file, 'rb')
contents = file_open.read()
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
......@@ -91,19 +91,27 @@ class EuropressFileParser(FileParser):
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
else:
elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
else:
date = header[2]
try:
hyperdata['publication_date'] = dateparser.parse(date, languages=['fr', 'en'])
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
except:
hyperdata['publication_date'] = timezone.now()
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
......@@ -118,8 +126,6 @@ class EuropressFileParser(FileParser):
yield hyperdata
file_open.close()
except :
PrintException()
pass
......@@ -133,4 +139,3 @@ if __name__ == "__main__":
except:
pass
......@@ -126,7 +126,7 @@ class FileParser:
# initialize the list of hyperdata
hyperdata_list = []
if zipfile.is_zipfile(file):
print(file, "# is the file is a ZIP archive, recurse on each of its files...")
#print(file, "# is the file is a ZIP archive, recurse on each of its files...")
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
try:
......@@ -137,7 +137,7 @@ class FileParser:
print(error)
# ...otherwise, let's parse it directly!
else:
print(file, "it is not a zip file")
#print(file, "it is not a zip file")
try:
for hyperdata in self._parse(file):
hyperdata_list.append(self.format_hyperdata(hyperdata))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment