Commit a83006e4 authored by delanoe's avatar delanoe

[FIX] dates for Europresse parser: ok.

parent 2fac6579
...@@ -29,9 +29,9 @@ class EuropressFileParser(FileParser): ...@@ -29,9 +29,9 @@ class EuropressFileParser(FileParser):
format_date = re.compile('.*\d{4}.*', re.UNICODE) format_date = re.compile('.*\d{4}.*', re.UNICODE)
if isinstance(file, str): if isinstance(file, str):
file_open = open(file, 'rb') file = open(file, 'rb')
contents = file_open.read() contents = file.read()
encoding = self.detect_encoding(contents) encoding = self.detect_encoding(contents)
if encoding != "utf-8": if encoding != "utf-8":
...@@ -91,19 +91,27 @@ class EuropressFileParser(FileParser): ...@@ -91,19 +91,27 @@ class EuropressFileParser(FileParser):
header = header.split(', ') header = header.split(', ')
if format_date.match(header[0]): if format_date.match(header[0]):
date = header[0] date = header[0]
else: elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0] hyperdata['rubrique'] = header[0]
date = header[1] date = header[1]
try:
try: hyperdata['page'] = header[2].split(' ')[1]
hyperdata['page'] = header[2].split(' ')[1] except:
except: pass
pass else:
date = header[2]
try: try:
hyperdata['publication_date'] = dateparser.parse(date, languages=['fr', 'en']) hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
except: except:
hyperdata['publication_date'] = timezone.now() hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date'])
try: try:
title = paragraph_list(html_article.xpath(title_xpath)) title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0] hyperdata['title'] = title[0]
...@@ -118,8 +126,6 @@ class EuropressFileParser(FileParser): ...@@ -118,8 +126,6 @@ class EuropressFileParser(FileParser):
yield hyperdata yield hyperdata
file_open.close()
except : except :
PrintException() PrintException()
pass pass
...@@ -133,4 +139,3 @@ if __name__ == "__main__": ...@@ -133,4 +139,3 @@ if __name__ == "__main__":
except: except:
pass pass
...@@ -126,7 +126,7 @@ class FileParser: ...@@ -126,7 +126,7 @@ class FileParser:
# initialize the list of hyperdata # initialize the list of hyperdata
hyperdata_list = [] hyperdata_list = []
if zipfile.is_zipfile(file): if zipfile.is_zipfile(file):
print(file, "# is the file is a ZIP archive, recurse on each of its files...") #print(file, "# is the file is a ZIP archive, recurse on each of its files...")
zipArchive = zipfile.ZipFile(file) zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist(): for filename in zipArchive.namelist():
try: try:
...@@ -137,7 +137,7 @@ class FileParser: ...@@ -137,7 +137,7 @@ class FileParser:
print(error) print(error)
# ...otherwise, let's parse it directly! # ...otherwise, let's parse it directly!
else: else:
print(file, "it is not a zip file") #print(file, "it is not a zip file")
try: try:
for hyperdata in self._parse(file): for hyperdata in self._parse(file):
hyperdata_list.append(self.format_hyperdata(hyperdata)) hyperdata_list.append(self.format_hyperdata(hyperdata))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment