Commit 6f11445e authored by delanoe's avatar delanoe

[FIX] Europresse parser for english documents: ok.

parent f04dbe16
......@@ -24,9 +24,13 @@ from admin.utils import PrintException
class EuropressFileParser(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
localeEncoding = "fr_FR"
codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
......@@ -89,17 +93,26 @@ class EuropressFileParser(FileParser):
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
if parse_date(header[0], 'fr') is not None:
date = header[0]
elif format_date.match(header[1]):
elif parse_date(header[1], 'fr') is not None:
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
else:
elif parse_date(header[2], 'fr') is not None:
date = header[2]
elif parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment