Commit 0f39af9a authored by Administrator's avatar Administrator

[FIX] Europresse parser, Date parser fix.

parent af167ccf
...@@ -3,6 +3,7 @@ import locale ...@@ -3,6 +3,7 @@ import locale
from lxml import etree from lxml import etree
from datetime import datetime, date from datetime import datetime, date
from django.utils import timezone from django.utils import timezone
import dateutil.parser
from .FileParser import FileParser from .FileParser import FileParser
from ..NgramsExtractors import * from ..NgramsExtractors import *
...@@ -23,9 +24,9 @@ class EuropressFileParser(FileParser): ...@@ -23,9 +24,9 @@ class EuropressFileParser(FileParser):
#print(len(contents)) #print(len(contents))
#return [] #return []
encoding = self.detect_encoding(contents) encoding = self.detect_encoding(contents)
print(encoding) #print(encoding)
if encoding != "utf-8": #if encoding != "utf-8":
contents = contents.decode(encoding, errors='replace').encode(codif) contents = contents.decode(encoding, errors='replace').encode(codif)
try: try:
html_parser = etree.HTMLParser(encoding=codif) html_parser = etree.HTMLParser(encoding=codif)
...@@ -78,14 +79,19 @@ class EuropressFileParser(FileParser): ...@@ -78,14 +79,19 @@ class EuropressFileParser(FileParser):
text = text.replace('ű', 'û') text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ') text = text.replace(' aot ', ' août ')
try : try :
metadata['publication_date'] = datetime.strptime(text, '%d %B %Y') metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except : except :
try: try:
metadata['publication_date'] = datetime.strptime(text, '%B %Y') metadata['publication_date'] = datetime.strptime(text, '%B %Y')
except : except :
print(text) try:
pass metadata['publication_date'] = dateutil.parser.parse(text)
except Exception as error:
print(error)
print(text)
pass
if test_date_en is not None: if test_date_en is not None:
localeEncoding = "en_GB.UTF-8" localeEncoding = "en_GB.UTF-8"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment