Commit 8527685b authored by Administrator's avatar Administrator

[BUGFIX] Encoding error with corpus coming from Windows OS.

parent a1c7dd0e
......@@ -2,6 +2,7 @@ import re
import locale
from lxml import etree
from datetime import datetime, date
from django.utils import timezone
from .FileParser import FileParser
from ..NgramsExtractors import *
......@@ -17,14 +18,17 @@ class EuropressFileParser(FileParser):
if isinstance(file, str):
file = open(file, 'rb')
print(file)
#print(file)
contents = file.read()
print(len(contents))
#print(len(contents))
#return []
encoding = self.detect_encoding(contents)
print(encoding)
if encoding != "utf-8":
contents = contents.decode(encoding, errors='replace').encode(codif)
try:
html_parser = etree.HTMLParser(encoding=encoding)
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_articles = html.xpath('/html/body/table')
except:
......@@ -69,12 +73,18 @@ class EuropressFileParser(FileParser):
if test_date_fr is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding)
if encoding != "utf-8":
text = text.replace('י', 'é')
text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ')
try :
metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except :
try:
metadata['publication_date'] = datetime.strptime(text, '%B %Y')
except :
print(text)
pass
if test_date_en is not None:
......@@ -122,7 +132,7 @@ class EuropressFileParser(FileParser):
try:
back = metadata['publication_date']
except Exception as e:
print(e)
#print(e)
pass
else:
try:
......@@ -130,7 +140,7 @@ class EuropressFileParser(FileParser):
except Exception as e:
print(e)
except :
metadata['publication_date'] = datetime.now()
metadata['publication_date'] = timezone.now()
#if lang == 'fr':
#metadata['language_iso2'] = 'fr'
......@@ -161,12 +171,5 @@ class EuropressFileParser(FileParser):
# pprint(metadata_list)
# return []
return metadata_list
#
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment