Commit 8527685b authored by Administrator's avatar Administrator

[BUGFIX] Encoding error with corpus coming from Windows OS.

parent a1c7dd0e
...@@ -2,6 +2,7 @@ import re ...@@ -2,6 +2,7 @@ import re
import locale import locale
from lxml import etree from lxml import etree
from datetime import datetime, date from datetime import datetime, date
from django.utils import timezone
from .FileParser import FileParser from .FileParser import FileParser
from ..NgramsExtractors import * from ..NgramsExtractors import *
...@@ -17,14 +18,17 @@ class EuropressFileParser(FileParser): ...@@ -17,14 +18,17 @@ class EuropressFileParser(FileParser):
if isinstance(file, str): if isinstance(file, str):
file = open(file, 'rb') file = open(file, 'rb')
print(file) #print(file)
contents = file.read() contents = file.read()
print(len(contents)) #print(len(contents))
#return [] #return []
encoding = self.detect_encoding(contents) encoding = self.detect_encoding(contents)
print(encoding)
if encoding != "utf-8":
contents = contents.decode(encoding, errors='replace').encode(codif)
try: try:
html_parser = etree.HTMLParser(encoding=encoding) html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser) html = etree.fromstring(contents, html_parser)
html_articles = html.xpath('/html/body/table') html_articles = html.xpath('/html/body/table')
except: except:
...@@ -69,12 +73,18 @@ class EuropressFileParser(FileParser): ...@@ -69,12 +73,18 @@ class EuropressFileParser(FileParser):
if test_date_fr is not None: if test_date_fr is not None:
self.localeEncoding = "fr_FR" self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding) locale.setlocale(locale.LC_ALL, localeEncoding)
if encoding != "utf-8":
text = text.replace('י', 'é')
text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ')
try : try :
metadata['publication_date'] = datetime.strptime(text, '%d %B %Y') metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except : except :
try: try:
metadata['publication_date'] = datetime.strptime(text, '%B %Y') metadata['publication_date'] = datetime.strptime(text, '%B %Y')
except : except :
print(text)
pass pass
if test_date_en is not None: if test_date_en is not None:
...@@ -122,7 +132,7 @@ class EuropressFileParser(FileParser): ...@@ -122,7 +132,7 @@ class EuropressFileParser(FileParser):
try: try:
back = metadata['publication_date'] back = metadata['publication_date']
except Exception as e: except Exception as e:
print(e) #print(e)
pass pass
else: else:
try: try:
...@@ -130,7 +140,7 @@ class EuropressFileParser(FileParser): ...@@ -130,7 +140,7 @@ class EuropressFileParser(FileParser):
except Exception as e: except Exception as e:
print(e) print(e)
except : except :
metadata['publication_date'] = datetime.now() metadata['publication_date'] = timezone.now()
#if lang == 'fr': #if lang == 'fr':
#metadata['language_iso2'] = 'fr' #metadata['language_iso2'] = 'fr'
...@@ -161,12 +171,5 @@ class EuropressFileParser(FileParser): ...@@ -161,12 +171,5 @@ class EuropressFileParser(FileParser):
# pprint(metadata_list) # pprint(metadata_list)
# return [] # return []
return metadata_list return metadata_list
#
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment