Commit 6a2e10fb authored by Administrator's avatar Administrator

[BUG FIX] Adding try/except for ugly encoded corpora.

parent 0c97f772
......@@ -42,7 +42,7 @@ class EuropressFileParser(FileParser):
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one
try:
for html_article in html_articles:
metadata = {}
......@@ -59,7 +59,12 @@ class EuropressFileParser(FileParser):
metadata['source'] = name.text.encode(codif)
for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"):
try:
text = header.text
except Exception as error:
print(error)
if isinstance(text, bytes):
text = text.decode(encoding)
......@@ -177,6 +182,10 @@ class EuropressFileParser(FileParser):
metadata_list.append(metadata)
count += 1
except Exception as error:
print(error)
pass
# from pprint import pprint
# pprint(metadata_list)
# return []
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment