Commit 6a2e10fb authored by Administrator's avatar Administrator

[BUG FIX] Adding try/except for ugly encoded corpora.

parent 0c97f772
...@@ -42,7 +42,7 @@ class EuropressFileParser(FileParser): ...@@ -42,7 +42,7 @@ class EuropressFileParser(FileParser):
# initialize the list of metadata # initialize the list of metadata
metadata_list = [] metadata_list = []
# parse all the articles, one by one # parse all the articles, one by one
try:
for html_article in html_articles: for html_article in html_articles:
metadata = {} metadata = {}
...@@ -59,7 +59,12 @@ class EuropressFileParser(FileParser): ...@@ -59,7 +59,12 @@ class EuropressFileParser(FileParser):
metadata['source'] = name.text.encode(codif) metadata['source'] = name.text.encode(codif)
for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"): for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"):
try:
text = header.text text = header.text
except Exception as error:
print(error)
if isinstance(text, bytes): if isinstance(text, bytes):
text = text.decode(encoding) text = text.decode(encoding)
...@@ -177,6 +182,10 @@ class EuropressFileParser(FileParser): ...@@ -177,6 +182,10 @@ class EuropressFileParser(FileParser):
metadata_list.append(metadata) metadata_list.append(metadata)
count += 1 count += 1
except Exception as error:
print(error)
pass
# from pprint import pprint # from pprint import pprint
# pprint(metadata_list) # pprint(metadata_list)
# return [] # return []
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment