Commit 3b34158e authored by delanoe's avatar delanoe

[BUG FIX] Some files errors for HTML/LXML (without any log errors).

parent 75a7e329
...@@ -62,12 +62,15 @@ class EuropressParser(Parser): ...@@ -62,12 +62,15 @@ class EuropressParser(Parser):
except: except:
ValueError('Error while decoding from "latin1" to "%s"' % encoding) ValueError('Error while decoding from "latin1" to "%s"' % encoding)
html_parser = etree.HTMLParser(encoding=codif) try:
html = etree.fromstring(contents, html_parser) html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif) html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser) html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article') html_articles = html.xpath('//article')
except Exception as error:
print ("Europresse lxml error:", error)
# all except detail_header are mandatory to parse the article # all except detail_header are mandatory to parse the article
name_xpath = "./header/div/span[@class = 'DocPublicationName']" name_xpath = "./header/div/span[@class = 'DocPublicationName']"
...@@ -261,7 +264,7 @@ class EuropressParser(Parser): ...@@ -261,7 +264,7 @@ class EuropressParser(Parser):
yield hyperdata yield hyperdata
except: except:
raise Exception('Something bad happened.') print('Something bad happened.')
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment