Commit 91e2c2b8 authored by delanoe's avatar delanoe

[FIX] Bug if files empty + collectiv email address for Gargantext work.

parent 774060ae
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
__author__ = "Gargantext Team" __author__ = "Gargantext Team"
__copyright__ = "Copyright 2014-16 ISCPIF-CNRS" __copyright__ = "Copyright 2014-16 ISCPIF-CNRS"
__version__ = "0.2" __version__ = "0.2"
__email__ = "romain.loth@iscpif.fr" __email__ = "team@gargantext.org"
__status__ = "Test" __status__ = "Test"
import re import re
...@@ -63,13 +63,13 @@ class EuropresseParser(Parser): ...@@ -63,13 +63,13 @@ class EuropresseParser(Parser):
ValueError('Error while decoding from "latin1" to "%s"' % encoding) ValueError('Error while decoding from "latin1" to "%s"' % encoding)
try: try:
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif) html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser) html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article') html_articles = html.xpath('//article')
except Exception as error: except Exception as error:
html_articles = None
print ("Europresse lxml error:", error) print ("Europresse lxml error:", error)
# all except detail_header are mandatory to parse the article # all except detail_header are mandatory to parse the article
...@@ -113,6 +113,7 @@ class EuropresseParser(Parser): ...@@ -113,6 +113,7 @@ class EuropresseParser(Parser):
# parse all the articles, one by one # parse all the articles, one by one
if html_articles is not None:
for html_article in html_articles: for html_article in html_articles:
try: try:
# s'il n'y a pas du tout de header on doit skip # s'il n'y a pas du tout de header on doit skip
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment