[BUG FIX] Some files errors for HTML/LXML (without any log errors).

3b34158e · delanoe · 75a7e329 · 3b34158e
Commit 3b34158e authored May 24, 2016 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 6 deletions

Europress.py gargantext/util/parsers/Europress.py +9 -6

No files found.
--- a/gargantext/util/parsers/Europress.py
+++ b/gargantext/util/parsers/Europress.py
@@ -62,12 +62,15 @@ class EuropressParser(Parser):
            except:
                ValueError('Error while decoding from "latin1" to "%s"' % encoding)

-        html_parser = etree.HTMLParser(encoding=codif)
-        html = etree.fromstring(contents, html_parser)
+        try:
+            html_parser = etree.HTMLParser(encoding=codif)
+            html = etree.fromstring(contents, html_parser)

-        html_parser = html5parser.etree.HTMLParser(encoding=codif)
-        html = html5parser.etree.fromstring(contents, html_parser)
-        html_articles = html.xpath('//article')
+            html_parser = html5parser.etree.HTMLParser(encoding=codif)
+            html = html5parser.etree.fromstring(contents, html_parser)
+            html_articles = html.xpath('//article')
+        except Exception as error:
+            print ("Europresse lxml error:", error)

        # all except detail_header are mandatory to parse the article
        name_xpath  = "./header/div/span[@class = 'DocPublicationName']"
@@ -261,7 +264,7 @@ class EuropressParser(Parser):
                yield hyperdata

        except:
-            raise Exception('Something bad happened.')
+            print('Something bad happened.')


 if __name__ == "__main__":