Commit 5ce424f9 authored by Romain Loth's avatar Romain Loth

parsers: fix europresse exceptions handling

parent ee50eb3c
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
""" """
__author__ = "Gargantext Team" __author__ = "Gargantext Team"
__copyright__ = "Copyright 2014-15 ISCPIF-CNRS" __copyright__ = "Copyright 2014-16 ISCPIF-CNRS"
__version__ = "0.1" __version__ = "0.2"
__email__ = "romain.loth@iscpif.fr" __email__ = "romain.loth@iscpif.fr"
__status__ = "Test" __status__ = "Test"
...@@ -27,6 +27,7 @@ import sys ...@@ -27,6 +27,7 @@ import sys
#from admin.env import * #from admin.env import *
#from parsing.FileParsers.FileParser import FileParser #from parsing.FileParsers.FileParser import FileParser
from ._Parser import Parser from ._Parser import Parser
from traceback import format_tb
class EuropresseParser(Parser): class EuropresseParser(Parser):
...@@ -112,15 +113,17 @@ class EuropresseParser(Parser): ...@@ -112,15 +113,17 @@ class EuropresseParser(Parser):
# parse all the articles, one by one # parse all the articles, one by one
try: for html_article in html_articles:
for html_article in html_articles: try:
print("==============================new article") print("==============================new article")
# s'il n'y a pas du tout de header on doit skip # s'il n'y a pas du tout de header on doit skip
all_header = html_article.xpath(entire_header_xpath) all_header = html_article.xpath(entire_header_xpath)
if len(all_header) == 0: all_header_text = " ".join(scrap_text(all_header))
print("WARNING: europress (skip) article without header") if len(all_header) == 0 or len(all_header_text) == 0:
hyperdata['error']="Europresse: html doc with no header"
yield(hyperdata)
print("WARNING: europresse (skip) article without header")
continue continue
hyperdata = {} hyperdata = {}
...@@ -134,7 +137,9 @@ class EuropresseParser(Parser): ...@@ -134,7 +137,9 @@ class EuropresseParser(Parser):
hyperdata['title'] = title[0] hyperdata['title'] = title[0]
except: except:
# il y aura un problème d'affichage si pas de titre ! # il y aura un problème d'affichage si pas de titre !
print("WARNING: europress (skip) article without title") print("WARNING: europresse (skip) article without title")
hyperdata['error']="Europresse: doc with no title"
yield(hyperdata)
continue continue
...@@ -189,12 +194,19 @@ class EuropresseParser(Parser): ...@@ -189,12 +194,19 @@ class EuropresseParser(Parser):
else: else:
# occasionellment DocHeader absent # occasionellment DocHeader absent
# (on se rabat sur le header entier) # (on se rabat sur le header entier)
search_text = " ".join(scrap_text(all_header[0])) search_text = all_header_text
# print("---using all header: '%s'" % search_text) # print("---using all header: '%s'" % search_text)
# si on n'a pas trouvé de zone du tout
if not search_text:
the_err = "europresse (skip) doc without detailed header"
print("WARNING:" + the_err)
hyperdata['error']= the_err
yield(hyperdata)
continue
# on poursuit date/langue avec la zone obtenue # on poursuit date/langue avec la zone obtenue...
# 1) Une REGEXP identifie la langue ET attrape la date # 1) Une REGEXP identifie la langue ET attrape la date
test_date_fr = re.search(format_date_fr,search_text) test_date_fr = re.search(format_date_fr,search_text)
...@@ -223,7 +235,7 @@ class EuropresseParser(Parser): ...@@ -223,7 +235,7 @@ class EuropresseParser(Parser):
# match str # match str
date_str = test_date_en.group() date_str = test_date_en.group()
else: else:
print("WARNING europress: echec diagnostic date/langue header sur '%s'" % header) print("WARNING europresse: echec diagnostic date/langue header sur '%s'" % header)
# default lg value, used locally, not saved # default lg value, used locally, not saved
doc_language = 'en' doc_language = 'en'
# default date value, will be saved # default date value, will be saved
...@@ -260,8 +272,12 @@ class EuropresseParser(Parser): ...@@ -260,8 +272,12 @@ class EuropresseParser(Parser):
# most probably news_topic before beginning of date # most probably news_topic before beginning of date
hyperdata['rubrique'] = header_elts[0] hyperdata['rubrique'] = header_elts[0]
print(hyperdata) # print(hyperdata)
yield hyperdata yield hyperdata
except Exception as err: except Exception as err:
print('Europresse parser: Something bad happened:' + str(err)) print('WARNING: europresse (skip) unknown error:"' + str(err) + '"'
+ "\n>>>" + (">>>".join(format_tb(err.__traceback__))))
hyperdata['error']= err
yield(hyperdata)
continue
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment