Commit 91e2c2b8 authored by delanoe's avatar delanoe

[FIX] Bug if files empty + collectiv email address for Gargantext work.

parent 774060ae
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
__author__ = "Gargantext Team" __author__ = "Gargantext Team"
__copyright__ = "Copyright 2014-16 ISCPIF-CNRS" __copyright__ = "Copyright 2014-16 ISCPIF-CNRS"
__version__ = "0.2" __version__ = "0.2"
__email__ = "romain.loth@iscpif.fr" __email__ = "team@gargantext.org"
__status__ = "Test" __status__ = "Test"
import re import re
...@@ -63,13 +63,13 @@ class EuropresseParser(Parser): ...@@ -63,13 +63,13 @@ class EuropresseParser(Parser):
ValueError('Error while decoding from "latin1" to "%s"' % encoding) ValueError('Error while decoding from "latin1" to "%s"' % encoding)
try: try:
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif) html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser) html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article') html_articles = html.xpath('//article')
except Exception as error: except Exception as error:
html_articles = None
print ("Europresse lxml error:", error) print ("Europresse lxml error:", error)
# all except detail_header are mandatory to parse the article # all except detail_header are mandatory to parse the article
...@@ -113,169 +113,170 @@ class EuropresseParser(Parser): ...@@ -113,169 +113,170 @@ class EuropresseParser(Parser):
# parse all the articles, one by one # parse all the articles, one by one
for html_article in html_articles: if html_articles is not None:
try: for html_article in html_articles:
# s'il n'y a pas du tout de header on doit skip
all_header = html_article.xpath(entire_header_xpath)
all_header_text = " ".join(scrap_text(all_header))
if len(all_header) == 0 or len(all_header_text) == 0:
hyperdata['error']="Europresse: html doc with no header"
yield(hyperdata)
print("WARNING: europresse (skip) article without header")
continue
hyperdata = {}
# TITLE
# -----
title = []
try: try:
title = scrap_text(html_article.xpath(title_xpath)) # s'il n'y a pas du tout de header on doit skip
hyperdata['title'] = title[0] all_header = html_article.xpath(entire_header_xpath)
except: all_header_text = " ".join(scrap_text(all_header))
# il y aura un problème d'affichage si pas de titre ! if len(all_header) == 0 or len(all_header_text) == 0:
print("WARNING: europresse (skip) article without title") hyperdata['error']="Europresse: html doc with no header"
hyperdata['error']="Europresse: doc with no title" yield(hyperdata)
yield(hyperdata) print("WARNING: europresse (skip) article without header")
continue continue
# FULLTEXT hyperdata = {}
# --------
try:
text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = '\n'.join([ '<p>'+p_text+'</p>' for p_text in title[1:] + text])
except:
pass
# PUBLICATIONNAME # TITLE
# ---------------- # -----
try: title = []
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try: try:
hyperdata['journal'] = pub_name.strip() title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except: except:
pass # il y aura un problème d'affichage si pas de titre !
print("WARNING: europresse (skip) article without title")
hyperdata['error']="Europresse: doc with no title"
yield(hyperdata)
continue
# DATE et LANGUAGE # FULLTEXT
# ---------------- # --------
# analyse locale de la langue via le format de la date try:
# text = scrap_text(html_article.xpath(text_xpath))
# permet de choisir ResourceType "Europress" sans s'occuper hyperdata['abstract'] = '\n'.join([ '<p>'+p_text+'</p>' for p_text in title[1:] + text])
# du détail de la langue source
doc_language = None
date = None
# le texte sur lequel on cherchera la date/langue
search_text = None
# zone DocHeader fournissant précisément rubrique et date
detailed_text = None
get_detail_header = html_article.xpath(detail_header_xpath)
if len(get_detail_header) != 0:
# cas le plus courant
# -------------------
# ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17"
# ex: "Votre ville, jeudi 6 février 2014"
# ex: "World, Friday, November 13, 2015"
detailed_text = get_detail_header[0].text
search_text = detailed_text
else:
# occasionellment DocHeader absent
# (on se rabat sur le header entier)
search_text = all_header_text
# print("---using all header: '%s'" % search_text)
# si on n'a pas trouvé de zone du tout
if not search_text:
the_err = "europresse (skip) doc without detailed header"
print("WARNING:" + the_err)
hyperdata['error']= the_err
yield(hyperdata)
continue
# on poursuit date/langue avec la zone obtenue... except:
pass
# 1) Une REGEXP identifie la langue ET attrape la date # PUBLICATIONNAME
test_date_fr = re.search(format_date_fr,search_text) # ----------------
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
# DATE et LANGUAGE
# ----------------
# analyse locale de la langue via le format de la date
#
# permet de choisir ResourceType "Europress" sans s'occuper
# du détail de la langue source
doc_language = None
date = None
# le texte sur lequel on cherchera la date/langue
search_text = None
# zone DocHeader fournissant précisément rubrique et date
detailed_text = None
get_detail_header = html_article.xpath(detail_header_xpath)
if len(get_detail_header) != 0:
# cas le plus courant
# -------------------
# ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17"
# ex: "Votre ville, jeudi 6 février 2014"
# ex: "World, Friday, November 13, 2015"
detailed_text = get_detail_header[0].text
search_text = detailed_text
else:
# occasionellment DocHeader absent
# (on se rabat sur le header entier)
search_text = all_header_text
if test_date_fr: # print("---using all header: '%s'" % search_text)
doc_language = 'fr'
# print("=============== Header date fr")
# save for FileParser # si on n'a pas trouvé de zone du tout
hyperdata["language_iso2"] = 'fr' if not search_text:
the_err = "europresse (skip) doc without detailed header"
print("WARNING:" + the_err)
hyperdata['error']= the_err
yield(hyperdata)
continue
# match str # on poursuit date/langue avec la zone obtenue...
date_str = test_date_fr.group()
else: # 1) Une REGEXP identifie la langue ET attrape la date
# ex: November 7, 2012 test_date_fr = re.search(format_date_fr,search_text)
test_date_en = re.search(format_date_en,search_text)
if test_date_en: if test_date_fr:
doc_language = 'en' doc_language = 'fr'
# print("=============== Header date en") # print("=============== Header date fr")
# save for FileParser # save for FileParser
# TODO this does not work
hyperdata["language_iso2"] = 'fr' hyperdata["language_iso2"] = 'fr'
# match str # match str
date_str = test_date_en.group() date_str = test_date_fr.group()
else: else:
print("WARNING europresse: echec diagnostic date/langue header sur '%s'" % header) # ex: November 7, 2012
# default lg value, used locally, not saved test_date_en = re.search(format_date_en,search_text)
doc_language = 'en'
# default date value, will be saved if test_date_en:
date_str = "2016" doc_language = 'en'
# print("=============== Header date en")
# save for FileParser
# TODO this does not work
hyperdata["language_iso2"] = 'fr'
# match str
date_str = test_date_en.group()
else:
print("WARNING europresse: echec diagnostic date/langue header sur '%s'" % header)
# default lg value, used locally, not saved
doc_language = 'en'
# default date value, will be saved
date_str = "2016"
# 2) we parse the retrieved datestring into a formal date
try:
the_date = dateparser.parse(
date_str.strip(),
languages=[doc_language],
date_formats=['%d %B %Y','%B %d, %Y']
)
# 2) we parse the retrieved datestring into a formal date except:
try: the_date = timezone.now()
the_date = dateparser.parse(
date_str.strip(), hyperdata['publication_date'] = the_date.strftime("%Y-%m-%d %H:%M:%S")
languages=[doc_language], # print("RES POSTPROC:",hyperdata['publication_date'])
date_formats=['%d %B %Y','%B %d, %Y']
) # infos dérivées
hyperdata['publication_year'] = the_date.strftime('%Y')
except: hyperdata['publication_month'] = the_date.strftime('%m')
the_date = timezone.now() hyperdata['publication_day'] = the_date.strftime('%d')
hyperdata['publication_date'] = the_date.strftime("%Y-%m-%d %H:%M:%S")
# print("RES POSTPROC:",hyperdata['publication_date']) # RUBRIQUE
# --------
# infos dérivées # quand on a le DocHeader détaillé on peut vérifier la rubrique
hyperdata['publication_year'] = the_date.strftime('%Y') # (si présente elle est juste avant la date)
hyperdata['publication_month'] = the_date.strftime('%m') if detailed_text is not None:
hyperdata['publication_day'] = the_date.strftime('%d') header_elts = detailed_text.split(', ')
# on vérifie que le premier élément n'est pas une date ou un fragment de date
if parse_date(header_elts[0], doc_language) is None:
# RUBRIQUE # most probably news_topic before beginning of date
# -------- hyperdata['rubrique'] = header_elts[0]
# quand on a le DocHeader détaillé on peut vérifier la rubrique
# (si présente elle est juste avant la date) # print(hyperdata)
if detailed_text is not None: yield hyperdata
header_elts = detailed_text.split(', ')
# on vérifie que le premier élément n'est pas une date ou un fragment de date except Exception as err:
if parse_date(header_elts[0], doc_language) is None: print('WARNING: europresse (skip) unknown error:"' + str(err) + '"'
# most probably news_topic before beginning of date + "\n>>>" + (">>>".join(format_tb(err.__traceback__))))
hyperdata['rubrique'] = header_elts[0] hyperdata['error']= err
yield(hyperdata)
# print(hyperdata) continue
yield hyperdata
except Exception as err:
print('WARNING: europresse (skip) unknown error:"' + str(err) + '"'
+ "\n>>>" + (">>>".join(format_tb(err.__traceback__))))
hyperdata['error']= err
yield(hyperdata)
continue
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment