Commit c22ee4aa authored by Alexandre Delanoë's avatar Alexandre Delanoë

[PARSER] Europress Article text fix + adding authors extraction.

parent 2d470e64
......@@ -82,7 +82,8 @@ class EuropresseParser(Parser):
#
# title_xpath (chemin plus générique)
title_xpath = "./header//*[contains(@class,'titreArticle')]"
text_xpath = "./section/div[@class='DocText']//p"
authors_xpath = "./header//*[contains(@class,'docAuthors')]"
text_xpath = "./section/div[@class='DocText clearfix']//p"
entire_header_xpath = "./header"
# diagnosed during date retrieval and used for rubrique
......@@ -144,6 +145,15 @@ class EuropresseParser(Parser):
yield(hyperdata)
continue
# Authors
# --------
try:
authors = scrap_text(html_article.xpath(authors_xpath))
hyperdata['authors'] = '; '.join([author for author in authors])
except:
pass
# FULLTEXT
# --------
......@@ -154,6 +164,7 @@ class EuropresseParser(Parser):
except:
pass
# PUBLICATIONNAME
# ----------------
try:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment