Commit c22ee4aa authored by Alexandre Delanoë's avatar Alexandre Delanoë

[PARSER] Europress Article text fix + adding authors extraction.

parent 2d470e64
...@@ -82,7 +82,8 @@ class EuropresseParser(Parser): ...@@ -82,7 +82,8 @@ class EuropresseParser(Parser):
# #
# title_xpath (chemin plus générique) # title_xpath (chemin plus générique)
title_xpath = "./header//*[contains(@class,'titreArticle')]" title_xpath = "./header//*[contains(@class,'titreArticle')]"
text_xpath = "./section/div[@class='DocText']//p" authors_xpath = "./header//*[contains(@class,'docAuthors')]"
text_xpath = "./section/div[@class='DocText clearfix']//p"
entire_header_xpath = "./header" entire_header_xpath = "./header"
# diagnosed during date retrieval and used for rubrique # diagnosed during date retrieval and used for rubrique
...@@ -144,6 +145,15 @@ class EuropresseParser(Parser): ...@@ -144,6 +145,15 @@ class EuropresseParser(Parser):
yield(hyperdata) yield(hyperdata)
continue continue
# Authors
# --------
try:
authors = scrap_text(html_article.xpath(authors_xpath))
hyperdata['authors'] = '; '.join([author for author in authors])
except:
pass
# FULLTEXT # FULLTEXT
# -------- # --------
...@@ -154,6 +164,7 @@ class EuropresseParser(Parser): ...@@ -154,6 +164,7 @@ class EuropresseParser(Parser):
except: except:
pass pass
# PUBLICATIONNAME # PUBLICATIONNAME
# ---------------- # ----------------
try: try:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment