[PARSER] Europress Article text fix + adding authors extraction.

c22ee4aa · Alexandre Delanoë · 2d470e64 · c22ee4aa
Commit c22ee4aa authored Jan 25, 2018 by Alexandre Delanoë
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 1 deletion

EUROPRESSE.py gargantext/util/parsers/EUROPRESSE.py +12 -1

No files found.
--- a/gargantext/util/parsers/EUROPRESSE.py
+++ b/gargantext/util/parsers/EUROPRESSE.py
@@ -82,7 +82,8 @@ class EuropresseParser(Parser):
        #
        # title_xpath (chemin plus générique)
        title_xpath         = "./header//*[contains(@class,'titreArticle')]"
-        text_xpath          = "./section/div[@class='DocText']//p"
+        authors_xpath       = "./header//*[contains(@class,'docAuthors')]"
+        text_xpath          = "./section/div[@class='DocText clearfix']//p"
        entire_header_xpath = "./header"
        # diagnosed during date retrieval and used for rubrique
@@ -144,6 +145,15 @@ class EuropresseParser(Parser):
                        yield(hyperdata)
                        continue
+                    # Authors
+                    # --------
+                    try:
+                        authors    = scrap_text(html_article.xpath(authors_xpath))
+                        hyperdata['authors'] = '; '.join([author for author in authors])
+                    except:
+                        pass
                    # FULLTEXT
                    # --------
@@ -154,6 +164,7 @@ class EuropresseParser(Parser):
                    except:
                        pass
                    # PUBLICATIONNAME
                    # ----------------
                    try: