recup du texte europarl modifiée: p par p géré au niveau du xpath puis texte...

recup du texte europarl modifiée: p par p géré au niveau du xpath puis texte obtenu par itération locale

recup du texte europarl modifiée: p par p géré au niveau du xpath puis texte...
recup du texte europarl modifiée: p par p géré au niveau du xpath puis texte obtenu par itération locale
2ca89c51 · Romain Loth · 72267265 · 2ca89c51
Commit 2ca89c51 authored Dec 04, 2015 by Romain Loth
Show whitespace changes
Inline Side-by-side

Showing with 25 additions and 15 deletions

EuropressFileParser_fr.py parsing/FileParsers/EuropressFileParser_fr.py +25 -15

No files found.
--- a/parsing/FileParsers/EuropressFileParser_fr.py
+++ b/parsing/FileParsers/EuropressFileParser_fr.py
@@ -22,6 +22,7 @@ from ..NgramsExtractors import *
 from admin.utils import PrintException
 class EuropressFileParser_fr(FileParser):
    def _parse(self, file):
        localeEncoding          = "fr_FR"
@@ -54,24 +55,33 @@ class EuropressFileParser_fr(FileParser):
        name_xpath      = "./header/div/span[@class = 'DocPublicationName']"
        header_xpath    = "./header/div/span[@class = 'DocHeader']"
-        title_xpath     = "./header/div[@class='titreArticle']/descendant-or-self::*"
+        title_xpath     = "./header/div[@class='titreArticle']"
-        text_xpath      = "./section/div[@class='DocText']/descendant-or-self::*"
+        text_xpath      = "./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
-        def paragraph_list(data_xpath):
+        def scrap_text(data_xpath):
+            """
+            Récupère le texte de toute arborescence
+            sous une liste de noeuds (par ex liste de <p>)
+            et renvoie une liste de string
+            """
            result = list()
+            # a priori un seul titre ou plusieurs p dans data_xpath
            for elem in data_xpath:
-                if elem.text is not None:
+                all_text = list()
-                    if elem.text.strip() != '':
+                # on utilise itertext pour avoir
-                        if elem.tag == 'p':
+                # tous les sous éléments 1 fois
-                            result.append(elem.text)
+                # quelque soit la profondeur
-                        else:
+                for sub_txt in elem.itertext(with_tail=True):
-                            if len(result) > 0:
+                    sub_txt_clean = sub_txt.strip()
-                                result.append(result.pop() + elem.text)
+                    if sub_txt_clean != '':
-                            else:
+                        all_text.append(sub_txt_clean)
-                                result.append(elem.text)
+                result.append(" ".join(all_text))
            return result
        # parse all the articles, one by one
        try:
            for html_article in html_articles:
@@ -126,14 +136,14 @@ class EuropressFileParser_fr(FileParser):
                #print(hyperdata['publication_date'])
                try:
-                    title   = paragraph_list(html_article.xpath(title_xpath))
+                    title   = scrap_text(html_article.xpath(title_xpath))
                    hyperdata['title'] = title[0]
                except:
                    pass
                try:
-                    text    = paragraph_list(html_article.xpath(text_xpath))
+                    text    = scrap_text(html_article.xpath(text_xpath))
-                    hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
+                    hyperdata['abstract'] = ' '.join([ p_text for p_text in title[1:] + text])
                except:
                    pass