Commit 2ca89c51 authored by Romain Loth's avatar Romain Loth

recup du texte europarl modifiée: p par p géré au niveau du xpath puis texte...

recup du texte europarl modifiée: p par p géré au niveau du xpath puis texte obtenu par itération locale
parent 72267265
...@@ -22,6 +22,7 @@ from ..NgramsExtractors import * ...@@ -22,6 +22,7 @@ from ..NgramsExtractors import *
from admin.utils import PrintException from admin.utils import PrintException
class EuropressFileParser_fr(FileParser): class EuropressFileParser_fr(FileParser):
def _parse(self, file): def _parse(self, file):
localeEncoding = "fr_FR" localeEncoding = "fr_FR"
...@@ -54,24 +55,33 @@ class EuropressFileParser_fr(FileParser): ...@@ -54,24 +55,33 @@ class EuropressFileParser_fr(FileParser):
name_xpath = "./header/div/span[@class = 'DocPublicationName']" name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']" header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*" title_xpath = "./header/div[@class='titreArticle']"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*" text_xpath = "./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
def paragraph_list(data_xpath): def scrap_text(data_xpath):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result = list() result = list()
# a priori un seul titre ou plusieurs p dans data_xpath
for elem in data_xpath: for elem in data_xpath:
if elem.text is not None: all_text = list()
if elem.text.strip() != '': # on utilise itertext pour avoir
if elem.tag == 'p': # tous les sous éléments 1 fois
result.append(elem.text) # quelque soit la profondeur
else: for sub_txt in elem.itertext(with_tail=True):
if len(result) > 0: sub_txt_clean = sub_txt.strip()
result.append(result.pop() + elem.text) if sub_txt_clean != '':
else: all_text.append(sub_txt_clean)
result.append(elem.text) result.append(" ".join(all_text))
return result return result
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
...@@ -126,14 +136,14 @@ class EuropressFileParser_fr(FileParser): ...@@ -126,14 +136,14 @@ class EuropressFileParser_fr(FileParser):
#print(hyperdata['publication_date']) #print(hyperdata['publication_date'])
try: try:
title = paragraph_list(html_article.xpath(title_xpath)) title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0] hyperdata['title'] = title[0]
except: except:
pass pass
try: try:
text = paragraph_list(html_article.xpath(text_xpath)) text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text]) hyperdata['abstract'] = ' '.join([ p_text for p_text in title[1:] + text])
except: except:
pass pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment