Commit fd6e272f authored by delanoe's avatar delanoe

Merge branch 'romain' of ssh://delanoe.org:1979/gargantext into romain

parents 4b38f1f2 ca537f58
...@@ -54,24 +54,33 @@ class EuropressFileParser_en(FileParser): ...@@ -54,24 +54,33 @@ class EuropressFileParser_en(FileParser):
name_xpath = "./header/div/span[@class = 'DocPublicationName']" name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']" header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*" title_xpath = "./header/div[@class='titreArticle']"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*" text_xpath = "./section/div[@class='DocText']//p"
def paragraph_list(data_xpath): def scrap_text(data_xpath):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result = list() result = list()
# a priori un seul titre ou plusieurs p dans data_xpath
for elem in data_xpath: for elem in data_xpath:
if elem.text is not None: all_text = list()
if elem.text.strip() != '': # on utilise itertext pour avoir
if elem.tag == 'p': # tous les sous éléments 1 fois
result.append(elem.text) # quelque soit la profondeur
else: for sub_txt in elem.itertext(with_tail=True):
if len(result) > 0: sub_txt_clean = sub_txt.strip()
result.append(result.pop() + elem.text) if sub_txt_clean != '':
else: all_text.append(sub_txt_clean)
result.append(elem.text) result.append(" ".join(all_text))
return result return result
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
...@@ -88,26 +97,25 @@ class EuropressFileParser_en(FileParser): ...@@ -88,26 +97,25 @@ class EuropressFileParser_en(FileParser):
hyperdata['journal'] = pub_name.strip() hyperdata['journal'] = pub_name.strip()
except: except:
pass pass
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
header = html_article.xpath(header_xpath)[0].text header = html_article.xpath(header_xpath)[0].text
if header is not None: if header is not None:
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header = header.split(', ') header = header.split(', ')
header = list(filter(lambda x: format_page.match(x) is None, header)) header = list(filter(lambda x: format_page.match(x) is None, header))
print(header)
if parse_date(header[0], 'en') is not None: if parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:]) date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None: elif parse_date(header[1], 'en') is not None:
hyperdata['rubrique'] = header[0]
date = ' '.join(header[1:]) date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None: elif parse_date(header[2], 'en') is not None:
hyperdata['rubrique'] = header[0]
date = ' '.join(header[2:]) date = ' '.join(header[2:])
elif parse_date(header[3], 'en') is not None: elif parse_date(header[3], 'en') is not None:
hyperdata['rubrique'] = header[0]
date = ' '.join(header[3:]) date = ' '.join(header[3:])
else: else:
date = '2016' date = '2016'
...@@ -127,10 +135,16 @@ class EuropressFileParser_en(FileParser): ...@@ -127,10 +135,16 @@ class EuropressFileParser_en(FileParser):
print(hyperdata['title']) print(hyperdata['title'])
print(date) print(date)
try:
title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try: try:
text = paragraph_list(html_article.xpath(text_xpath)) text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text]) hyperdata['abstract'] = '\n'.join([ '<p>\n'+p_text+'</p>\n' for p_text in title[1:] + text])
except: except:
pass pass
......
...@@ -143,7 +143,10 @@ class EuropressFileParser_fr(FileParser): ...@@ -143,7 +143,10 @@ class EuropressFileParser_fr(FileParser):
try: try:
text = scrap_text(html_article.xpath(text_xpath)) text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ p_text for p_text in title[1:] + text]) hyperdata['abstract'] = '\n'.join([ '<p>\n'+p_text+'</p>\n' for p_text in title[1:] + text])
# join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except: except:
pass pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment