Commit 930a34a1 authored by Administrator's avatar Administrator

[FEAT] Europresse parser can parse one article page of visualisation

parent c777cc99
...@@ -41,12 +41,43 @@ class EuropressFileParser(FileParser): ...@@ -41,12 +41,43 @@ class EuropressFileParser(FileParser):
html = etree.fromstring(contents, html_parser) html = etree.fromstring(contents, html_parser)
try: try:
format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody') html_articles = html.xpath('/html/body/table/tbody')
if len(html_articles) < 1: if len(html_articles) < 1:
html_articles = html.xpath('/html/body/table') html_articles = html.xpath('/html/body/table')
if len(html_articles) < 1:
format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]')
except Exception as error: except Exception as error:
print(error) print(error)
if format_europresse == 50:
name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif format_europresse == 1:
name_xpath = "//span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(//div[@class = 'titreArticleVisu'])"
text_xpath = "./descendant::*[\
not(\
self::div[@class='Doc-SourceText'] \
or self::span[@class='DocHeader'] \
or self::span[@class='DocPublicationName'] \
or self::span[@id='docNameVisu'] \
or self::span[@class='DocHeader'] \
or self::div[@class='titreArticleVisu'] \
or self::span[@id='docNameContType'] \
or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo'] \
or descendant-or-self::span[@id='ucPubliC_lblEndDate'] \
or self::td[@class='txtCertificat'] \
)]/text()"
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
except Exception as error: except Exception as error:
print(error) print(error)
...@@ -58,7 +89,7 @@ class EuropressFileParser(FileParser): ...@@ -58,7 +89,7 @@ class EuropressFileParser(FileParser):
metadata = {} metadata = {}
if len(html_article): if len(html_article):
for name in html_article.xpath("./tr/td/span[@class = 'DocPublicationName']"): for name in html_article.xpath(name_xpath):
if name.text is not None: if name.text is not None:
format_journal = re.compile('(.*), (.*)', re.UNICODE) format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text) test_journal = format_journal.match(name.text)
...@@ -68,9 +99,10 @@ class EuropressFileParser(FileParser): ...@@ -68,9 +99,10 @@ class EuropressFileParser(FileParser):
else: else:
metadata['source'] = name.text.encode(codif) metadata['source'] = name.text.encode(codif)
for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"): for header in html_article.xpath(header_xpath):
try: try:
text = header.text text = header.text
#print("header", text)
except Exception as error: except Exception as error:
print(error) print(error)
...@@ -136,8 +168,8 @@ class EuropressFileParser(FileParser): ...@@ -136,8 +168,8 @@ class EuropressFileParser(FileParser):
if test_page is not None: if test_page is not None:
metadata['page'] = test_page.group(1).encode(codif) metadata['page'] = test_page.group(1).encode(codif)
metadata['title'] = html_article.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(codif) metadata['title'] = html_article.xpath(title_xpath).encode(codif)
metadata['text'] = html_article.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()") metadata['text'] = html_article.xpath(text_xpath)
line = 0 line = 0
br_tag = 10 br_tag = 10
...@@ -185,13 +217,18 @@ class EuropressFileParser(FileParser): ...@@ -185,13 +217,18 @@ class EuropressFileParser(FileParser):
metadata['publication_day'] = metadata['publication_date'].strftime('%d') metadata['publication_day'] = metadata['publication_date'].strftime('%d')
metadata['publication_date'] = "" metadata['publication_date'] = ""
if len(metadata['text'])>0: if len(metadata['text'])>0 and format_europresse == 50:
metadata['doi'] = str(metadata['text'][-9]) metadata['doi'] = str(metadata['text'][-9])
metadata['text'].pop() metadata['text'].pop()
metadata['text'] = str(' '.join(metadata['text'])) metadata['text'] = str(' '.join(metadata['text']))
metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text'])) metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text']))
elif format_europresse == 1:
metadata['doi'] = ' '.join(html_article.xpath(doi_xpath))
metadata['text'] = metadata['text'][:-9]
metadata['text'] = str(' '.join(metadata['text']))
else: metadata['doi'] = "not found" else:
metadata['doi'] = "not found"
metadata['bdd'] = u'europresse' metadata['bdd'] = u'europresse'
metadata['url'] = u'' metadata['url'] = u''
...@@ -201,7 +238,8 @@ class EuropressFileParser(FileParser): ...@@ -201,7 +238,8 @@ class EuropressFileParser(FileParser):
metadata[key] = value.decode() if isinstance(value, bytes) else value metadata[key] = value.decode() if isinstance(value, bytes) else value
yield metadata yield metadata
count += 1 count += 1
file.close()
except Exception as error: except Exception as error:
print(error) print(error)
pass pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment