Commit 1fbed046 authored by delanoe's avatar delanoe

[BUG] Europresse format has changed (HTML5) here is the beginning of the...

[BUG] Europresse format has changed (HTML5) here is the beginning of the debug. Still help needed on this.
parent 65922ca0
import re import re
import locale import locale
from lxml import etree from lxml import etree
from lxml.html import html5parser
from datetime import datetime, date from datetime import datetime, date
from django.utils import timezone from django.utils import timezone
import dateutil.parser import dateutil.parser
...@@ -51,6 +52,16 @@ class EuropressFileParser(FileParser): ...@@ -51,6 +52,16 @@ class EuropressFileParser(FileParser):
if len(html_articles) < 1: if len(html_articles) < 1:
format_europresse = 1 format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]') html_articles = html.xpath('//div[@id="docContain"]')
if len(html_articles) < 1:
format_europresse = 50.2
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
if len(html_articles) < 1:
print("no article found")
except : except :
PrintException() PrintException()
...@@ -77,6 +88,11 @@ class EuropressFileParser(FileParser): ...@@ -77,6 +88,11 @@ class EuropressFileParser(FileParser):
or self::td[@class='txtCertificat'] \ or self::td[@class='txtCertificat'] \
)]/text()" )]/text()"
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()" doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
elif format_europresse == 50.2 :
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "string(./header/div/span[@class = 'TitreArticleVisu'])"
text_xpath = "./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
except Exception as error : except Exception as error :
...@@ -90,7 +106,9 @@ class EuropressFileParser(FileParser): ...@@ -90,7 +106,9 @@ class EuropressFileParser(FileParser):
if len(html_article): if len(html_article):
for name in html_article.xpath(name_xpath): for name in html_article.xpath(name_xpath):
#print("test name.text")
if name.text is not None: if name.text is not None:
#print(name.text)
format_journal = re.compile('(.*), (.*)', re.UNICODE) format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text) test_journal = format_journal.match(name.text)
if test_journal is not None: if test_journal is not None:
...@@ -104,7 +122,6 @@ class EuropressFileParser(FileParser): ...@@ -104,7 +122,6 @@ class EuropressFileParser(FileParser):
for header in html_article.xpath(header_xpath): for header in html_article.xpath(header_xpath):
# print(count) # print(count)
# countbis += 1 # countbis += 1
# try: # try:
# print('109', hyperdata['publication_date']) # print('109', hyperdata['publication_date'])
# except: # except:
...@@ -113,16 +130,26 @@ class EuropressFileParser(FileParser): ...@@ -113,16 +130,26 @@ class EuropressFileParser(FileParser):
try: try:
text = header.text text = header.text
#print("header", text) print("header", text)
except Exception as error: except Exception as error:
print(error) print(error)
if isinstance(text, bytes): if isinstance(text, bytes):
text = text.decode(encoding) text = text.decode(encoding)
if format_europresse == 50.2:
# TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
text = text.split(', ')[1]
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE) format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE)
if text is not None: if text is not None:
test_date_fr = format_date_fr.match(text) test_date_fr = format_date_fr.match(text)
#TODO check the v2 format here
test_date_fr_v2 = format_date_fr_v2.match(text)
format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE) format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
test_date_en = format_date_en.match(text) test_date_en = format_date_en.match(text)
format_sect = re.compile('(\D+),', re.UNICODE) format_sect = re.compile('(\D+),', re.UNICODE)
...@@ -131,13 +158,13 @@ class EuropressFileParser(FileParser): ...@@ -131,13 +158,13 @@ class EuropressFileParser(FileParser):
test_page = format_page.match(text) test_page = format_page.match(text)
else: else:
test_date_fr = None test_date_fr = None
test_date_fr_v2 = None
test_date_en = None test_date_en = None
test_sect = None test_sect = None
test_page = None test_page = None
if test_date_fr is not None or test_date_fr_v2 is not None:
if test_date_fr is not None:
self.localeEncoding = "fr_FR" self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding) locale.setlocale(locale.LC_ALL, localeEncoding)
if encoding != "utf-8": if encoding != "utf-8":
...@@ -155,9 +182,13 @@ class EuropressFileParser(FileParser): ...@@ -155,9 +182,13 @@ class EuropressFileParser(FileParser):
locale.setlocale(locale.LC_ALL, "fr_FR") locale.setlocale(locale.LC_ALL, "fr_FR")
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y') hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
# hyperdata['publication_date'] = dateutil.parser.parse(text) # hyperdata['publication_date'] = dateutil.parser.parse(text)
except Exception as error: except :
print(error, text) # TODO format to parse: ' mercredi 26 novembre 2014'
pass try :
hyperdata['publication_date'] = datetime.strptime(text, ' %A %d %B %Y')
except Exception as error:
print(error, text)
pass
if test_date_en is not None: if test_date_en is not None:
...@@ -227,7 +258,6 @@ class EuropressFileParser(FileParser): ...@@ -227,7 +258,6 @@ class EuropressFileParser(FileParser):
#elif lang == 'en': #elif lang == 'en':
# hyperdata['language_iso2'] = 'en' # hyperdata['language_iso2'] = 'en'
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y') hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m') hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d') hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
...@@ -248,7 +278,11 @@ class EuropressFileParser(FileParser): ...@@ -248,7 +278,11 @@ class EuropressFileParser(FileParser):
else: else:
hyperdata['doi'] = "not found" hyperdata['doi'] = "not found"
hyperdata['length_words'] = len(hyperdata['abstract'].split(' ')) try:
hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
except:
PrintException()
hyperdata['length_letters'] = len(hyperdata['abstract']) hyperdata['length_letters'] = len(hyperdata['abstract'])
hyperdata['bdd'] = u'europresse' hyperdata['bdd'] = u'europresse'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment