Commit 842efbe4 authored by delanoe's avatar delanoe

[FEAT] EUROPRESSE PARSER FOR HTML5. still bug with zip files.

parent ecd6640d
......@@ -238,7 +238,7 @@ def compute_tfidf_global(corpus):
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('insert tfidf for %d documents' % (D, ))
dbg.show('insert tfidf')
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
......
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
count = 0
format_date = re.compile('.*\d{4}.*', re.UNICODE)
if isinstance(file, str):
file = open(file, 'rb')
# print(file)
contents = file.read()
#print(len(contents))
#return []
file_open = open(file, 'rb')
contents = file_open.read()
encoding = self.detect_encoding(contents)
#print(encoding)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
# try:
# contents = contents.decode(encoding, errors='replace').encode(codif)
# except Exception as error:
# print(error)
try:
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
try :
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody')
if len(html_articles) < 1:
html_articles = html.xpath('/html/body/table')
if len(html_articles) < 1:
format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]')
if len(html_articles) < 1 :
format_europresse = 50.2
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
if len(html_articles) < 1:
print("no article found")
except :
PrintException()
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
if format_europresse == 50 :
name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "./tr/td/span[@class = 'DocHeader']"
title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif format_europresse == 1 :
name_xpath = "//span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(//div[@class = 'titreArticleVisu'])"
text_xpath = "./descendant::*[\
not(\
self::div[@class='Doc-SourceText'] \
or self::span[@class='DocHeader'] \
or self::span[@class='DocPublicationName'] \
or self::span[@id='docNameVisu'] \
or self::span[@class='DocHeader'] \
or self::div[@class='titreArticleVisu'] \
or self::span[@id='docNameContType'] \
or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo'] \
or descendant-or-self::span[@id='ucPubliC_lblEndDate'] \
or self::td[@class='txtCertificat'] \
)]/text()"
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
elif format_europresse == 50.2 :
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "string(./header/div/span[@class = 'TitreArticleVisu'])"
text_xpath = "./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*"
except Exception as error :
PrintException()
def paragraph_list(data_xpath):
result = list()
for elem in data_xpath:
if elem.text is not None:
if elem.text.strip() != '':
if elem.tag == 'p':
result.append(elem.text)
else:
if len(result) > 0:
result.append(result.pop() + elem.text)
else:
result.append(elem.text)
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
if len(html_article):
for name in html_article.xpath(name_xpath):
#print("test name.text")
if name.text is not None:
#print(name.text)
format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text)
if test_journal is not None:
hyperdata['journal'] = test_journal.group(1)
hyperdata['volume'] = test_journal.group(2)
else:
hyperdata['journal'] = name.text.encode(codif)
countbis = 0
for header in html_article.xpath(header_xpath):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try:
text = header.text
print("header", text)
except Exception as error:
print(error)
if isinstance(text, bytes):
text = text.decode(encoding)
if format_europresse == 50.2:
# TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
try:# # 2015-oct-08 exception added
text = text.split(', ')[1]
except:
pass
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE)
if text is not None:
test_date_fr = format_date_fr.match(text)
#TODO check the v2 format here
test_date_fr_v2 = format_date_fr_v2.match(text)
format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
test_date_en = format_date_en.match(text)
format_sect = re.compile('(\D+),', re.UNICODE)
test_sect = format_sect.match(text)
format_page = re.compile(', p. (\w+)', re.UNICODE)
test_page = format_page.match(text)
else:
test_date_fr = None
test_date_fr_v2 = None
test_date_en = None
test_sect = None
test_page = None
if test_date_fr is not None or test_date_fr_v2 is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, "fr_FR.utf-8")
if encoding != "utf-8":
text = text.replace('י', 'é')
text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ')
try:
hyperdata['publication_date'] = dateparser.parse(text, languages=['fr'])
except:
try :
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except :
try:
hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
except :
try:
locale.setlocale(locale.LC_ALL, "fr_FR")
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
# hyperdata['publication_date'] = dateutil.parser.parse(text)
except :
# TODO format to parse: ' mercredi 26 novembre 2014'
try :
hyperdata['publication_date'] = datetime.strptime(text, ' %A %d %B %Y')
except Exception as error:
print(error, text)
pass
if test_date_en is not None:
localeEncoding = "en_GB.UTF-8"
locale.setlocale(locale.LC_ALL, localeEncoding)
try :
hyperdata['publication_date'] = datetime.strptime(text, '%B %d, %Y')
except :
try :
hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
except :
pass
if test_sect is not None:
hyperdata['section'] = test_sect.group(1).encode(codif)
if test_page is not None:
hyperdata['page'] = test_page.group(1).encode(codif)
# try:
# print('183', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
#
hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
hyperdata['abstract'] = html_article.xpath(text_xpath)
line = 0
br_tag = 10
for i in html_articles[count].iter():
# print line, br, i, i.tag, i.attrib, i.tail
if i.tag == "span":
if "class" in i.attrib:
if i.attrib['class'] == 'TitreArticleVisu':
line = 1
br_tag = 2
if line == 1 and i.tag == "br":
br_tag -= 1
if line == 1 and br_tag == 0:
try:
hyperdata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';')
except:
hyperdata['authors'] = 'not found'
line = 0
br_tag = 10
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
try:
back = hyperdata['publication_date']
except Exception as e:
#print(e)
pass
else:
try:
hyperdata['publication_date'] = back
except Exception as e:
print(e)
except :
hyperdata['publication_date'] = timezone.now()
#if lang == 'fr':
#hyperdata['language_iso2'] = 'fr'
#elif lang == 'en':
# hyperdata['language_iso2'] = 'en'
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#hyperdata.pop('publication_date')
if len(hyperdata['abstract'])>0 and format_europresse == 50:
hyperdata['doi'] = str(hyperdata['abstract'][-9])
hyperdata['abstract'].pop()
# Here add separator for paragraphs
hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
hyperdata['abstract'] = str(re.sub('Tous droits réservés.*$', '', hyperdata['abstract']))
elif format_europresse == 1:
hyperdata['doi'] = ' '.join(html_article.xpath(doi_xpath))
hyperdata['abstract'] = hyperdata['abstract'][:-9]
# Here add separator for paragraphs
hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
hyperdata['journal'] = pub_name.strip()
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
else:
hyperdata['doi'] = "not found"
# try:
# hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
# except:
# PrintException()
hyperdata['rubrique'] = header[0]
date = header[1]
hyperdata['length_letters'] = len(hyperdata['abstract'])
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
try:
hyperdata['publication_date'] = dateparser.parse(date, languages=['fr', 'en'])
except:
hyperdata['publication_date'] = timezone.now()
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = paragraph_list(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
file_open.close()
except :
PrintException()
pass
hyperdata['bdd'] = u'europresse'
hyperdata['url'] = u''
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
#hyperdata_str = {}
for key, value in hyperdata.items():
hyperdata[key] = value.decode() if isinstance(value, bytes) else value
yield hyperdata
count += 1
file.close()
except Exception as error:
print(error)
pass
import re
import locale
from lxml import etree
from lxml.html import html5parser
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
from .FileParser import FileParser
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
count = 0
if isinstance(file, str):
file = open(file, 'rb')
# print(file)
contents = file.read()
#print(len(contents))
#return []
encoding = self.detect_encoding(contents)
#print(encoding)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
# try:
# contents = contents.decode(encoding, errors='replace').encode(codif)
# except Exception as error:
# print(error)
try:
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
try :
format_europresse = 50
html_articles = html.xpath('/html/body/table/tbody')
if len(html_articles) < 1:
html_articles = html.xpath('/html/body/table')
if len(html_articles) < 1:
format_europresse = 1
html_articles = html.xpath('//div[@id="docContain"]')
if len(html_articles) < 1 :
format_europresse = 50.2
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
if len(html_articles) < 1:
print("no article found")
except :
PrintException()
if format_europresse == 50 :
name_xpath = "./tr/td/span[@class = 'DocPublicationName']"
header_xpath = "./tr/td/span[@class = 'DocHeader']"
title_xpath = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif format_europresse == 1 :
name_xpath = "//span[@class = 'DocPublicationName']"
header_xpath = "//span[@class = 'DocHeader']"
title_xpath = "string(//div[@class = 'titreArticleVisu'])"
text_xpath = "./descendant::*[\
not(\
self::div[@class='Doc-SourceText'] \
or self::span[@class='DocHeader'] \
or self::span[@class='DocPublicationName'] \
or self::span[@id='docNameVisu'] \
or self::span[@class='DocHeader'] \
or self::div[@class='titreArticleVisu'] \
or self::span[@id='docNameContType'] \
or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo'] \
or descendant-or-self::span[@id='ucPubliC_lblEndDate'] \
or self::td[@class='txtCertificat'] \
)]/text()"
doi_xpath = "//span[@id='ucPubliC_lblNodoc']/text()"
elif format_europresse == 50.2 :
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "string(./header/div/span[@class = 'TitreArticleVisu'])"
text_xpath = "./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
except Exception as error :
PrintException()
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
if len(html_article):
for name in html_article.xpath(name_xpath):
#print("test name.text")
if name.text is not None:
#print(name.text)
format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text)
if test_journal is not None:
hyperdata['journal'] = test_journal.group(1)
hyperdata['volume'] = test_journal.group(2)
else:
hyperdata['journal'] = name.text.encode(codif)
countbis = 0
for header in html_article.xpath(header_xpath):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try:
text = header.text
print("header", text)
except Exception as error:
print(error)
if isinstance(text, bytes):
text = text.decode(encoding)
if format_europresse == 50.2:
# TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
try:# # 2015-oct-08 exception added
text = text.split(', ')[1]
except:
pass
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE)
if text is not None:
test_date_fr = format_date_fr.match(text)
#TODO check the v2 format here
test_date_fr_v2 = format_date_fr_v2.match(text)
format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
test_date_en = format_date_en.match(text)
format_sect = re.compile('(\D+),', re.UNICODE)
test_sect = format_sect.match(text)
format_page = re.compile(', p. (\w+)', re.UNICODE)
test_page = format_page.match(text)
else:
test_date_fr = None
test_date_fr_v2 = None
test_date_en = None
test_sect = None
test_page = None
if test_date_fr is not None or test_date_fr_v2 is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, "fr_FR.utf-8")
if encoding != "utf-8":
text = text.replace('י', 'é')
text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ')
try:
hyperdata['publication_date'] = dateparser.parse(text, languages=['fr'])
except:
try :
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except :
try:
hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
except :
try:
locale.setlocale(locale.LC_ALL, "fr_FR")
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
# hyperdata['publication_date'] = dateutil.parser.parse(text)
except :
# TODO format to parse: ' mercredi 26 novembre 2014'
try :
hyperdata['publication_date'] = datetime.strptime(text, ' %A %d %B %Y')
except Exception as error:
print(error, text)
pass
if test_date_en is not None:
localeEncoding = "en_GB.UTF-8"
locale.setlocale(locale.LC_ALL, localeEncoding)
try :
hyperdata['publication_date'] = datetime.strptime(text, '%B %d, %Y')
except :
try :
hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
except :
pass
if test_sect is not None:
hyperdata['section'] = test_sect.group(1).encode(codif)
if test_page is not None:
hyperdata['page'] = test_page.group(1).encode(codif)
# try:
# print('183', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
#
hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
hyperdata['abstract'] = html_article.xpath(text_xpath)
line = 0
br_tag = 10
for i in html_articles[count].iter():
# print line, br, i, i.tag, i.attrib, i.tail
if i.tag == "span":
if "class" in i.attrib:
if i.attrib['class'] == 'TitreArticleVisu':
line = 1
br_tag = 2
if line == 1 and i.tag == "br":
br_tag -= 1
if line == 1 and br_tag == 0:
try:
hyperdata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';')
except:
hyperdata['authors'] = 'not found'
line = 0
br_tag = 10
try:
if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
try:
back = hyperdata['publication_date']
except Exception as e:
#print(e)
pass
else:
try:
hyperdata['publication_date'] = back
except Exception as e:
print(e)
except :
hyperdata['publication_date'] = timezone.now()
#if lang == 'fr':
#hyperdata['language_iso2'] = 'fr'
#elif lang == 'en':
# hyperdata['language_iso2'] = 'en'
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#hyperdata.pop('publication_date')
if len(hyperdata['abstract'])>0 and format_europresse == 50:
hyperdata['doi'] = str(hyperdata['abstract'][-9])
hyperdata['abstract'].pop()
# Here add separator for paragraphs
hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
hyperdata['abstract'] = str(re.sub('Tous droits réservés.*$', '', hyperdata['abstract']))
elif format_europresse == 1:
hyperdata['doi'] = ' '.join(html_article.xpath(doi_xpath))
hyperdata['abstract'] = hyperdata['abstract'][:-9]
# Here add separator for paragraphs
hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
else:
hyperdata['doi'] = "not found"
# try:
# hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
# except:
# PrintException()
hyperdata['length_letters'] = len(hyperdata['abstract'])
hyperdata['bdd'] = u'europresse'
hyperdata['url'] = u''
#hyperdata_str = {}
for key, value in hyperdata.items():
hyperdata[key] = value.decode() if isinstance(value, bytes) else value
yield hyperdata
count += 1
file.close()
except Exception as error:
print(error)
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment