Commit 389cc1e5 authored by delanoe's avatar delanoe

[FIX] Europresse parser for english, need refactoring (see...

[FIX] Europresse parser for english, need refactoring (see EuropresseFileParser.py for heritage class).
parent f6398952
...@@ -23,14 +23,12 @@ from ..NgramsExtractors import * ...@@ -23,14 +23,12 @@ from ..NgramsExtractors import *
from admin.utils import PrintException from admin.utils import PrintException
class EuropressFileParser(FileParser): class EuropressFileParser(FileParser):
def _parse_header(self, header):
pass
def _parse(self, file): def _parse(self, file):
localeEncoding = "fr_FR" localeEncoding = "fr_FR"
codif = "UTF-8" codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str): if isinstance(file, str):
file = open(file, 'rb') file = open(file, 'rb')
...@@ -75,6 +73,7 @@ class EuropressFileParser(FileParser): ...@@ -75,6 +73,7 @@ class EuropressFileParser(FileParser):
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
print('article')
hyperdata = {} hyperdata = {}
...@@ -91,57 +90,83 @@ class EuropressFileParser(FileParser): ...@@ -91,57 +90,83 @@ class EuropressFileParser(FileParser):
header = html_article.xpath(header_xpath)[0].text header = html_article.xpath(header_xpath)[0].text
hyperdata.update(self._parse_header(header))
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = paragraph_list(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
class EuropressFileParser_fr(EuropressFileParser):
def _parse_header(self, header):
format_date = re.compile('.*\d{4}.*', re.UNICODE)
hyperdata = dict()
if header is not None: if header is not None:
header = header.split(', ') header = header.split(', ')
if parse_date(header[0], 'fr') is not None: if format_date.match(header[0]):
date = header[0] date = header[0]
elif parse_date(header[1], 'fr') is not None: elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0] hyperdata['rubrique'] = header[0]
date = header[1] date = header[1]
try: try:
hyperdata['page'] = header[2].split(' ')[1] hyperdata['page'] = header[2].split(' ')[1]
except: except:
pass pass
else:
elif parse_date(header[2], 'fr') is not None:
date = header[2] date = header[2]
elif parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
try: try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en']) hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr'])
except: except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S") hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y') return(hyperdata)
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date']) #print(hyperdata['publication_date'])
class EuropressFileParser_en(EuropressFileParser):
def _parse_header(self, header):
format_date = re.compile('.*\d{4}.*', re.UNICODE)
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0]
date = header[1]
try: try:
title = paragraph_list(html_article.xpath(title_xpath)) hyperdata['page'] = header[2].split(' ')[1]
hyperdata['title'] = title[0]
except: except:
pass pass
else:
date = header[2]
try: try:
text = paragraph_list(html_article.xpath(text_xpath)) hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr'])
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except: except:
pass hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__": if __name__ == "__main__":
e = EuropressFileParser() e = EuropressFileParser()
......
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_en(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_page = re.compile('p\. .*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*"
def paragraph_list(data_xpath):
result = list()
for elem in data_xpath:
if elem.text is not None:
if elem.text.strip() != '':
if elem.tag == 'p':
result.append(elem.text)
else:
if len(result) > 0:
result.append(result.pop() + elem.text)
else:
result.append(elem.text)
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
header = list(filter(lambda x: format_page.match(x) is None, header))
print(header)
if parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
elif parse_date(header[3], 'en') is not None:
date = ' '.join(header[3:])
else:
date = '2016'
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
try:
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
except:
print(hyperdata['title'])
print(date)
try:
text = paragraph_list(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_fr(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*"
def paragraph_list(data_xpath):
result = list()
for elem in data_xpath:
if elem.text is not None:
if elem.text.strip() != '':
if elem.tag == 'p':
result.append(elem.text)
else:
if len(result) > 0:
result.append(result.pop() + elem.text)
else:
result.append(elem.text)
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
if parse_date(header[0], 'fr') is not None:
date = header[0]
elif parse_date(header[1], 'fr') is not None:
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
elif parse_date(header[2], 'fr') is not None:
date = header[2]
elif parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = paragraph_list(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
...@@ -3,6 +3,7 @@ from .IsiFileParser import IsiFileParser ...@@ -3,6 +3,7 @@ from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser from .EuropressFileParser_en import EuropressFileParser_en
from .EuropressFileParser_fr import EuropressFileParser_fr
from .ISTex import ISTex from .ISTex import ISTex
from .CSVParser import CSVParser from .CSVParser import CSVParser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment