Commit 389cc1e5 authored by delanoe's avatar delanoe

[FIX] Europresse parser for english, need refactoring (see...

[FIX] Europresse parser for english, need refactoring (see EuropresseFileParser.py for heritage class).
parent f6398952
......@@ -23,14 +23,12 @@ from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser(FileParser):
def _parse_header(self, header):
pass
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
localeEncoding = "fr_FR"
codif = "UTF-8"
if isinstance(file, str):
file = open(file, 'rb')
......@@ -75,6 +73,7 @@ class EuropressFileParser(FileParser):
# parse all the articles, one by one
try:
for html_article in html_articles:
print('article')
hyperdata = {}
......@@ -91,40 +90,12 @@ class EuropressFileParser(FileParser):
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
if parse_date(header[0], 'fr') is not None:
date = header[0]
elif parse_date(header[1], 'fr') is not None:
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
elif parse_date(header[2], 'fr') is not None:
date = header[2]
elif parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata.update(self._parse_header(header))
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
......@@ -143,6 +114,60 @@ class EuropressFileParser(FileParser):
PrintException()
pass
class EuropressFileParser_fr(EuropressFileParser):
def _parse_header(self, header):
format_date = re.compile('.*\d{4}.*', re.UNICODE)
hyperdata = dict()
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
else:
date = header[2]
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
return(hyperdata)
#print(hyperdata['publication_date'])
class EuropressFileParser_en(EuropressFileParser):
def _parse_header(self, header):
format_date = re.compile('.*\d{4}.*', re.UNICODE)
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
else:
date = header[2]
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
......
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_en(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_page = re.compile('p\. .*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*"
def paragraph_list(data_xpath):
result = list()
for elem in data_xpath:
if elem.text is not None:
if elem.text.strip() != '':
if elem.tag == 'p':
result.append(elem.text)
else:
if len(result) > 0:
result.append(result.pop() + elem.text)
else:
result.append(elem.text)
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
header = list(filter(lambda x: format_page.match(x) is None, header))
print(header)
if parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
elif parse_date(header[3], 'en') is not None:
date = ' '.join(header[3:])
else:
date = '2016'
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
try:
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
except:
print(hyperdata['title'])
print(date)
try:
text = paragraph_list(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_fr(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*"
def paragraph_list(data_xpath):
result = list()
for elem in data_xpath:
if elem.text is not None:
if elem.text.strip() != '':
if elem.tag == 'p':
result.append(elem.text)
else:
if len(result) > 0:
result.append(result.pop() + elem.text)
else:
result.append(elem.text)
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
if parse_date(header[0], 'fr') is not None:
date = header[0]
elif parse_date(header[1], 'fr') is not None:
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
elif parse_date(header[2], 'fr') is not None:
date = header[2]
elif parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = paragraph_list(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
......@@ -3,6 +3,7 @@ from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .EuropressFileParser_en import EuropressFileParser_en
from .EuropressFileParser_fr import EuropressFileParser_fr
from .ISTex import ISTex
from .CSVParser import CSVParser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment