Commit ba026add authored by Romain Loth's avatar Romain Loth

Un seul EuropressFileParser pour les deux langues.

parent 5149e7ce
...@@ -125,21 +125,14 @@ def project(request, project_id): ...@@ -125,21 +125,14 @@ def project(request, project_id):
thefile = form.cleaned_data['file'] thefile = form.cleaned_data['file']
resourcetype = cache.ResourceType[form.cleaned_data['type']] resourcetype = cache.ResourceType[form.cleaned_data['type']]
# which default language shall be used?
if resourcetype.name == "Europress (French)":
language_id = cache.Language['fr'].id
elif resourcetype.name == "Europress (English)":
language_id = cache.Language['en'].id
else:
language_id = None
# corpus node instanciation as a Django model # corpus node instanciation as a Django model
corpus = Node( corpus = Node(
name = name, name = name,
user_id = request.user.id, user_id = request.user.id,
parent_id = project_id, parent_id = project_id,
type_id = cache.NodeType['Corpus'].id, type_id = cache.NodeType['Corpus'].id,
language_id = language_id, # no default language at this point
language_id = None,
hyperdata = {'Processing' : "Parsing documents",} hyperdata = {'Processing' : "Parsing documents",}
) )
session.add(corpus) session.add(corpus)
......
"""
Parses Europress 2015 html format (both for english and french)
=> recognizes language according to date format
=> scraps text for each paragraph to fill hyperdata['abstract']
"""
__author__ = "Gargantext Team"
__copyright__ = "Copyright 2014-15 ISCPIF-CNRS"
__version__ = "0.1"
__email__ = "romain.loth@iscpif.fr"
__status__ = "Test"
import re import re
import locale import locale
...@@ -23,12 +34,24 @@ from ..NgramsExtractors import * ...@@ -23,12 +34,24 @@ from ..NgramsExtractors import *
from admin.utils import PrintException from admin.utils import PrintException
class EuropressFileParser(FileParser): class EuropressFileParser(FileParser):
def _parse_header(self, header):
pass
def _parse(self, file): def _parse(self, file):
localeEncoding = "fr_FR" #print("europr_parser file", file)
codif = "UTF-8"
localeEncoding = "fr_FR"
codif = "UTF-8"
format_page = re.compile('p\. .*', re.UNICODE)
# les docs europresse en/fr
# se distinguent principalement
# par la forme de leur date
# ex: November 7, 2012
format_date_en = re.compile(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+[0-3]?\d,\s+(?:19|20)\d\d')
# ex: 16 mars 2011
format_date_fr = re.compile(r'[0-3]?\d\s+(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\s+(?:19|20)\d\d')
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str): if isinstance(file, str):
file = open(file, 'rb') file = open(file, 'rb')
...@@ -52,31 +75,47 @@ class EuropressFileParser(FileParser): ...@@ -52,31 +75,47 @@ class EuropressFileParser(FileParser):
name_xpath = "./header/div/span[@class = 'DocPublicationName']" name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']" header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*" title_xpath = "./header/div[@class='titreArticle']"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*" text_xpath = "./section/div[@class='DocText']//p"
def paragraph_list(data_xpath): def scrap_text(data_xpath):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result = list() result = list()
# a priori un seul titre ou plusieurs p dans data_xpath
for elem in data_xpath: for elem in data_xpath:
if elem.text is not None: all_text = list()
if elem.text.strip() != '': # on utilise itertext pour avoir
if elem.tag == 'p': # tous les sous éléments 1 fois
result.append(elem.text) # quelque soit la profondeur
else: for sub_txt in elem.itertext(with_tail=True):
if len(result) > 0: sub_txt_clean = sub_txt.strip()
result.append(result.pop() + elem.text) if sub_txt_clean != '':
else: all_text.append(sub_txt_clean)
result.append(elem.text) result.append(" ".join(all_text))
return result return result
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
print('article')
# print("2 en 1 ==============================new article")
hyperdata = {} hyperdata = {}
# analyse de la langue => utile pour la date
# faite localement pour permettre aux utilisateurs
# de choisir ResourceType "Europress" sans s'occuper
# du détail de la langue sourc
doc_language = None
try: try:
pub_name = html_article.xpath(name_xpath)[0].text pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ') name = pub_name.split(', ')
...@@ -87,24 +126,101 @@ class EuropressFileParser(FileParser): ...@@ -87,24 +126,101 @@ class EuropressFileParser(FileParser):
hyperdata['journal'] = pub_name.strip() hyperdata['journal'] = pub_name.strip()
except: except:
pass pass
header = html_article.xpath(header_xpath)[0].text header = html_article.xpath(header_xpath)[0].text
hyperdata.update(self._parse_header(header)) if header is not None:
# Article headers in europress
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y') # -----------------------------
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m') # ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17"
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d') # ex: "Votre ville, jeudi 6 février 2014"
# ex: "World, Friday, November 13, 2015"
# 1) test language before splitting
if re.search(format_date_fr,header):
doc_language = 'fr'
# print("=============== Header date fr")
# save for FileParser
hyperdata["language_iso2"] = 'fr'
elif re.search(format_date_en,header):
doc_language = 'en'
# print("=============== Header date en")
# save for FileParser
hyperdata["language_iso2"] = 'en'
else:
print("WARNING europress: echec diagnostic langue header sur '%s'" % header)
# default value, used locally, not saved
doc_language = 'en'
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header = header.split(', ')
# mais dateparser ne veut pas d'éléments autres à la suite de la date
# ==> on filtre les indications de pages qu'europress met souvent après
header = list(filter(lambda x: format_page.match(x) is None, header))
date = None
if parse_date(header[0], doc_language) is not None:
if doc_language == 'fr':
date = header[0]
# print("match 1 fre => 0 = %s " % date)
else:
date = ' '.join(header[0:])
# print("match 0 eng => 0: = %s " % date)
else:
# most probably news_topic before beginning of date
hyperdata['rubrique'] = header[0]
# [1..last_header_fragment]
for i in range(1,len(header)):
if parse_date(header[i], doc_language) is not None:
if doc_language == 'fr':
date = header[i]
# print("match %i fre => %i = %s " % (i,i,date))
else:
date = ' '.join(header[i:])
# print("match %i eng => %i: = %s " % (i,i,date))
# default
if date is None:
date = '2016'
# print("no match => 2016")
# we parse the retrieved datestring into a formal date
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), doc_language)
# print("RES POSTPROC:",hyperdata['publication_date'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
try: try:
title = paragraph_list(html_article.xpath(title_xpath)) hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
except:
print(hyperdata['title'])
print(date)
#print(hyperdata['publication_date'])
try:
title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0] hyperdata['title'] = title[0]
except: except:
pass pass
try: try:
text = paragraph_list(html_article.xpath(text_xpath)) text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text]) hyperdata['abstract'] = '\n'.join([ '<p>\n'+p_text+'</p>\n' for p_text in title[1:] + text])
except: except:
pass pass
...@@ -114,60 +230,6 @@ class EuropressFileParser(FileParser): ...@@ -114,60 +230,6 @@ class EuropressFileParser(FileParser):
PrintException() PrintException()
pass pass
class EuropressFileParser_fr(EuropressFileParser):
def _parse_header(self, header):
format_date = re.compile('.*\d{4}.*', re.UNICODE)
hyperdata = dict()
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
else:
date = header[2]
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
return(hyperdata)
#print(hyperdata['publication_date'])
class EuropressFileParser_en(EuropressFileParser):
def _parse_header(self, header):
format_date = re.compile('.*\d{4}.*', re.UNICODE)
if header is not None:
header = header.split(', ')
if format_date.match(header[0]):
date = header[0]
elif format_date.match(header[1]):
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
else:
date = header[2]
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
if __name__ == "__main__": if __name__ == "__main__":
e = EuropressFileParser() e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1])) hyperdata = e.parse(str(sys.argv[1]))
......
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_en(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_page = re.compile('p\. .*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']"
text_xpath = "./section/div[@class='DocText']//p"
def scrap_text(data_xpath):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result = list()
# a priori un seul titre ou plusieurs p dans data_xpath
for elem in data_xpath:
all_text = list()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for sub_txt in elem.itertext(with_tail=True):
sub_txt_clean = sub_txt.strip()
if sub_txt_clean != '':
all_text.append(sub_txt_clean)
result.append(" ".join(all_text))
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header = header.split(', ')
header = list(filter(lambda x: format_page.match(x) is None, header))
if parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
hyperdata['rubrique'] = header[0]
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
hyperdata['rubrique'] = header[0]
date = ' '.join(header[2:])
elif parse_date(header[3], 'en') is not None:
hyperdata['rubrique'] = header[0]
date = ' '.join(header[3:])
else:
date = '2016'
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
try:
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
except:
print(hyperdata['title'])
print(date)
try:
title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = '\n'.join([ '<p>\n'+p_text+'</p>\n' for p_text in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_fr(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']"
text_xpath = "./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
def scrap_text(data_xpath):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result = list()
# a priori un seul titre ou plusieurs p dans data_xpath
for elem in data_xpath:
all_text = list()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for sub_txt in elem.itertext(with_tail=True):
sub_txt_clean = sub_txt.strip()
if sub_txt_clean != '':
all_text.append(sub_txt_clean)
result.append(" ".join(all_text))
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
if parse_date(header[0], 'fr') is not None:
date = header[0]
elif parse_date(header[1], 'fr') is not None:
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
elif parse_date(header[2], 'fr') is not None:
date = header[2]
elif parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date'])
try:
title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = '\n'.join([ '<p>\n'+p_text+'</p>\n' for p_text in title[1:] + text])
# join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
...@@ -3,7 +3,7 @@ from .IsiFileParser import IsiFileParser ...@@ -3,7 +3,7 @@ from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser_en import EuropressFileParser_en # 2015-12-08: parser 2 en 1
from .EuropressFileParser_fr import EuropressFileParser_fr from .EuropressFileParser import EuropressFileParser
from .ISTex import ISTex from .ISTex import ISTex
from .CSVParser import CSVParser from .CSVParser import CSVParser
# import * via __init__.py
from .FileParsers import * from .FileParsers import *
parsers = { parsers = {
...@@ -6,9 +7,16 @@ parsers = { ...@@ -6,9 +7,16 @@ parsers = {
'Scopus (RIS format)' : RisFileParser, 'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : ZoteroFileParser, 'Zotero (RIS format)' : ZoteroFileParser,
'Jstor (RIS format)' : JstorFileParser, 'Jstor (RIS format)' : JstorFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
# Une seule entrée pourra remplacer les variantes French/English
# mais (TODO) il faudra juste vérifier cohérence:
# - avec DB: node_resourcetype
# - avec admin/update_corpus.py
#'Europress' : EuropressFileParser, #'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser_fr,
'Europress (English)' : EuropressFileParser_en,
'CSVParser' : CSVParser, 'CSVParser' : CSVParser,
'ISTex' : ISTex, 'ISTex' : ISTex,
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment