Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
b192ddd8
Commit
b192ddd8
authored
Jul 28, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
M EUROPRESSE > EUROPRESS.py
parent
8ff1648c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
278 additions
and
0 deletions
+278
-0
EUROPRESS.py
gargantext/util/parsers/EUROPRESS.py
+278
-0
No files found.
gargantext/util/parsers/EUROPRESS.py
0 → 100644
View file @
b192ddd8
"""Parses Europress 2015 html format (both for English and French)
- recognizes language according to date format
- scraps text for each paragraph to fill hyperdata['abstract']
"""
__author__
=
"Gargantext Team"
__copyright__
=
"Copyright 2014-15 ISCPIF-CNRS"
__version__
=
"0.1"
__email__
=
"romain.loth@iscpif.fr"
__status__
=
"Test"
import
re
import
locale
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
._Parser
import
Parser
class
EuropressParser
(
Parser
):
def
parse
(
self
,
file
):
#print("europr_parser file", file)
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
# les docs europresse en/fr
# se distinguent principalement
# par la forme de leur date
# ex: November 7, 2012
format_date_en
=
re
.
compile
(
r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+[0-3]?\d,\s+(?:19|20)\d\d'
)
# ex: 16 mars 2011
format_date_fr
=
re
.
compile
(
r'[0-3]?\d\s+(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\s+(?:19|20)\d\d'
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
contents
=
file
.
read
()
encoding
=
self
.
detect_encoding
(
contents
)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
ValueError
(
'Error while decoding from "latin1" to "
%
s"'
%
encoding
)
try
:
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
except
Exception
as
error
:
print
(
"Europresse lxml error:"
,
error
)
# all except detail_header are mandatory to parse the article
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
# title_xpath (cas normal):
# "./header/div[@class='titreArticle']"
# title_xpath (rapports):
# "./header/div/p[@class='titreArticleVisu grandTitre']"
#
# title_xpath (chemin plus générique)
title_xpath
=
"./header//*[contains(@class,'titreArticle')]"
text_xpath
=
"./section/div[@class='DocText']//p"
entire_header_xpath
=
"./header"
# diagnosed during date retrieval and used for rubrique
detail_header_xpath
=
"./header/div/span[@class = 'DocHeader']"
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
):
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
))
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
# print("==============================new article")
# s'il n'y a pas du tout de header on doit skip
all_header
=
html_article
.
xpath
(
entire_header_xpath
)
if
len
(
all_header
)
==
0
:
print
(
"WARNING: europress (skip) article without header"
)
continue
hyperdata
=
{}
# TITLE
# -----
title
=
[]
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
# il y aura un problème d'affichage si pas de titre !
print
(
"WARNING: europress (skip) article without title"
)
continue
# FULLTEXT
# --------
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>'
+
p_text
+
'</p>'
for
p_text
in
title
[
1
:]
+
text
])
except
:
pass
# PUBLICATIONNAME
# ----------------
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
hyperdata
[
'journal'
]
=
name
[
0
]
hyperdata
[
'number'
]
=
name
[
1
]
except
:
try
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
# DATE et LANGUAGE
# ----------------
# analyse locale de la langue via le format de la date
#
# permet de choisir ResourceType "Europress" sans s'occuper
# du détail de la langue source
doc_language
=
None
date
=
None
# le texte sur lequel on cherchera la date/langue
search_text
=
None
# zone DocHeader fournissant précisément rubrique et date
detailed_text
=
None
get_detail_header
=
html_article
.
xpath
(
detail_header_xpath
)
if
len
(
get_detail_header
)
!=
0
:
# cas le plus courant
# -------------------
# ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17"
# ex: "Votre ville, jeudi 6 février 2014"
# ex: "World, Friday, November 13, 2015"
detailed_text
=
get_detail_header
[
0
]
.
text
search_text
=
detailed_text
else
:
# occasionellment DocHeader absent
# (on se rabat sur le header entier)
search_text
=
" "
.
join
(
scrap_text
(
all_header
[
0
]))
# print("---using all header: '%s'" % search_text)
# on poursuit date/langue avec la zone obtenue
# 1) Une REGEXP identifie la langue ET attrape la date
test_date_fr
=
re
.
search
(
format_date_fr
,
search_text
)
if
test_date_fr
:
doc_language
=
'fr'
# print("=============== Header date fr")
# save for FileParser
hyperdata
[
"language_iso2"
]
=
'fr'
# match str
date_str
=
test_date_fr
.
group
()
else
:
# ex: November 7, 2012
test_date_en
=
re
.
search
(
format_date_en
,
search_text
)
if
test_date_en
:
doc_language
=
'en'
# print("=============== Header date en")
# save for FileParser
# TODO this does not work
hyperdata
[
"language_iso2"
]
=
'fr'
# match str
date_str
=
test_date_en
.
group
()
else
:
print
(
"WARNING europress: echec diagnostic date/langue header sur '
%
s'"
%
header
)
# default lg value, used locally, not saved
doc_language
=
'en'
# default date value, will be saved
date_str
=
"2016"
# 2) we parse the retrieved datestring into a formal date
try
:
the_date
=
dateparser
.
parse
(
date_str
.
strip
(),
languages
=
[
doc_language
],
date_formats
=
[
'
%
d
%
B
%
Y'
,
'
%
B
%
d,
%
Y'
]
)
except
:
the_date
=
timezone
.
now
()
hyperdata
[
'publication_date'
]
=
the_date
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
# print("RES POSTPROC:",hyperdata['publication_date'])
# infos dérivées
hyperdata
[
'publication_year'
]
=
the_date
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
the_date
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
the_date
.
strftime
(
'
%
d'
)
# RUBRIQUE
# --------
# quand on a le DocHeader détaillé on peut vérifier la rubrique
# (si présente elle est juste avant la date)
if
detailed_text
is
not
None
:
header_elts
=
detailed_text
.
split
(
', '
)
# on vérifie que le premier élément n'est pas une date ou un fragment de date
if
parse_date
(
header_elts
[
0
],
doc_language
)
is
None
:
# most probably news_topic before beginning of date
hyperdata
[
'rubrique'
]
=
header_elts
[
0
]
yield
hyperdata
except
:
print
(
'Something bad happened.'
)
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment