Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
d87916f9
Commit
d87916f9
authored
Dec 10, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain' into unstable
parents
485f2b02
d5a8e664
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
368 additions
and
477 deletions
+368
-477
README.rst
README.rst
+0
-0
views_optimized.py
gargantext_web/views_optimized.py
+2
-9
importExport.py
ngram/importExport.py
+198
-49
stop.py
ngram/stop.py
+1
-0
EuropressFileParser.py
parsing/FileParsers/EuropressFileParser.py
+144
-82
EuropressFileParser_en.py
parsing/FileParsers/EuropressFileParser_en.py
+0
-165
EuropressFileParser_fr.py
parsing/FileParsers/EuropressFileParser_fr.py
+0
-167
ISTex.py
parsing/FileParsers/ISTex.py
+11
-1
__init__.py
parsing/FileParsers/__init__.py
+2
-2
parsers_config.py
parsing/parsers_config.py
+10
-2
No files found.
init/
README.rst
→
README.rst
View file @
d87916f9
File moved
gargantext_web/views_optimized.py
View file @
d87916f9
...
...
@@ -125,21 +125,14 @@ def project(request, project_id):
thefile
=
form
.
cleaned_data
[
'file'
]
resourcetype
=
cache
.
ResourceType
[
form
.
cleaned_data
[
'type'
]]
# which default language shall be used?
if
resourcetype
.
name
==
"Europress (French)"
:
language_id
=
cache
.
Language
[
'fr'
]
.
id
elif
resourcetype
.
name
==
"Europress (English)"
:
language_id
=
cache
.
Language
[
'en'
]
.
id
else
:
language_id
=
None
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
name
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
type_id
=
cache
.
NodeType
[
'Corpus'
]
.
id
,
language_id
=
language_id
,
# no default language at this point
language_id
=
None
,
hyperdata
=
{
'Processing'
:
"Parsing documents"
,}
)
session
.
add
(
corpus
)
...
...
ngram/importExport.py
View file @
d87916f9
import
re
from
admin.utils
import
PrintException
from
gargantext_web.db
import
Node
,
Ngram
,
NodeNgram
,
NodeNode
Ngram
from
gargantext_web.db
import
Node
,
Ngram
,
NodeNgram
,
NodeNodeNgram
,
NodeNgram
Ngram
from
gargantext_web.db
import
cache
,
session
,
get_or_create_node
,
bulk_insert
import
sqlalchemy
as
sa
...
...
@@ -13,73 +13,222 @@ from sqlalchemy.orm import aliased
from
ngram.tools
import
insert_ngrams
from
analysis.lists
import
WeightedList
,
UnweightedList
def
exportNgramList
(
node
,
filename
):
from
collections
import
defaultdict
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
def
get_id
(
ngram_terms
):
query
=
session
.
query
(
Ngram
.
id
)
.
filter
(
Ngram
.
terms
==
ngram_terms
)
.
first
()
return
(
query
)
def
exportNgramList
(
node
,
filename
,
delimiter
=
"
\t
"
):
# les nodes couvrant les listes
# -----------------------------
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
node
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
node
)
map_node
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
node
)
group_node
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
node
)
stop_ngrams
=
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stop_node
.
id
)
.
all
()
miam_ngrams
=
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
all
()
map_ngrams
=
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
map_node
.
id
)
.
all
()
group_ngrams
=
(
session
.
query
(
NodeNgramNgram
.
ngramx_id
,
NodeNgramNgram
.
ngramy_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
group_node
.
id
)
.
all
()
)
all_ngrams
=
set
()
grouped
=
defaultdict
(
lambda
:
defaultdict
(
set
))
toList
=
list
()
for
ngram
in
group_ngrams
:
grouped
[
ngram
[
0
]]
.
add
(
ngram
[
1
])
all_ngrams
.
add
(
ngram
[
0
])
all_ngrams
.
add
(
ngram
[
1
])
# listes de ngram_ids correspondantes
# ------------------------------------
#~~ contenu: liste des ids [2562,...]
stop_ngram_ids
=
[
stop_ngram
.
ngram_id
for
stop_ngram
in
stop_node
.
node_node_ngram_collection
]
# idem pour miam et map
miam_ngram_ids
=
[
miam_ng
.
ngram_id
for
miam_ng
in
miam_node
.
node_node_ngram_collection
]
map_ngram_ids
=
[
map_ng
.
ngram_id
for
map_ng
in
map_node
.
node_node_ngram_collection
]
def
add_ngram
(
fromList
,
toList
=
toList
,
grouplist
=
grouped
,
all_ngrams
=
all_ngrams
,
weight
=
0
):
for
ngram_id
in
from_list
:
all_ngrams
.
add
(
ngram_id
)
if
ngram_id
in
grouplist
.
keys
():
ngrams
.
append
((
ngram_id
,
grouped
[
ngram_id
],
weight
))
else
:
ngram
.
append
((
ngram_id
,
""
,
weight
))
add_ngrams
(
stop_ngrams
,
weight
=
0
)
add_ngrams
(
miam_ngrams
,
weight
=
1
)
add_ngrams
(
map_ngrams
,
weight
=
2
)
# pour la group_list on a des couples de ngram_ids
# -------------------
# ex: [(3544, 2353), (2787, 4032), ...]
group_ngram_id_couples
=
[(
nd_ng_ng
.
ngramx_id
,
nd_ng_ng
.
ngramy_id
)
for
nd_ng_ng
in
group_node
.
node_nodengramngram_collection
]
# to csv
with
open
(
filename
,
"w"
)
as
f
:
f
.
write
(
ngram
)
for
ngram
in
ngrams
# k couples comme set
# --------------------
# [(a => x) (a => y)] => [a => {x,y}]
grouped
=
defaultdict
(
set
)
for
ngram
in
group_ngram_id_couples
:
# /!\ just in one direction /!\
# a => {x} but not not x => {a}
grouped
[
ngram
[
0
]]
.
add
(
ngram
[
1
])
# helper func
def
ngrams_to_csv_rows
(
ngram_ids
,
id_groupings
=
{},
list_type
=
7
):
"""
Table d'infos basiques par ngram :
(ng_id, forme du terme, poids, type_de_liste)
avec une colonne supplémentaire optionnelle:
ngrams groupés avec cet id ex: "4|42"
Retourne une matrice csv_rows en liste de liste
[
[ligne1_colA, ligne1_colB..],
[ligne2_colA, ligne2_colB..],
..
]
(ensuite par exemple csv.writer.writerows(csv_rows)
"""
# récupérer d'un coup les objets Ngram (avec terme)
ng_objs
=
session
.
query
(
Ngram
)
.
filter
(
Ngram
.
id
.
in_
(
ngram_ids
))
.
all
()
# les transcrire en tableau (liste de listes)
csv_rows
=
list
()
for
ng_obj
in
ng_objs
:
ng_id
=
ng_obj
.
id
if
ng_id
in
id_groupings
.
keys
():
this_grouped
=
"|"
.
join
(
str
(
gid
)
for
gid
in
id_groupings
[
ng_id
])
else
:
this_grouped
=
""
# transcription : 5 colonnes
# ID , terme , n , type_de_liste , gid|gid|gid
csv_rows
.
append
(
[
ng_id
,
ng_obj
.
terms
,
ng_obj
.
n
,
list_type
,
this_grouped
]
)
# csv_rows = [[ligne1_a, ligne1_b..],[ligne2_a, ligne2_b..],..]
return
csv_rows
# on applique notre fonction ng_to_csv sur chaque liste
# ------------------------------------------------------
stop_csv_rows
=
ngrams_to_csv_rows
(
stop_ngram_ids
,
id_groupings
=
grouped
,
list_type
=
0
)
# miam contient map donc il y a un préalable ici
miam_without_map
=
[
ng
for
ng
in
miam_ngram_ids
if
ng
not
in
map_ngram_ids
]
miam_csv_rows
=
ngrams_to_csv_rows
(
miam_without_map
,
id_groupings
=
grouped
,
list_type
=
1
)
map_csv_rows
=
ngrams_to_csv_rows
(
map_ngram_ids
,
id_groupings
=
grouped
,
list_type
=
2
)
# all lists together now
this_corpus_all_rows
=
stop_csv_rows
+
miam_csv_rows
+
map_csv_rows
# output
with
open
(
filename
,
'w'
)
as
out_file
:
# csv.writer()
csv_wr
=
writer
(
out_file
,
delimiter
=
delimiter
,
quoting
=
QUOTE_MINIMAL
)
# write to outfile
csv_wr
.
writerows
(
this_corpus_all_rows
)
def
importNgramList
(
node
,
filename
):
def
importNgramList
(
node
,
filename
,
delimiter
=
"
\t
"
,
modify_lists
=
[
0
,
1
,
2
]
):
'''
Suppose
Suppose une table CSV avec colonnes comme dans fonction export.
/!
\
efface et remplace les listes existantes /!
\
/!
\
(supprime leur collection de NodeNgrams) /!
\
'''
list_types_shortcuts
=
{
0
:
"StopList"
,
1
:
"MiamList"
,
2
:
"MapList"
,
}
# on supprime tous les NodeNgrams des listes à modifier
# ------------------------------------------------------
for
list_shortcut
in
modify_lists
:
# find previous listnode id
list_type
=
list_types_shortcuts
[
list_shortcut
]
list_node
=
get_or_create_node
(
nodetype
=
list_type
,
corpus
=
node
)
node_id
=
listnode
.
id
# delete previous lists
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
list_node
.
id
)
.
delete
()
session
.
commit
()
# on lit le CSV
# --------------
ngrams_csv_rows
=
[]
with
open
(
filename
,
"r"
)
as
f
:
ngrams_list
=
f
.
read
()
.
splitlines
()
ngrams_csv_rows
=
reader
(
f
,
delimiter
=
delimiter
,
quoting
=
QUOTE_MINIMAL
)
# for row delete others and
stop_words
=
set
(
stop_list
)
stop_ids
=
insert_ngrams
([(
word
,
len
(
word
.
split
(
' '
)))
for
word
in
stop_words
])
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
node
)
all_read_terms
=
list
()
session
.
add
(
stop_node
)
session
.
commit
()
for
csv_row
in
ngrams_csv_rows
:
this_ng_id
=
csv_row
[
0
]
this_ng_terms
=
csv_row
[
1
]
this_ng_nlen
=
csv_row
[
2
]
this_ng_list_type_id
=
csv_row
[
3
]
this_ng_grouped_ngs
=
csv_row
[
4
]
# --- quelle liste cible ?
# par ex: "MiamList"
list_type
=
type_ids_cache
[
this_ng_list_type_id
]
tgt_list_node
=
get_or_create_node
(
nodetype
=
list_type
,
corpus
=
node
)
# --- test 1: forme existante dans node_ngram ?
#preexisting = session.query(Ngram).filter(Ngram.terms == this_ng_terms).first()
#if preexisting is None:
# # todo ajouter Ngram dans la table node_ngram
# avec un nouvel ID
# --- test 2: forme déjà dans une liste ?
#if preexisting is not None:
# # premier node de type "liste" mentionnant ce ngram_id
# #
# node_ngram = preexisting.node_node_ngram_collection[0]
# previous_list = node_ngram.node_id
#
# ---------------
data
[
0
]
=
tgt_list_node
.
id
data
[
1
]
=
this_ng_id
# on suppose le même ngram_id
data
[
2
]
=
size
=
len
(
list
(
stop_words
))
data
=
zip
(
[
stop_node
.
id
for
i
in
range
(
0
,
size
)]
,
[
stop_ids
[
word
]
for
word
in
list
(
stop_words
)]
,
[
-
1
for
i
in
range
(
0
,
size
)]
)
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
# bulk_insert(NodeNgramNgram, ['node_id', 'ngramx_id', 'ngramy_id', 'weight'], [d for d in data])
# lecture des ngrams préexistants
# ------------------
# Remarque quand on a un list_node li alors faire:
# li.node_node_ngram_collection
# (donne tous les node_ngram)
# (plus rapide que lancer une nouvelle session.query)
#
# TODO utiliser carrément :
# [w.node_ngram for w in listnode.node_node_ngram_collection]
ngram/stop.py
View file @
d87916f9
...
...
@@ -56,6 +56,7 @@ def isStopWord(ngram, stop_words=None):
,
"(.*)
\
d(.*)"
,
"(.*)(
\
.)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(< ?/?p ?>)(.*)"
# marques de paragraphes
,
"(.*)(study)(.*)"
,
"(.*)(xx|xi|xv)(.*)"
,
"(.*)(result)(.*)"
...
...
parsing/FileParsers/EuropressFileParser.py
View file @
d87916f9
"""
Parses Europress 2015 html format (both for english and french)
=> recognizes language according to date format
=> scraps text for each paragraph to fill hyperdata['abstract']
"""
__author__
=
"Gargantext Team"
__copyright__
=
"Copyright 2014-15 ISCPIF-CNRS"
__version__
=
"0.1"
__email__
=
"romain.loth@iscpif.fr"
__status__
=
"Test"
import
re
import
locale
...
...
@@ -23,12 +34,24 @@ from ..NgramsExtractors import *
from
admin.utils
import
PrintException
class
EuropressFileParser
(
FileParser
):
def
_parse_header
(
self
,
header
):
pass
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
#print("europr_parser file", file)
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_page
=
re
.
compile
(
'p
\
. .*'
,
re
.
UNICODE
)
# les docs europresse en/fr
# se distinguent principalement
# par la forme de leur date
# ex: November 7, 2012
format_date_en
=
re
.
compile
(
r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+[0-3]?\d,\s+(?:19|20)\d\d'
)
# ex: 16 mars 2011
format_date_fr
=
re
.
compile
(
r'[0-3]?\d\s+(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\s+(?:19|20)\d\d'
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
...
...
@@ -52,31 +75,47 @@ class EuropressFileParser(FileParser):
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']
/descendant-or-self::*
"
text_xpath
=
"./section/div[@class='DocText']/
descendant-or-self::*
"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']/
/p
"
def
paragraph_list
(
data_xpath
):
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
if
elem
.
text
is
not
None
:
if
elem
.
text
.
strip
()
!=
''
:
if
elem
.
tag
==
'p'
:
result
.
append
(
elem
.
text
)
else
:
if
len
(
result
)
>
0
:
result
.
append
(
result
.
pop
()
+
elem
.
text
)
else
:
result
.
append
(
elem
.
text
)
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
)
:
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
)
)
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
print
(
'article'
)
# print("2 en 1 ==============================new article")
hyperdata
=
{}
# analyse de la langue => utile pour la date
# faite localement pour permettre aux utilisateurs
# de choisir ResourceType "Europress" sans s'occuper
# du détail de la langue sourc
doc_language
=
None
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
...
...
@@ -87,24 +126,101 @@ class EuropressFileParser(FileParser):
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
hyperdata
.
update
(
self
.
_parse_header
(
header
))
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
if
header
is
not
None
:
# Article headers in europress
# -----------------------------
# ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17"
# ex: "Votre ville, jeudi 6 février 2014"
# ex: "World, Friday, November 13, 2015"
# 1) test language before splitting
if
re
.
search
(
format_date_fr
,
header
):
doc_language
=
'fr'
# print("=============== Header date fr")
# save for FileParser
hyperdata
[
"language_iso2"
]
=
'fr'
elif
re
.
search
(
format_date_en
,
header
):
doc_language
=
'en'
# print("=============== Header date en")
# save for FileParser
hyperdata
[
"language_iso2"
]
=
'en'
else
:
print
(
"WARNING europress: echec diagnostic langue header sur '
%
s'"
%
header
)
# default value, used locally, not saved
doc_language
=
'en'
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header
=
header
.
split
(
', '
)
# mais dateparser ne veut pas d'éléments autres à la suite de la date
# ==> on filtre les indications de pages qu'europress met souvent après
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
date
=
None
if
parse_date
(
header
[
0
],
doc_language
)
is
not
None
:
if
doc_language
==
'fr'
:
date
=
header
[
0
]
# print("match 1 fre => 0 = %s " % date)
else
:
date
=
' '
.
join
(
header
[
0
:])
# print("match 0 eng => 0: = %s " % date)
else
:
# most probably news_topic before beginning of date
hyperdata
[
'rubrique'
]
=
header
[
0
]
# [1..last_header_fragment]
for
i
in
range
(
1
,
len
(
header
)):
if
parse_date
(
header
[
i
],
doc_language
)
is
not
None
:
if
doc_language
==
'fr'
:
date
=
header
[
i
]
# print("match %i fre => %i = %s " % (i,i,date))
else
:
date
=
' '
.
join
(
header
[
i
:])
# print("match %i eng => %i: = %s " % (i,i,date))
# default
if
date
is
None
:
date
=
'2016'
# print("no match => 2016")
# we parse the retrieved datestring into a formal date
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
doc_language
)
# print("RES POSTPROC:",hyperdata['publication_date'])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
try
:
title
=
paragraph_list
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
except
:
print
(
hyperdata
[
'title'
])
print
(
date
)
#print(hyperdata['publication_date'])
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
paragraph_list
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
' '
.
join
([
' <p> '
+
p
+
' </p> '
for
p
in
title
[
1
:]
+
text
])
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
except
:
pass
...
...
@@ -114,60 +230,6 @@ class EuropressFileParser(FileParser):
PrintException
()
pass
class
EuropressFileParser_fr
(
EuropressFileParser
):
def
_parse_header
(
self
,
header
):
format_date
=
re
.
compile
(
'.*
\
d{4}.*'
,
re
.
UNICODE
)
hyperdata
=
dict
()
if
header
is
not
None
:
header
=
header
.
split
(
', '
)
if
format_date
.
match
(
header
[
0
]):
date
=
header
[
0
]
elif
format_date
.
match
(
header
[
1
]):
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
header
[
1
]
try
:
hyperdata
[
'page'
]
=
header
[
2
]
.
split
(
' '
)[
1
]
except
:
pass
else
:
date
=
header
[
2
]
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'fr'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
return
(
hyperdata
)
#print(hyperdata['publication_date'])
class
EuropressFileParser_en
(
EuropressFileParser
):
def
_parse_header
(
self
,
header
):
format_date
=
re
.
compile
(
'.*
\
d{4}.*'
,
re
.
UNICODE
)
if
header
is
not
None
:
header
=
header
.
split
(
', '
)
if
format_date
.
match
(
header
[
0
]):
date
=
header
[
0
]
elif
format_date
.
match
(
header
[
1
]):
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
header
[
1
]
try
:
hyperdata
[
'page'
]
=
header
[
2
]
.
split
(
' '
)[
1
]
except
:
pass
else
:
date
=
header
[
2
]
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'fr'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
...
...
parsing/FileParsers/EuropressFileParser_en.py
deleted
100644 → 0
View file @
485f2b02
import
re
import
locale
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
.FileParser
import
FileParser
#from parsing.NgramsExtractors import *
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser_en
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_page
=
re
.
compile
(
'p
\
. .*'
,
re
.
UNICODE
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
contents
=
file
.
read
()
encoding
=
self
.
detect_encoding
(
contents
)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
PrintException
()
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']//p"
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
):
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
))
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
hyperdata
[
'journal'
]
=
name
[
0
]
hyperdata
[
'number'
]
=
name
[
1
]
except
:
try
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header
=
header
.
split
(
', '
)
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
if
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
2
:])
elif
parse_date
(
header
[
3
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
3
:])
else
:
date
=
'2016'
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'en'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
try
:
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
except
:
print
(
hyperdata
[
'title'
])
print
(
date
)
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
except
:
pass
yield
hyperdata
except
:
PrintException
()
pass
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
parsing/FileParsers/EuropressFileParser_fr.py
deleted
100644 → 0
View file @
485f2b02
import
re
import
locale
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
.FileParser
import
FileParser
#from parsing.NgramsExtractors import *
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser_fr
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_date
=
re
.
compile
(
'.*
\
d{4}.*'
,
re
.
UNICODE
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
contents
=
file
.
read
()
encoding
=
self
.
detect_encoding
(
contents
)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
PrintException
()
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
):
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
))
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
hyperdata
[
'journal'
]
=
name
[
0
]
hyperdata
[
'number'
]
=
name
[
1
]
except
:
try
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
header
=
header
.
split
(
', '
)
if
parse_date
(
header
[
0
],
'fr'
)
is
not
None
:
date
=
header
[
0
]
elif
parse_date
(
header
[
1
],
'fr'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
header
[
1
]
try
:
hyperdata
[
'page'
]
=
header
[
2
]
.
split
(
' '
)[
1
]
except
:
pass
elif
parse_date
(
header
[
2
],
'fr'
)
is
not
None
:
date
=
header
[
2
]
elif
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
2
:])
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'fr'
,
'en'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
#print(hyperdata['publication_date'])
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
# join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except
:
pass
yield
hyperdata
except
:
PrintException
()
pass
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
parsing/FileParsers/ISTex.py
View file @
d87916f9
...
...
@@ -82,10 +82,20 @@ class ISTex(FileParser):
if
len
(
hyperdata
[
"genre"
])
==
0
:
hyperdata
.
pop
(
"genre"
)
if
"language_iso3"
in
hyperdata
:
if
len
(
hyperdata
[
"language_iso3"
])
>
0
:
# retrieve lang if lang != [] and lang != ["unknown"]
# ---------------------------------------------------
if
len
(
hyperdata
[
"language_iso3"
])
>
0
and
hyperdata
[
"language_iso3"
][
0
]
!=
"unknown"
:
hyperdata
[
"language_iso3"
]
=
hyperdata
[
"language_iso3"
][
0
]
# default value = eng
# possible even better: langid.classify(abstract)
else
:
# NB 97% des docs istex sont eng donc par défaut
# ----------------------------------------------
hyperdata
[
"language_iso3"
]
=
"eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
if
"publication_date"
in
hyperdata
:
RealDate
=
hyperdata
[
"publication_date"
]
...
...
parsing/FileParsers/__init__.py
View file @
d87916f9
...
...
@@ -3,7 +3,7 @@ from .IsiFileParser import IsiFileParser
from
.JstorFileParser
import
JstorFileParser
from
.ZoteroFileParser
import
ZoteroFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.EuropressFileParser_en
import
EuropressFileParser_en
from
.EuropressFileParser
_fr
import
EuropressFileParser_f
r
# 2015-12-08: parser 2 en 1
from
.EuropressFileParser
import
EuropressFileParse
r
from
.ISTex
import
ISTex
from
.CSVParser
import
CSVParser
parsing/parsers_config.py
View file @
d87916f9
# import * via __init__.py
from
.FileParsers
import
*
parsers
=
{
...
...
@@ -6,9 +7,16 @@ parsers = {
'Scopus (RIS format)'
:
RisFileParser
,
'Zotero (RIS format)'
:
ZoteroFileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
'Europress (French)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
# Une seule entrée pourra remplacer les variantes French/English
# mais (TODO) il faudra juste vérifier cohérence:
# - avec DB: node_resourcetype
# - avec admin/update_corpus.py
#'Europress' : EuropressFileParser,
'Europress (French)'
:
EuropressFileParser_fr
,
'Europress (English)'
:
EuropressFileParser_en
,
'CSVParser'
:
CSVParser
,
'ISTex'
:
ISTex
,
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment