Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
d87916f9
Commit
d87916f9
authored
Dec 10, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain' into unstable
parents
485f2b02
d5a8e664
Changes
10
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
368 additions
and
477 deletions
+368
-477
README.rst
README.rst
+0
-0
views_optimized.py
gargantext_web/views_optimized.py
+2
-9
importExport.py
ngram/importExport.py
+198
-49
stop.py
ngram/stop.py
+1
-0
EuropressFileParser.py
parsing/FileParsers/EuropressFileParser.py
+144
-82
EuropressFileParser_en.py
parsing/FileParsers/EuropressFileParser_en.py
+0
-165
EuropressFileParser_fr.py
parsing/FileParsers/EuropressFileParser_fr.py
+0
-167
ISTex.py
parsing/FileParsers/ISTex.py
+11
-1
__init__.py
parsing/FileParsers/__init__.py
+2
-2
parsers_config.py
parsing/parsers_config.py
+10
-2
No files found.
init/
README.rst
→
README.rst
View file @
d87916f9
File moved
gargantext_web/views_optimized.py
View file @
d87916f9
...
...
@@ -125,21 +125,14 @@ def project(request, project_id):
thefile
=
form
.
cleaned_data
[
'file'
]
resourcetype
=
cache
.
ResourceType
[
form
.
cleaned_data
[
'type'
]]
# which default language shall be used?
if
resourcetype
.
name
==
"Europress (French)"
:
language_id
=
cache
.
Language
[
'fr'
]
.
id
elif
resourcetype
.
name
==
"Europress (English)"
:
language_id
=
cache
.
Language
[
'en'
]
.
id
else
:
language_id
=
None
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
name
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
type_id
=
cache
.
NodeType
[
'Corpus'
]
.
id
,
language_id
=
language_id
,
# no default language at this point
language_id
=
None
,
hyperdata
=
{
'Processing'
:
"Parsing documents"
,}
)
session
.
add
(
corpus
)
...
...
ngram/importExport.py
View file @
d87916f9
import
re
from
admin.utils
import
PrintException
from
gargantext_web.db
import
Node
,
Ngram
,
NodeNgram
,
NodeNode
Ngram
from
gargantext_web.db
import
Node
,
Ngram
,
NodeNgram
,
NodeNodeNgram
,
NodeNgram
Ngram
from
gargantext_web.db
import
cache
,
session
,
get_or_create_node
,
bulk_insert
import
sqlalchemy
as
sa
...
...
@@ -13,73 +13,222 @@ from sqlalchemy.orm import aliased
from
ngram.tools
import
insert_ngrams
from
analysis.lists
import
WeightedList
,
UnweightedList
def
exportNgramList
(
node
,
filename
):
from
collections
import
defaultdict
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
def
get_id
(
ngram_terms
):
query
=
session
.
query
(
Ngram
.
id
)
.
filter
(
Ngram
.
terms
==
ngram_terms
)
.
first
()
return
(
query
)
def
exportNgramList
(
node
,
filename
,
delimiter
=
"
\t
"
):
# les nodes couvrant les listes
# -----------------------------
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
node
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
node
)
map_node
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
node
)
group_node
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
node
)
stop_ngrams
=
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stop_node
.
id
)
.
all
()
miam_ngrams
=
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
all
()
map_ngrams
=
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
map_node
.
id
)
.
all
()
group_ngrams
=
(
session
.
query
(
NodeNgramNgram
.
ngramx_id
,
NodeNgramNgram
.
ngramy_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
group_node
.
id
)
.
all
()
)
all_ngrams
=
set
()
grouped
=
defaultdict
(
lambda
:
defaultdict
(
set
))
toList
=
list
()
# listes de ngram_ids correspondantes
# ------------------------------------
#~~ contenu: liste des ids [2562,...]
stop_ngram_ids
=
[
stop_ngram
.
ngram_id
for
stop_ngram
in
stop_node
.
node_node_ngram_collection
]
# idem pour miam et map
miam_ngram_ids
=
[
miam_ng
.
ngram_id
for
miam_ng
in
miam_node
.
node_node_ngram_collection
]
map_ngram_ids
=
[
map_ng
.
ngram_id
for
map_ng
in
map_node
.
node_node_ngram_collection
]
# pour la group_list on a des couples de ngram_ids
# -------------------
# ex: [(3544, 2353), (2787, 4032), ...]
group_ngram_id_couples
=
[(
nd_ng_ng
.
ngramx_id
,
nd_ng_ng
.
ngramy_id
)
for
nd_ng_ng
in
group_node
.
node_nodengramngram_collection
]
for
ngram
in
group_ngrams
:
# k couples comme set
# --------------------
# [(a => x) (a => y)] => [a => {x,y}]
grouped
=
defaultdict
(
set
)
for
ngram
in
group_ngram_id_couples
:
# /!\ just in one direction /!\
# a => {x} but not not x => {a}
grouped
[
ngram
[
0
]]
.
add
(
ngram
[
1
])
all_ngrams
.
add
(
ngram
[
0
])
all_ngrams
.
add
(
ngram
[
1
])
def
add_ngram
(
fromList
,
toList
=
toList
,
grouplist
=
grouped
,
all_ngrams
=
all_ngrams
,
weight
=
0
):
for
ngram_id
in
from_list
:
all_ngrams
.
add
(
ngram_id
)
if
ngram_id
in
grouplist
.
keys
():
ngrams
.
append
((
ngram_id
,
grouped
[
ngram_id
],
weight
))
else
:
ngram
.
append
((
ngram_id
,
""
,
weight
))
add_ngrams
(
stop_ngrams
,
weight
=
0
)
add_ngrams
(
miam_ngrams
,
weight
=
1
)
add_ngrams
(
map_ngrams
,
weight
=
2
)
# helper func
def
ngrams_to_csv_rows
(
ngram_ids
,
id_groupings
=
{},
list_type
=
7
):
"""
Table d'infos basiques par ngram :
(ng_id, forme du terme, poids, type_de_liste)
avec une colonne supplémentaire optionnelle:
ngrams groupés avec cet id ex: "4|42"
Retourne une matrice csv_rows en liste de liste
[
[ligne1_colA, ligne1_colB..],
[ligne2_colA, ligne2_colB..],
..
]
(ensuite par exemple csv.writer.writerows(csv_rows)
"""
# récupérer d'un coup les objets Ngram (avec terme)
ng_objs
=
session
.
query
(
Ngram
)
.
filter
(
Ngram
.
id
.
in_
(
ngram_ids
))
.
all
()
# les transcrire en tableau (liste de listes)
csv_rows
=
list
()
for
ng_obj
in
ng_objs
:
ng_id
=
ng_obj
.
id
if
ng_id
in
id_groupings
.
keys
():
this_grouped
=
"|"
.
join
(
str
(
gid
)
for
gid
in
id_groupings
[
ng_id
])
else
:
this_grouped
=
""
# to csv
with
open
(
filename
,
"w"
)
as
f
:
f
.
write
(
ngram
)
for
ngram
in
ngrams
# transcription : 5 colonnes
# ID , terme , n , type_de_liste , gid|gid|gid
csv_rows
.
append
(
[
ng_id
,
ng_obj
.
terms
,
ng_obj
.
n
,
list_type
,
this_grouped
]
)
# csv_rows = [[ligne1_a, ligne1_b..],[ligne2_a, ligne2_b..],..]
return
csv_rows
# on applique notre fonction ng_to_csv sur chaque liste
# ------------------------------------------------------
stop_csv_rows
=
ngrams_to_csv_rows
(
stop_ngram_ids
,
id_groupings
=
grouped
,
list_type
=
0
)
# miam contient map donc il y a un préalable ici
miam_without_map
=
[
ng
for
ng
in
miam_ngram_ids
if
ng
not
in
map_ngram_ids
]
miam_csv_rows
=
ngrams_to_csv_rows
(
miam_without_map
,
id_groupings
=
grouped
,
list_type
=
1
)
map_csv_rows
=
ngrams_to_csv_rows
(
map_ngram_ids
,
id_groupings
=
grouped
,
list_type
=
2
)
# all lists together now
this_corpus_all_rows
=
stop_csv_rows
+
miam_csv_rows
+
map_csv_rows
def
importNgramList
(
node
,
filename
):
# output
with
open
(
filename
,
'w'
)
as
out_file
:
# csv.writer()
csv_wr
=
writer
(
out_file
,
delimiter
=
delimiter
,
quoting
=
QUOTE_MINIMAL
)
# write to outfile
csv_wr
.
writerows
(
this_corpus_all_rows
)
def
importNgramList
(
node
,
filename
,
delimiter
=
"
\t
"
,
modify_lists
=
[
0
,
1
,
2
]):
'''
Suppose
Suppose une table CSV avec colonnes comme dans fonction export.
/!
\
efface et remplace les listes existantes /!
\
/!
\
(supprime leur collection de NodeNgrams) /!
\
'''
list_types_shortcuts
=
{
0
:
"StopList"
,
1
:
"MiamList"
,
2
:
"MapList"
,
}
# on supprime tous les NodeNgrams des listes à modifier
# ------------------------------------------------------
for
list_shortcut
in
modify_lists
:
# find previous listnode id
list_type
=
list_types_shortcuts
[
list_shortcut
]
list_node
=
get_or_create_node
(
nodetype
=
list_type
,
corpus
=
node
)
node_id
=
listnode
.
id
# delete previous lists
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
list_node
.
id
)
.
delete
()
session
.
commit
()
# on lit le CSV
# --------------
ngrams_csv_rows
=
[]
with
open
(
filename
,
"r"
)
as
f
:
ngrams_list
=
f
.
read
()
.
splitlines
()
ngrams_csv_rows
=
reader
(
f
,
delimiter
=
delimiter
,
quoting
=
QUOTE_MINIMAL
)
all_read_terms
=
list
()
# for row delete others and
for
csv_row
in
ngrams_csv_rows
:
this_ng_id
=
csv_row
[
0
]
this_ng_terms
=
csv_row
[
1
]
this_ng_nlen
=
csv_row
[
2
]
this_ng_list_type_id
=
csv_row
[
3
]
this_ng_grouped_ngs
=
csv_row
[
4
]
# --- quelle liste cible ?
stop_words
=
set
(
stop_list
)
stop_ids
=
insert_ngrams
([(
word
,
len
(
word
.
split
(
' '
)))
for
word
in
stop_words
])
# par ex: "MiamList"
list_type
=
type_ids_cache
[
this_ng_list_type_id
]
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
node
)
tgt_list_node
=
get_or_create_node
(
nodetype
=
list_type
,
corpus
=
node
)
# --- test 1: forme existante dans node_ngram ?
#preexisting = session.query(Ngram).filter(Ngram.terms == this_ng_terms).first()
#if preexisting is None:
# # todo ajouter Ngram dans la table node_ngram
# avec un nouvel ID
session
.
add
(
stop_node
)
session
.
commit
()
# --- test 2: forme déjà dans une liste ?
#if preexisting is not None:
# # premier node de type "liste" mentionnant ce ngram_id
# #
# node_ngram = preexisting.node_node_ngram_collection[0]
# previous_list = node_ngram.node_id
#
# ---------------
data
[
0
]
=
tgt_list_node
.
id
data
[
1
]
=
this_ng_id
# on suppose le même ngram_id
data
[
2
]
=
size
=
len
(
list
(
stop_words
))
data
=
zip
(
[
stop_node
.
id
for
i
in
range
(
0
,
size
)]
,
[
stop_ids
[
word
]
for
word
in
list
(
stop_words
)]
,
[
-
1
for
i
in
range
(
0
,
size
)]
)
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
# bulk_insert(NodeNgramNgram, ['node_id', 'ngramx_id', 'ngramy_id', 'weight'], [d for d in data])
# lecture des ngrams préexistants
# ------------------
# Remarque quand on a un list_node li alors faire:
# li.node_node_ngram_collection
# (donne tous les node_ngram)
# (plus rapide que lancer une nouvelle session.query)
#
# TODO utiliser carrément :
# [w.node_ngram for w in listnode.node_node_ngram_collection]
ngram/stop.py
View file @
d87916f9
...
...
@@ -56,6 +56,7 @@ def isStopWord(ngram, stop_words=None):
,
"(.*)
\
d(.*)"
,
"(.*)(
\
.)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(< ?/?p ?>)(.*)"
# marques de paragraphes
,
"(.*)(study)(.*)"
,
"(.*)(xx|xi|xv)(.*)"
,
"(.*)(result)(.*)"
...
...
parsing/FileParsers/EuropressFileParser.py
View file @
d87916f9
This diff is collapsed.
Click to expand it.
parsing/FileParsers/EuropressFileParser_en.py
deleted
100644 → 0
View file @
485f2b02
import
re
import
locale
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
.FileParser
import
FileParser
#from parsing.NgramsExtractors import *
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser_en
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_page
=
re
.
compile
(
'p
\
. .*'
,
re
.
UNICODE
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
contents
=
file
.
read
()
encoding
=
self
.
detect_encoding
(
contents
)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
PrintException
()
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']//p"
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
):
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
))
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
hyperdata
[
'journal'
]
=
name
[
0
]
hyperdata
[
'number'
]
=
name
[
1
]
except
:
try
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header
=
header
.
split
(
', '
)
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
if
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
2
:])
elif
parse_date
(
header
[
3
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
3
:])
else
:
date
=
'2016'
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'en'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
try
:
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
except
:
print
(
hyperdata
[
'title'
])
print
(
date
)
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
except
:
pass
yield
hyperdata
except
:
PrintException
()
pass
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
parsing/FileParsers/EuropressFileParser_fr.py
deleted
100644 → 0
View file @
485f2b02
import
re
import
locale
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
.FileParser
import
FileParser
#from parsing.NgramsExtractors import *
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser_fr
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_date
=
re
.
compile
(
'.*
\
d{4}.*'
,
re
.
UNICODE
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
contents
=
file
.
read
()
encoding
=
self
.
detect_encoding
(
contents
)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
PrintException
()
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
):
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
))
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
hyperdata
[
'journal'
]
=
name
[
0
]
hyperdata
[
'number'
]
=
name
[
1
]
except
:
try
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
header
=
header
.
split
(
', '
)
if
parse_date
(
header
[
0
],
'fr'
)
is
not
None
:
date
=
header
[
0
]
elif
parse_date
(
header
[
1
],
'fr'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
header
[
1
]
try
:
hyperdata
[
'page'
]
=
header
[
2
]
.
split
(
' '
)[
1
]
except
:
pass
elif
parse_date
(
header
[
2
],
'fr'
)
is
not
None
:
date
=
header
[
2
]
elif
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
2
:])
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'fr'
,
'en'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
#print(hyperdata['publication_date'])
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
# join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except
:
pass
yield
hyperdata
except
:
PrintException
()
pass
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
parsing/FileParsers/ISTex.py
View file @
d87916f9
...
...
@@ -82,10 +82,20 @@ class ISTex(FileParser):
if
len
(
hyperdata
[
"genre"
])
==
0
:
hyperdata
.
pop
(
"genre"
)
if
"language_iso3"
in
hyperdata
:
if
len
(
hyperdata
[
"language_iso3"
])
>
0
:
# retrieve lang if lang != [] and lang != ["unknown"]
# ---------------------------------------------------
if
len
(
hyperdata
[
"language_iso3"
])
>
0
and
hyperdata
[
"language_iso3"
][
0
]
!=
"unknown"
:
hyperdata
[
"language_iso3"
]
=
hyperdata
[
"language_iso3"
][
0
]
# default value = eng
# possible even better: langid.classify(abstract)
else
:
# NB 97% des docs istex sont eng donc par défaut
# ----------------------------------------------
hyperdata
[
"language_iso3"
]
=
"eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
if
"publication_date"
in
hyperdata
:
RealDate
=
hyperdata
[
"publication_date"
]
...
...
parsing/FileParsers/__init__.py
View file @
d87916f9
...
...
@@ -3,7 +3,7 @@ from .IsiFileParser import IsiFileParser
from
.JstorFileParser
import
JstorFileParser
from
.ZoteroFileParser
import
ZoteroFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.EuropressFileParser_en
import
EuropressFileParser_en
from
.EuropressFileParser
_fr
import
EuropressFileParser_f
r
# 2015-12-08: parser 2 en 1
from
.EuropressFileParser
import
EuropressFileParse
r
from
.ISTex
import
ISTex
from
.CSVParser
import
CSVParser
parsing/parsers_config.py
View file @
d87916f9
# import * via __init__.py
from
.FileParsers
import
*
parsers
=
{
...
...
@@ -6,9 +7,16 @@ parsers = {
'Scopus (RIS format)'
:
RisFileParser
,
'Zotero (RIS format)'
:
ZoteroFileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
'Europress (French)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
# Une seule entrée pourra remplacer les variantes French/English
# mais (TODO) il faudra juste vérifier cohérence:
# - avec DB: node_resourcetype
# - avec admin/update_corpus.py
#'Europress' : EuropressFileParser,
'Europress (French)'
:
EuropressFileParser_fr
,
'Europress (English)'
:
EuropressFileParser_en
,
'CSVParser'
:
CSVParser
,
'ISTex'
:
ISTex
,
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment