Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
fd6e272f
Commit
fd6e272f
authored
Dec 08, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain' of
ssh://delanoe.org:1979/gargantext
into romain
parents
4b38f1f2
ca537f58
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
41 additions
and
24 deletions
+41
-24
EuropressFileParser_en.py
parsing/FileParsers/EuropressFileParser_en.py
+37
-23
EuropressFileParser_fr.py
parsing/FileParsers/EuropressFileParser_fr.py
+4
-1
No files found.
parsing/FileParsers/EuropressFileParser_en.py
View file @
fd6e272f
...
...
@@ -54,24 +54,33 @@ class EuropressFileParser_en(FileParser):
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']
/descendant-or-self::*
"
text_xpath
=
"./section/div[@class='DocText']/
descendant-or-self::*
"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']/
/p
"
def
paragraph_list
(
data_xpath
):
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
if
elem
.
text
is
not
None
:
if
elem
.
text
.
strip
()
!=
''
:
if
elem
.
tag
==
'p'
:
result
.
append
(
elem
.
text
)
else
:
if
len
(
result
)
>
0
:
result
.
append
(
result
.
pop
()
+
elem
.
text
)
else
:
result
.
append
(
elem
.
text
)
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
)
:
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
)
)
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
...
...
@@ -88,26 +97,25 @@ class EuropressFileParser_en(FileParser):
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
#print(hyperdata['publication_date'])
try
:
title
=
paragraph_list
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header
=
header
.
split
(
', '
)
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
print
(
header
)
if
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
2
:])
elif
parse_date
(
header
[
3
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
3
:])
else
:
date
=
'2016'
...
...
@@ -127,10 +135,16 @@ class EuropressFileParser_en(FileParser):
print
(
hyperdata
[
'title'
])
print
(
date
)
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
paragraph_list
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
' '
.
join
([
' <p> '
+
p
+
' </p> '
for
p
in
title
[
1
:]
+
text
])
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
except
:
pass
...
...
parsing/FileParsers/EuropressFileParser_fr.py
View file @
fd6e272f
...
...
@@ -143,7 +143,10 @@ class EuropressFileParser_fr(FileParser):
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
' '
.
join
([
p_text
for
p_text
in
title
[
1
:]
+
text
])
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
# join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except
:
pass
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment