Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
fd6e272f
Commit
fd6e272f
authored
Dec 08, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain' of
ssh://delanoe.org:1979/gargantext
into romain
parents
4b38f1f2
ca537f58
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
41 additions
and
24 deletions
+41
-24
EuropressFileParser_en.py
parsing/FileParsers/EuropressFileParser_en.py
+37
-23
EuropressFileParser_fr.py
parsing/FileParsers/EuropressFileParser_fr.py
+4
-1
No files found.
parsing/FileParsers/EuropressFileParser_en.py
View file @
fd6e272f
...
@@ -54,24 +54,33 @@ class EuropressFileParser_en(FileParser):
...
@@ -54,24 +54,33 @@ class EuropressFileParser_en(FileParser):
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']
/descendant-or-self::*
"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']/
descendant-or-self::*
"
text_xpath
=
"./section/div[@class='DocText']/
/p
"
def
paragraph_list
(
data_xpath
):
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
for
elem
in
data_xpath
:
if
elem
.
text
is
not
None
:
all_text
=
list
()
if
elem
.
text
.
strip
()
!=
''
:
# on utilise itertext pour avoir
if
elem
.
tag
==
'p'
:
# tous les sous éléments 1 fois
result
.
append
(
elem
.
text
)
# quelque soit la profondeur
else
:
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
)
:
if
len
(
result
)
>
0
:
sub_txt_clean
=
sub_txt
.
strip
()
result
.
append
(
result
.
pop
()
+
elem
.
text
)
if
sub_txt_clean
!=
''
:
else
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
elem
.
text
)
result
.
append
(
" "
.
join
(
all_text
)
)
return
result
return
result
# parse all the articles, one by one
# parse all the articles, one by one
try
:
try
:
for
html_article
in
html_articles
:
for
html_article
in
html_articles
:
...
@@ -88,26 +97,25 @@ class EuropressFileParser_en(FileParser):
...
@@ -88,26 +97,25 @@ class EuropressFileParser_en(FileParser):
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
except
:
pass
pass
#print(hyperdata['publication_date'])
try
:
title
=
paragraph_list
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
if
header
is
not
None
:
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header
=
header
.
split
(
', '
)
header
=
header
.
split
(
', '
)
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
print
(
header
)
if
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
if
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
1
:])
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
2
:])
date
=
' '
.
join
(
header
[
2
:])
elif
parse_date
(
header
[
3
],
'en'
)
is
not
None
:
elif
parse_date
(
header
[
3
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
3
:])
date
=
' '
.
join
(
header
[
3
:])
else
:
else
:
date
=
'2016'
date
=
'2016'
...
@@ -127,10 +135,16 @@ class EuropressFileParser_en(FileParser):
...
@@ -127,10 +135,16 @@ class EuropressFileParser_en(FileParser):
print
(
hyperdata
[
'title'
])
print
(
hyperdata
[
'title'
])
print
(
date
)
print
(
date
)
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
try
:
text
=
paragraph_list
(
html_article
.
xpath
(
text_xpath
))
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
' '
.
join
([
' <p> '
+
p
+
' </p> '
for
p
in
title
[
1
:]
+
text
])
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
except
:
except
:
pass
pass
...
...
parsing/FileParsers/EuropressFileParser_fr.py
View file @
fd6e272f
...
@@ -143,7 +143,10 @@ class EuropressFileParser_fr(FileParser):
...
@@ -143,7 +143,10 @@ class EuropressFileParser_fr(FileParser):
try
:
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
' '
.
join
([
p_text
for
p_text
in
title
[
1
:]
+
text
])
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
# join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except
:
except
:
pass
pass
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment