Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
4b38f1f2
Commit
4b38f1f2
authored
Dec 05, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain' of
ssh://delanoe.org:1979/gargantext
into romain
parents
d1049a2e
f0cdf7d4
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
30 additions
and
20 deletions
+30
-20
README.rst
init/README.rst
+3
-3
2-requirements.txt
init/install/2-requirements.txt
+2
-2
EuropressFileParser_fr.py
parsing/FileParsers/EuropressFileParser_fr.py
+25
-15
No files found.
init/README.rst
View file @
4b38f1f2
...
...
@@ -25,7 +25,7 @@ Install the requirements
3) Type: source [your virtual environment directory]/bin/activate
4) sudo chown -R user:user /srv/gargantext_env
pip install -r /srv/gargantext/init/requirements.txt
pip install -r /srv/gargantext/init/
install/2-
requirements.txt
5) Type: deactivate
...
...
@@ -73,7 +73,7 @@ Last steps of configuration
rm gargantext_lib.tar.bz2
3) init nodetypes and main variables
/srv/gargantext/manage.py shell < /srv/gargantext/init
/init
.py
/srv/gargantext/manage.py shell < /srv/gargantext/init.py
4) patch CTE:
patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff
...
...
@@ -89,7 +89,7 @@ Last steps of configuration
Start Turbo parser server
-------------------------
See dependences in init/dependences.sh
See README for install instructions /srv/gargantext/parsing/Taggers/nlpserver/README.rst
See README for install instructions /srv/gargantext/parsing/Taggers/
lib/
nlpserver/README.rst
Start the Python Notebook server
...
...
init/install/2-requirements.txt
View file @
4b38f1f2
...
...
@@ -17,6 +17,7 @@ certifi==14.05.14
cffi==0.8.6
chardet==2.3.0
cryptography==0.6
dateparser==0.3.0
decorator==3.4.0
django-autoslug==1.7.2
django-autoslug-field==0.2.3
...
...
@@ -39,9 +40,8 @@ ipython==2.2.0
jedi==0.9.0
kombu==3.0.24
lxml==3.4.1
matplotlib==1.4.0
networkx==1.9
nltk==3.
0a4
nltk==3.
1
nose==1.3.4
numpy==1.8.2
pandas==0.14.1
...
...
parsing/FileParsers/EuropressFileParser_fr.py
View file @
4b38f1f2
...
...
@@ -22,6 +22,7 @@ from ..NgramsExtractors import *
from
admin.utils
import
PrintException
class
EuropressFileParser_fr
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
...
...
@@ -54,24 +55,33 @@ class EuropressFileParser_fr(FileParser):
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']
/descendant-or-self::*
"
text_xpath
=
"./section/div[@class='DocText']/d
escendant-or-self::*
"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']/d
iv[@class='docOcurrContainer']/p
"
def
paragraph_list
(
data_xpath
):
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
if
elem
.
text
is
not
None
:
if
elem
.
text
.
strip
()
!=
''
:
if
elem
.
tag
==
'p'
:
result
.
append
(
elem
.
text
)
else
:
if
len
(
result
)
>
0
:
result
.
append
(
result
.
pop
()
+
elem
.
text
)
else
:
result
.
append
(
elem
.
text
)
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
)
:
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
)
)
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
...
...
@@ -126,14 +136,14 @@ class EuropressFileParser_fr(FileParser):
#print(hyperdata['publication_date'])
try
:
title
=
paragraph_lis
t
(
html_article
.
xpath
(
title_xpath
))
title
=
scrap_tex
t
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
paragraph_lis
t
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
' '
.
join
([
' <p> '
+
p
+
' </p> '
for
p
in
title
[
1
:]
+
text
])
text
=
scrap_tex
t
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
' '
.
join
([
p_text
for
p_text
in
title
[
1
:]
+
text
])
except
:
pass
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment