Commit 4b38f1f2 authored by delanoe's avatar delanoe

Merge branch 'romain' of ssh://delanoe.org:1979/gargantext into romain

parents d1049a2e f0cdf7d4
......@@ -25,7 +25,7 @@ Install the requirements
3) Type: source [your virtual environment directory]/bin/activate
4) sudo chown -R user:user /srv/gargantext_env
pip install -r /srv/gargantext/init/requirements.txt
pip install -r /srv/gargantext/init/install/2-requirements.txt
5) Type: deactivate
......@@ -73,7 +73,7 @@ Last steps of configuration
rm gargantext_lib.tar.bz2
3) init nodetypes and main variables
/srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
/srv/gargantext/manage.py shell < /srv/gargantext/init.py
4) patch CTE:
patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff
......@@ -89,7 +89,7 @@ Last steps of configuration
Start Turbo parser server
-------------------------
See dependences in init/dependences.sh
See README for install instructions /srv/gargantext/parsing/Taggers/nlpserver/README.rst
See README for install instructions /srv/gargantext/parsing/Taggers/lib/nlpserver/README.rst
Start the Python Notebook server
......
......@@ -17,6 +17,7 @@ certifi==14.05.14
cffi==0.8.6
chardet==2.3.0
cryptography==0.6
dateparser==0.3.0
decorator==3.4.0
django-autoslug==1.7.2
django-autoslug-field==0.2.3
......@@ -39,9 +40,8 @@ ipython==2.2.0
jedi==0.9.0
kombu==3.0.24
lxml==3.4.1
matplotlib==1.4.0
networkx==1.9
nltk==3.0a4
nltk==3.1
nose==1.3.4
numpy==1.8.2
pandas==0.14.1
......
......@@ -22,6 +22,7 @@ from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_fr(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
......@@ -54,24 +55,33 @@ class EuropressFileParser_fr(FileParser):
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*"
title_xpath = "./header/div[@class='titreArticle']"
text_xpath = "./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
def paragraph_list(data_xpath):
def scrap_text(data_xpath):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result = list()
# a priori un seul titre ou plusieurs p dans data_xpath
for elem in data_xpath:
if elem.text is not None:
if elem.text.strip() != '':
if elem.tag == 'p':
result.append(elem.text)
else:
if len(result) > 0:
result.append(result.pop() + elem.text)
else:
result.append(elem.text)
all_text = list()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for sub_txt in elem.itertext(with_tail=True):
sub_txt_clean = sub_txt.strip()
if sub_txt_clean != '':
all_text.append(sub_txt_clean)
result.append(" ".join(all_text))
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
......@@ -126,14 +136,14 @@ class EuropressFileParser_fr(FileParser):
#print(hyperdata['publication_date'])
try:
title = paragraph_list(html_article.xpath(title_xpath))
title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = paragraph_list(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ p_text for p_text in title[1:] + text])
except:
pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment