Commit 4b38f1f2 authored by delanoe's avatar delanoe

Merge branch 'romain' of ssh://delanoe.org:1979/gargantext into romain

parents d1049a2e f0cdf7d4
...@@ -25,7 +25,7 @@ Install the requirements ...@@ -25,7 +25,7 @@ Install the requirements
3) Type: source [your virtual environment directory]/bin/activate 3) Type: source [your virtual environment directory]/bin/activate
4) sudo chown -R user:user /srv/gargantext_env 4) sudo chown -R user:user /srv/gargantext_env
pip install -r /srv/gargantext/init/requirements.txt pip install -r /srv/gargantext/init/install/2-requirements.txt
5) Type: deactivate 5) Type: deactivate
...@@ -73,7 +73,7 @@ Last steps of configuration ...@@ -73,7 +73,7 @@ Last steps of configuration
rm gargantext_lib.tar.bz2 rm gargantext_lib.tar.bz2
3) init nodetypes and main variables 3) init nodetypes and main variables
/srv/gargantext/manage.py shell < /srv/gargantext/init/init.py /srv/gargantext/manage.py shell < /srv/gargantext/init.py
4) patch CTE: 4) patch CTE:
patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff
...@@ -89,7 +89,7 @@ Last steps of configuration ...@@ -89,7 +89,7 @@ Last steps of configuration
Start Turbo parser server Start Turbo parser server
------------------------- -------------------------
See dependences in init/dependences.sh See dependences in init/dependences.sh
See README for install instructions /srv/gargantext/parsing/Taggers/nlpserver/README.rst See README for install instructions /srv/gargantext/parsing/Taggers/lib/nlpserver/README.rst
Start the Python Notebook server Start the Python Notebook server
......
...@@ -17,6 +17,7 @@ certifi==14.05.14 ...@@ -17,6 +17,7 @@ certifi==14.05.14
cffi==0.8.6 cffi==0.8.6
chardet==2.3.0 chardet==2.3.0
cryptography==0.6 cryptography==0.6
dateparser==0.3.0
decorator==3.4.0 decorator==3.4.0
django-autoslug==1.7.2 django-autoslug==1.7.2
django-autoslug-field==0.2.3 django-autoslug-field==0.2.3
...@@ -39,9 +40,8 @@ ipython==2.2.0 ...@@ -39,9 +40,8 @@ ipython==2.2.0
jedi==0.9.0 jedi==0.9.0
kombu==3.0.24 kombu==3.0.24
lxml==3.4.1 lxml==3.4.1
matplotlib==1.4.0
networkx==1.9 networkx==1.9
nltk==3.0a4 nltk==3.1
nose==1.3.4 nose==1.3.4
numpy==1.8.2 numpy==1.8.2
pandas==0.14.1 pandas==0.14.1
......
...@@ -22,6 +22,7 @@ from ..NgramsExtractors import * ...@@ -22,6 +22,7 @@ from ..NgramsExtractors import *
from admin.utils import PrintException from admin.utils import PrintException
class EuropressFileParser_fr(FileParser): class EuropressFileParser_fr(FileParser):
def _parse(self, file): def _parse(self, file):
localeEncoding = "fr_FR" localeEncoding = "fr_FR"
...@@ -54,24 +55,33 @@ class EuropressFileParser_fr(FileParser): ...@@ -54,24 +55,33 @@ class EuropressFileParser_fr(FileParser):
name_xpath = "./header/div/span[@class = 'DocPublicationName']" name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']" header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']/descendant-or-self::*" title_xpath = "./header/div[@class='titreArticle']"
text_xpath = "./section/div[@class='DocText']/descendant-or-self::*" text_xpath = "./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
def paragraph_list(data_xpath): def scrap_text(data_xpath):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result = list() result = list()
# a priori un seul titre ou plusieurs p dans data_xpath
for elem in data_xpath: for elem in data_xpath:
if elem.text is not None: all_text = list()
if elem.text.strip() != '': # on utilise itertext pour avoir
if elem.tag == 'p': # tous les sous éléments 1 fois
result.append(elem.text) # quelque soit la profondeur
else: for sub_txt in elem.itertext(with_tail=True):
if len(result) > 0: sub_txt_clean = sub_txt.strip()
result.append(result.pop() + elem.text) if sub_txt_clean != '':
else: all_text.append(sub_txt_clean)
result.append(elem.text) result.append(" ".join(all_text))
return result return result
# parse all the articles, one by one # parse all the articles, one by one
try: try:
for html_article in html_articles: for html_article in html_articles:
...@@ -126,14 +136,14 @@ class EuropressFileParser_fr(FileParser): ...@@ -126,14 +136,14 @@ class EuropressFileParser_fr(FileParser):
#print(hyperdata['publication_date']) #print(hyperdata['publication_date'])
try: try:
title = paragraph_list(html_article.xpath(title_xpath)) title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0] hyperdata['title'] = title[0]
except: except:
pass pass
try: try:
text = paragraph_list(html_article.xpath(text_xpath)) text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text]) hyperdata['abstract'] = ' '.join([ p_text for p_text in title[1:] + text])
except: except:
pass pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment