Merge branch 'romain' of ssh://delanoe.org:1979/gargantext into romain

4b38f1f2 · delanoe · d1049a2e · f0cdf7d4 · 4b38f1f2 · 4b38f1f2
Commit 4b38f1f2 authored Dec 05, 2015 by delanoe
Show whitespace changes
Inline Side-by-side

Showing with 30 additions and 20 deletions

README.rst init/README.rst +3 -3

2-requirements.txt init/install/2-requirements.txt +2 -2

EuropressFileParser_fr.py parsing/FileParsers/EuropressFileParser_fr.py +25 -15

No files found.
--- a/init/README.rst
+++ b/init/README.rst
@@ -25,7 +25,7 @@ Install the requirements
 3)  Type: source [your virtual environment directory]/bin/activate

 4)  sudo chown -R user:user /srv/gargantext_env
-    pip install -r /srv/gargantext/init/requirements.txt
+    pip install -r /srv/gargantext/init/install/2-requirements.txt

 5)  Type: deactivate

@@ -73,7 +73,7 @@ Last steps of configuration
    rm gargantext_lib.tar.bz2

 3)  init nodetypes and main variables
-    /srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
+    /srv/gargantext/manage.py shell < /srv/gargantext/init.py

 4)  patch CTE:
    patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff
@@ -89,7 +89,7 @@ Last steps of configuration
 Start Turbo parser server
 -------------------------
 See dependences in init/dependences.sh
-See README for install instructions /srv/gargantext/parsing/Taggers/nlpserver/README.rst
+See README for install instructions /srv/gargantext/parsing/Taggers/lib/nlpserver/README.rst


 Start the Python Notebook server

--- a/init/install/2-requirements.txt
+++ b/init/install/2-requirements.txt
@@ -17,6 +17,7 @@ certifi==14.05.14
 cffi==0.8.6
 chardet==2.3.0
 cryptography==0.6
+dateparser==0.3.0
 decorator==3.4.0
 django-autoslug==1.7.2
 django-autoslug-field==0.2.3
@@ -39,9 +40,8 @@ ipython==2.2.0
 jedi==0.9.0
 kombu==3.0.24
 lxml==3.4.1
-matplotlib==1.4.0
 networkx==1.9
-nltk==3.0a4
+nltk==3.1
 nose==1.3.4
 numpy==1.8.2
 pandas==0.14.1

--- a/parsing/FileParsers/EuropressFileParser_fr.py
+++ b/parsing/FileParsers/EuropressFileParser_fr.py
@@ -22,6 +22,7 @@ from ..NgramsExtractors import *

 from admin.utils import PrintException

+
 class EuropressFileParser_fr(FileParser):
    def _parse(self, file):
        localeEncoding          = "fr_FR"
@@ -54,24 +55,33 @@ class EuropressFileParser_fr(FileParser):

        name_xpath      = "./header/div/span[@class = 'DocPublicationName']"
        header_xpath    = "./header/div/span[@class = 'DocHeader']"
-        title_xpath     = "./header/div[@class='titreArticle']/descendant-or-self::*"
-        text_xpath      = "./section/div[@class='DocText']/descendant-or-self::*"
+        title_xpath     = "./header/div[@class='titreArticle']"
+        text_xpath      = "./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
        

-        def paragraph_list(data_xpath):
+        def scrap_text(data_xpath):
+            """
+            Récupère le texte de toute arborescence
+            sous une liste de noeuds (par ex liste de <p>)
+            et renvoie une liste de string
+            """
            result = list()
+            
+            # a priori un seul titre ou plusieurs p dans data_xpath
            for elem in data_xpath:
-                if elem.text is not None:
-                    if elem.text.strip() != '':
-                        if elem.tag == 'p':
-                            result.append(elem.text)
-                        else:
-                            if len(result) > 0:
-                                result.append(result.pop() + elem.text)
-                            else:
-                                result.append(elem.text)
+                all_text = list()
+                # on utilise itertext pour avoir
+                # tous les sous éléments 1 fois
+                # quelque soit la profondeur
+                for sub_txt in elem.itertext(with_tail=True):
+                    sub_txt_clean = sub_txt.strip()
+                    if sub_txt_clean != '':
+                        all_text.append(sub_txt_clean)
+                result.append(" ".join(all_text))
            return result

+
+
        # parse all the articles, one by one
        try:
            for html_article in html_articles:
@@ -126,14 +136,14 @@ class EuropressFileParser_fr(FileParser):

                #print(hyperdata['publication_date'])
                try:
-                    title   = paragraph_list(html_article.xpath(title_xpath))
+                    title   = scrap_text(html_article.xpath(title_xpath))
                    hyperdata['title'] = title[0]
                except:
                    pass
                
                try:
-                    text    = paragraph_list(html_article.xpath(text_xpath))
-                    hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
+                    text    = scrap_text(html_article.xpath(text_xpath))
+                    hyperdata['abstract'] = ' '.join([ p_text for p_text in title[1:] + text])
                except:
                    pass