Merge remote-tracking branch 'origin/romain-testing' into testing

24b52471 · delanoe · d1c00c07 · 309a940e · 24b52471 · 24b52471
Commit 24b52471 authored Sep 13, 2016 by delanoe
Show whitespace changes
Inline Side-by-side

Showing with 15 additions and 9 deletions

EUROPRESSE.py gargantext/util/parsers/EUROPRESSE.py +7 -4

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +8 -5

No files found.
--- a/gargantext/util/parsers/EUROPRESSE.py
+++ b/gargantext/util/parsers/EUROPRESSE.py
@@ -70,7 +70,7 @@ class EuropresseParser(Parser):
        except Exception as error:
            html_articles = None
-            print ("Europresse lxml error:", error)
+            print ("Europresse lxml error:", str(error))
        # all except detail_header are mandatory to parse the article
        name_xpath  = "./header/div/span[@class = 'DocPublicationName']"
@@ -113,7 +113,10 @@ class EuropresseParser(Parser):
        # parse all the articles, one by one
-        if html_articles is not None:
+        if html_articles is None:
+            filename = file.name if hasattr(file, 'name') else 'unknown file'
+            print("WARNING: europresse (skip) 1 file with no parsable content: " + filename)
+        else:
            for html_article in html_articles:
                try:
                    # s'il n'y a pas du tout de header on doit skip

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -98,9 +98,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
                                tokens = tuple(normalize_forms(token[0]) for token in ngram)
                                if do_subngrams:
                                    # ex tokens = ["very", "cool", "exemple"]
-                                    #    subterms = [['very', 'cool'],
+                                    #    subterms = [['very', 'cool'],...]
-                                    #                ['very', 'cool', 'exemple'],
-                                    #                ['cool', 'exemple']]
                                    subterms = subsequences(tokens)
                                else:
@@ -108,11 +106,16 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
                                for seqterm in subterms:
                                    ngram = ' '.join(seqterm)
-                                    if len(ngram) > 1:
+                                    nbwords = len(seqterm)
+                                    nbchars = len(ngram)
+                                    if nbchars > 1:
+                                        if nbchars > 255:
+                                            # max ngram length (DB constraint)
+                                            ngram = ngram[:255]
                                        # doc <=> ngram index
                                        nodes_ngrams_count[(document.id, ngram)] += 1
                                        # add fields :   terms          n
-                                        ngrams_data.add((ngram[:255], len(seqterm), ))
+                                        ngrams_data.add((ngram, nbwords, ))
                        except:
                            #value not in doc
                            continue