add subsequences of long terms in the counts/indexation + remove debug log

826d11ad · Romain Loth · 9533d10a · 826d11ad · 826d11ad · 826d11ad
Commit 826d11ad authored Jun 01, 2016 by Romain Loth
Showing with 57 additions and 7 deletions

constants.py gargantext/constants.py +11 -0

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +44 -5

NGrams_dyna_chart_and_table.js static/lib/gargantext/NGrams_dyna_chart_and_table.js +2 -2

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -233,6 +233,17 @@ DEFAULT_ALL_LOWERCASE_FLAG      = True       # lowercase ngrams before recording
                                             #  good for variants like same term
                                             #  occurring at sentence beginning)

+DEFAULT_INDEX_SUBGRAMS         = False       # False <=> traditional
+                                             # True  <=>
+                                             #  when ngram is like:
+                                             #  "very cool example"
+                                             #  then also count:
+                                             #  "very cool" and "cool example"
+                                             #  (n-1 length ngrams, at initial
+                                             #   indexing after extraction)
+
+
+
 # ------------------------------------------------------------------------------

 # other parameters

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -33,7 +33,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
    db.commit()


-def extract_ngrams(corpus, keys=('title', 'abstract', )):
+def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_INDEX_SUBGRAMS):
    """Extract ngrams for every document below the given corpus.
    Default language is given by the resource type.
    The result is then inserted into database.
@@ -75,10 +75,25 @@ def extract_ngrams(corpus, keys=('title', 'abstract', )):
                # get ngrams
                for ngram in ngramsextractor.extract(value):
                    tokens = tuple(token[0] for token in ngram)
-                    terms = normalize_terms(' '.join(tokens))
-                    if len(terms) > 1:
-                        nodes_ngrams_count[(document.id, terms)] += 1
-                        ngrams_data.add((terms[:255], len(tokens), ))
+
+                    if do_subngrams:
+                        # ex tokens = ["very", "cool", "exemple"]
+                        #    subterms = [['very', 'cool'],
+                        #                ['very', 'cool', 'exemple'],
+                        #                ['cool', 'exemple']]
+
+                        subterms = subsequences(tokens)
+                    else:
+                        subterms = [tokens]
+
+                    for seqterm in subterms:
+                        ngram = normalize_term(' '.join(seqterm))
+                        if len(ngram) > 1:
+                            # doc <=> ngram index
+                            nodes_ngrams_count[(document.id, ngram)] += 1
+                            # add fields :   terms          n
+                            ngrams_data.add((ngram[:255], len(seqterm), ))
+
            # integrate ngrams and nodes-ngrams
            if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
                _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
@@ -116,3 +131,27 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
        term_str = term_str.lower()

    return term_str
+
+
+def subsequences(sequence):
+    """
+    For an array of length n, returns an array of subarrays
+    with the original and all its sub arrays of length n-1
+
+    Ex: subsequences(["Aaa","Bbb", "Ccc", "Ddd"])
+        [
+            ['Aaa', 'Bbb', 'Ccc'],
+            ['Aaa', 'Bbb', 'Ccc', 'Ddd'],
+                   ['Bbb', 'Ccc', 'Ddd']
+         ]
+    """
+    l = len(sequence)
+    li = []
+    lsave = li.append
+    for i in range(l):
+        for j in range(i+(l-1),l+1):
+            if i != j:
+                lsave(sequence[i:j])
+                # debug
+                # print("  >", sequence[i:j])
+    return li
--- a/static/lib/gargantext/NGrams_dyna_chart_and_table.js
+++ b/static/lib/gargantext/NGrams_dyna_chart_and_table.js
@@ -1325,8 +1325,8 @@ function InferCRUDFlags(id, oldState, desiredState, registry) {
            }
        }
    }
-    console.log("registry")
-    console.log(registry)
+    // console.log("registry")
+    // console.log(registry)
    return registry
 }