extended possibility for subsequences of long terms in indexation (but disabled by default)

f9e2dbd5 · Romain Loth · 3ff6aa00 · f9e2dbd5 · f9e2dbd5
Commit f9e2dbd5 authored Jun 03, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 12 deletions

constants.py gargantext/constants.py +6 -5

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +14 -7

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -239,14 +239,15 @@ DEFAULT_ALL_LOWERCASE_FLAG      = True       # lowercase ngrams before recording
                                             #  good for variants like same term
                                             #  occurring at sentence beginning)
-DEFAULT_INDEX_SUBGRAMS         = False       # False <=> traditional
+DEFAULT_INDEX_SUBGRAMS         = False        # False <=> traditional
                                             # True  <=>
                                             #  when ngram is like:
                                             #  "very cool example"
-                                             #  then also count:
+                                             #  then also count: "very", "cool"
-                                             #  "very cool" and "cool example"
+                                             #  "example", "very cool" and
-                                             #  (n-1 length ngrams, at initial
+                                             #  "cool example".
-                                             #   indexing after extraction)
+                                             #   (all 1 to n-1 length ngrams,
+                                             #    at indexing after extraction)

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -87,7 +87,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
                        subterms = [tokens]
                    for seqterm in subterms:
-                        ngram = normalize_term(' '.join(seqterm))
+                        ngram = normalize_terms(' '.join(seqterm))
                        if len(ngram) > 1:
                            # doc <=> ngram index
                            nodes_ngrams_count[(document.id, ngram)] += 1
@@ -136,20 +136,27 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
 def subsequences(sequence):
    """
    For an array of length n, returns an array of subarrays
-    with the original and all its sub arrays of length n-1
+    with the original and all its sub arrays of lengths 1 to n-1
-    Ex: subsequences(["Aaa","Bbb", "Ccc", "Ddd"])
+    Ex: subsequences(['Aa','Bb','Cc','Dd'])
        [
-            ['Aaa', 'Bbb', 'Ccc'],
+            ['Aa'],
-            ['Aaa', 'Bbb', 'Ccc', 'Ddd'],
+            ['Aa', 'Bb'],
-                   ['Bbb', 'Ccc', 'Ddd']
+            ['Aa', 'Bb', 'Cc'],
+            ['Aa', 'Bb', 'Cc', 'Dd'],
+            ['Bb'],
+            ['Bb', 'Cc'],
+            ['Bb', 'Cc', 'Dd'],
+            ['Cc'],
+            ['Cc', 'Dd'],
+            ['Dd']
         ]
    """
    l = len(sequence)
    li = []
    lsave = li.append
    for i in range(l):
-        for j in range(i+(l-1),l+1):
+        for j in range(i+1,l+1):
            if i != j:
                lsave(sequence[i:j])
                # debug