Dedent

13d5ba29 · sim · d6816ea5 · 13d5ba29
Commit 13d5ba29 authored Sep 08, 2017 by sim
Hide whitespace changes
Inline Side-by-side

Showing with 37 additions and 37 deletions

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +37 -37

No files found.
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -81,44 +81,44 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
                    corpus.hyperdata["skipped_docs"].append(document.id)
                    corpus.save_hyperdata()
                    continue
-                else:
-                    # ready !
-                    tagger = tagger_bots[language_iso2]
-
-                    # to do verify if document has no KEYS to index
-                    # eg: use set intersect (+ loop becomes direct! with no continue)
-                    for key in keys:
-                        try:
-                            value = document.hyperdata[str(key)]
-                            if not isinstance(value, str):
-                                #print("DBG wrong content in doc for key", key)
-                                continue
-                                # get ngrams
-                            for ngram in tagger.extract(value):
-                                tokens = tuple(normalize_forms(token[0]) for token in ngram)
-                                if do_subngrams:
-                                    # ex tokens = ["very", "cool", "exemple"]
-                                    #    subterms = [['very', 'cool'],...]
-
-                                    subterms = subsequences(tokens)
-                                else:
-                                    subterms = [tokens]
-
-                                for seqterm in subterms:
-                                    ngram = ' '.join(seqterm)
-                                    nbwords = len(seqterm)
-                                    nbchars = len(ngram)
-                                    if nbchars > 1:
-                                        if nbchars > 255:
-                                            # max ngram length (DB constraint)
-                                            ngram = ngram[:255]
-                                        # doc <=> ngram index
-                                        nodes_ngrams_count[(document.id, ngram)] += 1
-                                        # add fields :   terms          n
-                                        ngrams_data.add((ngram, nbwords, ))
-                        except:
-                            #value not in doc
+
+                # ready !
+                tagger = tagger_bots[language_iso2]
+
+                # to do verify if document has no KEYS to index
+                # eg: use set intersect (+ loop becomes direct! with no continue)
+                for key in keys:
+                    try:
+                        value = document.hyperdata[str(key)]
+                        if not isinstance(value, str):
+                            #print("DBG wrong content in doc for key", key)
                            continue
+                            # get ngrams
+                        for ngram in tagger.extract(value):
+                            tokens = tuple(normalize_forms(token[0]) for token in ngram)
+                            if do_subngrams:
+                                # ex tokens = ["very", "cool", "exemple"]
+                                #    subterms = [['very', 'cool'],...]
+
+                                subterms = subsequences(tokens)
+                            else:
+                                subterms = [tokens]
+
+                            for seqterm in subterms:
+                                ngram = ' '.join(seqterm)
+                                nbwords = len(seqterm)
+                                nbchars = len(ngram)
+                                if nbchars > 1:
+                                    if nbchars > 255:
+                                        # max ngram length (DB constraint)
+                                        ngram = ngram[:255]
+                                    # doc <=> ngram index
+                                    nodes_ngrams_count[(document.id, ngram)] += 1
+                                    # add fields :   terms          n
+                                    ngrams_data.add((ngram, nbwords, ))
+                    except:
+                        #value not in doc
+                        continue

            # integrate ngrams and nodes-ngrams
            if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE: