fix bulk_insert_ifnotexists occasional bugs with better term normalization...

fix bulk_insert_ifnotexists occasional bugs with better term normalization sequence and reduce frequency of integrate step (reduce DB effort but use slightly more RAM)

fix bulk_insert_ifnotexists occasional bugs with better term normalization...
fix bulk_insert_ifnotexists occasional bugs with better term normalization sequence and reduce frequency of integrate step (reduce DB effort but use slightly more RAM)
74693a64 · Romain Loth · 817f2ee3 · 74693a64 · 74693a64 · 74693a64
Commit 74693a64 authored Jul 05, 2016 by Romain Loth
Showing with 11 additions and 11 deletions

constants.py gargantext/constants.py +3 -3

ngramlists_tools.py gargantext/util/ngramlists_tools.py +1 -1

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +7 -7

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -230,10 +230,10 @@ DEFAULT_COOC_THRESHOLD          = 3          # inclusive minimum for COOCS coefs

 DEFAULT_MAPLIST_MAX             = 350        # MAPLIST maximum terms

-DEFAULT_MAPLIST_MONOGRAMS_RATIO = .25        # quota of monograms in MAPLIST
+DEFAULT_MAPLIST_MONOGRAMS_RATIO = .2         # quota of monograms in MAPLIST
                                             # (vs multigrams = 1-mono)

-DEFAULT_MAPLIST_GENCLUSION_RATIO = .7        # quota of top genclusion in MAPLIST
+DEFAULT_MAPLIST_GENCLUSION_RATIO = .6        # quota of top genclusion in MAPLIST
                                             # (vs top specclusion = 1-gen)

 DEFAULT_MAX_NGRAM_LEN           = 7          # limit used after POStagging rule
@@ -279,7 +279,7 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY

 # about batch processing...
 BATCH_PARSING_SIZE          = 256
-BATCH_NGRAMSEXTRACTION_SIZE = 1024
+BATCH_NGRAMSEXTRACTION_SIZE = 3000   # how many distinct ngrams before INTEGRATE


 # Scrapers config

--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -19,7 +19,7 @@ from gargantext.constants        import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO

 # import will implement the same text cleaning procedures as toolchain
 from gargantext.util.toolchain.parsing           import normalize_chars
-from gargantext.util.toolchain.ngrams_extraction import normalize_terms
+from gargantext.util.toolchain.ngrams_extraction import normalize_forms

 from sqlalchemy.sql      import exists
 from os                  import path

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -77,7 +77,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
                    continue
                # get ngrams
                for ngram in ngramsextractor.extract(value):
-                    tokens = tuple(token[0] for token in ngram)
+                    tokens = tuple(normalize_forms(token[0]) for token in ngram)

                    if do_subngrams:
                        # ex tokens = ["very", "cool", "exemple"]
@@ -90,7 +90,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
                        subterms = [tokens]

                    for seqterm in subterms:
-                        ngram = normalize_terms(' '.join(seqterm))
+                        ngram = ' '.join(seqterm)
                        if len(ngram) > 1:
                            # doc <=> ngram index
                            nodes_ngrams_count[(document.id, ngram)] += 1
@@ -118,7 +118,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
        raise error


-def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
+def normalize_forms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
    """
    Removes unwanted trailing punctuation
    AND optionally puts everything to lowercase
@@ -127,14 +127,14 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):

    (benefits from normalize_chars upstream so there's less cases to consider)
    """
-    # print('normalize_terms  IN: "%s"' % term_str)
-    term_str = sub(r'^[-",;/%(){}\\\[\]\.\' ]+', '', term_str)
-    term_str = sub(r'[-",;/%(){}\\\[\]\.\' ]+$', '', term_str)
+    # print('normalize_forms  IN: "%s"' % term_str)
+    term_str = sub(r'^[-\'",;/%(){}\\\[\]\. ©]+', '', term_str)
+    term_str = sub(r'[-\'",;/%(){}\\\[\]\. ©]+$', '', term_str)

    if do_lowercase:
        term_str = term_str.lower()

-    # print('normalize_terms OUT: "%s"' % term_str)
+    # print('normalize_forms OUT: "%s"' % term_str)

    return term_str