Commit 74693a64 authored by Romain Loth's avatar Romain Loth

fix bulk_insert_ifnotexists occasional bugs with better term normalization...

fix bulk_insert_ifnotexists occasional bugs with better term normalization sequence and reduce frequency of integrate step (reduce DB effort but use slightly more RAM)
parent 817f2ee3
...@@ -230,10 +230,10 @@ DEFAULT_COOC_THRESHOLD = 3 # inclusive minimum for COOCS coefs ...@@ -230,10 +230,10 @@ DEFAULT_COOC_THRESHOLD = 3 # inclusive minimum for COOCS coefs
DEFAULT_MAPLIST_MAX = 350 # MAPLIST maximum terms DEFAULT_MAPLIST_MAX = 350 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .25 # quota of monograms in MAPLIST DEFAULT_MAPLIST_MONOGRAMS_RATIO = .2 # quota of monograms in MAPLIST
# (vs multigrams = 1-mono) # (vs multigrams = 1-mono)
DEFAULT_MAPLIST_GENCLUSION_RATIO = .7 # quota of top genclusion in MAPLIST DEFAULT_MAPLIST_GENCLUSION_RATIO = .6 # quota of top genclusion in MAPLIST
# (vs top specclusion = 1-gen) # (vs top specclusion = 1-gen)
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
...@@ -279,7 +279,7 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY ...@@ -279,7 +279,7 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing... # about batch processing...
BATCH_PARSING_SIZE = 256 BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 1024 BATCH_NGRAMSEXTRACTION_SIZE = 3000 # how many distinct ngrams before INTEGRATE
# Scrapers config # Scrapers config
......
...@@ -19,7 +19,7 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO ...@@ -19,7 +19,7 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
# import will implement the same text cleaning procedures as toolchain # import will implement the same text cleaning procedures as toolchain
from gargantext.util.toolchain.parsing import normalize_chars from gargantext.util.toolchain.parsing import normalize_chars
from gargantext.util.toolchain.ngrams_extraction import normalize_terms from gargantext.util.toolchain.ngrams_extraction import normalize_forms
from sqlalchemy.sql import exists from sqlalchemy.sql import exists
from os import path from os import path
......
...@@ -77,7 +77,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_ ...@@ -77,7 +77,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
continue continue
# get ngrams # get ngrams
for ngram in ngramsextractor.extract(value): for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram) tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams: if do_subngrams:
# ex tokens = ["very", "cool", "exemple"] # ex tokens = ["very", "cool", "exemple"]
...@@ -90,7 +90,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_ ...@@ -90,7 +90,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
subterms = [tokens] subterms = [tokens]
for seqterm in subterms: for seqterm in subterms:
ngram = normalize_terms(' '.join(seqterm)) ngram = ' '.join(seqterm)
if len(ngram) > 1: if len(ngram) > 1:
# doc <=> ngram index # doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1 nodes_ngrams_count[(document.id, ngram)] += 1
...@@ -118,7 +118,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_ ...@@ -118,7 +118,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
raise error raise error
def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG): def normalize_forms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
""" """
Removes unwanted trailing punctuation Removes unwanted trailing punctuation
AND optionally puts everything to lowercase AND optionally puts everything to lowercase
...@@ -127,14 +127,14 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG): ...@@ -127,14 +127,14 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
(benefits from normalize_chars upstream so there's less cases to consider) (benefits from normalize_chars upstream so there's less cases to consider)
""" """
# print('normalize_terms IN: "%s"' % term_str) # print('normalize_forms IN: "%s"' % term_str)
term_str = sub(r'^[-",;/%(){}\\\[\]\.\' ]+', '', term_str) term_str = sub(r'^[-\'",;/%(){}\\\[\]\. ©]+', '', term_str)
term_str = sub(r'[-",;/%(){}\\\[\]\.\' ]+$', '', term_str) term_str = sub(r'[-\'",;/%(){}\\\[\]\. ©]+$', '', term_str)
if do_lowercase: if do_lowercase:
term_str = term_str.lower() term_str = term_str.lower()
# print('normalize_terms OUT: "%s"' % term_str) # print('normalize_forms OUT: "%s"' % term_str)
return term_str return term_str
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment