Commit 74693a64 authored by Romain Loth's avatar Romain Loth

fix bulk_insert_ifnotexists occasional bugs with better term normalization...

fix bulk_insert_ifnotexists occasional bugs with better term normalization sequence and reduce frequency of integrate step (reduce DB effort but use slightly more RAM)
parent 817f2ee3
......@@ -230,10 +230,10 @@ DEFAULT_COOC_THRESHOLD = 3 # inclusive minimum for COOCS coefs
DEFAULT_MAPLIST_MAX = 350 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .25 # quota of monograms in MAPLIST
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .2 # quota of monograms in MAPLIST
# (vs multigrams = 1-mono)
DEFAULT_MAPLIST_GENCLUSION_RATIO = .7 # quota of top genclusion in MAPLIST
DEFAULT_MAPLIST_GENCLUSION_RATIO = .6 # quota of top genclusion in MAPLIST
# (vs top specclusion = 1-gen)
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
......@@ -279,7 +279,7 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing...
BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 1024
BATCH_NGRAMSEXTRACTION_SIZE = 3000 # how many distinct ngrams before INTEGRATE
# Scrapers config
......
......@@ -19,7 +19,7 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
# import will implement the same text cleaning procedures as toolchain
from gargantext.util.toolchain.parsing import normalize_chars
from gargantext.util.toolchain.ngrams_extraction import normalize_terms
from gargantext.util.toolchain.ngrams_extraction import normalize_forms
from sqlalchemy.sql import exists
from os import path
......
......@@ -77,7 +77,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
continue
# get ngrams
for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram)
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
......@@ -90,7 +90,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
subterms = [tokens]
for seqterm in subterms:
ngram = normalize_terms(' '.join(seqterm))
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
......@@ -118,7 +118,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
raise error
def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
def normalize_forms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
"""
Removes unwanted trailing punctuation
AND optionally puts everything to lowercase
......@@ -127,14 +127,14 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
(benefits from normalize_chars upstream so there's less cases to consider)
"""
# print('normalize_terms IN: "%s"' % term_str)
term_str = sub(r'^[-",;/%(){}\\\[\]\.\' ]+', '', term_str)
term_str = sub(r'[-",;/%(){}\\\[\]\.\' ]+$', '', term_str)
# print('normalize_forms IN: "%s"' % term_str)
term_str = sub(r'^[-\'",;/%(){}\\\[\]\. ©]+', '', term_str)
term_str = sub(r'[-\'",;/%(){}\\\[\]\. ©]+$', '', term_str)
if do_lowercase:
term_str = term_str.lower()
# print('normalize_terms OUT: "%s"' % term_str)
# print('normalize_forms OUT: "%s"' % term_str)
return term_str
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment