Commit 0713e2c3 authored by Romain Loth's avatar Romain Loth

[FIX] prevent nodes_ngrams_count.items() KeyError by forcing...

[FIX] prevent nodes_ngrams_count.items() KeyError by forcing nodes_ngrams_count to use same key as ngrams_data when key is truncated
parent 542da873
......@@ -98,9 +98,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
# subterms = [['very', 'cool'],...]
subterms = subsequences(tokens)
else:
......@@ -108,11 +106,16 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
nbwords = len(seqterm)
nbchars = len(ngram)
if nbchars > 1:
if nbchars > 255:
# max ngram length (DB constraint)
ngram = ngram[:255]
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
ngrams_data.add((ngram, nbwords, ))
except:
#value not in doc
continue
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment