Commit 0713e2c3 authored by Romain Loth's avatar Romain Loth

[FIX] prevent nodes_ngrams_count.items() KeyError by forcing...

[FIX] prevent nodes_ngrams_count.items() KeyError by forcing nodes_ngrams_count to use same key as ngrams_data when key is truncated
parent 542da873
...@@ -98,9 +98,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -98,9 +98,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
tokens = tuple(normalize_forms(token[0]) for token in ngram) tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams: if do_subngrams:
# ex tokens = ["very", "cool", "exemple"] # ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'], # subterms = [['very', 'cool'],...]
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens) subterms = subsequences(tokens)
else: else:
...@@ -108,11 +106,16 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -108,11 +106,16 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
for seqterm in subterms: for seqterm in subterms:
ngram = ' '.join(seqterm) ngram = ' '.join(seqterm)
if len(ngram) > 1: nbwords = len(seqterm)
nbchars = len(ngram)
if nbchars > 1:
if nbchars > 255:
# max ngram length (DB constraint)
ngram = ngram[:255]
# doc <=> ngram index # doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1 nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n # add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), )) ngrams_data.add((ngram, nbwords, ))
except: except:
#value not in doc #value not in doc
continue continue
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment