Commit 224eae66 authored by sim's avatar sim

[FIX] Fix bug in ngrams normalization: remove dangling spaces

parent 13d5ba29
......@@ -95,7 +95,8 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
continue
# get ngrams
for ngram in tagger.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
normal_forms = (normalize_forms(t[0]) for t in ngram)
tokens = tuple(nf for nf in normal_forms if nf)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],...]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment