Commit 6cd3ec54 authored by Romain Loth's avatar Romain Loth

extended possibility for subsequences of long terms in indexation (but disabled by default)

parent bc5985ce
...@@ -239,14 +239,15 @@ DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording ...@@ -239,14 +239,15 @@ DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording
# good for variants like same term # good for variants like same term
#  occurring at sentence beginning) #  occurring at sentence beginning)
DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# True <=> # True <=>
# when ngram is like: # when ngram is like:
# "very cool example" # "very cool example"
# then also count: # then also count: "very", "cool"
# "very cool" and "cool example" # "example", "very cool" and
# (n-1 length ngrams, at initial # "cool example".
# indexing after extraction) # (all 1 to n-1 length ngrams,
# at indexing after extraction)
......
...@@ -87,7 +87,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_ ...@@ -87,7 +87,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
subterms = [tokens] subterms = [tokens]
for seqterm in subterms: for seqterm in subterms:
ngram = normalize_term(' '.join(seqterm)) ngram = normalize_terms(' '.join(seqterm))
if len(ngram) > 1: if len(ngram) > 1:
# doc <=> ngram index # doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1 nodes_ngrams_count[(document.id, ngram)] += 1
...@@ -136,20 +136,27 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG): ...@@ -136,20 +136,27 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
def subsequences(sequence): def subsequences(sequence):
""" """
For an array of length n, returns an array of subarrays For an array of length n, returns an array of subarrays
with the original and all its sub arrays of length n-1 with the original and all its sub arrays of lengths 1 to n-1
Ex: subsequences(["Aaa","Bbb", "Ccc", "Ddd"]) Ex: subsequences(['Aa','Bb','Cc','Dd'])
[ [
['Aaa', 'Bbb', 'Ccc'], ['Aa'],
['Aaa', 'Bbb', 'Ccc', 'Ddd'], ['Aa', 'Bb'],
['Bbb', 'Ccc', 'Ddd'] ['Aa', 'Bb', 'Cc'],
['Aa', 'Bb', 'Cc', 'Dd'],
['Bb'],
['Bb', 'Cc'],
['Bb', 'Cc', 'Dd'],
['Cc'],
['Cc', 'Dd'],
['Dd']
] ]
""" """
l = len(sequence) l = len(sequence)
li = [] li = []
lsave = li.append lsave = li.append
for i in range(l): for i in range(l):
for j in range(i+(l-1),l+1): for j in range(i+1,l+1):
if i != j: if i != j:
lsave(sequence[i:j]) lsave(sequence[i:j])
# debug # debug
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment