Commit 26493012 authored by delanoe's avatar delanoe

[FIX] Grouping is ok.

parent e6f43a8c
......@@ -96,26 +96,12 @@ def getStemmer(corpus):
print("No language found")
def stemIt(ngram):
return(list(map(lambda x: stemmer.stem(x), ngram.split(' '))).sort())
stems = list(map(lambda x: stemmer.stem(x), ngram.split(' ')))
stems.sort()
return(str(' '.join(stems)))
return(stemIt)
def equals(ngram1,ngram2, f=None):
'''
equals :: (Int,String) -> (Int,String) -> Bool
detect if two ngrams are equivalent according to a function :: String -> [String]
'''
if ngram1[0] == ngram2[0]:
# if ngrams have same id then they are the same
# and they can not be grouped
return(False)
else:
try:
return f(ngram1) == f(ngram2)
except:
return(False)
PrintException()
def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
'''
group ngrams according to a function (stemming or lemming)
......@@ -135,45 +121,60 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
miam_to_insert = set()
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus)
somme = sa.func.count(NodeNgram.weight)
ngrams = (session.query(Ngram.id, Ngram.terms, somme )
frequency = sa.func.count(NodeNgram.weight)
ngrams = (session.query(Ngram.id, Ngram.terms, frequency )
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
.filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['Document'].id)
.group_by(Ngram.id)
.order_by(desc(somme))
.all()
#.limit(limit_sup)
.order_by(desc(frequency))
#.all()
.limit(limit_sup)
)
group = defaultdict(lambda : defaultdict())
#group = defaultdict(lambda : defaultdict())
ids_dict = dict()
mainform_dict = dict()
count_dict = dict()
for n in ngrams:
stem = stemIt(n[1])
maincount = group[stem].get('count', 0)
if n[2] > maincount:
group[stem]['main form'] = n[0]
group[stem]['count'] = n[2]
else:
group[stem]['ids'] = group[stem].get('ids', []) + [n[0]]
for stem in group.keys():
miam_to_insert.add((miam_node.id, group[stem]['main form'], 1))
for ngram_id in group[stem]['ids']:
group_to_insert.add((node_group.id, group[stem]['main form'], ngram_id, 1))
# TODO see here if coherent add in miam or group...
# Deleting previous groups
stem = str(stemIt(n[1]))
if stem is not None :
ids_dict[stem] = ids_dict.get(stem, []) + [n[0]]
count = count_dict.get(stem, 0)
if n[2] > count:
mainform_dict[stem] = n[0]
count_dict[stem] = n[2]
for key in mainform_dict.keys():
miam_to_insert.add((miam_node.id, mainform_dict[key], 1))
try:
ids = ids_dict[key]
if ids is not None and len(ids) > 1:
for ngram_id in ids :
if ngram_id != mainform_dict[key]:
group_to_insert.add((node_group.id, mainform_dict[key], ngram_id, 1))
except exception as e:
print(e)
print(group[stem])
#print(ids_dict[stem])
# # Deleting previous groups
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_group.id).delete()
# # Deleting previous ngrams miam list
session.query(NodeNgram).filter(NodeNgram.node_id == miam_node.id).delete()
session.commit()
bulk_insert(NodeNgramNgram
, ('node_id', 'ngramx_id', 'ngramy_id', 'score')
, [data for data in group_to_insert])
for n in group_to_insert:
#print(n)
miam_to_insert.add((miam_node.id, n[1], 1))
# Deleting previous ngrams miam list
session.query(NodeNgram).filter(NodeNgram.node_id == miam_node.id).delete()
bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'), [data for data in list(miam_to_insert)])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment