Commit 26493012 authored by delanoe's avatar delanoe

[FIX] Grouping is ok.

parent e6f43a8c
...@@ -96,26 +96,12 @@ def getStemmer(corpus): ...@@ -96,26 +96,12 @@ def getStemmer(corpus):
print("No language found") print("No language found")
def stemIt(ngram): def stemIt(ngram):
return(list(map(lambda x: stemmer.stem(x), ngram.split(' '))).sort()) stems = list(map(lambda x: stemmer.stem(x), ngram.split(' ')))
stems.sort()
return(str(' '.join(stems)))
return(stemIt) return(stemIt)
def equals(ngram1,ngram2, f=None):
'''
equals :: (Int,String) -> (Int,String) -> Bool
detect if two ngrams are equivalent according to a function :: String -> [String]
'''
if ngram1[0] == ngram2[0]:
# if ngrams have same id then they are the same
# and they can not be grouped
return(False)
else:
try:
return f(ngram1) == f(ngram2)
except:
return(False)
PrintException()
def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'): def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
''' '''
group ngrams according to a function (stemming or lemming) group ngrams according to a function (stemming or lemming)
...@@ -135,45 +121,60 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'): ...@@ -135,45 +121,60 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
miam_to_insert = set() miam_to_insert = set()
miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus) miam_node = get_or_create_node(nodetype='MiamList', corpus=corpus)
somme = sa.func.count(NodeNgram.weight) frequency = sa.func.count(NodeNgram.weight)
ngrams = (session.query(Ngram.id, Ngram.terms, somme ) ngrams = (session.query(Ngram.id, Ngram.terms, frequency )
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id) .join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id) .join(Node, Node.id == NodeNgram.node_id)
.filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['Document'].id) .filter(Node.parent_id==corpus.id, Node.type_id==cache.NodeType['Document'].id)
.group_by(Ngram.id) .group_by(Ngram.id)
.order_by(desc(somme)) .order_by(desc(frequency))
.all() #.all()
#.limit(limit_sup) .limit(limit_sup)
) )
group = defaultdict(lambda : defaultdict()) #group = defaultdict(lambda : defaultdict())
ids_dict = dict()
mainform_dict = dict()
count_dict = dict()
for n in ngrams: for n in ngrams:
stem = stemIt(n[1]) stem = str(stemIt(n[1]))
maincount = group[stem].get('count', 0)
if n[2] > maincount: if stem is not None :
group[stem]['main form'] = n[0]
group[stem]['count'] = n[2] ids_dict[stem] = ids_dict.get(stem, []) + [n[0]]
else:
group[stem]['ids'] = group[stem].get('ids', []) + [n[0]] count = count_dict.get(stem, 0)
if n[2] > count:
mainform_dict[stem] = n[0]
count_dict[stem] = n[2]
for stem in group.keys():
miam_to_insert.add((miam_node.id, group[stem]['main form'], 1))
for ngram_id in group[stem]['ids']:
group_to_insert.add((node_group.id, group[stem]['main form'], ngram_id, 1))
# TODO see here if coherent add in miam or group... for key in mainform_dict.keys():
# Deleting previous groups miam_to_insert.add((miam_node.id, mainform_dict[key], 1))
try:
ids = ids_dict[key]
if ids is not None and len(ids) > 1:
for ngram_id in ids :
if ngram_id != mainform_dict[key]:
group_to_insert.add((node_group.id, mainform_dict[key], ngram_id, 1))
except exception as e:
print(e)
print(group[stem])
#print(ids_dict[stem])
# # Deleting previous groups
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_group.id).delete() session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_group.id).delete()
# # Deleting previous ngrams miam list
session.query(NodeNgram).filter(NodeNgram.node_id == miam_node.id).delete()
session.commit()
bulk_insert(NodeNgramNgram bulk_insert(NodeNgramNgram
, ('node_id', 'ngramx_id', 'ngramy_id', 'score') , ('node_id', 'ngramx_id', 'ngramy_id', 'score')
, [data for data in group_to_insert]) , [data for data in group_to_insert])
for n in group_to_insert:
#print(n)
miam_to_insert.add((miam_node.id, n[1], 1))
# Deleting previous ngrams miam list
session.query(NodeNgram).filter(NodeNgram.node_id == miam_node.id).delete()
bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'), [data for data in list(miam_to_insert)]) bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'), [data for data in list(miam_to_insert)])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment