Commit 5582c35f authored by delanoe's avatar delanoe

[FIX] labels for nodes.

parent 94929700
......@@ -3,7 +3,8 @@ from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from sqlalchemy.sql import func
from gargantext_web.db import Node, NodeNgram, NodeNgramNgram, NodeNodeNgram, NodeHyperdata, Hyperdata
from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \
NodeNodeNgram, NodeHyperdata, Hyperdata
from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
from analysis.lists import WeightedMatrix, UnweightedList, Translations
......@@ -11,6 +12,7 @@ def cooc(corpus=None
, field_X=None, field_Y=None
, miam_id=None, stop_id=None, group_id=None
, cvalue_id=None
, n_min=2, n_max=None
, start=None, end=None
, limit=1000):
'''
......@@ -57,8 +59,30 @@ def cooc(corpus=None
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.filter(Node.parent_id==corpus.id, Node.type_id==doc_id)
)
# Size of the ngrams between n_min and n_max
if n_min is not None or n_max is not None:
NgramX = aliased(Ngram)
NgramY = aliased(Ngram)
cooc_query = (cooc_query
.join(NgramX, NgramX.id == NodeNgramX.ngram_id)
.join(NgramY, NgramY.id == NodeNgramY.ngram_id)
)
if n_min is not None:
cooc_query = (cooc_query
.filter(NgramX.n >= n_min)
.filter(NgramY.n >= n_min)
)
if n_max is not None:
cooc_query = (cooc_query
.filter(NgramX.n >= n_min)
.filter(NgramY.n >= n_min)
)
# Cooc between the dates start and end
if start is not None:
Start=aliased(NodeHyperdata)
StartFormat = aliased(Hyperdata)
......@@ -79,11 +103,12 @@ def cooc(corpus=None
)
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = (cooc_query.filter(Node.parent_id == corpus.id, Node.type_id == doc_id)
.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
.group_by(Node.id, NodeNgramX.ngram_id, NodeNgramY.ngram_id)
.order_by(func.count())
.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
.order_by(desc(func.count()))
.limit(limit)
)
......@@ -91,6 +116,7 @@ def cooc(corpus=None
matrix = WeightedMatrix(cooc_query)
#print(matrix)
# Select according some scores
if cvalue_id is not None :
#miam = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cvalue_list = UnweightedList(session.query(NodeNodeNgram.ngram_id)
......
......@@ -34,7 +34,6 @@ def create_blacklist(user, corpus):
def create_synonymes(user, corpus):
pass
size = 1000
def create_whitelist(user, corpus_id, size=size, count_min=2, miam_id=None):
......@@ -170,7 +169,9 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start
return cooc.id
def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=size):
'''
get_ccoc : to compute the graph.
'''
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
......@@ -185,16 +186,15 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
# data deleted each time
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).delete()
#cooc_id = cooc(corpus=corpus, miam_id=miam_id, stop_id=stop_id, limit=size)
cooc_id = cooc(corpus=corpus, miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size)
#cooc_id = cooc(corpus=corpus, miam_id=miam_id, limit=size)
#print([n for n in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all()])
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all():
#print(cooccurrence)
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
labels[cooccurrence.ngramx_id] = session.query(Ngram.id).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
labels[cooccurrence.ngramy_id] = session.query(Ngram.id).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
# TODO clean this part, unuseful
labels[cooccurrence.ngramx_id] = cooccurrence.ngramx_id #session.query(Ngram.id).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
labels[cooccurrence.ngramy_id] = cooccurrence.ngramy_id #session.query(Ngram.id).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
......@@ -205,7 +205,6 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0)
......@@ -280,7 +279,6 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
#node,type(labels[node])
G.node[node]['pk'] = ids[node]
G.node[node]['label'] = session.query(Ngram.terms).filter(Ngram.id==node).first()
# G.node[node]['pk'] = ids[str(node)]
G.node[node]['size'] = weight[ids[node]]
G.node[node]['group'] = partition[node]
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment