Commit 4ef20306 authored by delanoe's avatar delanoe

[GRAPH]

- Graphes asynchrones testés en mode prod
- Documentation, README amélioré
- cosmetics: aération du code (en particulier bridgeness...)
- factorisation et meilleur positionnement de la fonction compute_graph

 Modifications qui seront validées :
	modifié :         README.md
	modifié :         bridgeness.py
	modifié :         cooccurrences.py
	modifié :         graph.py
parent cce9dbcc
Module Graph Explorer: from text to graph Module Graph Explorer: from text to graph
========================================= =========================================
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## Graph Explorer main ## Graph Explorer main
0) All urls.py of the Graph Explorer 0) All urls.py of the Graph Explorer
1) Main view of the graph explorer: views.py 1) Main view of the graph explorer: views.py
2) Data are retrieved as REST: rest.py -> Graph Explorer
3) Graph is generated (graph.py) through different steps -> My graph View
-> REST API to get Data
2) Graph is generated (graph.py) through different steps
a) check the constraints (graph_constraints) in gargantext/constants.py a) check the constraints (graph_constraints) in gargantext/constants.py
b) Cooccurences are computed (in live or asynchronously): cooccurrences.py b) Data are retrieved as REST
c) Thresold and distances : distances.py rest.py: check REST parameters
d) clustering: louvain.py c) graph.py:
c) links between communities: bridgeness.py get_graph: check Graph parameters
compute_graph: compute graph
1) Cooccurences are computed (in live or asynchronously): cooccurrences.py
2) Thresold and distances : distances.py
3) clustering: louvain.py
4) links between communities: bridgeness.py
d) compress graph before returning it: utils.py
4) Additional features: 4) Additional features:
a) intersection of graphs: intersection.py a) intersection of graphs: intersection.py
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## TODO ## TODO
1) save parameters in hyperdata myGraphs view:
2) graph explorer:
* save current graph
2) myGraphs view:
* progress bar * progress bar
* Show already computed graphs vs to be computed with parameters
* show parameters * show parameters
* copy / paste and change some parameters to generate new graph * copy / paste and change some parameters to generate new graph
...@@ -17,13 +17,18 @@ def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2): ...@@ -17,13 +17,18 @@ def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
for node_id in G.nodes(): for node_id in G.nodes():
#node,type(labels[node]) #node,type(labels[node])
nodesB_dict [ ids[node_id][1] ] = True nodesB_dict [ ids[node_id][1] ] = True
# TODO the query below is not optimized (do it do_distance). # TODO the query below is not optimized (do it do_distance).
the_label = session.query(Ngram.terms).filter(Ngram.id==node_id).first() the_label = session.query(Ngram.terms).filter(Ngram.id==node_id).first()
the_label = ", ".join(the_label) the_label = ", ".join(the_label)
G.node[node_id]['label'] = the_label G.node[node_id]['label'] = the_label
G.node[node_id]['size'] = weight[node_id] G.node[node_id]['size'] = weight[node_id]
G.node[node_id]['type'] = ids[node_id][0].replace("ngrams","terms") G.node[node_id]['type'] = ids[node_id][0].replace("ngrams","terms")
G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3) # G.add_edge(node, "cluster " + str(partition[node]), weight=3)
...@@ -65,12 +70,20 @@ def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2): ...@@ -65,12 +70,20 @@ def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
if bridgeness > 0: if bridgeness > 0:
for c1 in com_link.keys(): for c1 in com_link.keys():
for c2 in com_link[c1].keys(): for c2 in com_link[c1].keys():
index = round(bridgeness*len(com_link[c1][c2]) / (len(com_ids[c1]) + len(com_ids[c2]))) index = round(
bridgeness * len( com_link[c1][c2] )
/ #----------------------------------#
( len(com_ids[c1]) + len(com_ids[c2] ))
)
#print((c1,len(com_ids[c1])), (c2,len(com_ids[c2])), index) #print((c1,len(com_ids[c1])), (c2,len(com_ids[c2])), index)
if index > 0: if index > 0:
for link in sorted(com_link[c1][c2], key=lambda x: x[2], reverse=True)[:index]: for link in sorted( com_link[c1][c2]
, key=lambda x: x[2]
, reverse=True)[:index]:
#print(c1, c2, link[2]) #print(c1, c2, link[2])
info = {"s": link[0], "t": link[1], "w": link[2]} info = {"s": link[0], "t": link[1], "w": link[2]}
links.append(info) links.append(info)
......
...@@ -3,15 +3,10 @@ from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \ ...@@ -3,15 +3,10 @@ from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
from gargantext.util.db import session, aliased, func from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness
from sqlalchemy import desc, asc, or_, and_ from sqlalchemy import desc, asc, or_, and_
from datetime import datetime
#import inspect
from datetime import datetime
from celery import shared_task
def filterMatrix(matrix, mapList_id, groupList_id): def filterMatrix(matrix, mapList_id, groupList_id):
mapList = UnweightedList( mapList_id ) mapList = UnweightedList( mapList_id )
...@@ -19,52 +14,6 @@ def filterMatrix(matrix, mapList_id, groupList_id): ...@@ -19,52 +14,6 @@ def filterMatrix(matrix, mapList_id, groupList_id):
cooc = matrix & (mapList * group_list) cooc = matrix & (mapList * group_list)
return cooc return cooc
@shared_task
def computeGraph( corpus_id=None , cooc_id=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
, isMonopartite=True , threshold = 3
, save_on_db= True , reset=True
):
print("GRAPH # ... Computing cooccurrences.")
(cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id=cooc_id
, field1=field1, field2=field2
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True
)
print("GRAPH #%d ... Cooccurrences computed." % (cooc_id))
print("GRAPH #%d ... Clustering with %s distance." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
print("GRAPH #%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
print("GRAPH #%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
if node.hyperdata.get(distance, None) is None:
node.hyperdata[distance] = dict()
node.hyperdata[distance][bridgeness] = data
node.save_hyperdata()
session.commit()
print("GRAPH #%d ... Returning data as json." % cooc_id)
return data
def countCooccurrences( corpus_id=None , cooc_id=None def countCooccurrences( corpus_id=None , cooc_id=None
, field1='ngrams' , field2='ngrams' , field1='ngrams' , field2='ngrams'
...@@ -269,8 +218,10 @@ def countCooccurrences( corpus_id=None , cooc_id=None ...@@ -269,8 +218,10 @@ def countCooccurrences( corpus_id=None , cooc_id=None
# Saving the parameters # Saving the parameters
print("GRAPH #%s ... Parameters saved in Node." % cooc_id) print("GRAPH #%s ... Parameters saved in Node." % cooc_id)
coocNode = session.query(Node).filter(Node.id==cooc_id).first() coocNode = session.query(Node).filter(Node.id==cooc_id).first()
coocNode.hyperdata[distance] = dict() coocNode.hyperdata[distance] = dict()
coocNode.hyperdata[distance]["parameters"] = parameters coocNode.hyperdata[distance]["parameters"] = parameters
session.add(coocNode) session.add(coocNode)
session.commit() session.commit()
......
...@@ -4,15 +4,78 @@ from gargantext.util.lists import WeightedMatrix, UnweightedList, Transla ...@@ -4,15 +4,78 @@ from gargantext.util.lists import WeightedMatrix, UnweightedList, Transla
from gargantext.util.http import JsonHttpResponse from gargantext.util.http import JsonHttpResponse
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, NodeHyperdata from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, NodeHyperdata
#from gargantext.util.toolchain.ngram_coocs import compute_coocs from graph.cooccurrences import countCooccurrences
from graph.cooccurrences import computeGraph, filterMatrix
from graph.distances import clusterByDistances from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness from graph.bridgeness import filterByBridgeness
from gargantext.util.scheduling import scheduled from gargantext.util.scheduling import scheduled
from gargantext.constants import graph_constraints from gargantext.constants import graph_constraints
from datetime import datetime from celery import shared_task
from datetime import datetime
@shared_task
def compute_graph( corpus_id=None , cooc_id=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
, isMonopartite=True , threshold = 3
, save_on_db= True , reset=True
) :
'''
All steps to compute a graph:
1) count Cooccurrences (function countCooccurrences)
main parameters: threshold, isMonopartite
2) filter and cluster By Distances (function clusterByDistances)
main parameter: distance
TODO option clustering='louvain'
or 'percolation' or 'random walk' or ...
3) filter By Bridgeness (function filterByBridgeness)
main parameter: bridgeness
4) format the graph (formatGraph)
main parameter: format_
'''
print("GRAPH # ... Computing cooccurrences.")
(cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id=cooc_id
, field1=field1, field2=field2
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True
)
print("GRAPH #%d ... Cooccurrences computed." % (cooc_id))
print("GRAPH #%d ... Clustering with %s distance." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
print("GRAPH #%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
print("GRAPH #%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
if node.hyperdata.get(distance, None) is None:
node.hyperdata[distance] = dict()
node.hyperdata[distance][bridgeness] = data
node.save_hyperdata()
session.commit()
print("GRAPH #%d ... Returning data as json." % cooc_id)
return data
def get_graph( request=None , corpus=None def get_graph( request=None , corpus=None
, field1='ngrams' , field2='ngrams' , field1='ngrams' , field2='ngrams'
...@@ -22,7 +85,7 @@ def get_graph( request=None , corpus=None ...@@ -22,7 +85,7 @@ def get_graph( request=None , corpus=None
, distance='conditional', bridgeness=5 , distance='conditional', bridgeness=5
, threshold=1 , isMonopartite=True , threshold=1 , isMonopartite=True
, saveOnly=True , saveOnly=True
): ) :
''' '''
Get_graph : main steps: Get_graph : main steps:
0) Check the parameters 0) Check the parameters
...@@ -33,18 +96,9 @@ def get_graph( request=None , corpus=None ...@@ -33,18 +96,9 @@ def get_graph( request=None , corpus=None
get_graph first checks the parameters and return either graph data or a dict with get_graph first checks the parameters and return either graph data or a dict with
state "type" with an integer to indicate the size of the parameter state "type" with an integer to indicate the size of the parameter
(maybe we could add a String in that step to factor and give here the error message) (maybe we could add a String in that step to factor and give here the error message)
1) count Cooccurrences (function countCooccurrences) 1) compute_graph (see function above)
main parameters: threshold 2) return graph
2) filter and cluster By Distances (function clusterByDistances)
main parameter: distance
3) filter By Bridgeness (function filterByBridgeness)
main parameter: bridgeness
4) format the graph (formatGraph)
main parameter: format_
''' '''
...@@ -69,7 +123,7 @@ def get_graph( request=None , corpus=None ...@@ -69,7 +123,7 @@ def get_graph( request=None , corpus=None
# Case of graph has not been computed already # Case of graph has not been computed already
# First, check the parameters # First, check the parameters
# Case of mapList not big enough # Case of mapList not big enough
# ============================== # ==============================
...@@ -128,7 +182,7 @@ def get_graph( request=None , corpus=None ...@@ -128,7 +182,7 @@ def get_graph( request=None , corpus=None
corpus_size = corpus_size_query.count() corpus_size = corpus_size_query.count()
if saveOnly is not None and saveOnly == "True": if saveOnly is not None and saveOnly == "True":
scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams" #, field1="ngrams", field2="ngrams"
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
...@@ -141,7 +195,7 @@ def get_graph( request=None , corpus=None ...@@ -141,7 +195,7 @@ def get_graph( request=None , corpus=None
elif corpus_size > graph_constraints['corpusMax']: elif corpus_size > graph_constraints['corpusMax']:
# Then compute cooc asynchronously with celery # Then compute cooc asynchronously with celery
scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams" #, field1="ngrams", field2="ngrams"
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
...@@ -160,7 +214,7 @@ def get_graph( request=None , corpus=None ...@@ -160,7 +214,7 @@ def get_graph( request=None , corpus=None
else: else:
# If graph_constraints are ok then compute the graph in live # If graph_constraints are ok then compute the graph in live
data = computeGraph( corpus_id=corpus.id, cooc_id=cooc_id data = compute_graph( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams" #, field1="ngrams", field2="ngrams"
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
...@@ -173,7 +227,7 @@ def get_graph( request=None , corpus=None ...@@ -173,7 +227,7 @@ def get_graph( request=None , corpus=None
# case when 0 coocs are observed (usually b/c not enough ngrams in maplist) # case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
if len(data) == 0: if len(data) == 0:
print("GET_GRAPH: 0 coocs in matrix") print("GRAPH # ... GET_GRAPH: 0 coocs in matrix")
data = {'nodes':[], 'links':[]} # empty data data = {'nodes':[], 'links':[]} # empty data
return data return data
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment