Commit aef18d43 authored by delanoe's avatar delanoe

[GRAPH]

- Graphes asynchrones testés en mode prod
- Documentation, README amélioré
- cosmetics: aération du code (en particulier bridgeness...)
- factorisation et meilleur positionnement de la fonction compute_graph

 Modifications qui seront validées :
	modifié :         README.md
	modifié :         bridgeness.py
	modifié :         cooccurrences.py
	modifié :         graph.py
parent 6278045c
Module Graph Explorer: from text to graph
=========================================
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## Graph Explorer main
0) All urls.py of the Graph Explorer
1) Main view of the graph explorer: views.py
2) Data are retrieved as REST: rest.py
3) Graph is generated (graph.py) through different steps
-> Graph Explorer
-> My graph View
-> REST API to get Data
2) Graph is generated (graph.py) through different steps
a) check the constraints (graph_constraints) in gargantext/constants.py
b) Cooccurences are computed (in live or asynchronously): cooccurrences.py
c) Thresold and distances : distances.py
d) clustering: louvain.py
c) links between communities: bridgeness.py
b) Data are retrieved as REST
rest.py: check REST parameters
c) graph.py:
get_graph: check Graph parameters
compute_graph: compute graph
1) Cooccurences are computed (in live or asynchronously): cooccurrences.py
2) Thresold and distances : distances.py
3) clustering: louvain.py
4) links between communities: bridgeness.py
d) compress graph before returning it: utils.py
4) Additional features:
a) intersection of graphs: intersection.py
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## TODO
1) save parameters in hyperdata
2) graph explorer:
* save current graph
2) myGraphs view:
myGraphs view:
* progress bar
* Show already computed graphs vs to be computed with parameters
* show parameters
* copy / paste and change some parameters to generate new graph
......@@ -17,13 +17,18 @@ def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
for node_id in G.nodes():
#node,type(labels[node])
nodesB_dict [ ids[node_id][1] ] = True
# TODO the query below is not optimized (do it do_distance).
the_label = session.query(Ngram.terms).filter(Ngram.id==node_id).first()
the_label = ", ".join(the_label)
G.node[node_id]['label'] = the_label
G.node[node_id]['size'] = weight[node_id]
G.node[node_id]['type'] = ids[node_id][0].replace("ngrams","terms")
G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
......@@ -65,12 +70,20 @@ def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
if bridgeness > 0:
for c1 in com_link.keys():
for c2 in com_link[c1].keys():
index = round(bridgeness*len(com_link[c1][c2]) / (len(com_ids[c1]) + len(com_ids[c2])))
index = round(
bridgeness * len( com_link[c1][c2] )
/ #----------------------------------#
( len(com_ids[c1]) + len(com_ids[c2] ))
)
#print((c1,len(com_ids[c1])), (c2,len(com_ids[c2])), index)
if index > 0:
for link in sorted(com_link[c1][c2], key=lambda x: x[2], reverse=True)[:index]:
for link in sorted( com_link[c1][c2]
, key=lambda x: x[2]
, reverse=True)[:index]:
#print(c1, c2, link[2])
info = {"s": link[0], "t": link[1], "w": link[2]}
links.append(info)
......
......@@ -3,15 +3,10 @@ from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness
from sqlalchemy import desc, asc, or_, and_
from datetime import datetime
#import inspect
from datetime import datetime
from celery import shared_task
def filterMatrix(matrix, mapList_id, groupList_id):
mapList = UnweightedList( mapList_id )
......@@ -19,52 +14,6 @@ def filterMatrix(matrix, mapList_id, groupList_id):
cooc = matrix & (mapList * group_list)
return cooc
@shared_task
def computeGraph( corpus_id=None , cooc_id=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
, isMonopartite=True , threshold = 3
, save_on_db= True , reset=True
):
print("GRAPH # ... Computing cooccurrences.")
(cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id=cooc_id
, field1=field1, field2=field2
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True
)
print("GRAPH #%d ... Cooccurrences computed." % (cooc_id))
print("GRAPH #%d ... Clustering with %s distance." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
print("GRAPH #%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
print("GRAPH #%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
if node.hyperdata.get(distance, None) is None:
node.hyperdata[distance] = dict()
node.hyperdata[distance][bridgeness] = data
node.save_hyperdata()
session.commit()
print("GRAPH #%d ... Returning data as json." % cooc_id)
return data
def countCooccurrences( corpus_id=None , cooc_id=None
, field1='ngrams' , field2='ngrams'
......@@ -269,8 +218,10 @@ def countCooccurrences( corpus_id=None , cooc_id=None
# Saving the parameters
print("GRAPH #%s ... Parameters saved in Node." % cooc_id)
coocNode = session.query(Node).filter(Node.id==cooc_id).first()
coocNode.hyperdata[distance] = dict()
coocNode.hyperdata[distance]["parameters"] = parameters
session.add(coocNode)
session.commit()
......
......@@ -4,15 +4,78 @@ from gargantext.util.lists import WeightedMatrix, UnweightedList, Transla
from gargantext.util.http import JsonHttpResponse
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, NodeHyperdata
#from gargantext.util.toolchain.ngram_coocs import compute_coocs
from graph.cooccurrences import computeGraph, filterMatrix
from graph.cooccurrences import countCooccurrences
from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness
from gargantext.util.scheduling import scheduled
from gargantext.constants import graph_constraints
from datetime import datetime
from celery import shared_task
from datetime import datetime
@shared_task
def compute_graph( corpus_id=None , cooc_id=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
, isMonopartite=True , threshold = 3
, save_on_db= True , reset=True
) :
'''
All steps to compute a graph:
1) count Cooccurrences (function countCooccurrences)
main parameters: threshold, isMonopartite
2) filter and cluster By Distances (function clusterByDistances)
main parameter: distance
TODO option clustering='louvain'
or 'percolation' or 'random walk' or ...
3) filter By Bridgeness (function filterByBridgeness)
main parameter: bridgeness
4) format the graph (formatGraph)
main parameter: format_
'''
print("GRAPH # ... Computing cooccurrences.")
(cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id=cooc_id
, field1=field1, field2=field2
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True
)
print("GRAPH #%d ... Cooccurrences computed." % (cooc_id))
print("GRAPH #%d ... Clustering with %s distance." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
print("GRAPH #%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
print("GRAPH #%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
if node.hyperdata.get(distance, None) is None:
node.hyperdata[distance] = dict()
node.hyperdata[distance][bridgeness] = data
node.save_hyperdata()
session.commit()
print("GRAPH #%d ... Returning data as json." % cooc_id)
return data
def get_graph( request=None , corpus=None
, field1='ngrams' , field2='ngrams'
......@@ -22,7 +85,7 @@ def get_graph( request=None , corpus=None
, distance='conditional', bridgeness=5
, threshold=1 , isMonopartite=True
, saveOnly=True
):
) :
'''
Get_graph : main steps:
0) Check the parameters
......@@ -33,18 +96,9 @@ def get_graph( request=None , corpus=None
get_graph first checks the parameters and return either graph data or a dict with
state "type" with an integer to indicate the size of the parameter
(maybe we could add a String in that step to factor and give here the error message)
1) count Cooccurrences (function countCooccurrences)
main parameters: threshold
2) filter and cluster By Distances (function clusterByDistances)
main parameter: distance
3) filter By Bridgeness (function filterByBridgeness)
main parameter: bridgeness
4) format the graph (formatGraph)
main parameter: format_
1) compute_graph (see function above)
2) return graph
'''
......@@ -69,7 +123,7 @@ def get_graph( request=None , corpus=None
# Case of graph has not been computed already
# First, check the parameters
# Case of mapList not big enough
# ==============================
......@@ -128,7 +182,7 @@ def get_graph( request=None , corpus=None
corpus_size = corpus_size_query.count()
if saveOnly is not None and saveOnly == "True":
scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id
scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
......@@ -141,7 +195,7 @@ def get_graph( request=None , corpus=None
elif corpus_size > graph_constraints['corpusMax']:
# Then compute cooc asynchronously with celery
scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id
scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
......@@ -160,7 +214,7 @@ def get_graph( request=None , corpus=None
else:
# If graph_constraints are ok then compute the graph in live
data = computeGraph( corpus_id=corpus.id, cooc_id=cooc_id
data = compute_graph( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
......@@ -173,7 +227,7 @@ def get_graph( request=None , corpus=None
# case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
if len(data) == 0:
print("GET_GRAPH: 0 coocs in matrix")
print("GRAPH # ... GET_GRAPH: 0 coocs in matrix")
data = {'nodes':[], 'links':[]} # empty data
return data
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment