Commit ae011343 authored by delanoe's avatar delanoe

[GRAPH] Graph almost done: needs more factorization.

parent 76617d6b
...@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5 ...@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
# Graph constraints to compute the graph: # Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected # Modes: live graph generation, graph asynchronously computed or errors detected
# here are the maximum size of corpus and maplist required to compute the graph # here are the maximum size of corpus and maplist required to compute the graph
graph_constraints = {'corpusMax' : 500 graph_constraints = {'corpusMax' : 100
,'corpusMin' : 40 ,'corpusMin' : 40
,'mapList' : 50 ,'mapList' : 50
} }
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \ from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
NodeHyperdata, HyperdataKey NodeHyperdata, HyperdataKey
from gargantext.util.db import session, aliased, bulk_insert, func from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from graph.distances import clusterByDistances from graph.distances import clusterByDistances
...@@ -19,48 +19,61 @@ def filterMatrix(matrix, mapList_id, groupList_id): ...@@ -19,48 +19,61 @@ def filterMatrix(matrix, mapList_id, groupList_id):
cooc = matrix & (mapList * group_list) cooc = matrix & (mapList * group_list)
return cooc return cooc
# computeGraph @shared_task
def cooc2graph( cooc_id, cooc_matrix, field1="ngrams", field2="ngrams", distance=None, bridgeness=None): def computeGraph( corpus_id=None, cooc_id=None
, field1='ngrams' , field2='ngrams'
print("GRAPH#%d ... Computing cooccurrences." % (cooc_id)) , start=None , end=None
# Check if already computed cooc , mapList_id=None , groupList_id=None
# (cooc_id, cooc) = count(countCooccurrences) , distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance)) , isMonopartite=True , threshold = 3
G, partition, ids, weight = clusterByDistances ( cooc_matrix , save_on_db= True , reset=True
, field1="ngrams", field2="ngrams" ):
, distance=distance
) print("GRAPH# ... Computing cooccurrences.")
(cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id=cooc_id
, field1=field1, field2=field2
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True
)
print("GRAPH#%d ... Cooccurrences computed." % (cooc_id))
print("GRAPH#%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2) print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
print("GRAPH#%d ... Saving Graph in hyperdata as json." % cooc_id) print("GRAPH#%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
node = session.query(Node).filter(Node.id == cooc_id).first() data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
if node.hyperdata.get(distance, None) is None: print("GRAPH#%d ... Saving Graph in hyperdata as json." % cooc_id)
node.hyperdata[distance] = dict() node = session.query(Node).filter(Node.id == cooc_id).first()
node.hyperdata[distance][bridgeness] = data if node.hyperdata.get(distance, None) is None:
node.hyperdata[distance] = dict()
node.save_hyperdata()
session.commit() node.hyperdata[distance][bridgeness] = data
node.save_hyperdata()
session.commit()
print("GRAPH#%d ... Returning data as json." % cooc_id) print("GRAPH#%d ... Returning data as json." % cooc_id)
return data return data
@shared_task def countCooccurrences( corpus_id=None, cooc_id=None
def countCooccurrences( corpus_id=None , test= False
, field1='ngrams' , field2='ngrams' , field1='ngrams' , field2='ngrams'
, start=None , end=None , start=None , end=None
, mapList_id=None , groupList_id=None , mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000 , n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3 , isMonopartite=True , threshold = 3
, distance=None , bridgeness=None , save_on_db= True , reset=True
, save_on_db= True, # just return the WeightedMatrix,
# (don't write to DB)
): ):
''' '''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
...@@ -71,15 +84,13 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -71,15 +84,13 @@ def countCooccurrences( corpus_id=None , test= False
mapList_id :: Int mapList_id :: Int
groupList_id :: Int groupList_id :: Int
For the moment, start and end are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02' start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp end :: TimeStamp
limit :: Int limit :: Int
''' '''
# TODO : add hyperdata here
# Parameters to save in hyperdata of the Node Cooc # FIXME remove the lines below after factorization of parameters
parameters = dict() parameters = dict()
parameters['field1'] = field1 parameters['field1'] = field1
parameters['field2'] = field2 parameters['field2'] = field2
...@@ -88,16 +99,16 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -88,16 +99,16 @@ def countCooccurrences( corpus_id=None , test= False
corpus = session.query(Node).filter(Node.id==corpus_id).first() corpus = session.query(Node).filter(Node.id==corpus_id).first()
# Get node of the Graph # Get node of the Graph
if not coocNode_id: if not cooc_id:
coocNode_id = ( session.query( Node.id ) cooc_id = ( session.query( Node.id )
.filter( Node.typename == "COOCCURRENCES" .filter( Node.typename == "COOCCURRENCES"
, Node.name == "GRAPH EXPLORER" , Node.name == "GRAPH EXPLORER"
, Node.parent_id == corpus.id , Node.parent_id == corpus.id
) )
.first() .first()
) )
if not coocNode_id: if not cooc_id:
coocNode = corpus.add_child( coocNode = corpus.add_child(
typename = "COOCCURRENCES", typename = "COOCCURRENCES",
name = "GRAPH (in corpus %s)" % corpus.id name = "GRAPH (in corpus %s)" % corpus.id
...@@ -105,12 +116,12 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -105,12 +116,12 @@ def countCooccurrences( corpus_id=None , test= False
session.add(coocNode) session.add(coocNode)
session.commit() session.commit()
coocNode_id = coocNode.id cooc_id = coocNode.id
else : else :
coocNode_id = int(coocNode_id[0]) cooc_id = int(cooc_id[0])
if reset == True : if reset == True :
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == coocNode_id ).delete() session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == cooc_id ).delete()
session.commit() session.commit()
...@@ -191,7 +202,7 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -191,7 +202,7 @@ def countCooccurrences( corpus_id=None , test= False
# Cooc between the dates start and end # Cooc between the dates start and end
if start is not None: if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S") #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here. # TODO : more precise date format here (day is smaller grain actually).
date_start = datetime.strptime (str(start), "%Y-%m-%d") date_start = datetime.strptime (str(start), "%Y-%m-%d")
date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S") date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
...@@ -207,7 +218,7 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -207,7 +218,7 @@ def countCooccurrences( corpus_id=None , test= False
if end is not None: if end is not None:
# TODO : more complexe date format here. # TODO : more precise date format here (day is smaller grain actually).
date_end = datetime.strptime (str(end), "%Y-%m-%d") date_end = datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S") date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
...@@ -239,28 +250,27 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -239,28 +250,27 @@ def countCooccurrences( corpus_id=None , test= False
matrix = WeightedMatrix(cooc_query) matrix = WeightedMatrix(cooc_query)
print("GRAPH #%s Filtering the matrix with Map and Group Lists." % coocNode_id) print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id)
cooc = filterMatrix(matrix, mapList_id, groupList_id) cooc = filterMatrix(matrix, mapList_id, groupList_id)
parameters['MapList_id'] = str(mapList_id) parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(groupList_id) parameters['GroupList_id'] = str(groupList_id)
# TODO factorize savings on db
if save_on_db: if save_on_db:
# Saving the cooccurrences # Saving the cooccurrences
cooc.save(coocNode_id) cooc.save(cooc_id)
print("GRAPH#%s ... Node Cooccurrence Matrix saved" % coocNode_id) print("GRAPH#%s ... Node Cooccurrence Matrix saved" % cooc_id)
# Saving the parameters # Saving the parameters
print("GRAPH#%s ... Parameters saved in Node." % coocNode_id) print("GRAPH#%s ... Parameters saved in Node." % cooc_id)
coocNode = session.query(Node).filter(Node.id==coocNode_id).first() coocNode = session.query(Node).filter(Node.id==cooc_id).first()
coocNode.hyperdata[distance] = dict() coocNode.hyperdata[distance] = dict()
coocNode.hyperdata[distance]["parameters"] = parameters coocNode.hyperdata[distance]["parameters"] = parameters
session.add(coocNode) session.add(coocNode)
session.commit() session.commit()
data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness) #data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
return data #return data
else: return(coocNode.id, cooc)
data = cooc2graph(coocNode_id, cooc, distance=distance)
return data
...@@ -5,7 +5,7 @@ from gargantext.util.http import JsonHttpResponse ...@@ -5,7 +5,7 @@ from gargantext.util.http import JsonHttpResponse
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, NodeHyperdata from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, NodeHyperdata
#from gargantext.util.toolchain.ngram_coocs import compute_coocs #from gargantext.util.toolchain.ngram_coocs import compute_coocs
from graph.cooccurrences import countCooccurrences, filterMatrix from graph.cooccurrences import computeGraph, filterMatrix
from graph.distances import clusterByDistances from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness from graph.bridgeness import filterByBridgeness
...@@ -19,12 +19,9 @@ def get_graph( request=None , corpus=None ...@@ -19,12 +19,9 @@ def get_graph( request=None , corpus=None
, mapList_id = None , groupList_id = None , mapList_id = None , groupList_id = None
, cooc_id=None , type='node_link' , cooc_id=None , type='node_link'
, start=None , end=None , start=None , end=None
, threshold=1 , distance='conditional', bridgeness=5
, distance='conditional' , threshold=1 , isMonopartite=True
, isMonopartite=True # By default, we compute terms/terms graph , saveOnly=True
, bridgeness=5
, saveOnly=None
#, size=1000
): ):
''' '''
Get_graph : main steps: Get_graph : main steps:
...@@ -54,7 +51,7 @@ def get_graph( request=None , corpus=None ...@@ -54,7 +51,7 @@ def get_graph( request=None , corpus=None
# Case of graph has been computed already # Case of graph has been computed already
if cooc_id is not None: if cooc_id is not None:
print("Getting data for matrix %d", int(cooc_id)) print("GRAPH#%d ... Loading data already computed." % int(cooc_id))
node = session.query(Node).filter(Node.id == cooc_id).first() node = session.query(Node).filter(Node.id == cooc_id).first()
# Structure of the Node.hyperdata[distance][bridbeness] # Structure of the Node.hyperdata[distance][bridbeness]
...@@ -65,8 +62,6 @@ def get_graph( request=None , corpus=None ...@@ -65,8 +62,6 @@ def get_graph( request=None , corpus=None
if node.hyperdata.get(distance, None) is not None: if node.hyperdata.get(distance, None) is not None:
graph = node.hyperdata[distance] graph = node.hyperdata[distance]
print(node.hyperdata[distance].keys())
# Check bridgeness of the graph # Check bridgeness of the graph
if graph.get(str(bridgeness), None) is not None: if graph.get(str(bridgeness), None) is not None:
return graph[str(bridgeness)] return graph[str(bridgeness)]
...@@ -133,7 +128,7 @@ def get_graph( request=None , corpus=None ...@@ -133,7 +128,7 @@ def get_graph( request=None , corpus=None
corpus_size = corpus_size_query.count() corpus_size = corpus_size_query.count()
if saveOnly is not None and saveOnly == "True": if saveOnly is not None and saveOnly == "True":
scheduled(countCooccurrences)( corpus_id=corpus.id, coocNode_id=cooc_id scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams" #, field1="ngrams", field2="ngrams"
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
...@@ -144,9 +139,9 @@ def get_graph( request=None , corpus=None ...@@ -144,9 +139,9 @@ def get_graph( request=None , corpus=None
) )
return {"state" : "saveOnly"} return {"state" : "saveOnly"}
if corpus_size > graph_constraints['corpusMax']: elif corpus_size > graph_constraints['corpusMax']:
# Then compute cooc asynchronously with celery # Then compute cooc asynchronously with celery
scheduled(countCooccurrences)( corpus_id=corpus.id, coocNode_id=cooc_id scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams" #, field1="ngrams", field2="ngrams"
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
...@@ -155,8 +150,8 @@ def get_graph( request=None , corpus=None ...@@ -155,8 +150,8 @@ def get_graph( request=None , corpus=None
, save_on_db = True , save_on_db = True
#, limit=size #, limit=size
) )
# Dict to inform user that corpus maximum is reached then # Dict to inform user that corpus maximum is reached
# graph is computed asynchronously # then graph is computed asynchronously
return {"state" : "corpusMax", "length" : corpus_size} return {"state" : "corpusMax", "length" : corpus_size}
elif corpus_size <= graph_constraints['corpusMin']: elif corpus_size <= graph_constraints['corpusMin']:
...@@ -165,7 +160,7 @@ def get_graph( request=None , corpus=None ...@@ -165,7 +160,7 @@ def get_graph( request=None , corpus=None
else: else:
# If graph_constraints are ok then compute the graph in live # If graph_constraints are ok then compute the graph in live
data = countCooccurrences( corpus_id=corpus.id, coocNode_id=cooc_id data = computeGraph( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams" #, field1="ngrams", field2="ngrams"
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
......
#from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.models.nodes import Node from gargantext.models.nodes import Node
from graph.graph import get_graph from graph.graph import get_graph
...@@ -8,7 +6,7 @@ from gargantext.util.http import APIView, APIException\ ...@@ -8,7 +6,7 @@ from gargantext.util.http import APIView, APIException\
, JsonHttpResponse, requires_auth , JsonHttpResponse, requires_auth
from gargantext.constants import graph_constraints from gargantext.constants import graph_constraints
from traceback import format_tb from traceback import format_tb
class Graph(APIView): class Graph(APIView):
''' '''
...@@ -29,6 +27,16 @@ class Graph(APIView): ...@@ -29,6 +27,16 @@ class Graph(APIView):
# Get the node we are working with # Get the node we are working with
corpus = session.query(Node).filter(Node.id==corpus_id).first() corpus = session.query(Node).filter(Node.id==corpus_id).first()
# TODO Parameters to save in hyperdata of the Node Cooc
# WARNING: we could factorize the parameters as dict but ...
# ... it causes a bug in asynchronous function !
# Check celery upgrades before.
# Example (for the future):
# parameters = dict()
# parameters['field1'] = field1
# parameters['field2'] = field2
# Get all the parameters in the URL # Get all the parameters in the URL
cooc_id = request.GET.get ('cooc_id' , None ) cooc_id = request.GET.get ('cooc_id' , None )
saveOnly = request.GET.get ('saveOnly' , None ) saveOnly = request.GET.get ('saveOnly' , None )
...@@ -48,6 +56,7 @@ class Graph(APIView): ...@@ -48,6 +56,7 @@ class Graph(APIView):
type_ = str(request.GET.get ('type' , 'node_link' )) type_ = str(request.GET.get ('type' , 'node_link' ))
distance = str(request.GET.get ('distance' , 'conditional')) distance = str(request.GET.get ('distance' , 'conditional'))
# Get default map List of corpus # Get default map List of corpus
if mapList_id == 0 : if mapList_id == 0 :
mapList_id = ( session.query ( Node.id ) mapList_id = ( session.query ( Node.id )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment