Commit ae011343 authored by delanoe's avatar delanoe

[GRAPH] Graph almost done: needs more factorization.

parent 76617d6b
......@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
# Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected
# here are the maximum size of corpus and maplist required to compute the graph
graph_constraints = {'corpusMax' : 500
graph_constraints = {'corpusMax' : 100
,'corpusMin' : 40
,'mapList' : 50
}
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
NodeHyperdata, HyperdataKey
from gargantext.util.db import session, aliased, bulk_insert, func
from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from graph.distances import clusterByDistances
......@@ -19,48 +19,61 @@ def filterMatrix(matrix, mapList_id, groupList_id):
cooc = matrix & (mapList * group_list)
return cooc
# computeGraph
def cooc2graph( cooc_id, cooc_matrix, field1="ngrams", field2="ngrams", distance=None, bridgeness=None):
print("GRAPH#%d ... Computing cooccurrences." % (cooc_id))
# Check if already computed cooc
# (cooc_id, cooc) = count(countCooccurrences)
print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
@shared_task
def computeGraph( corpus_id=None, cooc_id=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
, isMonopartite=True , threshold = 3
, save_on_db= True , reset=True
):
print("GRAPH# ... Computing cooccurrences.")
(cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id=cooc_id
, field1=field1, field2=field2
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True
)
print("GRAPH#%d ... Cooccurrences computed." % (cooc_id))
print("GRAPH#%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
print("GRAPH#%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
print("GRAPH#%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
if node.hyperdata.get(distance, None) is None:
node.hyperdata[distance] = dict()
node.hyperdata[distance][bridgeness] = data
node.save_hyperdata()
session.commit()
print("GRAPH#%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
if node.hyperdata.get(distance, None) is None:
node.hyperdata[distance] = dict()
node.hyperdata[distance][bridgeness] = data
node.save_hyperdata()
session.commit()
print("GRAPH#%d ... Returning data as json." % cooc_id)
return data
print("GRAPH#%d ... Returning data as json." % cooc_id)
return data
@shared_task
def countCooccurrences( corpus_id=None , test= False
def countCooccurrences( corpus_id=None, cooc_id=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3
, distance=None , bridgeness=None
, save_on_db= True, # just return the WeightedMatrix,
# (don't write to DB)
, save_on_db= True , reset=True
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
......@@ -71,15 +84,13 @@ def countCooccurrences( corpus_id=None , test= False
mapList_id :: Int
groupList_id :: Int
For the moment, start and end are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
'''
# TODO : add hyperdata here
# Parameters to save in hyperdata of the Node Cooc
# FIXME remove the lines below after factorization of parameters
parameters = dict()
parameters['field1'] = field1
parameters['field2'] = field2
......@@ -88,16 +99,16 @@ def countCooccurrences( corpus_id=None , test= False
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# Get node of the Graph
if not coocNode_id:
coocNode_id = ( session.query( Node.id )
if not cooc_id:
cooc_id = ( session.query( Node.id )
.filter( Node.typename == "COOCCURRENCES"
, Node.name == "GRAPH EXPLORER"
, Node.parent_id == corpus.id
)
.first()
)
if not coocNode_id:
if not cooc_id:
coocNode = corpus.add_child(
typename = "COOCCURRENCES",
name = "GRAPH (in corpus %s)" % corpus.id
......@@ -105,12 +116,12 @@ def countCooccurrences( corpus_id=None , test= False
session.add(coocNode)
session.commit()
coocNode_id = coocNode.id
cooc_id = coocNode.id
else :
coocNode_id = int(coocNode_id[0])
cooc_id = int(cooc_id[0])
if reset == True :
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == coocNode_id ).delete()
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == cooc_id ).delete()
session.commit()
......@@ -191,7 +202,7 @@ def countCooccurrences( corpus_id=None , test= False
# Cooc between the dates start and end
if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here.
# TODO : more precise date format here (day is smaller grain actually).
date_start = datetime.strptime (str(start), "%Y-%m-%d")
date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
......@@ -207,7 +218,7 @@ def countCooccurrences( corpus_id=None , test= False
if end is not None:
# TODO : more complexe date format here.
# TODO : more precise date format here (day is smaller grain actually).
date_end = datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
......@@ -239,28 +250,27 @@ def countCooccurrences( corpus_id=None , test= False
matrix = WeightedMatrix(cooc_query)
print("GRAPH #%s Filtering the matrix with Map and Group Lists." % coocNode_id)
print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id)
cooc = filterMatrix(matrix, mapList_id, groupList_id)
parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(groupList_id)
# TODO factorize savings on db
if save_on_db:
# Saving the cooccurrences
cooc.save(coocNode_id)
print("GRAPH#%s ... Node Cooccurrence Matrix saved" % coocNode_id)
cooc.save(cooc_id)
print("GRAPH#%s ... Node Cooccurrence Matrix saved" % cooc_id)
# Saving the parameters
print("GRAPH#%s ... Parameters saved in Node." % coocNode_id)
coocNode = session.query(Node).filter(Node.id==coocNode_id).first()
print("GRAPH#%s ... Parameters saved in Node." % cooc_id)
coocNode = session.query(Node).filter(Node.id==cooc_id).first()
coocNode.hyperdata[distance] = dict()
coocNode.hyperdata[distance]["parameters"] = parameters
session.add(coocNode)
session.commit()
data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
return data
#data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
#return data
else:
data = cooc2graph(coocNode_id, cooc, distance=distance)
return data
return(coocNode.id, cooc)
......@@ -5,7 +5,7 @@ from gargantext.util.http import JsonHttpResponse
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, NodeHyperdata
#from gargantext.util.toolchain.ngram_coocs import compute_coocs
from graph.cooccurrences import countCooccurrences, filterMatrix
from graph.cooccurrences import computeGraph, filterMatrix
from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness
......@@ -19,12 +19,9 @@ def get_graph( request=None , corpus=None
, mapList_id = None , groupList_id = None
, cooc_id=None , type='node_link'
, start=None , end=None
, threshold=1
, distance='conditional'
, isMonopartite=True # By default, we compute terms/terms graph
, bridgeness=5
, saveOnly=None
#, size=1000
, distance='conditional', bridgeness=5
, threshold=1 , isMonopartite=True
, saveOnly=True
):
'''
Get_graph : main steps:
......@@ -54,7 +51,7 @@ def get_graph( request=None , corpus=None
# Case of graph has been computed already
if cooc_id is not None:
print("Getting data for matrix %d", int(cooc_id))
print("GRAPH#%d ... Loading data already computed." % int(cooc_id))
node = session.query(Node).filter(Node.id == cooc_id).first()
# Structure of the Node.hyperdata[distance][bridbeness]
......@@ -65,8 +62,6 @@ def get_graph( request=None , corpus=None
if node.hyperdata.get(distance, None) is not None:
graph = node.hyperdata[distance]
print(node.hyperdata[distance].keys())
# Check bridgeness of the graph
if graph.get(str(bridgeness), None) is not None:
return graph[str(bridgeness)]
......@@ -133,7 +128,7 @@ def get_graph( request=None , corpus=None
corpus_size = corpus_size_query.count()
if saveOnly is not None and saveOnly == "True":
scheduled(countCooccurrences)( corpus_id=corpus.id, coocNode_id=cooc_id
scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
......@@ -144,9 +139,9 @@ def get_graph( request=None , corpus=None
)
return {"state" : "saveOnly"}
if corpus_size > graph_constraints['corpusMax']:
elif corpus_size > graph_constraints['corpusMax']:
# Then compute cooc asynchronously with celery
scheduled(countCooccurrences)( corpus_id=corpus.id, coocNode_id=cooc_id
scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
......@@ -155,8 +150,8 @@ def get_graph( request=None , corpus=None
, save_on_db = True
#, limit=size
)
# Dict to inform user that corpus maximum is reached then
# graph is computed asynchronously
# Dict to inform user that corpus maximum is reached
# then graph is computed asynchronously
return {"state" : "corpusMax", "length" : corpus_size}
elif corpus_size <= graph_constraints['corpusMin']:
......@@ -165,7 +160,7 @@ def get_graph( request=None , corpus=None
else:
# If graph_constraints are ok then compute the graph in live
data = countCooccurrences( corpus_id=corpus.id, coocNode_id=cooc_id
data = computeGraph( corpus_id=corpus.id, cooc_id=cooc_id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
......
#from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from gargantext.util.db import session
from gargantext.models.nodes import Node
from graph.graph import get_graph
......@@ -8,7 +6,7 @@ from gargantext.util.http import APIView, APIException\
, JsonHttpResponse, requires_auth
from gargantext.constants import graph_constraints
from traceback import format_tb
from traceback import format_tb
class Graph(APIView):
'''
......@@ -29,6 +27,16 @@ class Graph(APIView):
# Get the node we are working with
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# TODO Parameters to save in hyperdata of the Node Cooc
# WARNING: we could factorize the parameters as dict but ...
# ... it causes a bug in asynchronous function !
# Check celery upgrades before.
# Example (for the future):
# parameters = dict()
# parameters['field1'] = field1
# parameters['field2'] = field2
# Get all the parameters in the URL
cooc_id = request.GET.get ('cooc_id' , None )
saveOnly = request.GET.get ('saveOnly' , None )
......@@ -48,6 +56,7 @@ class Graph(APIView):
type_ = str(request.GET.get ('type' , 'node_link' ))
distance = str(request.GET.get ('distance' , 'conditional'))
# Get default map List of corpus
if mapList_id == 0 :
mapList_id = ( session.query ( Node.id )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment