Commit bb1e553b authored by delanoe's avatar delanoe

[GRAPH] Now graph as json is saved according to its distance as key (needs to...

[GRAPH] Now graph as json is saved according to its distance as key (needs to add bridgeness parameter as key).
parent 24f4d6a3
......@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
# Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected
# here are the maximum size of corpus and maplist required to compute the graph
graph_constraints = {'corpusMax' : 100
graph_constraints = {'corpusMax' : 500
,'corpusMin' : 40
,'mapList' : 50
}
......@@ -19,9 +19,13 @@ def filterMatrix(matrix, mapList_id, groupList_id):
cooc = matrix & (mapList * group_list)
return cooc
# computeGraph
def cooc2graph( cooc_id, cooc_matrix, field1="ngrams", field2="ngrams", distance=None, bridgeness=None):
print("GRAPH#%d ... Computing cooccurrences." % (cooc_id))
# Check if already computed cooc
# (cooc_id, cooc) = count(countCooccurrences)
print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
......@@ -36,8 +40,11 @@ def cooc2graph( cooc_id, cooc_matrix, field1="ngrams", field2="ngrams", distance
node.hyperdata[distance] = dict()
node.hyperdata[distance]["data"] = data
node.save_hyperdata()
session.commit()
print("GRAPH#%d ... Returning data as json." % cooc_id)
return data
......@@ -70,9 +77,6 @@ def countCooccurrences( corpus_id=None , test= False
'''
# TODO : add hyperdata here
# Security test
field1,field2 = str(field1), str(field2)
# Parameters to save in hyperdata of the Node Cooc
parameters = dict()
parameters['field1'] = field1
......@@ -81,10 +85,10 @@ def countCooccurrences( corpus_id=None , test= False
# Get corpus as Python object
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# Get node
# Get node of the Graph
if not coocNode_id:
coocNode_id0 = ( session.query( Node.id )
coocNode_id = ( session.query( Node.id )
.filter( Node.typename == "COOCCURRENCES"
, Node.name == "GRAPH EXPLORER"
, Node.parent_id == corpus.id
......@@ -101,7 +105,7 @@ def countCooccurrences( corpus_id=None , test= False
session.commit()
coocNode_id = coocNode.id
else :
coocNode_id = coocNode_id[0]
coocNode_id = int(coocNode_id[0])
if reset == True :
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == coocNode_id ).delete()
......@@ -233,18 +237,19 @@ def countCooccurrences( corpus_id=None , test= False
matrix = WeightedMatrix(cooc_query)
print("Node #%d Filtering the matrix with Map and Group Lists." % coocNode_id)
print("GRAPH #%s Filtering the matrix with Map and Group Lists." % coocNode_id)
cooc = filterMatrix(matrix, mapList_id, groupList_id)
parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(groupList_id)
if save_on_db:
# Saving the cooccurrences
cooc.save(coocNode_id)
print("Node Cooccurrence Matrix saved")
print("GRAPH#%s ... Node Cooccurrence Matrix saved" % coocNode_id)
# Saving the parameters
print("Saving parameters in Node %d" % coocNode_id)
print("GRAPH#%s ... Parameters saved in Node." % coocNode_id)
coocNode = session.query(Node).filter(Node.id==coocNode_id).first()
coocNode.hyperdata[distance] = dict()
coocNode.hyperdata[distance]["parameters"] = parameters
......@@ -252,8 +257,7 @@ def countCooccurrences( corpus_id=None , test= False
session.commit()
data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
print(data)
# Log message
return data
else:
data = cooc2graph(coocNode_id, cooc, distance=distance)
......
......@@ -33,7 +33,7 @@ def get_graph( request=None , corpus=None
get_graph :: GraphParameters -> Either (Dic Nodes Links) (Dic State Length)
where type Length = Int
get_graph first checks the parameters and return either graph data or a dic with
get_graph first checks the parameters and return either graph data or a dict with
state "type" with an integer to indicate the size of the parameter
(maybe we could add a String in that step to factor and give here the error message)
......@@ -51,20 +51,30 @@ def get_graph( request=None , corpus=None
'''
before_cooc = datetime.now()
# Case of graph has been computed already
if cooc_id is not None:
print("Getting data for matrix %d", int(cooc_id))
node = session.query(Node).filter(Node.id == cooc_id).first()
# Check if
if node.hyperdata.get(distance, None) is not None:
data = node.hyperdata[distance]["data"]
return data
# case of Cooccurrences have not been computed already
if cooc_id == None:
# case of mapList not big enough
# Case of graph has not been computed already
# First, check the parameters
# Case of mapList not big enough
# ==============================
# if we do not have any mapList_id already
if mapList_id is None:
mapList_id = session.query(Node.id).filter(Node.typename == "MAPLIST").first()[0]
mapList_size_query = session.query(NodeNgram).filter(NodeNgram.node_id == mapList_id)
mapList_size = mapList_size_query.count()
mapList_size = session.query(NodeNgram).filter(NodeNgram.node_id == mapList_id).count()
if mapList_size < graph_constraints['mapList']:
# Do not compute the graph if mapList is not big enough
return {'state': "mapListError", "length" : mapList_size}
......@@ -77,8 +87,8 @@ def get_graph( request=None , corpus=None
.filter(Node.parent_id == corpus.id)
)
# filter by date if any start date
# --------------------------------
# Filter corpus by date if any start date
# ---------------------------------------
if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
date_start = datetime.strptime (str(start), "%Y-%m-%d")
......@@ -93,8 +103,8 @@ def get_graph( request=None , corpus=None
)
# filter by date if any end date
# --------------------------------
# Filter corpus by date if any end date
# -------------------------------------
if end is not None:
date_end = datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
......@@ -114,7 +124,7 @@ def get_graph( request=None , corpus=None
corpus_size = corpus_size_query.count()
if saveOnly is not None and saveOnly == "True":
scheduled(countCooccurrences)( corpus_id=corpus.id
scheduled(countCooccurrences)( corpus_id=corpus.id, coocNode_id=cooc_id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
......@@ -127,7 +137,7 @@ def get_graph( request=None , corpus=None
if corpus_size > graph_constraints['corpusMax']:
# Then compute cooc asynchronously with celery
scheduled(countCooccurrences)( corpus_id=corpus.id
scheduled(countCooccurrences)( corpus_id=corpus.id, coocNode_id=cooc_id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
......@@ -146,7 +156,7 @@ def get_graph( request=None , corpus=None
else:
# If graph_constraints are ok then compute the graph in live
data = countCooccurrences( corpus_id=corpus.id
data = countCooccurrences( corpus_id=corpus.id, coocNode_id=cooc_id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
......@@ -156,22 +166,10 @@ def get_graph( request=None , corpus=None
#, limit=size
)
# case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
if len(data) == 0:
print("GET_GRAPH: 0 coocs in matrix")
data = {'nodes':[], 'links':[]} # empty data
else:
print("Getting data for matrix %d", int(cooc_id))
node = session.query(Node).filter(Node.id == cooc_id).first()
data = node.hyperdata[distance]["data"]
#print(data)
#matrix = WeightedMatrix(int(cooc_id))
#print(matrix)
#cooc_matrix = filterMatrix(matrix, mapList_id, groupList_id)
# normal case
return data
......@@ -3,59 +3,13 @@
from gargantext.util.db import session
from gargantext.models.nodes import Node
from graph.graph import get_graph
from graph.utils import compress_graph, format_html
from gargantext.util.http import APIView, APIException\
, JsonHttpResponse, requires_auth
from gargantext.constants import graph_constraints
from traceback import format_tb
def compress_graph(graphdata):
"""
graph data is usually a dict with 2 slots:
"nodes": [{"id":4103, "type":"terms", "attributes":{"clust_default": 0}, "size":29, "label":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.0425531914893617},...]
To send this data over the net, this function can reduce a lot of its size:
- keep less decimals for float value of each link's weight
- use shorter names for node properties (eg: s/clust_default/cl/)
result format:
"nodes": [{"id":4103, "at":{"cl": 0}, "s":29, "lb":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.042},...]
"""
for link in graphdata['links']:
link['w'] = format(link['w'], '.3f') # keep only 3 decimals
for node in graphdata['nodes']:
node['lb'] = node['label']
del node['label']
node['at'] = node['attributes']
del node['attributes']
node['at']['cl'] = node['at']['clust_default']
del node['at']['clust_default']
node['s'] = node['size']
del node['size']
if node['type'] == "terms":
# its the default type for our format: so we don't need it
del node['type']
else:
node['t'] = node['type']
del node['type']
return graphdata
def format_html(link):
"""
Build an html link adapted to our json message format
"""
return "<a class='msglink' href='%s'>%s</a>" % (link, link)
# TODO check authentication
class Graph(APIView):
'''
REST part for graphs.
......@@ -94,8 +48,7 @@ class Graph(APIView):
type_ = str(request.GET.get ('type' , 'node_link' ))
distance = str(request.GET.get ('distance' , 'conditional'))
# Get default value if no map list
# Get default map List of corpus
if mapList_id == 0 :
mapList_id = ( session.query ( Node.id )
.filter( Node.typename == "MAPLIST"
......@@ -107,7 +60,6 @@ class Graph(APIView):
mapList_id = mapList_id[0]
if mapList_id == None :
# todo add as an error msg ?
raise ValueError("MAPLIST node needed for cooccurrences")
......@@ -135,23 +87,13 @@ class Graph(APIView):
try:
# Check if parameters are accepted
if (field1 in accepted_field1) and (field2 in accepted_field2):
if start is not None and end is not None :
data = get_graph( corpus=corpus, cooc_id = cooc_id
#, field1=field1 , field2=field2
, field1=field1 , field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, start=start , end=end
, threshold =threshold , distance=distance
, saveOnly=saveOnly
)
else:
data = get_graph( corpus = corpus, cooc_id = cooc_id
#, field1=field1, field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, threshold = threshold
, distance = distance
, bridgeness = bridgeness
, saveOnly=saveOnly
)
# data :: Either (Dic Nodes Links) (Dic State Length)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment