Commit cc0cecce authored by delanoe's avatar delanoe

[GRAPH] need factorization.

parent 8c0baf85
...@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5 ...@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
# Graph constraints to compute the graph: # Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected # Modes: live graph generation, graph asynchronously computed or errors detected
# here are the maximum size of corpus and maplist required to compute the graph # here are the maximum size of corpus and maplist required to compute the graph
graph_constraints = {'corpusMax' : 599 graph_constraints = {'corpusMax' : 100
,'corpusMin' : 40 ,'corpusMin' : 40
,'mapList' : 50 ,'mapList' : 50
} }
...@@ -9,8 +9,9 @@ from networkx.readwrite import json_graph ...@@ -9,8 +9,9 @@ from networkx.readwrite import json_graph
def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2): def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
''' '''
What is bridgeness ? What is bridgeness ?
Measure to control links (bridges) between communities.
''' '''
# Data are stored in a dict(), (== hashmap by default for Python) # Data are stored in a dict(), (== hashmap by default with Python)
data = dict() data = dict()
if type == "node_link": if type == "node_link":
nodesB_dict = {} nodesB_dict = {}
......
...@@ -3,11 +3,13 @@ from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \ ...@@ -3,11 +3,13 @@ from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
from gargantext.util.db import session, aliased, bulk_insert, func from gargantext.util.db import session, aliased, bulk_insert, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness
from sqlalchemy import desc, asc, or_, and_ from sqlalchemy import desc, asc, or_, and_
#import inspect #import inspect
import datetime from datetime import datetime
from celery import shared_task from celery import shared_task
...@@ -18,6 +20,27 @@ def filterMatrix(matrix, mapList_id, groupList_id): ...@@ -18,6 +20,27 @@ def filterMatrix(matrix, mapList_id, groupList_id):
return cooc return cooc
def cooc2graph( cooc_id, cooc_matrix, field1="ngrams", field2="ngrams", distance=None, bridgeness=None):
print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
print("GRAPH#%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
print("GRAPH#%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
node.hyperdata[distance] = dict()
node.hyperdata[distance]["data"] = data
node.save_hyperdata()
session.commit()
return data
@shared_task @shared_task
def countCooccurrences( corpus_id=None , test= False def countCooccurrences( corpus_id=None , test= False
, field1='ngrams' , field2='ngrams' , field1='ngrams' , field2='ngrams'
...@@ -26,12 +49,13 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -26,12 +49,13 @@ def countCooccurrences( corpus_id=None , test= False
, n_min=1, n_max=None , limit=1000 , n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True , coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3 , isMonopartite=True , threshold = 3
, save_on_db= False, # just return the WeightedMatrix, , distance=None , bridgeness=None
, save_on_db= True, # just return the WeightedMatrix,
# (don't write to DB) # (don't write to DB)
): ):
''' '''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to For the moment list of parameters are not supported because, lists need to
be merged before. be merged before.
corpus :: Corpus corpus :: Corpus
...@@ -162,7 +186,7 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -162,7 +186,7 @@ def countCooccurrences( corpus_id=None , test= False
if start is not None: if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S") #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here. # TODO : more complexe date format here.
date_start = datetime.datetime.strptime (str(start), "%Y-%m-%d") date_start = datetime.strptime (str(start), "%Y-%m-%d")
date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S") date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
Start=aliased(NodeHyperdata) Start=aliased(NodeHyperdata)
...@@ -178,7 +202,7 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -178,7 +202,7 @@ def countCooccurrences( corpus_id=None , test= False
if end is not None: if end is not None:
# TODO : more complexe date format here. # TODO : more complexe date format here.
date_end = datetime.datetime.strptime (str(end), "%Y-%m-%d") date_end = datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S") date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
End=aliased(NodeHyperdata) End=aliased(NodeHyperdata)
...@@ -208,22 +232,29 @@ def countCooccurrences( corpus_id=None , test= False ...@@ -208,22 +232,29 @@ def countCooccurrences( corpus_id=None , test= False
#cooc_query = cooc_query.order_by(desc('cooc_score')) #cooc_query = cooc_query.order_by(desc('cooc_score'))
matrix = WeightedMatrix(cooc_query) matrix = WeightedMatrix(cooc_query)
print("Node #%d Filtering the matrix with Map and Group Lists." % coocNode_id)
cooc = filterMatrix(matrix, mapList_id, groupList_id) cooc = filterMatrix(matrix, mapList_id, groupList_id)
parameters['MapList_id'] = str(mapList_id) parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(mapList_id) parameters['GroupList_id'] = str(groupList_id)
if save_on_db: if save_on_db:
# Saving cooc Matrix
cooc.save(coocNode_id) cooc.save(coocNode_id)
print("Node Cooccurrence Matrix saved")
# Saving the parameters # Saving the parameters
print("Saving parameters in Node %d" % coocNode_id)
coocNode = session.query(Node).filter(Node.id==coocNode_id).first() coocNode = session.query(Node).filter(Node.id==coocNode_id).first()
coocNode.hyperdata = parameters coocNode.hyperdata[distance] = dict()
coocNode.hyperdata[distance]["parameters"] = parameters
session.add(coocNode) session.add(coocNode)
session.commit() session.commit()
data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
print(data)
# Log message # Log message
print("Cooccurrence Matrix saved")
return cooc else:
data = cooc2graph(coocNode_id, cooc, distance=distance)
return data
...@@ -16,16 +16,16 @@ import networkx as nx ...@@ -16,16 +16,16 @@ import networkx as nx
def clusterByDistances( cooc_matrix def clusterByDistances( cooc_matrix
, field1=None, field2=None , field1=None, field2=None
, distance='conditional'): , distance=None):
''' '''
do_distance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight}) clusterByDistance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
''' '''
# implicit global session # implicit global session
authorized = ['conditional', 'distributional', 'cosine'] authorized = ['conditional', 'distributional', 'cosine']
if distance not in authorized: if distance not in authorized:
distance = 'conditional' raise ValueError("Distance must be in %s" % str(authorized))
matrix = defaultdict(lambda : defaultdict(float)) matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int)) ids = defaultdict(lambda : defaultdict(int))
......
...@@ -51,7 +51,6 @@ def get_graph( request=None , corpus=None ...@@ -51,7 +51,6 @@ def get_graph( request=None , corpus=None
''' '''
before_cooc = datetime.now() before_cooc = datetime.now()
...@@ -110,7 +109,6 @@ def get_graph( request=None , corpus=None ...@@ -110,7 +109,6 @@ def get_graph( request=None , corpus=None
) )
# Finally test if the size of the corpora is big enough # Finally test if the size of the corpora is big enough
# -------------------------------- # --------------------------------
corpus_size = corpus_size_query.count() corpus_size = corpus_size_query.count()
...@@ -121,6 +119,7 @@ def get_graph( request=None , corpus=None ...@@ -121,6 +119,7 @@ def get_graph( request=None , corpus=None
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold , isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True , save_on_db = True
#, limit=size #, limit=size
) )
...@@ -133,10 +132,11 @@ def get_graph( request=None , corpus=None ...@@ -133,10 +132,11 @@ def get_graph( request=None , corpus=None
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold , isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True , save_on_db = True
#, limit=size #, limit=size
) )
# Dic to inform user that corpus maximum is reached then # Dict to inform user that corpus maximum is reached then
# graph is computed asynchronously # graph is computed asynchronously
return {"state" : "corpusMax", "length" : corpus_size} return {"state" : "corpusMax", "length" : corpus_size}
...@@ -146,44 +146,32 @@ def get_graph( request=None , corpus=None ...@@ -146,44 +146,32 @@ def get_graph( request=None , corpus=None
else: else:
# If graph_constraints are ok then compute the graph in live # If graph_constraints are ok then compute the graph in live
cooc_matrix = countCooccurrences( corpus_id=corpus.id data = countCooccurrences( corpus_id=corpus.id
#, field1="ngrams", field2="ngrams" #, field1="ngrams", field2="ngrams"
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold , isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True , save_on_db = True
#, limit=size #, limit=size
) )
else:
print("Getting data for matrix %d", int(cooc_id))
matrix = WeightedMatrix(int(cooc_id))
#print(matrix)
cooc_matrix = filterMatrix(matrix, mapList_id, groupList_id)
# fyi
after_cooc = datetime.now()
print("... Cooccurrences took %f s." % (after_cooc - before_cooc).total_seconds())
# case when 0 coocs are observed (usually b/c not enough ngrams in maplist) # case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
if len(cooc_matrix.items) == 0:
if len(data) == 0:
print("GET_GRAPH: 0 coocs in matrix") print("GET_GRAPH: 0 coocs in matrix")
data = {'nodes':[], 'links':[]} # empty data data = {'nodes':[], 'links':[]} # empty data
# normal case
else:
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
after_cluster = datetime.now()
print("... Clustering took %f s." % (after_cluster - after_cooc).total_seconds())
data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
after_filter = datetime.now() else:
print("... Filtering took %f s." % (after_filter - after_cluster).total_seconds()) print("Getting data for matrix %d", int(cooc_id))
node = session.query(Node).filter(Node.id == cooc_id).first()
data = node.hyperdata[distance]["data"]
#print(data)
#matrix = WeightedMatrix(int(cooc_id))
#print(matrix)
#cooc_matrix = filterMatrix(matrix, mapList_id, groupList_id)
# normal case
return data return data
...@@ -123,18 +123,17 @@ class Graph(APIView): ...@@ -123,18 +123,17 @@ class Graph(APIView):
groupList_id = groupList_id[0] groupList_id = groupList_id[0]
if groupList_id == None : if groupList_id == None :
# todo add as an error msg ?
raise ValueError("GROUPLIST node needed for cooccurrences") raise ValueError("GROUPLIST node needed for cooccurrences")
# Check the options # Declare accepted fields
accepted_field1 = ['ngrams', 'journal', 'source', 'authors'] accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams', ] accepted_field2 = ['ngrams', ]
options = ['start', 'end', 'threshold', 'distance', 'cooc_id' ] options = ['start', 'end', 'threshold', 'distance', 'cooc_id' ]
try: try:
# Test params # Check if parameters are accepted
if (field1 in accepted_field1) and (field2 in accepted_field2): if (field1 in accepted_field1) and (field2 in accepted_field2):
if start is not None and end is not None : if start is not None and end is not None :
data = get_graph( corpus=corpus, cooc_id = cooc_id data = get_graph( corpus=corpus, cooc_id = cooc_id
......
...@@ -14,6 +14,8 @@ def explorer(request, project_id, corpus_id): ...@@ -14,6 +14,8 @@ def explorer(request, project_id, corpus_id):
Graph explorer, also known as TinaWebJS, using SigmaJS. Graph explorer, also known as TinaWebJS, using SigmaJS.
Nodes are ngrams (from title or abstract or journal name. Nodes are ngrams (from title or abstract or journal name.
Links represent proximity measure. Links represent proximity measure.
Data are received in RESTfull mode (see rest.py).
''' '''
# we pass our corpus # we pass our corpus
...@@ -46,7 +48,10 @@ def explorer(request, project_id, corpus_id): ...@@ -46,7 +48,10 @@ def explorer(request, project_id, corpus_id):
@requires_auth @requires_auth
def myGraphs(request, project_id, corpus_id): def myGraphs(request, project_id, corpus_id):
''' '''
List all of my Graphs List all of my Graphs.
Each Graphs as one Node of Cooccurrences.
Each Graph is save in hyperdata of each Node.
''' '''
user = cache.User[request.user.id] user = cache.User[request.user.id]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment