Commit 24f4d6a3 authored by delanoe's avatar delanoe

[GRAPH] need factorization.

parent b7354d82
......@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
# Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected
# here are the maximum size of corpus and maplist required to compute the graph
graph_constraints = {'corpusMax' : 599
graph_constraints = {'corpusMax' : 100
,'corpusMin' : 40
,'mapList' : 50
}
......@@ -9,8 +9,9 @@ from networkx.readwrite import json_graph
def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
'''
What is bridgeness ?
Measure to control links (bridges) between communities.
'''
# Data are stored in a dict(), (== hashmap by default for Python)
# Data are stored in a dict(), (== hashmap by default with Python)
data = dict()
if type == "node_link":
nodesB_dict = {}
......
......@@ -3,21 +3,44 @@ from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
from gargantext.util.db import session, aliased, bulk_insert, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness
from sqlalchemy import desc, asc, or_, and_
#import inspect
import datetime
from datetime import datetime
from celery import shared_task
def filterMatrix(matrix, mapList_id, groupList_id):
mapList = UnweightedList( mapList_id )
mapList = UnweightedList( mapList_id )
group_list = Translations ( groupList_id )
cooc = matrix & (mapList * group_list)
return cooc
def cooc2graph( cooc_id, cooc_matrix, field1="ngrams", field2="ngrams", distance=None, bridgeness=None):
print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance))
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
print("GRAPH#%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
print("GRAPH#%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
node.hyperdata[distance] = dict()
node.hyperdata[distance]["data"] = data
node.save_hyperdata()
session.commit()
return data
@shared_task
def countCooccurrences( corpus_id=None , test= False
, field1='ngrams' , field2='ngrams'
......@@ -26,12 +49,13 @@ def countCooccurrences( corpus_id=None , test= False
, n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3
, save_on_db= False, # just return the WeightedMatrix,
, distance=None , bridgeness=None
, save_on_db= True, # just return the WeightedMatrix,
# (don't write to DB)
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
For the moment list of parameters are not supported because, lists need to
be merged before.
corpus :: Corpus
......@@ -162,7 +186,7 @@ def countCooccurrences( corpus_id=None , test= False
if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here.
date_start = datetime.datetime.strptime (str(start), "%Y-%m-%d")
date_start = datetime.strptime (str(start), "%Y-%m-%d")
date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
Start=aliased(NodeHyperdata)
......@@ -178,7 +202,7 @@ def countCooccurrences( corpus_id=None , test= False
if end is not None:
# TODO : more complexe date format here.
date_end = datetime.datetime.strptime (str(end), "%Y-%m-%d")
date_end = datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
End=aliased(NodeHyperdata)
......@@ -208,22 +232,29 @@ def countCooccurrences( corpus_id=None , test= False
#cooc_query = cooc_query.order_by(desc('cooc_score'))
matrix = WeightedMatrix(cooc_query)
print("Node #%d Filtering the matrix with Map and Group Lists." % coocNode_id)
cooc = filterMatrix(matrix, mapList_id, groupList_id)
parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(mapList_id)
parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(groupList_id)
if save_on_db:
# Saving cooc Matrix
cooc.save(coocNode_id)
print("Node Cooccurrence Matrix saved")
# Saving the parameters
print("Saving parameters in Node %d" % coocNode_id)
coocNode = session.query(Node).filter(Node.id==coocNode_id).first()
coocNode.hyperdata = parameters
coocNode.hyperdata[distance] = dict()
coocNode.hyperdata[distance]["parameters"] = parameters
session.add(coocNode)
session.commit()
data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
print(data)
# Log message
print("Cooccurrence Matrix saved")
return cooc
else:
data = cooc2graph(coocNode_id, cooc, distance=distance)
return data
......@@ -16,16 +16,16 @@ import networkx as nx
def clusterByDistances( cooc_matrix
, field1=None, field2=None
, distance='conditional'):
, distance=None):
'''
do_distance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
clusterByDistance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
'''
# implicit global session
authorized = ['conditional', 'distributional', 'cosine']
if distance not in authorized:
distance = 'conditional'
raise ValueError("Distance must be in %s" % str(authorized))
matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int))
......
......@@ -51,9 +51,8 @@ def get_graph( request=None , corpus=None
'''
before_cooc = datetime.now()
# case of Cooccurrences have not been computed already
if cooc_id == None:
......@@ -108,8 +107,7 @@ def get_graph( request=None , corpus=None
.filter( End.key == 'publication_date')
.filter( End.value_utc <= date_end_utc )
)
# Finally test if the size of the corpora is big enough
# --------------------------------
......@@ -121,6 +119,7 @@ def get_graph( request=None , corpus=None
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True
#, limit=size
)
......@@ -133,57 +132,46 @@ def get_graph( request=None , corpus=None
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True
#, limit=size
)
# Dic to inform user that corpus maximum is reached then
# Dict to inform user that corpus maximum is reached then
# graph is computed asynchronously
return {"state" : "corpusMax", "length" : corpus_size}
elif corpus_size <= graph_constraints['corpusMin']:
# Do not compute the graph if corpus is not big enough
return {"state" : "corpusMin", "length" : corpus_size}
else:
# If graph_constraints are ok then compute the graph in live
cooc_matrix = countCooccurrences( corpus_id=corpus.id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, save_on_db = True
#, limit=size
)
else:
print("Getting data for matrix %d", int(cooc_id))
matrix = WeightedMatrix(int(cooc_id))
#print(matrix)
cooc_matrix = filterMatrix(matrix, mapList_id, groupList_id)
data = countCooccurrences( corpus_id=corpus.id
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, distance=distance , bridgeness=bridgeness
, save_on_db = True
#, limit=size
)
# fyi
after_cooc = datetime.now()
print("... Cooccurrences took %f s." % (after_cooc - before_cooc).total_seconds())
# case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
# case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
if len(cooc_matrix.items) == 0:
print("GET_GRAPH: 0 coocs in matrix")
data = {'nodes':[], 'links':[]} # empty data
if len(data) == 0:
print("GET_GRAPH: 0 coocs in matrix")
data = {'nodes':[], 'links':[]} # empty data
# normal case
else:
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
after_cluster = datetime.now()
print("... Clustering took %f s." % (after_cluster - after_cooc).total_seconds())
data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
after_filter = datetime.now()
print("... Filtering took %f s." % (after_filter - after_cluster).total_seconds())
print("Getting data for matrix %d", int(cooc_id))
node = session.query(Node).filter(Node.id == cooc_id).first()
data = node.hyperdata[distance]["data"]
#print(data)
#matrix = WeightedMatrix(int(cooc_id))
#print(matrix)
#cooc_matrix = filterMatrix(matrix, mapList_id, groupList_id)
# normal case
return data
......@@ -123,18 +123,17 @@ class Graph(APIView):
groupList_id = groupList_id[0]
if groupList_id == None :
# todo add as an error msg ?
raise ValueError("GROUPLIST node needed for cooccurrences")
# Check the options
# Declare accepted fields
accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams', ]
options = ['start', 'end', 'threshold', 'distance', 'cooc_id' ]
try:
# Test params
# Check if parameters are accepted
if (field1 in accepted_field1) and (field2 in accepted_field2):
if start is not None and end is not None :
data = get_graph( corpus=corpus, cooc_id = cooc_id
......
......@@ -14,6 +14,8 @@ def explorer(request, project_id, corpus_id):
Graph explorer, also known as TinaWebJS, using SigmaJS.
Nodes are ngrams (from title or abstract or journal name.
Links represent proximity measure.
Data are received in RESTfull mode (see rest.py).
'''
# we pass our corpus
......@@ -46,7 +48,10 @@ def explorer(request, project_id, corpus_id):
@requires_auth
def myGraphs(request, project_id, corpus_id):
'''
List all of my Graphs
List all of my Graphs.
Each Graphs as one Node of Cooccurrences.
Each Graph is save in hyperdata of each Node.
'''
user = cache.User[request.user.id]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment