[GRAPH] need factorization.

cc0cecce · delanoe · 8c0baf85 · cc0cecce · cc0cecce · cc0cecce
Commit cc0cecce authored Sep 27, 2016 by delanoe
7 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
 # Graph constraints to compute the graph:
 # Modes: live graph generation, graph asynchronously computed or errors detected
 # here are the maximum size of corpus and maplist required to compute the graph
-graph_constraints = {'corpusMax' : 599
+graph_constraints = {'corpusMax' : 100
                    ,'corpusMin' : 40
                    ,'mapList'   : 50
                    }
--- a/graph/bridgeness.py
+++ b/graph/bridgeness.py
@@ -9,8 +9,9 @@ from networkx.readwrite           import json_graph
 def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
    '''
    What is bridgeness ?
+    Measure to control links (bridges) between communities.
    '''
-    # Data are stored in a dict(), (== hashmap by default for Python)
+    # Data are stored in a dict(), (== hashmap by default with Python)
    data = dict()
    if type == "node_link":
        nodesB_dict = {}

--- a/graph/cooccurrences.py
+++ b/graph/cooccurrences.py
@@ -3,11 +3,13 @@ from gargantext.models     import Node, Ngram, NodeNgram, NodeNgramNgram, \
 from gargantext.util.db    import session, aliased, bulk_insert, func
 from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
+from graph.distances       import clusterByDistances
+from graph.bridgeness      import filterByBridgeness
 from sqlalchemy            import desc, asc, or_, and_
 #import inspect
-import datetime
+from datetime import datetime
 from celery               import shared_task
@@ -18,6 +20,27 @@ def filterMatrix(matrix, mapList_id, groupList_id):
    return cooc
+def cooc2graph( cooc_id, cooc_matrix, field1="ngrams", field2="ngrams", distance=None, bridgeness=None):
+            print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance))
+            G, partition, ids, weight = clusterByDistances ( cooc_matrix
+                                                           , field1="ngrams", field2="ngrams"
+                                                           , distance=distance
+                                                           )
+            print("GRAPH#%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
+            data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
+            print("GRAPH#%d ... Saving Graph in hyperdata as json." % cooc_id)
+            node = session.query(Node).filter(Node.id == cooc_id).first()
+            node.hyperdata[distance] = dict()
+            node.hyperdata[distance]["data"] = data
+            node.save_hyperdata()
+            session.commit()
+            return data
 @shared_task
 def countCooccurrences( corpus_id=None         , test= False
                      , field1='ngrams'     , field2='ngrams'
@@ -26,12 +49,13 @@ def countCooccurrences( corpus_id=None         , test= False
                      , n_min=1, n_max=None , limit=1000
                      , coocNode_id=None    , reset=True
                      , isMonopartite=True  , threshold = 3
-                      , save_on_db= False,  # just return the WeightedMatrix,
+                      , distance=None       , bridgeness=None
+                      , save_on_db= True,  # just return the WeightedMatrix,
                                                 #    (don't write to DB)
                      ):
    '''
    Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
-    For the moment list of paramters are not supported because, lists need to
+    For the moment list of parameters are not supported because, lists need to
    be merged before.
    corpus           :: Corpus
@@ -162,7 +186,7 @@ def countCooccurrences( corpus_id=None         , test= False
    if start is not None:
        #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
        # TODO : more complexe date format here.
-        date_start = datetime.datetime.strptime (str(start), "%Y-%m-%d")
+        date_start = datetime.strptime (str(start), "%Y-%m-%d")
        date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
        Start=aliased(NodeHyperdata)
@@ -178,7 +202,7 @@ def countCooccurrences( corpus_id=None         , test= False
    if end is not None:
        # TODO : more complexe date format here.
-        date_end = datetime.datetime.strptime (str(end), "%Y-%m-%d")
+        date_end = datetime.strptime (str(end), "%Y-%m-%d")
        date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
        End=aliased(NodeHyperdata)
@@ -208,22 +232,29 @@ def countCooccurrences( corpus_id=None         , test= False
    #cooc_query = cooc_query.order_by(desc('cooc_score'))
    matrix = WeightedMatrix(cooc_query)
+    print("Node #%d Filtering the matrix with Map and Group Lists." % coocNode_id)
    cooc = filterMatrix(matrix, mapList_id, groupList_id)
    parameters['MapList_id']   = str(mapList_id)
-    parameters['GroupList_id'] = str(mapList_id)
+    parameters['GroupList_id'] = str(groupList_id)
    if save_on_db:
-        # Saving cooc Matrix
        cooc.save(coocNode_id)
+        print("Node Cooccurrence Matrix saved")
        # Saving the parameters
+        print("Saving parameters in Node %d" % coocNode_id)
        coocNode = session.query(Node).filter(Node.id==coocNode_id).first()
-        coocNode.hyperdata = parameters
+        coocNode.hyperdata[distance] = dict()
+        coocNode.hyperdata[distance]["parameters"] = parameters
        session.add(coocNode)
        session.commit()
+        data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
+        print(data)
        # Log message
-        print("Cooccurrence Matrix saved")
-    return cooc
+    else:
+        data = cooc2graph(coocNode_id, cooc, distance=distance)
+        return data
--- a/graph/distances.py
+++ b/graph/distances.py
@@ -16,16 +16,16 @@ import networkx as nx
 def clusterByDistances( cooc_matrix
               , field1=None, field2=None
-               , distance='conditional'):
+               , distance=None):
    '''
-    do_distance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
+    clusterByDistance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
    '''
    # implicit global session
    authorized = ['conditional', 'distributional', 'cosine']
    if distance not in authorized:
-        distance = 'conditional'
+        raise ValueError("Distance must be in %s" % str(authorized))
    matrix = defaultdict(lambda : defaultdict(float))
    ids    = defaultdict(lambda : defaultdict(int))

--- a/graph/graph.py
+++ b/graph/graph.py
@@ -51,7 +51,6 @@ def get_graph( request=None         , corpus=None
    '''
    before_cooc = datetime.now()
@@ -110,7 +109,6 @@ def get_graph( request=None         , corpus=None
                          )
        # Finally test if the size of the corpora is big enough
        # --------------------------------
        corpus_size = corpus_size_query.count()
@@ -121,6 +119,7 @@ def get_graph( request=None         , corpus=None
                                        , start=start           , end =end
                                        , mapList_id=mapList_id , groupList_id=groupList_id
                                        , isMonopartite=True    , threshold = threshold
+                                        , distance=distance     , bridgeness=bridgeness
                                        , save_on_db = True
                                       #, limit=size
                                        )
@@ -133,10 +132,11 @@ def get_graph( request=None         , corpus=None
                                        , start=start           , end =end
                                        , mapList_id=mapList_id , groupList_id=groupList_id
                                        , isMonopartite=True    , threshold = threshold
+                                        , distance=distance     , bridgeness=bridgeness
                                        , save_on_db = True
                                       #, limit=size
                                        )
-            # Dic to inform user that corpus maximum is reached then
+            # Dict to inform user that corpus maximum is reached then
            # graph is computed asynchronously
            return {"state" : "corpusMax", "length" : corpus_size}
@@ -146,44 +146,32 @@ def get_graph( request=None         , corpus=None
        else:
            # If graph_constraints are ok then compute the graph in live
-            cooc_matrix = countCooccurrences( corpus_id=corpus.id
+            data = countCooccurrences( corpus_id=corpus.id
                                      #, field1="ngrams", field2="ngrams"
                                       , start=start           , end =end
                                       , mapList_id=mapList_id , groupList_id=groupList_id
                                       , isMonopartite=True    , threshold = threshold
+                                       , distance=distance     , bridgeness=bridgeness
                                       , save_on_db = True
                                      #, limit=size
                                       )
-    else:
-        print("Getting data for matrix %d", int(cooc_id))
-        matrix      = WeightedMatrix(int(cooc_id))
-        #print(matrix)
-        cooc_matrix = filterMatrix(matrix, mapList_id, groupList_id)
-    # fyi
-    after_cooc = datetime.now()
-    print("... Cooccurrences took %f s." % (after_cooc - before_cooc).total_seconds())
        # case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
-    if len(cooc_matrix.items) == 0:
+        if len(data) == 0:
            print("GET_GRAPH: 0 coocs in matrix")
            data = {'nodes':[], 'links':[]}  # empty data
-    # normal case
-    else:
-        G, partition, ids, weight = clusterByDistances ( cooc_matrix
-                                                       , field1="ngrams", field2="ngrams"
-                                                       , distance=distance
-                                                       )
-        after_cluster = datetime.now()
-        print("... Clustering took %f s." % (after_cluster - after_cooc).total_seconds())
-        data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
-        after_filter = datetime.now()
+    else:
-        print("... Filtering took %f s." % (after_filter - after_cluster).total_seconds())
+        print("Getting data for matrix %d", int(cooc_id))
+        node = session.query(Node).filter(Node.id == cooc_id).first()
+        data = node.hyperdata[distance]["data"]
+        #print(data)
+        #matrix      = WeightedMatrix(int(cooc_id))
+        #print(matrix)
+        #cooc_matrix = filterMatrix(matrix, mapList_id, groupList_id)
+    # normal case
    return data
--- a/graph/rest.py
+++ b/graph/rest.py
@@ -123,18 +123,17 @@ class Graph(APIView):
            groupList_id  = groupList_id[0]
            if groupList_id == None :
-                # todo add as an error msg ?
                raise ValueError("GROUPLIST node needed for cooccurrences")
-        # Check the options
+        # Declare accepted fields
        accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
        accepted_field2 = ['ngrams',                               ]
        options         = ['start', 'end', 'threshold', 'distance', 'cooc_id' ]
        try:
-            # Test params
+            # Check if parameters are accepted
            if (field1 in accepted_field1) and (field2 in accepted_field2):
                if start is not None and end is not None :
                    data = get_graph( corpus=corpus, cooc_id = cooc_id

--- a/graph/views.py
+++ b/graph/views.py
@@ -14,6 +14,8 @@ def explorer(request, project_id, corpus_id):
    Graph explorer, also known as TinaWebJS, using SigmaJS.
    Nodes are ngrams (from title or abstract or journal name.
    Links represent proximity measure.
+    Data are received in RESTfull mode (see rest.py).
    '''
    # we pass our corpus
@@ -46,7 +48,10 @@ def explorer(request, project_id, corpus_id):
 @requires_auth
 def myGraphs(request, project_id, corpus_id):
    '''
-    List all of my Graphs
+    List all of my Graphs.
+    Each Graphs as one Node of Cooccurrences.
+    Each Graph is save in hyperdata of each Node.
    '''
    user = cache.User[request.user.id]