[GRAPH] Graph almost done: needs more factorization.

5e84408a · delanoe · f9204fa4 · 5e84408a · 5e84408a · 5e84408a
Commit 5e84408a authored Sep 28, 2016 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 89 additions and 75 deletions

constants.py gargantext/constants.py +1 -1

cooccurrences.py graph/cooccurrences.py +65 -55

graph.py graph/graph.py +11 -16

rest.py graph/rest.py +12 -3

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
 # Graph constraints to compute the graph:
 # Modes: live graph generation, graph asynchronously computed or errors detected
 # here are the maximum size of corpus and maplist required to compute the graph
-graph_constraints = {'corpusMax' : 500
+graph_constraints = {'corpusMax' : 100
                    ,'corpusMin' : 40
                    ,'mapList'   : 50
                    }
--- a/graph/cooccurrences.py
+++ b/graph/cooccurrences.py
 from gargantext.models     import Node, Ngram, NodeNgram, NodeNgramNgram, \
                                  NodeHyperdata, HyperdataKey
-from gargantext.util.db    import session, aliased, bulk_insert, func
+from gargantext.util.db    import session, aliased, func
 from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
 from graph.distances       import clusterByDistances
@@ -19,48 +19,61 @@ def filterMatrix(matrix, mapList_id, groupList_id):
    cooc       = matrix & (mapList * group_list)
    return cooc
-# computeGraph
+@shared_task
-def cooc2graph( cooc_id, cooc_matrix, field1="ngrams", field2="ngrams", distance=None, bridgeness=None):
+def computeGraph( corpus_id=None, cooc_id=None    
+                , field1='ngrams'     , field2='ngrams'
-            print("GRAPH#%d ... Computing cooccurrences." % (cooc_id))
+                , start=None          , end=None
-            # Check if already computed cooc
+                , mapList_id=None     , groupList_id=None
-            # (cooc_id, cooc) = count(countCooccurrences)
+                , distance=None       , bridgeness=None
+                , n_min=1, n_max=None , limit=1000
-            print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance))
+                , isMonopartite=True  , threshold = 3
-            G, partition, ids, weight = clusterByDistances ( cooc_matrix
+                , save_on_db= True    , reset=True
-                                                           , field1="ngrams", field2="ngrams"
+                ):
-                                                           , distance=distance
-                                                           )
+        print("GRAPH# ... Computing cooccurrences.")
+        (cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id=cooc_id
+                                    , field1=field1, field2=field2
+                                    , start=start           , end =end
+                                    , mapList_id=mapList_id , groupList_id=groupList_id
+                                    , isMonopartite=True    , threshold = threshold
+                                    , distance=distance     , bridgeness=bridgeness
+                                    , save_on_db = True
+                                    )
+        print("GRAPH#%d ... Cooccurrences computed." % (cooc_id))
-            print("GRAPH#%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
-            data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
+        print("GRAPH#%d ... Clustering with distance %s ." % (cooc_id,distance))
+        G, partition, ids, weight = clusterByDistances ( cooc_matrix
+                                                       , field1="ngrams", field2="ngrams"
+                                                       , distance=distance
+                                                       )
-            print("GRAPH#%d ... Saving Graph in hyperdata as json." % cooc_id)
+        print("GRAPH#%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
-            node = session.query(Node).filter(Node.id == cooc_id).first()
+        data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
-            if node.hyperdata.get(distance, None) is None:
+        print("GRAPH#%d ... Saving Graph in hyperdata as json." % cooc_id)
-                node.hyperdata[distance] = dict()
+        node = session.query(Node).filter(Node.id == cooc_id).first()
-            node.hyperdata[distance][bridgeness] = data
+        if node.hyperdata.get(distance, None) is None:
+            node.hyperdata[distance] = dict()
-            node.save_hyperdata()
-            session.commit()
+        node.hyperdata[distance][bridgeness] = data
+        node.save_hyperdata()
+        session.commit()
-            print("GRAPH#%d ... Returning data as json." % cooc_id)
+        print("GRAPH#%d ... Returning data as json." % cooc_id)
-            return data
+        return data
-@shared_task
+def countCooccurrences( corpus_id=None, cooc_id=None    
-def countCooccurrences( corpus_id=None      , test= False
                      , field1='ngrams'     , field2='ngrams'
                      , start=None          , end=None
                      , mapList_id=None     , groupList_id=None
+                      , distance=None       , bridgeness=None
                      , n_min=1, n_max=None , limit=1000
-                      , coocNode_id=None    , reset=True
                      , isMonopartite=True  , threshold = 3
-                      , distance=None       , bridgeness=None
+                      , save_on_db= True    , reset=True
-                      , save_on_db= True,  # just return the WeightedMatrix,
-                                                 #    (don't write to DB)
                      ):
    '''
    Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
@@ -71,15 +84,13 @@ def countCooccurrences( corpus_id=None      , test= False
    mapList_id       :: Int
    groupList_id     :: Int
-    For the moment, start and end are simple, only year is implemented yet
    start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
    end   :: TimeStamp
    limit :: Int
    '''
-    # TODO : add hyperdata here
-    # Parameters to save in hyperdata of the Node Cooc
+    # FIXME remove the lines below after factorization of parameters
    parameters = dict()
    parameters['field1'] = field1
    parameters['field2'] = field2
@@ -88,16 +99,16 @@ def countCooccurrences( corpus_id=None      , test= False
    corpus = session.query(Node).filter(Node.id==corpus_id).first()
    # Get node of the Graph
-    if not coocNode_id:
+    if not cooc_id:
-        coocNode_id  = ( session.query( Node.id )
+        cooc_id  = ( session.query( Node.id )
                                .filter( Node.typename  == "COOCCURRENCES"
                                       , Node.name      == "GRAPH EXPLORER"
                                       , Node.parent_id == corpus.id
                                       )
                                .first()
                        )
-        if not coocNode_id:
+        if not cooc_id:
            coocNode = corpus.add_child(
            typename  = "COOCCURRENCES",
            name = "GRAPH (in corpus %s)" % corpus.id
@@ -105,12 +116,12 @@ def countCooccurrences( corpus_id=None      , test= False
            session.add(coocNode)
            session.commit()
-            coocNode_id = coocNode.id
+            cooc_id = coocNode.id
        else :
-            coocNode_id = int(coocNode_id[0])
+            cooc_id = int(cooc_id[0])
    if reset == True :
-        session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == coocNode_id ).delete()
+        session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == cooc_id ).delete()
        session.commit()
@@ -191,7 +202,7 @@ def countCooccurrences( corpus_id=None      , test= False
    # Cooc between the dates start and end
    if start is not None:
        #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
-        # TODO : more complexe date format here.
+        # TODO : more precise date format here (day is smaller grain actually).
        date_start = datetime.strptime (str(start), "%Y-%m-%d")
        date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
@@ -207,7 +218,7 @@ def countCooccurrences( corpus_id=None      , test= False
    if end is not None:
-        # TODO : more complexe date format here.
+        # TODO : more precise date format here (day is smaller grain actually).
        date_end = datetime.strptime (str(end), "%Y-%m-%d")
        date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
@@ -239,28 +250,27 @@ def countCooccurrences( corpus_id=None      , test= False
    matrix = WeightedMatrix(cooc_query)
-    print("GRAPH #%s Filtering the matrix with Map and Group Lists." % coocNode_id)
+    print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id)
    cooc = filterMatrix(matrix, mapList_id, groupList_id)
    parameters['MapList_id']   = str(mapList_id)
    parameters['GroupList_id'] = str(groupList_id)
+    # TODO factorize savings on db
    if save_on_db:
        # Saving the cooccurrences
-        cooc.save(coocNode_id)
+        cooc.save(cooc_id)
-        print("GRAPH#%s ... Node Cooccurrence Matrix saved" % coocNode_id)
+        print("GRAPH#%s ... Node Cooccurrence Matrix saved" % cooc_id)
        # Saving the parameters
-        print("GRAPH#%s ... Parameters saved in Node." % coocNode_id)
+        print("GRAPH#%s ... Parameters saved in Node." % cooc_id)
-        coocNode = session.query(Node).filter(Node.id==coocNode_id).first()
+        coocNode = session.query(Node).filter(Node.id==cooc_id).first()
        coocNode.hyperdata[distance] = dict()
        coocNode.hyperdata[distance]["parameters"] = parameters
        session.add(coocNode)
        session.commit()
-        data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
+        #data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
-        return data
+        #return data
-    else:
+    return(coocNode.id, cooc)
-        data = cooc2graph(coocNode_id, cooc, distance=distance)
-        return data
--- a/graph/graph.py
+++ b/graph/graph.py
@@ -5,7 +5,7 @@ from gargantext.util.http         import JsonHttpResponse
 from gargantext.models            import Node, Ngram, NodeNgram, NodeNgramNgram, NodeHyperdata
 #from gargantext.util.toolchain.ngram_coocs import compute_coocs
-from graph.cooccurrences  import countCooccurrences, filterMatrix
+from graph.cooccurrences  import computeGraph, filterMatrix
 from graph.distances      import clusterByDistances
 from graph.bridgeness     import filterByBridgeness
@@ -19,12 +19,9 @@ def get_graph( request=None         , corpus=None
            , mapList_id = None     , groupList_id = None
            , cooc_id=None          , type='node_link'
            , start=None            , end=None
-            , threshold=1
+            , distance='conditional', bridgeness=5
-            , distance='conditional'
+            , threshold=1           , isMonopartite=True
-            , isMonopartite=True                # By default, we compute terms/terms graph
+            , saveOnly=True
-            , bridgeness=5
-            , saveOnly=None
-            #, size=1000
        ):
    '''
    Get_graph : main steps:
@@ -54,7 +51,7 @@ def get_graph( request=None         , corpus=None
    # Case of graph has been computed already
    if cooc_id is not None:
-        print("Getting data for matrix %d", int(cooc_id))
+        print("GRAPH#%d ... Loading data already computed." % int(cooc_id))
        node = session.query(Node).filter(Node.id == cooc_id).first()
        # Structure of the Node.hyperdata[distance][bridbeness]
@@ -65,8 +62,6 @@ def get_graph( request=None         , corpus=None
        if node.hyperdata.get(distance, None) is not None:
            graph = node.hyperdata[distance]
-            print(node.hyperdata[distance].keys())
            # Check bridgeness of the graph
            if graph.get(str(bridgeness), None) is not None:
                return graph[str(bridgeness)]
@@ -133,7 +128,7 @@ def get_graph( request=None         , corpus=None
    corpus_size = corpus_size_query.count()
    if saveOnly is not None and saveOnly == "True":
-        scheduled(countCooccurrences)( corpus_id=corpus.id, coocNode_id=cooc_id
+        scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id
                                   #, field1="ngrams", field2="ngrams"
                                    , start=start           , end =end
                                    , mapList_id=mapList_id , groupList_id=groupList_id
@@ -144,9 +139,9 @@ def get_graph( request=None         , corpus=None
                                    )
        return {"state" : "saveOnly"}
-    if corpus_size > graph_constraints['corpusMax']:
+    elif corpus_size > graph_constraints['corpusMax']:
        # Then compute cooc asynchronously with celery
-        scheduled(countCooccurrences)( corpus_id=corpus.id, coocNode_id=cooc_id
+        scheduled(computeGraph)( corpus_id=corpus.id, cooc_id=cooc_id
                                   #, field1="ngrams", field2="ngrams"
                                    , start=start           , end =end
                                    , mapList_id=mapList_id , groupList_id=groupList_id
@@ -155,8 +150,8 @@ def get_graph( request=None         , corpus=None
                                    , save_on_db = True
                                   #, limit=size
                                    )
-        # Dict to inform user that corpus maximum is reached then
+        # Dict to inform user that corpus maximum is reached 
-        # graph is computed asynchronously
+        # then graph is computed asynchronously
        return {"state" : "corpusMax", "length" : corpus_size}
    elif corpus_size <= graph_constraints['corpusMin']:
@@ -165,7 +160,7 @@ def get_graph( request=None         , corpus=None
    else:
        # If graph_constraints are ok then compute the graph in live
-        data = countCooccurrences( corpus_id=corpus.id, coocNode_id=cooc_id
+        data = computeGraph( corpus_id=corpus.id, cooc_id=cooc_id
                                  #, field1="ngrams", field2="ngrams"
                                   , start=start           , end =end
                                   , mapList_id=mapList_id , groupList_id=groupList_id

--- a/graph/rest.py
+++ b/graph/rest.py
-#from rest_framework.authentication import SessionAuthentication, BasicAuthentication
 from gargantext.util.db      import session
 from gargantext.models.nodes import Node
 from graph.graph             import get_graph
@@ -8,7 +6,7 @@ from gargantext.util.http    import APIView, APIException\
                                  , JsonHttpResponse, requires_auth
 from gargantext.constants    import graph_constraints
-from traceback import format_tb
+from traceback               import format_tb
 class Graph(APIView):
    '''
@@ -29,6 +27,16 @@ class Graph(APIView):
        # Get the node we are working with
        corpus = session.query(Node).filter(Node.id==corpus_id).first()
+        # TODO Parameters to save in hyperdata of the Node Cooc
+        # WARNING: we could factorize the parameters as dict but ...
+        #         ...  it causes a bug in asynchronous function !
+        # Check celery upgrades before.
+        # Example (for the future):
+        #        parameters = dict()
+        #        parameters['field1'] = field1
+        #        parameters['field2'] = field2
        # Get all the parameters in the URL
        cooc_id      = request.GET.get     ('cooc_id'   , None         )
        saveOnly     = request.GET.get     ('saveOnly'  , None         )
@@ -48,6 +56,7 @@ class Graph(APIView):
        type_        = str(request.GET.get ('type'      , 'node_link'  ))
        distance     = str(request.GET.get ('distance'  , 'conditional'))
        # Get default map List of corpus
        if mapList_id == 0 :
            mapList_id = ( session.query ( Node.id )