[FACTO] split graph function into 4 main functions.

b1ac2efb · delanoe · a40f95bb · b1ac2efb · b1ac2efb · b1ac2efb
Commit b1ac2efb authored Mar 30, 2016 by delanoe
7 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -114,9 +114,9 @@ from gargantext.util.taggers import *

 LANGUAGES = {
    'en': {
-        'tagger': TurboTagger,
-        # 'tagger': EnglishMeltTagger,
-        # 'tagger': NltkTagger,
+        #'tagger': TurboTagger,
+        'tagger': EnglishMeltTagger,
+        #'tagger': NltkTagger,
    },
    'fr': {
        'tagger': FrenchMeltTagger,

--- a/graphExplorer/README.md
+++ b/graphExplorer/README.md
+Module Graph Explorer: from text to graph.
+
+Maintainer: If you see bugs, please report to team@gargantext.org
--- a/graphExplorer/functions.py
+++ b/graphExplorer/functions.py
-# Gargantext lib
-from gargantext.util.db           import session
-from gargantext.util.http         import JsonHttpResponse
-from gargantext.models            import Node, Ngram, NodeNgram, NodeNgramNgram
+# Article coming soon

-#from gargantext.util.toolchain.ngram_coocs import compute_coocs
-from graphExplorer.distances      import do_distance
-from graphExplorer.cooccurrences  import do_cooc
+from gargantext.util.db       import session
+from gargantext.models.ngrams import Ngram
+from collections              import defaultdict

-# Prelude lib
-from copy                         import copy, deepcopy
-from collections                  import defaultdict
-from sqlalchemy.orm               import aliased
-
-# Math/Graph lib
-import math
-import pandas                     as pd
-import numpy                      as np
-
-import networkx                   as nx
 from networkx.readwrite           import json_graph

-
-def get_cooc( request=None, corpus=None
-            , field1='ngrams', field2='ngrams'
-            , cooc_id=None   , type='node_link'
-            , start=None     , end=None
-            , threshold=1
-            , distance='conditional'
-            , isMonopartite=True                # By default, we compute terms/terms graph
-            , size=1000
-            , bridgeness=5
-            , mapList_id = None , groupList_id = None
-        ):
-    '''
-    get_ccoc : to compute the graph.
-    '''
-
-
-    if mapList_id == None :
-        mapList_id  = ( session.query ( Node.id )
-                                .filter( Node.typename  == "MAPLIST"
-                                       , Node.parent_id == corpus.id
-                                       )
-                                .first()
-                       )
-        if mapList_id == None :
-            raise ValueError("MAPLIST node needed for cooccurrences")
-
-
-    if groupList_id   == None :
-        groupList_id  = ( session.query ( Node.id )
-                                 .filter( Node.typename  == "GROUPLIST"
-                                        , Node.parent_id == corpus.id
-                                        )
-                                 .first()
-                        )
-
-        if groupList_id == None :
-            raise ValueError("GROUPLIST node needed for cooccurrences")
-
-
-    if corpus is None:
-        corpus = session.query(Node).filter(Node.id==corpus_id).first()
-
-    cooc_id = do_cooc( corpus=corpus
-                    #, field1="ngrams", field2="ngrams"
-                     , mapList_id=int(mapList_id[0]), groupList_id=int(groupList_id[0])
-                    #, isMonopartite=True
-                     , start=start    , end =end
-                     , threshold      = threshold #, limit=size
-                     )
-    
-    G, partition, ids, weight = do_distance ( cooc_id
-                                            , field1="ngrams", field2="ngrams"
-                                            , isMonopartite=True
-                                            , distance=distance
-                                            )
+def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
    # Data are stored in a dict(), (== hashmap by default for Python)
    data = dict()
-    
    if type == "node_link":
        nodesB_dict = {}
        for node_id in G.nodes():
            #node,type(labels[node])
-            G.node[node_id]['pk'] = ids[node_id][1]
+            G.node[node_id]['pk']           = ids[node_id][1]
            nodesB_dict [ ids[node_id][1] ] = True
            # TODO the query below is not optimized (do it do_distance).
            the_label = session.query(Ngram.terms).filter(Ngram.id==node_id).first()
            the_label = ", ".join(the_label)
-            G.node[node_id]['label']   = the_label
+            G.node[node_id]['label']        = the_label
            
-            G.node[node_id]['size']    = weight[node_id]
-            G.node[node_id]['type']    = ids[node_id][0].replace("ngrams","terms")
-            G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format
+            G.node[node_id]['size']         = weight[node_id]
+            G.node[node_id]['type']         = ids[node_id][0].replace("ngrams","terms")
+            G.node[node_id]['attributes']   = { "clust_default": partition[node_id]} # new format
            # G.add_edge(node, "cluster " + str(partition[node]), weight=3)

-        
-
        links = []
        i=1
-        

        if bridgeness > 0:
            com_link = defaultdict(lambda: defaultdict(list))
@@ -107,7 +34,6 @@ def get_cooc( request=None, corpus=None
            
            for k, v in partition.items():
                com_ids[v].append(k)
-        

        for e in G.edges_iter():
            s = e[0]
@@ -180,5 +106,3 @@ def get_cooc( request=None, corpus=None
        return(partition)

    return(data)
-
-
--- a/graphExplorer/cooccurrences.py
+++ b/graphExplorer/cooccurrences.py
@@ -9,13 +9,13 @@ from sqlalchemy            import desc, asc, or_, and_
 #import inspect
 import datetime

-def do_cooc( corpus=None
-           , field1='ngrams'     , field2='ngrams'
-           , start=None          , end=None
-           , mapList_id=None     , groupList_id=None
-           , n_min=1, n_max=None , limit=1000
-           , coocNode_id=None    , reset=True
-           , isMonopartite=True  , threshold = 3):
+def countCooccurrences( corpus=None
+                      , field1='ngrams'     , field2='ngrams'
+                      , start=None          , end=None
+                      , mapList_id=None     , groupList_id=None
+                      , n_min=1, n_max=None , limit=1000
+                      , coocNode_id=None    , reset=True
+                      , isMonopartite=True  , threshold = 3):
    '''
    Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
    For the moment list of paramters are not supported because, lists need to

--- a/graphExplorer/distances.py
+++ b/graphExplorer/distances.py
@@ -14,9 +14,9 @@ import numpy    as np
 import pandas   as pd
 import networkx as nx

-def do_distance( cooc_id
+def clusterByDistances( cooc_id
               , field1=None, field2=None
-               , isMonopartite=True, distance='conditional'):
+               , distance='conditional'):
    '''
    do_distance :: Int -> (Graph, Partition, {ids}, {weight})
    '''

--- a/graphExplorer/graph.py
+++ b/graphExplorer/graph.py
+# Gargantext lib
+from gargantext.util.db           import session
+from gargantext.util.http         import JsonHttpResponse
+from gargantext.models            import Node, Ngram, NodeNgram, NodeNgramNgram
+
+#from gargantext.util.toolchain.ngram_coocs import compute_coocs
+from graphExplorer.cooccurrences  import countCooccurrences
+from graphExplorer.distances      import clusterByDistances
+from graphExplorer.bridgeness     import filterByBridgeness
+
+# Prelude lib
+from copy                         import copy, deepcopy
+from collections                  import defaultdict
+from sqlalchemy.orm               import aliased
+
+# Math/Graph lib
+import math
+import pandas                     as pd
+import numpy                      as np
+
+import networkx                   as nx
+
+
+def get_graph( request=None         , corpus=None
+            , field1='ngrams'       , field2='ngrams'
+            , mapList_id = None     , groupList_id = None
+            , cooc_id=None          , type='node_link'
+            , start=None            , end=None
+            , threshold=1
+            , distance='conditional'
+            , isMonopartite=True                # By default, we compute terms/terms graph
+            , bridgeness=5
+            #, size=1000
+        ):
+    '''
+    Get_graph : main steps:
+    1) count Cooccurrences  (function countCooccurrences)
+            main parameters: threshold
+
+    2) filter and cluster By Distances (function clusterByDistances)
+            main parameter: distance
+
+    3) filter By Bridgeness (filter By Bridgeness)
+            main parameter: bridgness
+    
+    4) format the graph     (formatGraph)
+            main parameter: format_
+
+    '''
+
+    if cooc_id == None:
+        cooc_id = countCooccurrences( corpus=corpus
+                                   #, field1="ngrams", field2="ngrams"
+                                    , start=start           , end =end
+                                    , mapList_id=mapList_id , groupList_id=groupList_id
+                                    , isMonopartite=True    , threshold = threshold
+                                   #, limit=size
+                                    )
+    
+    G, partition, ids, weight = clusterByDistances ( cooc_id
+                                                   , field1="ngrams", field2="ngrams"
+                                                   , distance=distance
+                                                   )
+    
+    data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
+    
+    return data
+
--- a/graphExplorer/rest.py
+++ b/graphExplorer/rest.py
@@ -2,7 +2,7 @@

 from gargantext.util.db      import session
 from gargantext.models.nodes import Node
-from graphExplorer.functions import get_cooc
+from graphExplorer.graph     import get_graph
 from gargantext.util.http    import APIView, APIException\
                                  , JsonHttpResponse, requires_auth

@@ -19,38 +19,74 @@ class Graph(APIView):
        graph?field1=ngrams&field2=ngrams&
        graph?field1=ngrams&field2=ngrams&start=''&end=''
        '''
-        # implicit global session
        
-        field1      = str(request.GET.get ('field1'    , 'ngrams'     ))
-        field2      = str(request.GET.get ('field2'    , 'ngrams'     ))
+        # Get the node we are working with
+        corpus = session.query(Node).filter(Node.id==corpus_id).first()
        
-        start       = request.GET.get     ('start'     , None         )
-        end         = request.GET.get     ('end'       , None         )
+        # Get all the parameters in the URL
+        field1       = str(request.GET.get ('field1'    , 'ngrams'     ))
+        field2       = str(request.GET.get ('field2'    , 'ngrams'     ))
        
-        threshold   = int(request.GET.get ('threshold' , 1            ))
-        bridgeness  = int(request.GET.get ('bridgeness', -1           ))
-        format_     = str(request.GET.get ('format'    , 'json'       ))
-        type_       = str(request.GET.get ('type'      , 'node_link'  ))
-        distance    = str(request.GET.get ('distance'  , 'conditional'))
+        start        = request.GET.get     ('start'     , None         )
+        end          = request.GET.get     ('end'       , None         )
        
-
-        corpus = session.query(Node).filter(Node.id==corpus_id).first()
+        mapList_id   = int(request.GET.get ('mapList'   , 0            ))
+        groupList_id = int(request.GET.get ('groupList' , 0            ))
+        
+        threshold    = int(request.GET.get ('threshold' , 1            ))
+        bridgeness   = int(request.GET.get ('bridgeness', -1           ))
+        format_      = str(request.GET.get ('format'    , 'json'       ))
+        type_        = str(request.GET.get ('type'      , 'node_link'  ))
+        distance     = str(request.GET.get ('distance'  , 'conditional'))
        
+        # Get default value if no map list
+        if mapList_id == 0 :
+            mapList_id = ( session.query ( Node.id )
+                                    .filter( Node.typename  == "MAPLIST"
+                                           , Node.parent_id == corpus.id
+                                           )
+                                    .first()
+                          )
+            
+            mapList_id = mapList_id[0]
+            
+            if mapList_id == None :
+                raise ValueError("MAPLIST node needed for cooccurrences")
+
+
+        # Get default value if no group list
+        if groupList_id  == 0 :
+            groupList_id  = ( session.query ( Node.id )
+                                     .filter( Node.typename  == "GROUPLIST"
+                                            , Node.parent_id == corpus.id
+                                            )
+                                     .first()
+                            )
+            
+            groupList_id  = groupList_id[0]
+            
+            if groupList_id == None :
+                raise ValueError("GROUPLIST node needed for cooccurrences")
+
+
+        # Chec the options
        accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
-        accepted_field2 = ['ngrams',]
-        options         = ['start', 'end', 'threshold', 'distance']
+        accepted_field2 = ['ngrams',                               ]
+        options         = ['start', 'end', 'threshold', 'distance' ]
        
        if field1 in accepted_field1 :
            if field2 in accepted_field2 :
                if start is not None and end is not None :
-                    data = get_cooc( corpus=corpus
-                                  #, field1=field1          , field2=field2
-                                   , start=start            , end=end
-                                   , threshold =threshold   , distance=distance
+                    data = get_graph( corpus=corpus
+                                  #, field1=field1           , field2=field2
+                                   , mapList_id = mapList_id , groupList_id = groupList_id
+                                   , start=start             , end=end
+                                   , threshold =threshold    , distance=distance
                                   )
                else:
-                    data = get_cooc( corpus = corpus
+                    data = get_graph( corpus = corpus
                                  #, field1=field1, field2=field2
+                                   , mapList_id = mapList_id , groupList_id = groupList_id
                                   , threshold  = threshold
                                   , distance   = distance
                                   , bridgeness = bridgeness