FASTER git add graphExplorer/distances.py ! remove writing each new cooc node at graph open

6f6e69a4 · Romain Loth · 471419c2 · 6f6e69a4 · 6f6e69a4 · 6f6e69a4
Commit 6f6e69a4 authored May 25, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 23 deletions

cooccurrences.py graphExplorer/cooccurrences.py +9 -3

distances.py graphExplorer/distances.py +12 -13

graph.py graphExplorer/graph.py +28 -7

No files found.
--- a/graphExplorer/cooccurrences.py
+++ b/graphExplorer/cooccurrences.py
@@ -15,7 +15,10 @@ def countCooccurrences( corpus=None
                      , mapList_id=None     , groupList_id=None
                      , n_min=1, n_max=None , limit=1000
                      , coocNode_id=None    , reset=True
-                      , isMonopartite=True  , threshold = 3):
+                      , isMonopartite=True  , threshold = 3
+                      , just_pass_result= True,  # just return the WeightedMatrix,
+                                                 #    (don't write to DB)
+                      ):
    '''
    Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
    For the moment list of paramters are not supported because, lists need to
@@ -186,5 +189,8 @@ def countCooccurrences( corpus=None
    group_list = Translations  ( groupList_id )
    cooc       = matrix & (mapList * group_list)
-    cooc.save(coocNode_id)
+    if just_pass_result:
-    return(coocNode_id)
+        return cooc
+    else:
+        cooc.save(coocNode_id)
+        return(coocNode_id)
--- a/graphExplorer/distances.py
+++ b/graphExplorer/distances.py
@@ -14,11 +14,11 @@ import numpy    as np
 import pandas   as pd
 import networkx as nx
-def clusterByDistances( cooc_id
+def clusterByDistances( cooc_matrix
               , field1=None, field2=None
               , distance='conditional'):
    '''
-    do_distance :: Int -> (Graph, Partition, {ids}, {weight})
+    do_distance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
    '''
    # implicit global session
@@ -32,19 +32,19 @@ def clusterByDistances( cooc_id
    labels = dict()
    weight = dict()
-    Cooc = aliased(NodeNgramNgram)
+    for cooc in cooc_matrix.items:
+        ngram1_id = cooc[0]
+        ngram2_id = cooc[1]
+        ccweight = cooc_matrix.items[cooc]
-    query = session.query(Cooc).filter(Cooc.node_id==cooc_id).all()
+        matrix[ngram1_id][ngram2_id] = ccweight
+        matrix[ngram2_id][ngram1_id] = ccweight
-    for cooc in query:
+        ids[ngram1_id] = (field1, ngram1_id)
-        matrix[cooc.ngram1_id][cooc.ngram2_id] = cooc.weight
+        ids[ngram2_id] = (field2, ngram2_id)
-        matrix[cooc.ngram2_id][cooc.ngram1_id] = cooc.weight
-        ids[cooc.ngram1_id] = (field1, cooc.ngram1_id)
+        weight[ngram1_id] = weight.get(ngram1_id, 0) + ccweight
-        ids[cooc.ngram2_id] = (field2, cooc.ngram2_id)
+        weight[ngram2_id] = weight.get(ngram2_id, 0) + ccweight
-        weight[cooc.ngram1_id] = weight.get(cooc.ngram1_id, 0) + cooc.weight
-        weight[cooc.ngram2_id] = weight.get(cooc.ngram2_id, 0) + cooc.weight
    x = pd.DataFrame(matrix).fillna(0)
@@ -217,4 +217,3 @@ def clusterByDistances( cooc_id
    partition = best_partition(G.to_undirected())
    return(G,partition,ids,weight)
--- a/graphExplorer/graph.py
+++ b/graphExplorer/graph.py
@@ -42,27 +42,48 @@ def get_graph( request=None         , corpus=None
    3) filter By Bridgeness (filter By Bridgeness)
            main parameter: bridgness
    4) format the graph     (formatGraph)
            main parameter: format_
    '''
+    from datetime import datetime
+    before_cooc = datetime.now()
+    # TODO change test here (always true)
+    #      to something like "if cooc.status threshold == required_threshold
+    #                         and group.creation_time < cooc.creation_time"
+    #      if False => read and give to clusterByDistances
+    #      if True => compute and give to clusterByDistances  <==
    if cooc_id == None:
-        cooc_id = countCooccurrences( corpus=corpus
+        cooc_matrix = countCooccurrences( corpus=corpus
                                   #, field1="ngrams", field2="ngrams"
                                    , start=start           , end =end
                                    , mapList_id=mapList_id , groupList_id=groupList_id
                                    , isMonopartite=True    , threshold = threshold
+                                    , just_pass_result = True
                                   #, limit=size
                                    )
+    else:
-    G, partition, ids, weight = clusterByDistances ( cooc_id
+        cooc_matrix = WeightedMatrix(cooc_id)
+    # fyi
+    after_cooc = datetime.now()
+    print("... Cooccurrences took %f s." % (after_cooc - before_cooc).total_seconds())
+    G, partition, ids, weight = clusterByDistances ( cooc_matrix
                                                   , field1="ngrams", field2="ngrams"
                                                   , distance=distance
                                                   )
+    after_cluster = datetime.now()
+    print("... Clustering took %f s." % (after_cluster - after_cooc).total_seconds())
    data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
-    return data
+    after_filter = datetime.now()
+    print("... Filtering took %f s." % (after_filter - after_cluster).total_seconds())
+    return data