Commit 6f6e69a4 authored by Romain Loth's avatar Romain Loth

FASTER git add graphExplorer/distances.py ! remove writing each new cooc node at graph open

parent 471419c2
...@@ -15,7 +15,10 @@ def countCooccurrences( corpus=None ...@@ -15,7 +15,10 @@ def countCooccurrences( corpus=None
, mapList_id=None , groupList_id=None , mapList_id=None , groupList_id=None
, n_min=1, n_max=None , limit=1000 , n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True , coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3): , isMonopartite=True , threshold = 3
, just_pass_result= True, # just return the WeightedMatrix,
# (don't write to DB)
):
''' '''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to For the moment list of paramters are not supported because, lists need to
...@@ -186,5 +189,8 @@ def countCooccurrences( corpus=None ...@@ -186,5 +189,8 @@ def countCooccurrences( corpus=None
group_list = Translations ( groupList_id ) group_list = Translations ( groupList_id )
cooc = matrix & (mapList * group_list) cooc = matrix & (mapList * group_list)
cooc.save(coocNode_id) if just_pass_result:
return(coocNode_id) return cooc
else:
cooc.save(coocNode_id)
return(coocNode_id)
...@@ -14,11 +14,11 @@ import numpy as np ...@@ -14,11 +14,11 @@ import numpy as np
import pandas as pd import pandas as pd
import networkx as nx import networkx as nx
def clusterByDistances( cooc_id def clusterByDistances( cooc_matrix
, field1=None, field2=None , field1=None, field2=None
, distance='conditional'): , distance='conditional'):
''' '''
do_distance :: Int -> (Graph, Partition, {ids}, {weight}) do_distance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
''' '''
# implicit global session # implicit global session
...@@ -32,19 +32,19 @@ def clusterByDistances( cooc_id ...@@ -32,19 +32,19 @@ def clusterByDistances( cooc_id
labels = dict() labels = dict()
weight = dict() weight = dict()
Cooc = aliased(NodeNgramNgram) for cooc in cooc_matrix.items:
ngram1_id = cooc[0]
ngram2_id = cooc[1]
ccweight = cooc_matrix.items[cooc]
query = session.query(Cooc).filter(Cooc.node_id==cooc_id).all() matrix[ngram1_id][ngram2_id] = ccweight
matrix[ngram2_id][ngram1_id] = ccweight
for cooc in query: ids[ngram1_id] = (field1, ngram1_id)
matrix[cooc.ngram1_id][cooc.ngram2_id] = cooc.weight ids[ngram2_id] = (field2, ngram2_id)
matrix[cooc.ngram2_id][cooc.ngram1_id] = cooc.weight
ids[cooc.ngram1_id] = (field1, cooc.ngram1_id) weight[ngram1_id] = weight.get(ngram1_id, 0) + ccweight
ids[cooc.ngram2_id] = (field2, cooc.ngram2_id) weight[ngram2_id] = weight.get(ngram2_id, 0) + ccweight
weight[cooc.ngram1_id] = weight.get(cooc.ngram1_id, 0) + cooc.weight
weight[cooc.ngram2_id] = weight.get(cooc.ngram2_id, 0) + cooc.weight
x = pd.DataFrame(matrix).fillna(0) x = pd.DataFrame(matrix).fillna(0)
...@@ -217,4 +217,3 @@ def clusterByDistances( cooc_id ...@@ -217,4 +217,3 @@ def clusterByDistances( cooc_id
partition = best_partition(G.to_undirected()) partition = best_partition(G.to_undirected())
return(G,partition,ids,weight) return(G,partition,ids,weight)
...@@ -42,27 +42,48 @@ def get_graph( request=None , corpus=None ...@@ -42,27 +42,48 @@ def get_graph( request=None , corpus=None
3) filter By Bridgeness (filter By Bridgeness) 3) filter By Bridgeness (filter By Bridgeness)
main parameter: bridgness main parameter: bridgness
4) format the graph (formatGraph) 4) format the graph (formatGraph)
main parameter: format_ main parameter: format_
''' '''
from datetime import datetime
before_cooc = datetime.now()
# TODO change test here (always true)
# to something like "if cooc.status threshold == required_threshold
# and group.creation_time < cooc.creation_time"
# if False => read and give to clusterByDistances
# if True => compute and give to clusterByDistances <==
if cooc_id == None: if cooc_id == None:
cooc_id = countCooccurrences( corpus=corpus cooc_matrix = countCooccurrences( corpus=corpus
#, field1="ngrams", field2="ngrams" #, field1="ngrams", field2="ngrams"
, start=start , end =end , start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id , mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold , isMonopartite=True , threshold = threshold
, just_pass_result = True
#, limit=size #, limit=size
) )
else:
G, partition, ids, weight = clusterByDistances ( cooc_id cooc_matrix = WeightedMatrix(cooc_id)
# fyi
after_cooc = datetime.now()
print("... Cooccurrences took %f s." % (after_cooc - before_cooc).total_seconds())
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams" , field1="ngrams", field2="ngrams"
, distance=distance , distance=distance
) )
after_cluster = datetime.now()
print("... Clustering took %f s." % (after_cluster - after_cooc).total_seconds())
data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2) data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
return data
after_filter = datetime.now()
print("... Filtering took %f s." % (after_filter - after_cluster).total_seconds())
return data
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment