Commit 6f6e69a4 authored by Romain Loth's avatar Romain Loth

FASTER git add graphExplorer/distances.py ! remove writing each new cooc node at graph open

parent 471419c2
......@@ -15,7 +15,10 @@ def countCooccurrences( corpus=None
, mapList_id=None , groupList_id=None
, n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3):
, isMonopartite=True , threshold = 3
, just_pass_result= True, # just return the WeightedMatrix,
# (don't write to DB)
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
......@@ -186,5 +189,8 @@ def countCooccurrences( corpus=None
group_list = Translations ( groupList_id )
cooc = matrix & (mapList * group_list)
cooc.save(coocNode_id)
return(coocNode_id)
if just_pass_result:
return cooc
else:
cooc.save(coocNode_id)
return(coocNode_id)
......@@ -14,11 +14,11 @@ import numpy as np
import pandas as pd
import networkx as nx
def clusterByDistances( cooc_id
def clusterByDistances( cooc_matrix
, field1=None, field2=None
, distance='conditional'):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
do_distance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
'''
# implicit global session
......@@ -32,19 +32,19 @@ def clusterByDistances( cooc_id
labels = dict()
weight = dict()
Cooc = aliased(NodeNgramNgram)
for cooc in cooc_matrix.items:
ngram1_id = cooc[0]
ngram2_id = cooc[1]
ccweight = cooc_matrix.items[cooc]
query = session.query(Cooc).filter(Cooc.node_id==cooc_id).all()
matrix[ngram1_id][ngram2_id] = ccweight
matrix[ngram2_id][ngram1_id] = ccweight
for cooc in query:
matrix[cooc.ngram1_id][cooc.ngram2_id] = cooc.weight
matrix[cooc.ngram2_id][cooc.ngram1_id] = cooc.weight
ids[ngram1_id] = (field1, ngram1_id)
ids[ngram2_id] = (field2, ngram2_id)
ids[cooc.ngram1_id] = (field1, cooc.ngram1_id)
ids[cooc.ngram2_id] = (field2, cooc.ngram2_id)
weight[cooc.ngram1_id] = weight.get(cooc.ngram1_id, 0) + cooc.weight
weight[cooc.ngram2_id] = weight.get(cooc.ngram2_id, 0) + cooc.weight
weight[ngram1_id] = weight.get(ngram1_id, 0) + ccweight
weight[ngram2_id] = weight.get(ngram2_id, 0) + ccweight
x = pd.DataFrame(matrix).fillna(0)
......@@ -217,4 +217,3 @@ def clusterByDistances( cooc_id
partition = best_partition(G.to_undirected())
return(G,partition,ids,weight)
......@@ -42,27 +42,48 @@ def get_graph( request=None , corpus=None
3) filter By Bridgeness (filter By Bridgeness)
main parameter: bridgness
4) format the graph (formatGraph)
main parameter: format_
'''
from datetime import datetime
before_cooc = datetime.now()
# TODO change test here (always true)
# to something like "if cooc.status threshold == required_threshold
# and group.creation_time < cooc.creation_time"
# if False => read and give to clusterByDistances
# if True => compute and give to clusterByDistances <==
if cooc_id == None:
cooc_id = countCooccurrences( corpus=corpus
cooc_matrix = countCooccurrences( corpus=corpus
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
, just_pass_result = True
#, limit=size
)
G, partition, ids, weight = clusterByDistances ( cooc_id
else:
cooc_matrix = WeightedMatrix(cooc_id)
# fyi
after_cooc = datetime.now()
print("... Cooccurrences took %f s." % (after_cooc - before_cooc).total_seconds())
G, partition, ids, weight = clusterByDistances ( cooc_matrix
, field1="ngrams", field2="ngrams"
, distance=distance
)
after_cluster = datetime.now()
print("... Clustering took %f s." % (after_cluster - after_cooc).total_seconds())
data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
return data
after_filter = datetime.now()
print("... Filtering took %f s." % (after_filter - after_cluster).total_seconds())
return data
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment