Commit a40f95bb authored by delanoe's avatar delanoe

[FEAT GRAPH] Commit before factoring.

parent 0564f787
......@@ -14,7 +14,9 @@ import numpy as np
import pandas as pd
import networkx as nx
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True, distance='conditional'):
def do_distance( cooc_id
, field1=None, field2=None
, isMonopartite=True, distance='conditional'):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
......
......@@ -4,7 +4,7 @@ from gargantext.util.http import JsonHttpResponse
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
#from gargantext.util.toolchain.ngram_coocs import compute_coocs
from graphExplorer.distance import do_distance
from graphExplorer.distances import do_distance
from graphExplorer.cooccurrences import do_cooc
# Prelude lib
......@@ -27,6 +27,7 @@ def get_cooc( request=None, corpus=None
, start=None , end=None
, threshold=1
, distance='conditional'
, isMonopartite=True # By default, we compute terms/terms graph
, size=1000
, bridgeness=5
, mapList_id = None , groupList_id = None
......@@ -35,8 +36,6 @@ def get_cooc( request=None, corpus=None
get_ccoc : to compute the graph.
'''
data = {}
if mapList_id == None :
mapList_id = ( session.query ( Node.id )
......@@ -61,19 +60,6 @@ def get_cooc( request=None, corpus=None
raise ValueError("GROUPLIST node needed for cooccurrences")
# compute_cooc needs group, fields etc.
# group_id = 3
SamuelFlag = False
# if field1 == field2 == 'ngrams' :
# isMonopartite = True
# SamuelFlag = True
# else:
# isMonopartite = False
isMonopartite = True # Always. So, calcule the graph B and from these B-nodes, build the graph-A
# data deleted each time
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
if corpus is None:
corpus = session.query(Node).filter(Node.id==corpus_id).first()
......@@ -90,6 +76,9 @@ def get_cooc( request=None, corpus=None
, isMonopartite=True
, distance=distance
)
# Data are stored in a dict(), (== hashmap by default for Python)
data = dict()
if type == "node_link":
nodesB_dict = {}
for node_id in G.nodes():
......@@ -192,183 +181,4 @@ def get_cooc( request=None, corpus=None
return(data)
def get_graphA( nodeA_type , NodesB , links , corpus ):
from analysis.InterUnion import Utils
print(" = = = == = = = ")
print("In get_graphA(), corpus id:",corpus.id)
nodeA_type_id = cache.Hyperdata[nodeA_type].id
threshold_cotainf = 0.02
max_nodeid = -1
for nodeid in NodesB:
if nodeid > max_nodeid:
max_nodeid = nodeid
# = = = = [ 01. Getting ALL documents of the Corpus c ] = = = = #
Docs = {}
document_type_id = cache.NodeType['Document'].id
sql_query = 'select id from node_node where parent_id='+str(corpus.id)+' and type_id='+str(document_type_id)
cursor = connection.cursor()
cursor.execute(sql_query)
results = cursor.fetchall()
for i in results:
Docs[i[0]] = True
print("docs:",len(Docs.keys()))
# = = = = [ / 01. Getting ALL documents of the Corpus c ] = = = = #
# = = = = [ 02. Getting ALL Documents related with Ngrams of the carte semantic ] = = = = #
sql_query = 'select nodey_id,ngram_id from node_nodenodengram where ngram_id IN (' + ','.join(map(str, NodesB.keys())) + ")"
cursor = connection.cursor()
cursor.execute(sql_query)
results = cursor.fetchall()
# = = = = [ / 02. Getting ALL Documents related with Ngrams of the carte semantic ] = = = = #
# = = = = [ 03. Now we limit the retrieved Documents(step 02) to those belonging to the Corpus c ] = = = = ]
Docs_and_ = {
"nodesA":{},
"nodesB":{}
}
NodesB_and_Docs = {}
for i in results:
doc_id = i[0]
ngram_id = i[1]
if ngram_id in NodesB and doc_id in Docs:
if doc_id not in Docs_and_["nodesB"]:
Docs_and_["nodesB"][doc_id] = []
Docs_and_["nodesB"][doc_id].append( ngram_id )
if ngram_id not in NodesB_and_Docs:
NodesB_and_Docs[ngram_id] = []
NodesB_and_Docs[ngram_id].append( doc_id )
# = = = = [ / 03. Now we limit the retrieved Documents(step 02) to those belonging to the Corpus c ] = = = = ]
# # = = = = [ Getting Authors ] = = = = ]
# Authors = {}
# sql_query = 'select node_id,value_string from node_node_hyperdata where node_id IN (' + ','.join(map(str, Docs_and_["nodesB"].keys())) + ")"+' and hyperdata_id=10'# 10 -> authors
# cursor = connection.cursor()
# cursor.execute(sql_query)
# results = cursor.fetchall()
# for i in results:
# doc_id = i[0]
# authors = i[1].split(",")
# for a in authors:
# if a not in Authors:
# Authors[a] = 0
# Authors[a] += 1
# print("")
# print("#authors:")
# import pprint
# pprint.pprint(Authors)
# print("")
# # = = = = [ / Getting Authors ] = = = = ]
# = = = = [ 04. Getting A-elems and making the dictionaries] = = = = ]
sql_query = 'select node_id,value_string from node_node_hyperdata where node_id IN (' + \
','.join(map(str, Docs_and_["nodesB"].keys())) + ")"+' and hyperdata_id='+str(nodeA_type_id)
cursor = connection.cursor()
cursor.execute(sql_query)
results = cursor.fetchall()
A_Freq = {}
A_int2str = {}
A_str2int = {}
counter = max_nodeid+1
for i in results:
doc_id = i[0]
a = i[1]
if a not in A_str2int:
A_str2int[ a ] = counter
A_int2str[counter] = a
counter += 1
for i in results:
doc_id = i[0]
a = A_str2int[i[1]]
Docs_and_["nodesA"][doc_id] = a
if a not in A_Freq:
A_Freq[ a ] = 0
A_Freq[ a ] += 1
# = = = = [ / 04. Getting A-elems and making the dictionaries ] = = = = ]
# = = = = [ Filling graph-A ] = = = = ]
Graph_A = Utils()
for i in NodesB_and_Docs:
ngram = i
docs = NodesB_and_Docs[i]
k_A_clique = {}
for doc in docs:
k_A = Docs_and_["nodesA"][doc]
k_A_clique[k_A] = True
if len(k_A_clique.keys())>1:
Graph_A.addCompleteSubGraph( k_A_clique.keys() )
# = = = = [ / Filling graph-A ] = = = = ]
# = = = = [ graph-A to JSON ] = = = = ]
A = Graph_A.G
for node_id in A.nodes():
A.node[node_id]['label'] = A_int2str[node_id]
A.node[node_id]['size'] = A_Freq[node_id]
A.node[node_id]['type'] = nodeA_type
A.node[node_id]['attributes'] = { "clust_default": 1 }
A_links = []
min_weight = 999999
max_weight = -1
Weights_Dist = {}
for e in A.edges_iter():
s = e[0]
t = e[1]
w = A[s][t]["weight"]
if w not in Weights_Dist:
Weights_Dist[ w ] = { "freq": 0 , "deleted":0 }
Weights_Dist[ w ]["freq"] += 1
if min_weight > w:
min_weight = w
if max_weight < w:
max_weight = w
edges2remove = []
for e in A.edges_iter():
s = e[0]
t = e[1]
w = A[s][t]["weight"]
if Weights_Dist [ w ]["freq"] < ( len(A)*3 ): # weight-threshold
info = {
"s":s ,
"t":t ,
"w": w / max_weight # normalization
}
A_links.append(info)
else:
# if Weights_Dist [ w ]["deleted"] < round(Weights_Dist [ w ]["freq"]*0.95):
atuple = (s,t)
edges2remove.append(atuple)
Weights_Dist [ w ]["deleted"] += 1
A.remove_edges_from( edges2remove )
A.remove_nodes_from(nx.isolates(A))
data = json_graph.node_link_data(A) # saving nodesA
AB = nx.Graph()
for i in NodesB_and_Docs:
b = i
docs = NodesB_and_Docs[i]
for doc in docs:
a = Docs_and_["nodesA"][doc]
if A.has_node(a):
AB.add_edge( a , b )
AB_links = []
for e in AB.edges_iter():
info = { "s": e[0], "t": e[1], "w": 1 }
AB_links.append(info)
data["links"] = A_links + AB_links # saving AA-links and AB-links
# = = = = [ / graph-A to JSON ] = = = = ]
return data
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment