Commit a40f95bb authored by delanoe's avatar delanoe

[FEAT GRAPH] Commit before factoring.

parent 0564f787
......@@ -14,7 +14,9 @@ import numpy as np
import pandas as pd
import networkx as nx
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True, distance='conditional'):
def do_distance( cooc_id
, field1=None, field2=None
, isMonopartite=True, distance='conditional'):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
......
......@@ -4,7 +4,7 @@ from gargantext.util.http import JsonHttpResponse
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
#from gargantext.util.toolchain.ngram_coocs import compute_coocs
from graphExplorer.distance import do_distance
from graphExplorer.distances import do_distance
from graphExplorer.cooccurrences import do_cooc
# Prelude lib
......@@ -14,10 +14,10 @@ from sqlalchemy.orm import aliased
# Math/Graph lib
import math
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import networkx as nx
import networkx as nx
from networkx.readwrite import json_graph
......@@ -27,6 +27,7 @@ def get_cooc( request=None, corpus=None
, start=None , end=None
, threshold=1
, distance='conditional'
, isMonopartite=True # By default, we compute terms/terms graph
, size=1000
, bridgeness=5
, mapList_id = None , groupList_id = None
......@@ -35,8 +36,6 @@ def get_cooc( request=None, corpus=None
get_ccoc : to compute the graph.
'''
data = {}
if mapList_id == None :
mapList_id = ( session.query ( Node.id )
......@@ -56,24 +55,11 @@ def get_cooc( request=None, corpus=None
)
.first()
)
if groupList_id == None :
raise ValueError("GROUPLIST node needed for cooccurrences")
# compute_cooc needs group, fields etc.
# group_id = 3
SamuelFlag = False
# if field1 == field2 == 'ngrams' :
# isMonopartite = True
# SamuelFlag = True
# else:
# isMonopartite = False
isMonopartite = True # Always. So, calcule the graph B and from these B-nodes, build the graph-A
# data deleted each time
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
if corpus is None:
corpus = session.query(Node).filter(Node.id==corpus_id).first()
......@@ -90,6 +76,9 @@ def get_cooc( request=None, corpus=None
, isMonopartite=True
, distance=distance
)
# Data are stored in a dict(), (== hashmap by default for Python)
data = dict()
if type == "node_link":
nodesB_dict = {}
for node_id in G.nodes():
......@@ -192,183 +181,4 @@ def get_cooc( request=None, corpus=None
return(data)
def get_graphA( nodeA_type , NodesB , links , corpus ):
from analysis.InterUnion import Utils
print(" = = = == = = = ")
print("In get_graphA(), corpus id:",corpus.id)
nodeA_type_id = cache.Hyperdata[nodeA_type].id
threshold_cotainf = 0.02
max_nodeid = -1
for nodeid in NodesB:
if nodeid > max_nodeid:
max_nodeid = nodeid
# = = = = [ 01. Getting ALL documents of the Corpus c ] = = = = #
Docs = {}
document_type_id = cache.NodeType['Document'].id
sql_query = 'select id from node_node where parent_id='+str(corpus.id)+' and type_id='+str(document_type_id)
cursor = connection.cursor()
cursor.execute(sql_query)
results = cursor.fetchall()
for i in results:
Docs[i[0]] = True
print("docs:",len(Docs.keys()))
# = = = = [ / 01. Getting ALL documents of the Corpus c ] = = = = #
# = = = = [ 02. Getting ALL Documents related with Ngrams of the carte semantic ] = = = = #
sql_query = 'select nodey_id,ngram_id from node_nodenodengram where ngram_id IN (' + ','.join(map(str, NodesB.keys())) + ")"
cursor = connection.cursor()
cursor.execute(sql_query)
results = cursor.fetchall()
# = = = = [ / 02. Getting ALL Documents related with Ngrams of the carte semantic ] = = = = #
# = = = = [ 03. Now we limit the retrieved Documents(step 02) to those belonging to the Corpus c ] = = = = ]
Docs_and_ = {
"nodesA":{},
"nodesB":{}
}
NodesB_and_Docs = {}
for i in results:
doc_id = i[0]
ngram_id = i[1]
if ngram_id in NodesB and doc_id in Docs:
if doc_id not in Docs_and_["nodesB"]:
Docs_and_["nodesB"][doc_id] = []
Docs_and_["nodesB"][doc_id].append( ngram_id )
if ngram_id not in NodesB_and_Docs:
NodesB_and_Docs[ngram_id] = []
NodesB_and_Docs[ngram_id].append( doc_id )
# = = = = [ / 03. Now we limit the retrieved Documents(step 02) to those belonging to the Corpus c ] = = = = ]
# # = = = = [ Getting Authors ] = = = = ]
# Authors = {}
# sql_query = 'select node_id,value_string from node_node_hyperdata where node_id IN (' + ','.join(map(str, Docs_and_["nodesB"].keys())) + ")"+' and hyperdata_id=10'# 10 -> authors
# cursor = connection.cursor()
# cursor.execute(sql_query)
# results = cursor.fetchall()
# for i in results:
# doc_id = i[0]
# authors = i[1].split(",")
# for a in authors:
# if a not in Authors:
# Authors[a] = 0
# Authors[a] += 1
# print("")
# print("#authors:")
# import pprint
# pprint.pprint(Authors)
# print("")
# # = = = = [ / Getting Authors ] = = = = ]
# = = = = [ 04. Getting A-elems and making the dictionaries] = = = = ]
sql_query = 'select node_id,value_string from node_node_hyperdata where node_id IN (' + \
','.join(map(str, Docs_and_["nodesB"].keys())) + ")"+' and hyperdata_id='+str(nodeA_type_id)
cursor = connection.cursor()
cursor.execute(sql_query)
results = cursor.fetchall()
A_Freq = {}
A_int2str = {}
A_str2int = {}
counter = max_nodeid+1
for i in results:
doc_id = i[0]
a = i[1]
if a not in A_str2int:
A_str2int[ a ] = counter
A_int2str[counter] = a
counter += 1
for i in results:
doc_id = i[0]
a = A_str2int[i[1]]
Docs_and_["nodesA"][doc_id] = a
if a not in A_Freq:
A_Freq[ a ] = 0
A_Freq[ a ] += 1
# = = = = [ / 04. Getting A-elems and making the dictionaries ] = = = = ]
# = = = = [ Filling graph-A ] = = = = ]
Graph_A = Utils()
for i in NodesB_and_Docs:
ngram = i
docs = NodesB_and_Docs[i]
k_A_clique = {}
for doc in docs:
k_A = Docs_and_["nodesA"][doc]
k_A_clique[k_A] = True
if len(k_A_clique.keys())>1:
Graph_A.addCompleteSubGraph( k_A_clique.keys() )
# = = = = [ / Filling graph-A ] = = = = ]
# = = = = [ graph-A to JSON ] = = = = ]
A = Graph_A.G
for node_id in A.nodes():
A.node[node_id]['label'] = A_int2str[node_id]
A.node[node_id]['size'] = A_Freq[node_id]
A.node[node_id]['type'] = nodeA_type
A.node[node_id]['attributes'] = { "clust_default": 1 }
A_links = []
min_weight = 999999
max_weight = -1
Weights_Dist = {}
for e in A.edges_iter():
s = e[0]
t = e[1]
w = A[s][t]["weight"]
if w not in Weights_Dist:
Weights_Dist[ w ] = { "freq": 0 , "deleted":0 }
Weights_Dist[ w ]["freq"] += 1
if min_weight > w:
min_weight = w
if max_weight < w:
max_weight = w
edges2remove = []
for e in A.edges_iter():
s = e[0]
t = e[1]
w = A[s][t]["weight"]
if Weights_Dist [ w ]["freq"] < ( len(A)*3 ): # weight-threshold
info = {
"s":s ,
"t":t ,
"w": w / max_weight # normalization
}
A_links.append(info)
else:
# if Weights_Dist [ w ]["deleted"] < round(Weights_Dist [ w ]["freq"]*0.95):
atuple = (s,t)
edges2remove.append(atuple)
Weights_Dist [ w ]["deleted"] += 1
A.remove_edges_from( edges2remove )
A.remove_nodes_from(nx.isolates(A))
data = json_graph.node_link_data(A) # saving nodesA
AB = nx.Graph()
for i in NodesB_and_Docs:
b = i
docs = NodesB_and_Docs[i]
for doc in docs:
a = Docs_and_["nodesA"][doc]
if A.has_node(a):
AB.add_edge( a , b )
AB_links = []
for e in AB.edges_iter():
info = { "s": e[0], "t": e[1], "w": 1 }
AB_links.append(info)
data["links"] = A_links + AB_links # saving AA-links and AB-links
# = = = = [ / graph-A to JSON ] = = = = ]
return data
......@@ -44,8 +44,8 @@ class Graph(APIView):
if field2 in accepted_field2 :
if start is not None and end is not None :
data = get_cooc( corpus=corpus
#, field1=field1 , field2=field2
, start=start , end=end
#, field1=field1 , field2=field2
, start=start , end=end
, threshold =threshold , distance=distance
)
else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment