Commit ce761204 authored by sim's avatar sim

Remove graph module

parent 6a0506a4
Module Graph Explorer: from text to graph
=========================================
## Graph Explorer main
0) All urls.py of the Graph Explorer
1) Main view of the graph explorer: views.py
-> Graph Explorer
-> My graph View
-> REST API to get Data
2) Graph is generated (graph.py) through different steps
a) check the constraints (graph_constraints) in gargantext/constants.py
b) Data are retrieved as REST
rest.py: check REST parameters
c) graph.py:
get_graph: check Graph parameters
compute_graph: compute graph
1) Cooccurences are computed (in live or asynchronously): cooccurrences.py
2) Thresold and distances : distances.py
3) clustering: louvain.py
4) links between communities: bridgeness.py
d) compress graph before returning it: utils.py
4) Additional features:
a) intersection of graphs: intersection.py
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## TODO
myGraphs view:
* progress bar
* Show already computed graphs vs to be computed with parameters
* show parameters
* copy / paste and change some parameters to generate new graph
# Article coming soon
from gargantext.util.db import session
from gargantext.models.ngrams import Ngram
from collections import defaultdict
from networkx.readwrite import json_graph
def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
'''
Bridgeness = measure to control links (bridges) between communities.
'''
# Data are stored in a dict(), (== hashmap by default with Python)
data = dict()
if type == "node_link":
nodesB_dict = {}
for node_id in G.nodes():
#node,type(labels[node])
nodesB_dict [ ids[node_id][1] ] = True
# TODO the query below is not optimized (do it do_distance).
the_label = session.query(Ngram.terms).filter(Ngram.id==node_id).first()
the_label = ", ".join(the_label)
G.node[node_id]['label'] = the_label
G.node[node_id]['size'] = weight[node_id]
G.node[node_id]['type'] = ids[node_id][0].replace("ngrams","terms")
G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
links = []
i=1
if bridgeness > 0:
com_link = defaultdict(lambda: defaultdict(list))
com_ids = defaultdict(list)
for k, v in partition.items():
com_ids[v].append(k)
for e in G.edges_iter():
s = e[0]
t = e[1]
weight = G[ids[s][1]][ids[t][1]]["weight"]
if bridgeness < 0:
info = { "s": ids[s][1]
, "t": ids[t][1]
, "w": weight
}
links.append(info)
else:
if partition[s] == partition[t]:
info = { "s": ids[s][1]
, "t": ids[t][1]
, "w": weight
}
links.append(info)
if bridgeness > 0:
if partition[s] < partition[t]:
com_link[partition[s]][partition[t]].append((s,t,weight))
if bridgeness > 0:
for c1 in com_link.keys():
for c2 in com_link[c1].keys():
index = round(
bridgeness * len( com_link[c1][c2] )
/ #----------------------------------#
( len(com_ids[c1]) + len(com_ids[c2] ))
)
#print((c1,len(com_ids[c1])), (c2,len(com_ids[c2])), index)
if index > 0:
for link in sorted( com_link[c1][c2]
, key=lambda x: x[2]
, reverse=True)[:index]:
#print(c1, c2, link[2])
info = {"s": link[0], "t": link[1], "w": link[2]}
links.append(info)
B = json_graph.node_link_data(G)
B["links"] = []
B["links"] = links
if field1 == field2 == 'ngrams' :
data["nodes"] = B["nodes"]
data["links"] = B["links"]
else:
A = get_graphA( "journal" , nodesB_dict , B["links"] , corpus )
print("#nodesA:",len(A["nodes"]))
print("#linksAA + #linksAB:",len(A["links"]))
print("#nodesB:",len(B["nodes"]))
print("#linksBB:",len(B["links"]))
data["nodes"] = A["nodes"] + B["nodes"]
data["links"] = A["links"] + B["links"]
print(" total nodes :",len(data["nodes"]))
print(" total links :",len(data["links"]))
print("")
elif type == "adjacency":
for node in G.nodes():
try:
#node,type(labels[node])
#G.node[node]['label'] = node
G.node[node]['name'] = node
#G.node[node]['size'] = weight[node]
G.node[node]['group'] = partition[node]
#G.add_edge(node, partition[node], weight=3)
except Exception as error:
print("error02: ",error)
data = json_graph.node_link_data(G)
elif type == 'bestpartition':
return(partition)
return(data)
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
NodeHyperdata, HyperdataKey
from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from sqlalchemy import desc, asc, or_, and_
from datetime import datetime
def filterMatrix(matrix, mapList_id, groupList_id):
mapList = UnweightedList( mapList_id )
group_list = Translations ( groupList_id )
cooc = matrix & (mapList * group_list)
return cooc
def countCooccurrences( corpus_id=None , cooc_id=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
, isMonopartite=True , threshold = 3
, save_on_db= True , reset=True
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of parameters are not supported because, lists need to
be merged before.
corpus :: Corpus
mapList_id :: Int
groupList_id :: Int
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
'''
# FIXME remove the lines below after factorization of parameters
parameters = dict()
parameters['field1'] = field1
parameters['field2'] = field2
# Get corpus as Python object
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# Get node of the Graph
if not cooc_id:
cooc_id = ( session.query( Node.id )
.filter( Node.typename == "COOCCURRENCES"
, Node.name == "GRAPH EXPLORER"
, Node.parent_id == corpus.id
)
.first()
)
if not cooc_id:
coocNode = corpus.add_child(
typename = "COOCCURRENCES",
name = "GRAPH (in corpus %s)" % corpus.id
)
session.add(coocNode)
session.commit()
cooc_id = coocNode.id
else :
cooc_id = int(cooc_id[0])
# when cooc_id preexisted, but we want to continue (reset = True)
# (to give new contents to this cooc_id)
elif reset:
print("GRAPH #%s ... Counting new cooccurrences data." % cooc_id)
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == cooc_id ).delete()
session.commit()
# when cooc_id preexisted and we just want to load it (reset = False)
else:
print("GRAPH #%s ... Loading cooccurrences computed already." % cooc_id)
cooc = session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.weight ).filter( NodeNgramNgram.node_id == cooc_id ).all()
return(int(cooc_id),WeightedMatrix(cooc))
NodeNgramX = aliased(NodeNgram)
# Simple Cooccurrences
cooc_score = func.count(NodeNgramX.node_id).label('cooc_score')
# A kind of Euclidean distance cooccurrences
#cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')
if isMonopartite :
NodeNgramY = aliased(NodeNgram)
cooc_query = (session.query( NodeNgramX.ngram_id
, NodeNgramY.ngram_id
, cooc_score
)
.join( Node
, Node.id == NodeNgramX.node_id
)
.join( NodeNgramY
, NodeNgramY.node_id == Node.id
)
.filter( Node.parent_id==corpus.id
, Node.typename=="DOCUMENT"
)
)
else :
NodeNgramY = aliased(NodeNgram)
cooc_query = (session.query( NodeHyperdataNgram.ngram_id
, NodeNgramY.ngram_id
, cooc_score
)
.join( Node
, Node.id == NodeHyperdataNgram.node_id
)
.join( NodeNgramY
, NodeNgramY.node_id == Node.id
)
.join( Hyperdata
, Hyperdata.id == NodeHyperdataNgram.hyperdata_id
)
.filter( Node.parent_id == corpus.id
, Node.typename == "DOCUMENT"
)
.filter( Hyperdata.name == field1 )
)
# Size of the ngrams between n_min and n_max
if n_min is not None or n_max is not None:
if isMonopartite:
NgramX = aliased(Ngram)
cooc_query = cooc_query.join ( NgramX
, NgramX.id == NodeNgramX.ngram_id
)
NgramY = aliased(Ngram)
cooc_query = cooc_query.join ( NgramY
, NgramY.id == NodeNgramY.ngram_id
)
if n_min is not None:
cooc_query = (cooc_query
.filter(NgramY.n >= n_min)
)
if isMonopartite:
cooc_query = cooc_query.filter(NgramX.n >= n_min)
if n_max is not None:
cooc_query = (cooc_query
.filter(NgramY.n >= n_min)
)
if isMonopartite:
cooc_query = cooc_query.filter(NgramX.n >= n_min)
# Cooc between the dates start and end
if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more precise date format here (day is smaller grain actually).
date_start = datetime.strptime (str(start), "%Y-%m-%d")
date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
Start=aliased(NodeHyperdata)
cooc_query = (cooc_query.join( Start
, Start.node_id == Node.id
)
.filter( Start.key == 'publication_date')
.filter( Start.value_utc >= date_start_utc)
)
parameters['start'] = date_start_utc
if end is not None:
# TODO : more precise date format here (day is smaller grain actually).
date_end = datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
End=aliased(NodeHyperdata)
cooc_query = (cooc_query.join( End
, End.node_id == Node.id
)
.filter( End.key == 'publication_date')
.filter( End.value_utc <= date_end_utc )
)
parameters['end'] = date_end_utc
if isMonopartite:
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = cooc_query.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
cooc_query = cooc_query.having(cooc_score >= threshold)
if isMonopartite:
cooc_query = cooc_query.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
else:
cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id)
# Order according some scores
# If ordering is really needed, use Ordered Index (faster)
#cooc_query = cooc_query.order_by(desc('cooc_score'))
matrix = WeightedMatrix(cooc_query)
print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id)
cooc = filterMatrix(matrix, mapList_id, groupList_id)
parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(groupList_id)
# TODO factorize savings on db
if save_on_db:
# Saving the cooccurrences
cooc.save(cooc_id)
print("GRAPH #%s ... Node Cooccurrence Matrix saved" % cooc_id)
# Saving the parameters
print("GRAPH #%s ... Parameters saved in Node." % cooc_id)
coocNode = session.query(Node).filter(Node.id==cooc_id).first()
coocNode.hyperdata["parameters"] = dict()
coocNode.hyperdata["parameters"] = parameters
coocNode.save_hyperdata()
session.commit()
#data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
else:
return cooc
return(coocNode.id, cooc)
import math
import numpy as np
import pandas as pd
import networkx as nx
from copy import copy
from collections import defaultdict
from math import log,sqrt
#from operator import itemgetter
from gargantext.models import Node, NodeNgram, NodeNgramNgram, \
NodeHyperdata
from gargantext.util.db import session, aliased
from .louvain import best_partition
def clusterByDistances( cooc_matrix
, field1=None, field2=None
, distance=None):
'''
clusterByDistance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
'''
# implicit global session
authorized = ['conditional', 'distributional', 'cosine']
if distance not in authorized:
raise ValueError("Distance must be in %s" % str(authorized))
matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int))
labels = dict()
weight = dict()
for cooc in cooc_matrix.items:
ngram1_id = cooc[0]
ngram2_id = cooc[1]
ccweight = cooc_matrix.items[cooc]
matrix[ngram1_id][ngram2_id] = ccweight
matrix[ngram2_id][ngram1_id] = ccweight
ids[ngram1_id] = (field1, ngram1_id)
ids[ngram2_id] = (field2, ngram2_id)
weight[ngram1_id] = weight.get(ngram1_id, 0) + ccweight
weight[ngram2_id] = weight.get(ngram2_id, 0) + ccweight
x = pd.DataFrame(matrix).fillna(0)
if distance == 'conditional':
x = x / x.sum(axis=1)
#y = y / y.sum(axis=0)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
n = n.sort_index(inplace=False)
m = m.sort_index(inplace=False)
nodes_included = 10000 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 10000 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx >= threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(np.matrix(matrix_filtered))
G = nx.relabel_nodes(G, dict(enumerate([ ids[id_][1] for id_ in list(xx.columns)])))
elif distance == 'cosine':
scd = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
numerator = sum(
[
matrix[i][k] * matrix[j][k]
for k in matrix.keys()
if i != j and k != i and k != j
]
)
denominator = sqrt(
sum([
matrix[i][k]
for k in matrix.keys()
if k != i and k != j #and matrix[i][k] > 0
])
*
sum([
matrix[i][k]
for k in matrix.keys()
if k != i and k != j #and matrix[i][k] > 0
])
)
try:
scd[i][j] = numerator / denominator
except Exception as error:
scd[i][j] = 0
minmax = min([ max([ scd[i][j] for i in scd.keys()]) for j in scd.keys()])
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': scd[i][j]})
for i in scd.keys() for j in scd.keys()
if i != j and scd[i][j] > minmax and scd[i][j] > scd[j][i]
]
)
elif distance == 'distributional':
mi = defaultdict(lambda : defaultdict(int))
total_cooc = x.sum().sum()
for i in matrix.keys():
si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
for j in matrix[i].keys():
sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
if i!=j :
mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
r = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
sumMin = sum(
[
min(mi[i][k], mi[j][k])
for k in matrix.keys()
if i != j and k != i and k != j and mi[i][k] > 0
]
)
sumMi = sum(
[
mi[i][k]
for k in matrix.keys()
if k != i and k != j and mi[i][k] > 0
]
)
try:
r[i][j] = sumMin / sumMi
except Exception as error:
r[i][j] = 0
# Need to filter the weak links, automatic threshold here
minmax = min([ max([ r[i][j] for i in r.keys()]) for j in r.keys()])
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': r[i][j]})
for i in r.keys() for j in r.keys()
if i != j and r[i][j] > minmax and r[i][j] > r[j][i]
]
)
# degree_max = max([(n, d) for n,d in G.degree().items()], key=itemgetter(1))[1]
# nodes_to_remove = [n for (n,d) in G.degree().items() if d <= round(degree_max/2)]
# G.remove_nodes_from(nodes_to_remove)
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
# nodes_to_remove = [n for n in degree if degree[n] <= 1]
# G.remove_nodes_from(nodes_to_remove)
def getWeight(item):
return item[1]
#
# node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
# #print(node_degree)
# nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
#
# for n in nodes_too_connected:
# n_edges = list()
# for v in nx.neighbors(G,n):
# #print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
# n_edges.append(((n, v), G[n][v]['weight']))
#
# n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
# G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])
G.remove_nodes_from(nx.isolates(G))
partition = best_partition(G.to_undirected())
return(G,partition,ids,weight)
This diff is collapsed.
"""
Computes ngram growth on periods
"""
from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db_cache import cache
from gargantext.util.db import session, bulk_insert, aliased, \
func, get_engine # = sqlalchemy.func like sum() or count()
from datetime import datetime
def timeframes(start, end):
"""
timeframes :: String -> String -> (UTCTime, UTCTime, UTCTime)
"""
start = datetime.strptime (str(start), "%Y-%m-%d")
end = datetime.strptime (str(end), "%Y-%m-%d")
date_0 = start - (end - start)
date_1 = start
date_2 = end
return (date_0, date_1, date_2)
def compute_growth(corpus_id, groupList_id, mapList_id, start, end):
"""
compute_graph :: Int -> UTCTime -> UTCTime -> Int -> Int
-> [(Int, Numeric)]
this function uses SQL function in
/srv/gargantext/install/gargamelle/sqlFunctions.sql
First compute occurrences of ngrams in mapList (with groups) on the first
period, then on the second and finally returns growth.
Directly computed with Postgres Database (C) for optimization.
"""
connection = get_engine()
(date_0, date_1, date_2) = timeframes(start, end)
query = """SELECT * FROM OCC_HIST( {corpus_id}
, {groupList_id}
, {mapList_id}
, '{date_0}'
, '{date_1}'
, '{date_2}'
)
""".format( corpus_id = corpus_id
, groupList_id = groupList_id
, mapList_id = mapList_id
, date_0 = date_0
, date_1 = date_1
, date_2 = date_2
)
return(connection.execute(query))
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
HyperdataKey
from gargantext.util.db import session, aliased, bulk_insert, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from gargantext.util.http import JsonHttpResponse
from sqlalchemy import desc, asc, or_, and_, func
import datetime
import ast
import networkx as nx
def doc_freq(corpus_id, node_ids):
'''
doc_freq :: Corpus_id -> [(Ngram_id, Int)]
Given a corpus, compute number of documents that have the ngram in it.
'''
return ( session.query(NodeNgram.ngram_id, func.count(NodeNgram.node_id))
.join(Node, NodeNgram.node_id == Node.id)
.filter( Node.parent_id == corpus_id
, Node.typename== 'DOCUMENT')
.filter( NodeNgram.weight > 0
, NodeNgram.ngram_id.in_(node_ids) )
.group_by(NodeNgram.ngram_id)
.all()
)
def doc_ngram_representativity(corpus_id, node_ids):
'''
doc_ngram_representativity :: Corpus_ID -> Dict Ngram_id Float
Given a corpus, compute part of of documents that have the ngram it it.
'''
nodes_count = ( session.query(Node)
.filter( Node.parent_id == corpus_id
, Node.typename == 'DOCUMENT'
)
.count()
)
result = dict()
for ngram_id, somme in doc_freq(corpus_id, node_ids):
result[ngram_id] = somme / nodes_count
return result
def compare_corpora(Corpus_id_A, Corpus_id_B, node_ids):
'''
compare_corpora :: Corpus_id -> Corpus_id -> Dict Ngram_id Float
Given two corpus :
- if corpora are the same, it return :
(dict of document frequency per ngram as key)
- if corpora are different, it returns :
doc_ngram_representativit(Corpus_id_A) / doc_ngram_representativity(Corpus_id_B)
(as dict per ngram as key)
'''
result = dict()
if int(Corpus_id_A) == int(Corpus_id_B):
for ngram_id, somme in doc_freq(Corpus_id_A, node_ids):
result[ngram_id] = somme
else:
data_A = doc_ngram_representativity(Corpus_id_A, node_ids)
data_B = doc_ngram_representativity(Corpus_id_B, node_ids)
queue = list()
for k in data_A.keys():
if k not in data_B.keys():
queue.append(k)
else:
result[k] = data_B[k] / data_A[k]
maximum = max([ result[k] for k in result.keys()])
minimum = min([ result[k] for k in result.keys()])
for k in queue:
result[k] = minimum
return result
def intersection(request , corpuses_ids, measure='cooc'):
'''
intersection :: (str(Int) + "a" str(Int)) -> Dict(Ngram.id :: Int, Score :: Int)
intersection = returns as Json Http Response the intersection of two graphs
'''
if request.method == 'POST' and "nodeids" in request.POST and len(request.POST["nodeids"])>0 :
node_ids = [int(i) for i in (ast.literal_eval( request.POST["nodeids"] )) ]
# Here are the visible nodes of the initial semantic map.
corpuses_ids = corpuses_ids.split('a')
corpuses_ids = [int(i) for i in corpuses_ids]
# corpus[1] will be the corpus to compare
return JsonHttpResponse(compare_corpora(corpuses_ids[0], corpuses_ids[1], node_ids))
This diff is collapsed.
from gargantext.models.users import User
from gargantext.util.db import session
from django.core.mail import send_mail
from gargantext.settings import BASE_URL
def notify_owner(corpus,cooc_id,distance,bridgeness):
user = session.query(User).filter(User.id == corpus.user_id).first()
message = '''
Bonjour,
votre graph vient de se terminer dans votre corpus intitulé:
%s
Vous pouvez accéder et renommer votre Graph à l'adresse:
http://%s/projects/%d/corpora/%d/explorer?cooc_id=%d&distance=%s&bridgeness=%d
Nous restons à votre disposition pour tout complément d'information.
Cordialement
--
L'équipe de Gargantext (CNRS)
''' % (corpus.name, BASE_URL, corpus.parent_id, corpus.id, cooc_id, distance, bridgeness)
if user.email != "" :
send_mail('[Gargantext] Votre Graph est calculé'
, message
, 'team@gargantext.org'
, [user.email], fail_silently=False )
else:
print("User %s (%d), has no email" % (user.username, user.id) )
This diff is collapsed.
This diff is collapsed.
from django.conf.urls import url
# Module "Graph Explorer"
from .rest import Graph
from .views import explorer, myGraphs
from .intersection import intersection
# TODO : factor urls
# url will have this pattern:
# ^explorer/$corpus_id/view
# ^explorer/$corpus_id/data.json
# ^explorer/$corpus_id/intersection
# GET ^api/projects/(\d+)/corpora/(\d+)/explorer$ -> data in json format
urlpatterns = [ url(r'^projects/(\d+)/corpora/(\d+)/explorer$' , explorer )
, url(r'^projects/(\d+)/corpora/(\d+)/myGraphs$' , myGraphs )
, url(r'^explorer/intersection/(\w+)$' , intersection )
]
def compress_graph(graphdata):
"""
graph data is usually a dict with 2 slots:
"nodes": [{"id":4103, "type":"terms", "attributes":{"clust_default": 0}, "size":29, "label":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.0425531914893617},...]
To send this data over the net, this function can reduce a lot of its size:
- keep less decimals for float value of each link's weight
- use shorter names for node properties (eg: s/clust_default/cl/)
result format:
"nodes": [{"id":4103, "at":{"cl": 0}, "s":29, "lb":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.042},...]
"""
for link in graphdata['links']:
link['w'] = format(link['w'], '.3f') # keep only 3 decimals
for node in graphdata['nodes']:
node['lb'] = node['label']
del node['label']
#node['attributes']['growth'] = 0.8
node['at'] = node['attributes']
del node['attributes']
node['at']['cl'] = node['at']['clust_default']
del node['at']['clust_default']
node['s'] = node['size']
del node['size']
if node['type'] == "terms":
# its the default type for our format: so we don't need it
del node['type']
else:
node['t'] = node['type']
del node['type']
return graphdata
def format_html(link):
"""
Build an html link adapted to our json message format
"""
return "<a class='msglink' href='%s'>%s</a>" % (link, link)
from gargantext.util.http import *
from gargantext.util.db import *
from gargantext.util.db_cache import cache
from gargantext.models import *
from gargantext.constants import *
from gargantext.settings import *
from gargantext.constants import USER_LANG
from datetime import datetime
from gargantext.views.pages.main import get_user_params
@requires_auth
def explorer(request, project_id, corpus_id):
'''
Graph explorer, also known as TinaWebJS, using SigmaJS.
Nodes are ngrams (from title or abstract or journal name.
Links represent proximity measure.
Data are received in RESTfull mode (see rest.py).
'''
# we pass our corpus
corpus = cache.Node[corpus_id]
# security check
user = cache.User[request.user.id]
if corpus is None:
raise Http404()
if not user.owns(corpus):
return HttpResponseForbidden()
# get the maplist_id for modifications
maplist_id = corpus.children(typename="MAPLIST").first().id
# and the project just for project.id in corpusBannerTop
project = cache.Node[project_id]
# rendered page : explorer.html
return render(
template_name = 'explorer.html',
request = request,
context = {
'debug' : settings.DEBUG ,
'request' : request ,
'user' : request.user ,
'date' : datetime.now() ,
'project' : project ,
'corpus' : corpus ,
'maplist_id': maplist_id ,
'view' : 'graph' ,
'user_parameters': get_user_params(request.user),
'languages': USER_LANG
},
)
@requires_auth
def myGraphs(request, project_id, corpus_id):
'''
List all of my Graphs.
Each Graphs as one Node of Cooccurrences.
Each Graph is save in hyperdata of each Node.
'''
user = cache.User[request.user.id]
# we pass our corpus
corpus = cache.Node[corpus_id]
# and the project just for project.id in corpusBannerTop
project = cache.Node[project_id]
coocs = corpus.children('COOCCURRENCES', order=True).all()
coocs_count = dict()
for cooc in coocs:
# FIXME : approximativ number of nodes (not exactly what user sees in explorer)
# Need to be connected with Graph Clustering
cooc_nodes = (session.query(Ngram.id,func.count(Ngram.id))
.join(NodeNgramNgram, NodeNgramNgram.ngram1_id == Ngram.id)
.filter(NodeNgramNgram.node_id==cooc.id)
.filter(NodeNgramNgram.weight >= 1)
.group_by(Ngram.id)
.all()
)
#coocs_count[cooc.id] = len(cooc_nodes)
coocs_count[cooc.id] = len([cooc_node for cooc_node in cooc_nodes if cooc_node[1] > 1])
print("coocs_count a posteriori", coocs_count)
return render(
template_name = 'pages/corpora/myGraphs.html',
request = request,
context = {
'debug' : settings.DEBUG,
'request' : request,
'user' : request.user,
'date' : datetime.now(),
'project' : project,
'resourcename' : get_resource_by_name(corpus),
'corpus' : corpus,
'view' : 'myGraph',
'coocs' : coocs,
'coocs_count' : coocs_count,
'user_parameters': get_user_params(request.user),
'languages': USER_LANG,
},
)
......@@ -43,7 +43,6 @@ CELERYBEAT_SCHEDULER = 'djcelery.schedulers.DatabaseScheduler'
CELERY_IMPORTS = (
"gargantext.util.toolchain",
"gargantext.util.crawlers",
"gargantext.graph.graph",
"gargantext.moissonneurs.pubmed",
"gargantext.moissonneurs.istex",
"gargantext.util.ngramlists_tools",
......@@ -65,7 +64,6 @@ INSTALLED_APPS = [
'rest_framework',
'djcelery',
'gargantext.annotations',
'gargantext.graph',
'gargantext.moissonneurs',
'gargantext',
]
......
......@@ -5,7 +5,6 @@ Views are shared between these modules:
- `pages`, to present HTML views to the user
- `contents`, for Python-generated contents
- `annotations`, to annotate local context of a corpus (as global context)
- `graph explorer`, to explore graphs
"""
from django.conf.urls import include, url
......@@ -21,9 +20,6 @@ import gargantext.views.pages.urls
from gargantext.annotations import urls as annotations_urls
from gargantext.annotations.views import main as annotations_main_view
# Module for graph service
import gargantext.graph.urls
# Module Scrapers
import gargantext.moissonneurs.urls
......@@ -34,9 +30,6 @@ urlpatterns = [ url(r'^admin/' , admin.site.urls
, url(r'^favicon.ico$', Redirect.as_view( url=static.url('favicon.ico')
, permanent=False), name="favicon" )
# Module Graph
, url(r'^' , include( gargantext.graph.urls ) )
# Module Annotation
# tempo: unchanged doc-annotations routes --
, url(r'^annotations/', include( annotations_urls ) )
......
......@@ -10,7 +10,7 @@ from . import ngrams
from . import metrics
from . import ngramlists
from . import analytics
from gargantext.graph.rest import Graph
urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view())
, url(r'^nodes/(\d+)$' , nodes.NodeResource.as_view())
......@@ -37,14 +37,6 @@ urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view()
# Metrics
, url(r'^projects/(\d+)/corpora/(\d+)/metrics$', metrics.CorpusMetrics.as_view())
# GraphExplorer
, url(r'^projects/(\d+)/corpora/(\d+)/explorer$', Graph.as_view())
# data for graph explorer (json)
# GET /api/projects/43198/corpora/111107/explorer?
# Corresponding view is : /projects/43198/corpora/111107/explorer?
# Parameters (example):
# explorer?field1=ngrams&field2=ngrams&distance=conditional&bridgeness=5&start=1996-6-1&end=2002-10-5
# Ngrams
, url(r'^ngrams/?$' , ngrams.ApiNgrams.as_view())
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment