Commit f64dd8ce authored by delanoe's avatar delanoe

[FEAT] Graph Explorer as a module, only template is working, need to connect the rest api.

parent 1325bf54
"""URL Configuration of GarganText
Views are shared between three main modules:
- `api`, for JSON and CSV interaction with data
- `pages`, to present HTML views to the user
- `contents`, for Python-generated contents
Views are shared between these modules:
- `api`, for JSON and CSV interaction with data
- `pages`, to present HTML views to the user
- `contents`, for Python-generated contents
- `annotations`, to annotate local context of a corpus (as global context)
- `graph explorer`, to explore graphs
"""
from django.conf.urls import include, url
......@@ -14,10 +16,15 @@ import gargantext.views.api.urls
import gargantext.views.generated.urls
import gargantext.views.pages.urls
# tempo: unchanged doc-annotations --
from annotations import urls as annotations_urls
# Module Annotation
## tempo: unchanged doc-annotations --
from annotations import urls as annotations_urls
from annotations.views import main as annotations_main_view
# Module "Graph Explorer"
#from graphExplorer import urls as graphExplorer_urls
from graphExplorer.rest import Graph
from graphExplorer.views import explorer
urlpatterns = [
url(r'^admin/', admin.site.urls),
......@@ -25,7 +32,13 @@ urlpatterns = [
url(r'^api/', include(gargantext.views.api.urls)),
url(r'^', include(gargantext.views.pages.urls)),
# tempo: unchanged doc-annotations routes --
# Module Annotation
# tempo: unchanged doc-annotations routes --
url(r'^annotations/', include(annotations_urls)),
url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view),
url(r'^annotations/', include(annotations_urls))
# Module "Graph Explorer"
url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer)
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
]
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.models.nodes import Node
from gargantext.models.ngrams import Ngram, NodeNgram, NodeNgramNgram, \
NodeHyperdataNgram, NodeHyperdata, Hyperdata
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from sqlalchemy.sql import func
import datetime
import inspect
def do_cooc(corpus=None
, field1='ngrams', field2='ngrams'
, main_id=None, stop_id=None, group_id=None
, cvalue_id=None
, n_min=1, n_max=None
, start=None, end=None
, limit=1000
, isMonopartite=True
, hapax = 3
, session=None):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
be merged before.
corpus :: Corpus
cvalue_id :: Int
main_id :: Int
stop_id :: Int
group_id :: Int
For the moment, start and end are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
'''
# TODO : add hyperdata here
# Security test
field1,field2 = str(field1), str(field2)
# Get node
node_cooc = session.query(Node).filter(
Node.parent_id==corpus.id,
Node.typename == "COOCCURRENCES"
).first()
if node_cooc == None:
node_cooc = Node(
name="Coccurrences node",
parent_id=corpus.id,
user_id=corpus.user_id,
typename="COOCCURRENCES")
session.add(node_cooc)
session.commit()
# BEGIN
# Saving the parameters of the analysis in the Node JSONB hyperdata field
args, _, _, parameters = inspect.getargvalues(inspect.currentframe())
# hyperdata = dict()
#
# for parameter in parameters.keys():
# if parameter != 'corpus' and parameter != 'node_cooc':
# hyperdata[parameter] = parameters[parameter]
#
# node_cooc.hyperdata = hyperdata
#
# session.add(node_cooc)
# session.commit()
# END
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
session.commit()
NodeNgramX = aliased(NodeNgram)
cooc_score = func.count(NodeNgramX.node_id).label('cooc_score')
#cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')
#print([n for n in test_query])
if isMonopartite :
NodeNgramY = aliased(NodeNgram)
cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score)
.join(Node, Node.id == NodeNgramX.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.filter(Node.parent_id==corpus.id, Node.typename=="DOCUMENT")
)
else :
NodeNgramY = aliased(NodeNgram)
cooc_query = (session.query(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id, cooc_score)
.join(Node, Node.id == NodeHyperdataNgram.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.join(Hyperdata, Hyperdata.id == NodeHyperdataNgram.hyperdata_id)
.filter(Node.parent_id == corpus.id, Node.typename == "DOCUMENT")
.filter(Hyperdata.name == field1)
)
#print(cooc_query)
# Size of the ngrams between n_min and n_max
if n_min is not None or n_max is not None:
if isMonopartite:
NgramX = aliased(Ngram)
cooc_query = cooc_query.join(NgramX, NgramX.id == NodeNgramX.ngram_id)
NgramY = aliased(Ngram)
cooc_query = (cooc_query
.join(NgramY, NgramY.id == NodeNgramY.ngram_id)
)
if n_min is not None:
cooc_query = (cooc_query
.filter(NgramY.n >= n_min)
)
if isMonopartite:
cooc_query = cooc_query.filter(NgramX.n >= n_min)
if n_max is not None:
cooc_query = (cooc_query
.filter(NgramY.n >= n_min)
)
if isMonopartite:
cooc_query = cooc_query.filter(NgramX.n >= n_min)
# Cooc between the dates start and end
if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here.
date_start = datetime.datetime.strptime (str(start), "%Y-%m-%d")
date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
Start=aliased(NodeHyperdata)
StartFormat = aliased(Hyperdata)
cooc_query = (cooc_query.join(Start, Start.node_id == Node.id)
.join(StartFormat, StartFormat.id == Start.hyperdata_id)
.filter(StartFormat.name == 'publication_date')
.filter(Start.value_datetime >= date_start_utc)
)
if end is not None:
# TODO : more complexe date format here.
date_end = datetime.datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
End=aliased(NodeHyperdata)
EndFormat = aliased(Hyperdata)
cooc_query = (cooc_query.join(End, End.node_id == Node.id)
.join(EndFormat, EndFormat.id == End.hyperdata_id)
.filter(EndFormat.name == 'publication_date')
.filter(End.value_datetime <= date_end_utc)
)
if isMonopartite:
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = cooc_query.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
cooc_query = cooc_query.having(cooc_score > hapax)
if isMonopartite:
cooc_query = cooc_query.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
else:
cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id)
cooc_query = cooc_query.order_by(desc('cooc_score'))
# END of the query
matrix = LISTTYPES["COOCCURRENCES"](cooc_query)
#print(matrix)
if isMonopartite:
if main_id is not None :
main_list = LISTTYPES["MAINLIST"](main_id)
if stop_id is not None :
stop_list = LISTTYPES["STOPLIST"](stop_id)
if group_id is not None :
group_list = LISTTYPES["GROUPLIST"](group_id)
if main_id is not None and stop_id is None and group_id is None :
cooc = matrix & main_list
elif main_id is not None and stop_id is not None and group_id is None :
cooc = matrix & (main_list - stop_list)
elif main_id is not None and stop_id is not None and group_id is not None :
print("main_id is not None and stop_id is not None and group_id is not None")
cooc = matrix & (main_list * group_list - stop_list)
#cooc = matrix & (main_list - stop_list)
elif main_id is not None and stop_id is None and group_id is not None :
cooc = matrix & (main_list * group_list)
else :
cooc = matrix
else:
cooc = matrix
cooc.save(node_cooc.id)
return(node_cooc.id)
......@@ -6,9 +6,9 @@ from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD
from datetime import datetime
def compute_coocs(corpus,
overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD,
def compute_coocs( corpus,
overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD,
mainlist_id = None,
stoplist_id = None,
start = None,
......@@ -23,10 +23,10 @@ def compute_coocs(corpus,
node_id | ngram_id | weight ngram1_id | ngram2_id | score |
--------+----------+-------- ----------+-----------+-------+
MYDOCA | 487 | 1 => 487 | 294 | 2 |
MYDOCA | 294 | 3
MYDOCB | 487 | 1
MYDOCB | 294 | 4
MyDocA | 487 | 1 => 487 | 294 | 2 |
MyDocA | 294 | 3
MyDocB | 487 | 1
MyDocB | 294 | 4
Fill that info in DB:
- a *new* COOCCURRENCES node
......@@ -103,8 +103,8 @@ def compute_coocs(corpus,
coocs_query = (
session.query(x1.ngram_id, x2.ngram_id, ucooc)
.filter(x1.node_id == x2.node_id) # <- by definition of cooc
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself
.filter(x1.node_id == x2.node_id) # <- by definition of cooc
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself
.filter(x1.node_id.in_(docids_subquery)) # <- b/c within corpus
.group_by(x1.ngram_id, x2.ngram_id)
)
......@@ -209,7 +209,7 @@ def compute_coocs(corpus,
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id,
new_hyperdata = { 'corpus' : corpus.id,
'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
......
from django.conf.urls import url
from . import main, auth
from . import projects, corpora, terms, graph
from . import projects, corpora, terms
urlpatterns = [
......@@ -29,7 +29,4 @@ urlpatterns = [
# terms table for the corpus
url(r'^projects/(\d+)/corpora/(\d+)/terms/?$', terms.ngramtable),
# graph explorer
url(r'^projects/(\d+)/corpora/(\d+)/graph/?$', graph.explorer),
]
from gargantext.util.db import session
from collections import defaultdict
from operator import itemgetter
from django.db import connection, transaction
import math
from math import log,sqrt
import numpy as np
import pandas as pd
from copy import copy
import networkx as nx
from networkx.readwrite import json_graph
from graphExplorer.louvain import best_partition, generate_dendogram, partition_at_level
from sqlalchemy.orm import aliased
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True, distance='conditional'):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
# implicit global session
authorized = ['conditional', 'distributional', 'cosine']
if distance not in authorized:
distance = 'conditional'
matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int))
labels = dict()
weight = dict()
Cooc = aliased(NodeNgramNgram)
query = session.query(Cooc).filter(Cooc.node_id==cooc_id).all()
for cooc in query:
matrix[cooc.ngramx_id][cooc.ngramy_id] = cooc.score
matrix[cooc.ngramy_id][cooc.ngramx_id] = cooc.score
ids[cooc.ngramx_id] = (field1, cooc.ngramx_id)
ids[cooc.ngramy_id] = (field2, cooc.ngramy_id)
weight[cooc.ngramx_id] = weight.get(cooc.ngramx_id, 0) + cooc.score
weight[cooc.ngramy_id] = weight.get(cooc.ngramy_id, 0) + cooc.score
x = pd.DataFrame(matrix).fillna(0)
if distance == 'conditional':
x = x / x.sum(axis=1)
#y = y / y.sum(axis=0)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
nodes_included = 500 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 500 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx >= threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(np.matrix(matrix_filtered))
G = nx.relabel_nodes(G, dict(enumerate([ ids[id_][1] for id_ in list(xx.columns)])))
elif distance == 'cosine':
scd = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
numerator = sum(
[
matrix[i][k] * matrix[j][k]
for k in matrix.keys()
if i != j and k != i and k != j
]
)
denominator = sqrt(
sum([
matrix[i][k]
for k in matrix.keys()
if k != i and k != j #and matrix[i][k] > 0
])
*
sum([
matrix[i][k]
for k in matrix.keys()
if k != i and k != j #and matrix[i][k] > 0
])
)
try:
scd[i][j] = numerator / denominator
except Exception as error:
scd[i][j] = 0
minmax = min([ max([ scd[i][j] for i in scd.keys()]) for j in scd.keys()])
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': scd[i][j]})
for i in scd.keys() for j in scd.keys()
if i != j and scd[i][j] > minmax and scd[i][j] > scd[j][i]
]
)
elif distance == 'distributional':
mi = defaultdict(lambda : defaultdict(int))
total_cooc = x.sum().sum()
for i in matrix.keys():
si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
for j in matrix[i].keys():
sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
if i!=j :
mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
r = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
sumMin = sum(
[
min(mi[i][k], mi[j][k])
for k in matrix.keys()
if i != j and k != i and k != j and mi[i][k] > 0
]
)
sumMi = sum(
[
mi[i][k]
for k in matrix.keys()
if k != i and k != j and mi[i][k] > 0
]
)
try:
r[i][j] = sumMin / sumMi
except Exception as error:
r[i][j] = 0
# Need to filter the weak links, automatic threshold here
minmax = min([ max([ r[i][j] for i in r.keys()]) for j in r.keys()])
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': r[i][j]})
for i in r.keys() for j in r.keys()
if i != j and r[i][j] > minmax and r[i][j] > r[j][i]
]
)
# degree_max = max([(n, d) for n,d in G.degree().items()], key=itemgetter(1))[1]
# nodes_to_remove = [n for (n,d) in G.degree().items() if d <= round(degree_max/2)]
# G.remove_nodes_from(nodes_to_remove)
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
# nodes_to_remove = [n for n in degree if degree[n] <= 1]
# G.remove_nodes_from(nodes_to_remove)
def getWeight(item):
return item[1]
#
# node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
# #print(node_degree)
# nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
#
# for n in nodes_too_connected:
# n_edges = list()
# for v in nx.neighbors(G,n):
# #print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
# n_edges.append(((n, v), G[n][v]['weight']))
#
# n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
# G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])
G.remove_nodes_from(nx.isolates(G))
partition = best_partition(G.to_undirected())
return(G,partition,ids,weight)
This diff is collapsed.
This diff is collapsed.
from gargantext.util.http import APIView, APIException, JsonHttpResponse
#from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from gargantext.util.db import session
from graphExplorer.functions import get_cooc
class Graph(APIView):
#authentication_classes = (SessionAuthentication, BasicAuthentication)
def get(self, request, corpus_id):
'''
Graph.get :: Get graph data as REST api.
Get all the parameters first
graph?field1=ngrams&field2=ngrams&
graph?field1=ngrams&field2=ngrams&start=''&end=''
'''
# implicit global session
field1 = request.GET.get ('field1' , 'ngrams' )
field2 = request.GET.get ('field2' , 'ngrams' )
start = request.GET.get ('start' , None )
end = request.GET.get ('end' , None )
threshold = request.GET.get ('threshold' , 1 )
bridgeness = request.GET.get ('bridgeness', -1 )
format_ = request.GET.get ('format' , 'json' )
type_ = request.GET.get ('type' , 'node_link' )
distance = request.GET.get ('distance' , 'conditional')
corpus = session.query(Node).filter(Node.id==corpus_id).first()
accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams',]
options = ['start', 'end', 'threshold', 'distance']
if field1 in accepted_field1 :
if field2 in accepted_field2 :
if start is not None and end is not None :
data = compute_cooc( corpus
#, field1=field1 , field2=field2
, start=start , end=end
, threshold=threshold
, distance=distance
)
else:
data = compute_cooc( corpus
#, field1=field1, field2=field2
, threshold = threshold
, distance = distance
, bridgeness = bridgeness)
if format_ == 'json':
return JsonHttpResponse(data)
else:
return JsonHttpResponse({
'Warning USAGE' : 'One field for each range:'
, 'field1' : accepted_field1
, 'field2' : accepted_field2
, 'options': options
})
mv /srv/gargantext/static/js/tina* .
from django.conf.urls import patterns, url
from graphExplorer import views
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = patterns('',
url(r'^register/$', views.Register.as_view()), # Register
url(r'^login/$', views.Login.as_view()), # Login
)
......@@ -26,7 +26,7 @@ def explorer(request, project_id, corpus_id):
# rendered page : journals.html
return render(
template_name = 'pages/graph.html',
template_name = 'graphExplorer/explorer.html',
request = request,
context = {
'debug' : settings.DEBUG,
......
......@@ -27,3 +27,4 @@ ujson==1.35
umalqurra==0.2 # arabic calendars (?? why use ??)
wheel==0.29.0
pandas==0.18.0
networkx==1.11
This diff is collapsed.
......@@ -66,9 +66,10 @@
<div class="col-md-12">
<div class="btn-group btn-group-justified">
<center>
<!--
<a type="button" class="btn btn-default
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/">Export corpus</a>
--!>
<!-- <li class="divider"></li> --!>
<a type="button" class="btn btn-default
......@@ -84,9 +85,11 @@
<!-- <li class="divider"></li> --!>
<!-- FIXME put a separator here --!>
<!--
<a type="button" class="btn btn-default
{% if view == "charts" %}active{%endif%}"
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/terms">Advanced Charts</a>
--!>
<!-- FIXME a pop up for advanced mode of graphs --!>
<a type="button" class="btn btn-default
......@@ -95,9 +98,11 @@
<a type="button" class="btn btn-default
{% if view == "distributional" %}active{%endif%}"
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/graph">Graphs (Distributional)</a>
<!--
<a type="button" class="btn btn-default
{% if view == "journalTerms" %}active{%endif%}"
href="/projects/{{project.id}}/corpora/{{ corpus.id }}/graph">Graphs Journal/Terms</a>
--!>
</center>
</div>
</div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment