Commit 17e3a94b authored by PkSM3's avatar PkSM3

Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into samuel

parents 10a448fe 937e2c70
from admin.utils import PrintException
from gargantext_web.db import * from gargantext_web.db import *
from collections import defaultdict from collections import defaultdict
from django.db import connection, transaction from django.db import connection, transaction
import math
from math import log from math import log
import scipy
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
def create_blacklist(user, corpus): def create_blacklist(user, corpus):
pass pass
def create_synonymes(user, corpus): def create_synonymes(user, corpus):
pass pass
def create_whitelist(user, corpus_id, size=100): size = 1000
def create_whitelist(user, corpus_id, size=size, count_min=2):
cursor = connection.cursor() cursor = connection.cursor()
whitelist_type_id = cache.NodeType['WhiteList'].id whitelist_type_id = cache.NodeType['WhiteList'].id
...@@ -56,13 +66,13 @@ def create_whitelist(user, corpus_id, size=100): ...@@ -56,13 +66,13 @@ def create_whitelist(user, corpus_id, size=100):
GROUP BY GROUP BY
ngX.id ngX.id
Having Having
COUNT(*) >= 1 COUNT(*) >= %d
ORDER BY ORDER BY
occurrences DESC occurrences DESC
LIMIT LIMIT
%d %d
; ;
""" % (white_list.id, int(corpus_id), int(type_document_id), size) """ % (white_list.id, int(corpus_id), int(type_document_id), count_min, size)
# print("PRINTING QYERY OF WHITELIST:") # print("PRINTING QYERY OF WHITELIST:")
# print(query_whitelist) # print(query_whitelist)
cursor.execute(query_whitelist) cursor.execute(query_whitelist)
...@@ -70,7 +80,7 @@ def create_whitelist(user, corpus_id, size=100): ...@@ -70,7 +80,7 @@ def create_whitelist(user, corpus_id, size=100):
return white_list return white_list
#def create_cooc(user, corpus, whitelist, blacklist, synonymes): #def create_cooc(user, corpus, whitelist, blacklist, synonymes):
def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=None, year_end=None): def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start=None, year_end=None):
cursor = connection.cursor() cursor = connection.cursor()
cooc_type_id = cache.NodeType['Cooccurrence'].id cooc_type_id = cache.NodeType['Cooccurrence'].id
...@@ -135,67 +145,120 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start= ...@@ -135,67 +145,120 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=
cursor.execute(query_cooc) cursor.execute(query_cooc)
return cooc.id return cooc.id
def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150): def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=size):
import pandas as pd import pandas as pd
from copy import copy from copy import copy
import numpy as np import numpy as np
import scipy
import networkx as nx import networkx as nx
from networkx.readwrite import json_graph from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition from analysis.louvain import best_partition
#print(corpus_id, cooc_id)
try:
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
type_cooc_id = cache.NodeType['Cooccurrence'].id
if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=size)
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
type_cooc_id = cache.NodeType['Cooccurrence'].id
if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=n)
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=n)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all(): for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score) # print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0] matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0] matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
df = pd.DataFrame(matrix).fillna(0) x = pd.DataFrame(matrix).fillna(0)
x = copy(df.values) y = pd.DataFrame(matrix).fillna(0)
x = x / x.sum(axis=1) # x = copy(df.values)
# y = copy(df.values)
#xo = diag_null(x)
#y = diag_null(y)
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
#print(x)
# import pprint xs = x.sum(axis=1) - x
# pprint.pprint(ids) ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] -1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] -1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
print(n)
print(m)
nodes_included = int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# Removing unconnected nodes # import pprint
threshold = min(x.max(axis=1)) # pprint.pprint(ids)
matrix_filtered = np.where(x >= threshold, 1, 0)
#matrix_filtered = np.where(x > threshold, x, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
partition = best_partition(G) # Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx > threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
except:
PrintException()
try:
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
#print(G)
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
partition = best_partition(G)
except:
PrintException()
if type == "node_link": if type == "node_link":
for node in G.nodes(): for node in G.nodes():
......
...@@ -71,7 +71,7 @@ def apply_workflow(corpus_id): ...@@ -71,7 +71,7 @@ def apply_workflow(corpus_id):
print(error) print(error)
extract_ngrams(corpus, ['title']) extract_ngrams(corpus, ['title', 'abstract'])
compute_tfidf(corpus) compute_tfidf(corpus)
try: try:
......
...@@ -333,7 +333,7 @@ ...@@ -333,7 +333,7 @@
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);'); $("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false) // $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!") $("#submit_thing").html("Process a 1000 sample!")
thequeries = data thequeries = data
var N=0,k=0; var N=0,k=0;
...@@ -370,7 +370,7 @@ ...@@ -370,7 +370,7 @@
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);'); $("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false) // $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!") $("#submit_thing").html("Process a 1000 sample!")
thequeries = data thequeries = data
var N=data.length,k=0; var N=data.length,k=0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment