Commit 17e3a94b authored by PkSM3's avatar PkSM3

Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into samuel

parents 10a448fe 937e2c70
from admin.utils import PrintException
from gargantext_web.db import *
from collections import defaultdict
from django.db import connection, transaction
import math
from math import log
import scipy
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
def create_blacklist(user, corpus):
pass
def create_synonymes(user, corpus):
pass
def create_whitelist(user, corpus_id, size=100):
size = 1000
def create_whitelist(user, corpus_id, size=size, count_min=2):
cursor = connection.cursor()
whitelist_type_id = cache.NodeType['WhiteList'].id
......@@ -56,13 +66,13 @@ def create_whitelist(user, corpus_id, size=100):
GROUP BY
ngX.id
Having
COUNT(*) >= 1
COUNT(*) >= %d
ORDER BY
occurrences DESC
LIMIT
%d
;
""" % (white_list.id, int(corpus_id), int(type_document_id), size)
""" % (white_list.id, int(corpus_id), int(type_document_id), count_min, size)
# print("PRINTING QYERY OF WHITELIST:")
# print(query_whitelist)
cursor.execute(query_whitelist)
......@@ -70,7 +80,7 @@ def create_whitelist(user, corpus_id, size=100):
return white_list
#def create_cooc(user, corpus, whitelist, blacklist, synonymes):
def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=None, year_end=None):
def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start=None, year_end=None):
cursor = connection.cursor()
cooc_type_id = cache.NodeType['Cooccurrence'].id
......@@ -135,67 +145,120 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=
cursor.execute(query_cooc)
return cooc.id
def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150):
def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=size):
import pandas as pd
from copy import copy
import numpy as np
import scipy
import networkx as nx
from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition
#print(corpus_id, cooc_id)
try:
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
type_cooc_id = cache.NodeType['Cooccurrence'].id
if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=size)
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
type_cooc_id = cache.NodeType['Cooccurrence'].id
if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=n)
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=n)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
df = pd.DataFrame(matrix).fillna(0)
x = copy(df.values)
x = x / x.sum(axis=1)
x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0)
# x = copy(df.values)
# y = copy(df.values)
#xo = diag_null(x)
#y = diag_null(y)
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
#print(x)
# import pprint
# pprint.pprint(ids)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] -1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] -1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
print(n)
print(m)
nodes_included = int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# Removing unconnected nodes
threshold = min(x.max(axis=1))
matrix_filtered = np.where(x >= threshold, 1, 0)
#matrix_filtered = np.where(x > threshold, x, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
# import pprint
# pprint.pprint(ids)
partition = best_partition(G)
# Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx > threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
except:
PrintException()
try:
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
#print(G)
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
partition = best_partition(G)
except:
PrintException()
if type == "node_link":
for node in G.nodes():
......
......@@ -71,7 +71,7 @@ def apply_workflow(corpus_id):
print(error)
extract_ngrams(corpus, ['title'])
extract_ngrams(corpus, ['title', 'abstract'])
compute_tfidf(corpus)
try:
......
......@@ -333,7 +333,7 @@
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
$("#submit_thing").html("Process a 1000 sample!")
thequeries = data
var N=0,k=0;
......@@ -370,7 +370,7 @@
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
$("#submit_thing").html("Process a 1000 sample!")
thequeries = data
var N=data.length,k=0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment