Commit 8a06bb60 authored by Administrator's avatar Administrator

[FEAT] Graph improved with generic and specific nodes.

parent cdac66e2
from admin.utils import PrintException
from gargantext_web.db import *
from collections import defaultdict
from django.db import connection, transaction
import math
from math import log
import scipy
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
def create_blacklist(user, corpus):
pass
def create_synonymes(user, corpus):
pass
size = 1000
def create_whitelist(user, corpus_id, size=100):
def create_whitelist(user, corpus_id, size=size):
cursor = connection.cursor()
whitelist_type_id = cache.NodeType['WhiteList'].id
......@@ -70,7 +80,7 @@ def create_whitelist(user, corpus_id, size=100):
return white_list
#def create_cooc(user, corpus, whitelist, blacklist, synonymes):
def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=None, year_end=None):
def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start=None, year_end=None):
cursor = connection.cursor()
cooc_type_id = cache.NodeType['Cooccurrence'].id
......@@ -135,67 +145,110 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=
cursor.execute(query_cooc)
return cooc.id
def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150):
def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=size):
import pandas as pd
from copy import copy
import numpy as np
import scipy
import networkx as nx
from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition
#print(corpus_id, cooc_id)
try:
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
type_cooc_id = cache.NodeType['Cooccurrence'].id
if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=size)
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
type_cooc_id = cache.NodeType['Cooccurrence'].id
if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=n)
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=n)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0)
# x = copy(df.values)
# y = copy(df.values)
#xo = diag_null(x)
#y = diag_null(y)
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
#print(x)
df = pd.DataFrame(matrix).fillna(0)
x = copy(df.values)
x = x / x.sum(axis=1)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus
n = ( xs + ys) / (2 * (x.shape[0] -1))
# top specific
m = ( xs - ys) / (2 * (x.shape[0] -1))
m = pd.DataFrame.abs(m)
n = n.sort(inplace=False)
m = m.sort(inplace=False)
matrix_size = int(round(size/2,0))
# import pprint
# pprint.pprint(ids)
n_index = pd.Index.intersection(x.index, n.index[-matrix_size:])
m_index = pd.Index.intersection(x.index, m.index[-matrix_size:])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# Removing unconnected nodes
threshold = min(x.max(axis=1))
matrix_filtered = np.where(x >= threshold, 1, 0)
#matrix_filtered = np.where(x > threshold, x, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
# import pprint
# pprint.pprint(ids)
partition = best_partition(G)
# Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx > threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
except:
PrintException()
try:
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
#print(G)
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
partition = best_partition(G)
except:
PrintException()
if type == "node_link":
for node in G.nodes():
......
......@@ -28,6 +28,9 @@
##app.config_from_object('django.conf:settings')
#app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
#
from admin.utils import PrintException
from celery import shared_task
from node import models
......@@ -67,18 +70,19 @@ def apply_workflow(corpus_id):
# session.add(corpus)
# session.flush()
except Exception as error:
print(error)
except :
PrintException()
extract_ngrams(corpus, ['title'])
#extract_ngrams(corpus, ['title',])
extract_ngrams(corpus, ['title', 'abstract'])
compute_tfidf(corpus)
try:
corpus_django.metadata['Processing'] = 0
corpus_django.save()
except Exception as error:
print(error)
except :
PrintException()
......@@ -44,7 +44,7 @@ def getGlobalStats(request ):
alist = ["bar","foo"]
if request.method == "POST":
N = 100
N = 1000
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment