Commit 37ed856a authored by Administrator's avatar Administrator

Merge branch 'unstable_new_graph' into prod-dev

parents 4bc2c64c 8a06bb60
from admin.utils import PrintException
from gargantext_web.db import * from gargantext_web.db import *
from collections import defaultdict from collections import defaultdict
from django.db import connection, transaction from django.db import connection, transaction
import math
from math import log from math import log
import scipy
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
def create_blacklist(user, corpus): def create_blacklist(user, corpus):
pass pass
def create_synonymes(user, corpus): def create_synonymes(user, corpus):
pass pass
size = 1000
def create_whitelist(user, corpus_id, size=100): def create_whitelist(user, corpus_id, size=size):
cursor = connection.cursor() cursor = connection.cursor()
whitelist_type_id = cache.NodeType['WhiteList'].id whitelist_type_id = cache.NodeType['WhiteList'].id
...@@ -70,7 +80,7 @@ def create_whitelist(user, corpus_id, size=100): ...@@ -70,7 +80,7 @@ def create_whitelist(user, corpus_id, size=100):
return white_list return white_list
#def create_cooc(user, corpus, whitelist, blacklist, synonymes): #def create_cooc(user, corpus, whitelist, blacklist, synonymes):
def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=None, year_end=None): def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start=None, year_end=None):
cursor = connection.cursor() cursor = connection.cursor()
cooc_type_id = cache.NodeType['Cooccurrence'].id cooc_type_id = cache.NodeType['Cooccurrence'].id
...@@ -135,67 +145,110 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start= ...@@ -135,67 +145,110 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=
cursor.execute(query_cooc) cursor.execute(query_cooc)
return cooc.id return cooc.id
def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150): def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=size):
import pandas as pd import pandas as pd
from copy import copy from copy import copy
import numpy as np import numpy as np
import scipy
import networkx as nx import networkx as nx
from networkx.readwrite import json_graph from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition from analysis.louvain import best_partition
#print(corpus_id, cooc_id)
try:
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
type_cooc_id = cache.NodeType['Cooccurrence'].id
if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=size)
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
type_cooc_id = cache.NodeType['Cooccurrence'].id
if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None: for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
print("Coocurrences do not exist yet, create it.") # print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=n) labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=n) labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all(): matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score) matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0] ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0] ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score x = pd.DataFrame(matrix).fillna(0)
weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score y = pd.DataFrame(matrix).fillna(0)
# x = copy(df.values)
# y = copy(df.values)
#xo = diag_null(x)
#y = diag_null(y)
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
#print(x)
df = pd.DataFrame(matrix).fillna(0) xs = x.sum(axis=1) - x
x = copy(df.values) ys = x.sum(axis=0) - x
x = x / x.sum(axis=1)
# top inclus
n = ( xs + ys) / (2 * (x.shape[0] -1))
# top specific
m = ( xs - ys) / (2 * (x.shape[0] -1))
m = pd.DataFrame.abs(m)
n = n.sort(inplace=False)
m = m.sort(inplace=False)
matrix_size = int(round(size/2,0))
# import pprint n_index = pd.Index.intersection(x.index, n.index[-matrix_size:])
# pprint.pprint(ids) m_index = pd.Index.intersection(x.index, m.index[-matrix_size:])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# Removing unconnected nodes # import pprint
threshold = min(x.max(axis=1)) # pprint.pprint(ids)
matrix_filtered = np.where(x >= threshold, 1, 0)
#matrix_filtered = np.where(x > threshold, x, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
partition = best_partition(G) # Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx > threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
except:
PrintException()
try:
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
#print(G)
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
# G.remove_nodes_from(to_remove)
partition = best_partition(G)
except:
PrintException()
if type == "node_link": if type == "node_link":
for node in G.nodes(): for node in G.nodes():
......
...@@ -28,6 +28,9 @@ ...@@ -28,6 +28,9 @@
##app.config_from_object('django.conf:settings') ##app.config_from_object('django.conf:settings')
#app.autodiscover_tasks(lambda: settings.INSTALLED_APPS) #app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
# #
from admin.utils import PrintException
from celery import shared_task from celery import shared_task
from node import models from node import models
...@@ -67,18 +70,19 @@ def apply_workflow(corpus_id): ...@@ -67,18 +70,19 @@ def apply_workflow(corpus_id):
# session.add(corpus) # session.add(corpus)
# session.flush() # session.flush()
except Exception as error: except :
print(error) PrintException()
extract_ngrams(corpus, ['title']) #extract_ngrams(corpus, ['title',])
extract_ngrams(corpus, ['title', 'abstract'])
compute_tfidf(corpus) compute_tfidf(corpus)
try: try:
corpus_django.metadata['Processing'] = 0 corpus_django.metadata['Processing'] = 0
corpus_django.save() corpus_django.save()
except Exception as error: except :
print(error) PrintException()
...@@ -44,7 +44,7 @@ def getGlobalStats(request ): ...@@ -44,7 +44,7 @@ def getGlobalStats(request ):
alist = ["bar","foo"] alist = ["bar","foo"]
if request.method == "POST": if request.method == "POST":
N = 100 N = 1000
query = request.POST["query"] query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment