Commit 02bbc918 authored by Administrator's avatar Administrator

[FEAT] PubMed Scrapper, 1000 documents by default is ok.

parent cdac66e2
from admin.utils import PrintException
from gargantext_web.db import *
from collections import defaultdict
from django.db import connection, transaction
import math
from math import log
import scipy
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
def create_blacklist(user, corpus):
pass
def create_synonymes(user, corpus):
pass
def create_whitelist(user, corpus_id, size=100):
size = 1000
def create_whitelist(user, corpus_id, size=size):
cursor = connection.cursor()
whitelist_type_id = cache.NodeType['WhiteList'].id
......@@ -56,7 +66,7 @@ def create_whitelist(user, corpus_id, size=100):
GROUP BY
ngX.id
Having
COUNT(*) >= 1
COUNT(*) >= 3
ORDER BY
occurrences DESC
LIMIT
......@@ -70,7 +80,7 @@ def create_whitelist(user, corpus_id, size=100):
return white_list
#def create_cooc(user, corpus, whitelist, blacklist, synonymes):
def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=None, year_end=None):
def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start=None, year_end=None):
cursor = connection.cursor()
cooc_type_id = cache.NodeType['Cooccurrence'].id
......@@ -135,16 +145,20 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=
cursor.execute(query_cooc)
return cooc.id
def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150):
def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=size):
import pandas as pd
from copy import copy
import numpy as np
import scipy
import networkx as nx
from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition
#print(corpus_id, cooc_id)
try:
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
......@@ -154,40 +168,79 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=n)
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=n)
whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=size)
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
df = pd.DataFrame(matrix).fillna(0)
x = copy(df.values)
x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0)
# x = copy(df.values)
# y = copy(df.values)
#xo = diag_null(x)
#y = diag_null(y)
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
#print(x)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] -1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] -1))
#m = pd.DataFrame.abs(m)
#n = n.sort(inplace=False)
m = m.sort(inplace=False)
matrix_size = int(round(size/5,0))
# TODO user the generic score for the node size
#n_index = pd.Index.intersection(x.index, n.index[-matrix_size:])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:matrix_size])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-matrix_size:])
x_index = m_index# pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# import pprint
# pprint.pprint(ids)
# Removing unconnected nodes
threshold = min(x.max(axis=1))
matrix_filtered = np.where(x >= threshold, 1, 0)
#matrix_filtered = np.where(x > threshold, x, 0)
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx > threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
except:
PrintException()
try:
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
#print(G)
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
......@@ -195,6 +248,9 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
# G.remove_nodes_from(to_remove)
partition = best_partition(G)
except:
PrintException()
if type == "node_link":
......
......@@ -44,7 +44,7 @@ def getGlobalStats(request ):
alist = ["bar","foo"]
if request.method == "POST":
N = 100
N = 1000
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
......
......@@ -328,7 +328,7 @@
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
$("#submit_thing").html("Process a 1000 sample!")
thequeries = data
var N=0,k=0;
......@@ -365,7 +365,7 @@
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
$("#submit_thing").html("Process a 1000 sample!")
thequeries = data
var N=data.length,k=0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment