[FEAT] Graph improved with generic and specific nodes.

8a06bb60 · Administrator · cdac66e2 · 8a06bb60 · 8a06bb60 · 8a06bb60
Commit 8a06bb60 authored Apr 20, 2015 by Administrator
Hide whitespace changes
Inline Side-by-side

Showing with 106 additions and 49 deletions

functions.py analysis/functions.py +96 -43

celery.py gargantext_web/celery.py +9 -5

views.py scrappers/scrap_pubmed/views.py +1 -1

No files found.
--- a/analysis/functions.py
+++ b/analysis/functions.py
+from admin.utils import PrintException
 from gargantext_web.db import *

 from collections import defaultdict
 from django.db import connection, transaction

+import math
 from math import log

+import scipy
+
+def diag_null(x):
+    return x - x * scipy.eye(x.shape[0])
+
 def create_blacklist(user, corpus):
    pass

 def create_synonymes(user, corpus):
    pass
+    
+
+size = 1000 

-def create_whitelist(user, corpus_id, size=100):
+def create_whitelist(user, corpus_id, size=size):
    cursor = connection.cursor()
    
    whitelist_type_id = cache.NodeType['WhiteList'].id
@@ -70,7 +80,7 @@ def create_whitelist(user, corpus_id, size=100):
    return white_list

 #def create_cooc(user, corpus, whitelist, blacklist, synonymes):
-def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=None, year_end=None):
+def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start=None, year_end=None):
    cursor = connection.cursor()

    cooc_type_id  = cache.NodeType['Cooccurrence'].id
@@ -135,67 +145,110 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=
    cursor.execute(query_cooc)
    return cooc.id

-def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150):
+def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=size):
    import pandas as pd
    from copy import copy
    import numpy as np
+    import scipy
    import networkx as nx
    from networkx.readwrite import json_graph
    from gargantext_web.api import JsonHttpResponse
    
    from analysis.louvain import best_partition
+    
+    #print(corpus_id, cooc_id)
+
+    try:
+        matrix = defaultdict(lambda : defaultdict(float))
+        ids    = dict()
+        labels = dict()
+        weight = dict()
+
+        type_cooc_id = cache.NodeType['Cooccurrence'].id
+
+        if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
+            print("Coocurrences do not exist yet, create it.")
+            whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=size)
+            cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
+        else:
+            cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
+        

-    matrix = defaultdict(lambda : defaultdict(float))
-    ids    = dict()
-    labels = dict()
-    weight = dict()
-
-    type_cooc_id = cache.NodeType['Cooccurrence'].id

-    if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
-        print("Coocurrences do not exist yet, create it.")
-        whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=n)
-        cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=n)
-    else:
-        cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
+        for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
+            # print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
+            labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
+            labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]

-    for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
-        # print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
+            matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
+            matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score

-        labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
-        labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
+            ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
+            ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id

-        ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
-        ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
+            weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
+            weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score

-        matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
-        matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score

-        weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
-        weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
+        x = pd.DataFrame(matrix).fillna(0)
+        y = pd.DataFrame(matrix).fillna(0)
+#    x = copy(df.values)
+#    y = copy(df.values)
+        #xo = diag_null(x)
+        #y = diag_null(y)
+        
+        x = x / x.sum(axis=1)
+        y = y / y.sum(axis=0)
+        #print(x)

-    df = pd.DataFrame(matrix).fillna(0)
-    x = copy(df.values)
-    x = x / x.sum(axis=1)
+        xs = x.sum(axis=1) - x
+        ys = x.sum(axis=0) - x
+    
+        # top inclus
+        n = ( xs + ys) / (2 * (x.shape[0] -1))
+        # top specific
+        m = ( xs - ys) / (2 * (x.shape[0] -1))
+        m = pd.DataFrame.abs(m)
+        
+        n = n.sort(inplace=False)
+        m = m.sort(inplace=False)
+        
+        matrix_size = int(round(size/2,0))

-    # import pprint
-    # pprint.pprint(ids)
+        n_index = pd.Index.intersection(x.index, n.index[-matrix_size:])
+        m_index = pd.Index.intersection(x.index, m.index[-matrix_size:])
+        
+        x_index = pd.Index.union(n_index, m_index)
+        xx = x[list(x_index)].T[list(x_index)]

-    # Removing unconnected nodes
-    threshold = min(x.max(axis=1))
-    matrix_filtered = np.where(x >= threshold, 1, 0)
-    #matrix_filtered = np.where(x > threshold, x, 0)
-    #matrix_filtered = matrix_filtered.resize((90,90))
-    G = nx.from_numpy_matrix(matrix_filtered)
-    G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
-    #G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
-    # Removing too connected nodes (find automatic way to do it)
-    #    outdeg = G.degree()
-    #    to_remove = [n for n in outdeg if outdeg[n] >= 10]
-    #    G.remove_nodes_from(to_remove)
+        # import pprint
+        # pprint.pprint(ids)

-    partition = best_partition(G)
+        # Removing unconnected nodes
+        xxx = xx.values
+        threshold = min(xxx.max(axis=1))
+        matrix_filtered = np.where(xxx > threshold, xxx, 0)
+        
+        #matrix_filtered = matrix_filtered.resize((90,90))
+    except:
+        PrintException()
+    
+    try:
+        G = nx.from_numpy_matrix(matrix_filtered)
+        G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
+        
+        #print(G)
+        #G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
+        # Removing too connected nodes (find automatic way to do it)
+        #    outdeg = G.degree()
+        #    to_remove = [n for n in outdeg if outdeg[n] >= 10]
+        #    G.remove_nodes_from(to_remove)
+
+        partition = best_partition(G)
+    except:
+        PrintException()
    
+
    if type == "node_link":

        for node in G.nodes():

--- a/gargantext_web/celery.py
+++ b/gargantext_web/celery.py
@@ -28,6 +28,9 @@
 ##app.config_from_object('django.conf:settings')
 #app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
 #
+
+from admin.utils import PrintException
+
 from celery import shared_task
 from node import models

@@ -67,18 +70,19 @@ def apply_workflow(corpus_id):
 #        session.add(corpus)
 #        session.flush()

-    except Exception as error:
-        print(error)
+    except :
+        PrintException()

       
-    extract_ngrams(corpus, ['title'])
+    #extract_ngrams(corpus, ['title',])
+    extract_ngrams(corpus, ['title', 'abstract'])
    compute_tfidf(corpus)
    
    try:
        corpus_django.metadata['Processing'] = 0
        corpus_django.save()
-    except Exception as error:
-        print(error)
+    except :
+        PrintException()



--- a/scrappers/scrap_pubmed/views.py
+++ b/scrappers/scrap_pubmed/views.py
@@ -44,7 +44,7 @@ def getGlobalStats(request ):
 	alist = ["bar","foo"]

 	if request.method == "POST":
-		N = 100
+		N = 1000
 		query = request.POST["query"]
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )