Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into samuel

17e3a94b · PkSM3 · 10a448fe · 937e2c70 · 17e3a94b · 17e3a94b
Commit 17e3a94b authored Apr 23, 2015 by PkSM3
Hide whitespace changes
Inline Side-by-side

Showing with 112 additions and 49 deletions

functions.py analysis/functions.py +109 -46

celery.py gargantext_web/celery.py +1 -1

project.html templates/project.html +2 -2

No files found.
--- a/analysis/functions.py
+++ b/analysis/functions.py
+from admin.utils import PrintException
 from gargantext_web.db import *
 from collections import defaultdict
 from django.db import connection, transaction
+import math
 from math import log
+import scipy
+def diag_null(x):
+    return x - x * scipy.eye(x.shape[0])
 def create_blacklist(user, corpus):
    pass
 def create_synonymes(user, corpus):
    pass
-def create_whitelist(user, corpus_id, size=100):
+size = 1000 
+def create_whitelist(user, corpus_id, size=size, count_min=2):
    cursor = connection.cursor()
    whitelist_type_id = cache.NodeType['WhiteList'].id
@@ -56,13 +66,13 @@ def create_whitelist(user, corpus_id, size=100):
        GROUP BY
            ngX.id
        Having
-            COUNT(*) >= 1
+            COUNT(*) >= %d
        ORDER BY
            occurrences DESC
        LIMIT
            %d
        ;
-    """  % (white_list.id, int(corpus_id), int(type_document_id), size)
+    """  % (white_list.id, int(corpus_id), int(type_document_id), count_min, size)
    # print("PRINTING QYERY OF WHITELIST:")
    # print(query_whitelist)
    cursor.execute(query_whitelist)
@@ -70,7 +80,7 @@ def create_whitelist(user, corpus_id, size=100):
    return white_list
 #def create_cooc(user, corpus, whitelist, blacklist, synonymes):
-def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=None, year_end=None):
+def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start=None, year_end=None):
    cursor = connection.cursor()
    cooc_type_id  = cache.NodeType['Cooccurrence'].id
@@ -135,67 +145,120 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=150, year_start=
    cursor.execute(query_cooc)
    return cooc.id
-def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150):
+def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=size):
    import pandas as pd
    from copy import copy
    import numpy as np
+    import scipy
    import networkx as nx
    from networkx.readwrite import json_graph
    from gargantext_web.api import JsonHttpResponse
    from analysis.louvain import best_partition
+    #print(corpus_id, cooc_id)
+    try:
+        matrix = defaultdict(lambda : defaultdict(float))
+        ids    = dict()
+        labels = dict()
+        weight = dict()
+        type_cooc_id = cache.NodeType['Cooccurrence'].id
+        if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
+            print("Coocurrences do not exist yet, create it.")
+            whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=size)
+            cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
+        else:
+            cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
-    matrix = defaultdict(lambda : defaultdict(float))
-    ids    = dict()
-    labels = dict()
-    weight = dict()
-    type_cooc_id = cache.NodeType['Cooccurrence'].id
-    if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
-        print("Coocurrences do not exist yet, create it.")
-        whitelist = create_whitelist(request.user, corpus_id=corpus_id, size=n)
-        cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=n)
-    else:
-        cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
-    for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
+        for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
-        # print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
+            # print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
+            labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
+            labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
-        labels[cooccurrence.ngramx_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramx_id).first()[0]
+            matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
-        labels[cooccurrence.ngramy_id] = session.query(Ngram.terms).filter(Ngram.id == cooccurrence.ngramy_id).first()[0]
+            matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
-        ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
+            ids[labels[cooccurrence.ngramx_id]] = cooccurrence.ngramx_id
-        ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
+            ids[labels[cooccurrence.ngramy_id]] = cooccurrence.ngramy_id
-        matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
+            weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
-        matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
+            weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
-        weight[cooccurrence.ngramx_id] = weight.get(cooccurrence.ngramx_id, 0) + cooccurrence.score
-        weight[cooccurrence.ngramy_id] = weight.get(cooccurrence.ngramy_id, 0) + cooccurrence.score
-    df = pd.DataFrame(matrix).fillna(0)
+        x = pd.DataFrame(matrix).fillna(0)
-    x = copy(df.values)
+        y = pd.DataFrame(matrix).fillna(0)
-    x = x / x.sum(axis=1)
+#    x = copy(df.values)
+#    y = copy(df.values)
+        #xo = diag_null(x)
+        #y = diag_null(y)
+        x = x / x.sum(axis=1)
+        y = y / y.sum(axis=0)
+        #print(x)
-    # import pprint
+        xs = x.sum(axis=1) - x
-    # pprint.pprint(ids)
+        ys = x.sum(axis=0) - x
+        # top inclus ou exclus
+        n = ( xs + ys) / (2 * (x.shape[0] -1))
+        # top generic or specific
+        m = ( xs - ys) / (2 * (x.shape[0] -1))
+        n = n.sort(inplace=False)
+        m = m.sort(inplace=False)
+        print(n)
+        print(m)
+        nodes_included = int(round(size/20,0))
+        #nodes_excluded = int(round(size/10,0))
+        nodes_specific = int(round(size/10,0))
+        #nodes_generic = int(round(size/10,0))
+        # TODO user the included score for the node size
+        n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
+        # Generic: 
+        #m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
+        # Specific: 
+        m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
+        x_index = pd.Index.union(n_index, m_index)
+        xx = x[list(x_index)].T[list(x_index)]
-    # Removing unconnected nodes
+        # import pprint
-    threshold = min(x.max(axis=1))
+        # pprint.pprint(ids)
-    matrix_filtered = np.where(x >= threshold, 1, 0)
-    #matrix_filtered = np.where(x > threshold, x, 0)
-    #matrix_filtered = matrix_filtered.resize((90,90))
-    G = nx.from_numpy_matrix(matrix_filtered)
-    G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
-    #G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
-    # Removing too connected nodes (find automatic way to do it)
-    #    outdeg = G.degree()
-    #    to_remove = [n for n in outdeg if outdeg[n] >= 10]
-    #    G.remove_nodes_from(to_remove)
-    partition = best_partition(G)
+        # Removing unconnected nodes
+        xxx = xx.values
+        threshold = min(xxx.max(axis=1))
+        matrix_filtered = np.where(xxx > threshold, xxx, 0)
+        #matrix_filtered = matrix_filtered.resize((90,90))
+    except:
+        PrintException()
+    try:
+        G = nx.from_numpy_matrix(matrix_filtered)
+        G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
+        #print(G)
+        #G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
+        # Removing too connected nodes (find automatic way to do it)
+        #    outdeg = G.degree()
+        #    to_remove = [n for n in outdeg if outdeg[n] >= 10]
+        #    G.remove_nodes_from(to_remove)
+        partition = best_partition(G)
+    except:
+        PrintException()
    if type == "node_link":
        for node in G.nodes():

--- a/gargantext_web/celery.py
+++ b/gargantext_web/celery.py
@@ -71,7 +71,7 @@ def apply_workflow(corpus_id):
        print(error)
-    extract_ngrams(corpus, ['title'])
+    extract_ngrams(corpus, ['title', 'abstract'])
    compute_tfidf(corpus)
    try:

--- a/templates/project.html
+++ b/templates/project.html
@@ -333,7 +333,7 @@
 					console.log("enabling "+"#"+value.id)
 					$("#"+value.id).attr('onclick','getGlobalResults(this);');
 					// $("#submit_thing").prop('disabled' , false)
-					$("#submit_thing").html("Process a 100 sample!")
+					$("#submit_thing").html("Process a 1000 sample!")
 		            thequeries = data
 		            var N=0,k=0;
@@ -370,7 +370,7 @@
 					console.log("enabling "+"#"+value.id)
 					$("#"+value.id).attr('onclick','getGlobalResults(this);');
 					// $("#submit_thing").prop('disabled' , false)
-					$("#submit_thing").html("Process a 100 sample!")
+					$("#submit_thing").html("Process a 1000 sample!")
 		            thequeries = data
 		            var N=data.length,k=0;