distances.py 7.74 KB
Newer Older
delanoe's avatar
delanoe committed
1 2 3
from gargantext.models     import Node, NodeNgram, NodeNgramNgram, \
                                  NodeHyperdata
from gargantext.util.db    import session, aliased
4

5
from graph.louvain         import best_partition
delanoe's avatar
delanoe committed
6 7

from copy        import copy
8
from collections import defaultdict
delanoe's avatar
delanoe committed
9 10
from math        import log,sqrt
#from operator import itemgetter
11 12

import math
delanoe's avatar
delanoe committed
13 14
import numpy    as np
import pandas   as pd
15 16
import networkx as nx

17
def clusterByDistances( cooc_matrix
18
               , field1=None, field2=None
delanoe's avatar
delanoe committed
19
               , distance=None):
20
    '''
delanoe's avatar
delanoe committed
21
    clusterByDistance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
22
    '''
23

24 25 26 27
    # implicit global session

    authorized = ['conditional', 'distributional', 'cosine']
    if distance not in authorized:
delanoe's avatar
delanoe committed
28
        raise ValueError("Distance must be in %s" % str(authorized))
29 30 31 32 33 34

    matrix = defaultdict(lambda : defaultdict(float))
    ids    = defaultdict(lambda : defaultdict(int))
    labels = dict()
    weight = dict()

35 36 37 38
    for cooc in cooc_matrix.items:
        ngram1_id = cooc[0]
        ngram2_id = cooc[1]
        ccweight = cooc_matrix.items[cooc]
39

40 41
        matrix[ngram1_id][ngram2_id] = ccweight
        matrix[ngram2_id][ngram1_id] = ccweight
42

43 44
        ids[ngram1_id] = (field1, ngram1_id)
        ids[ngram2_id] = (field2, ngram2_id)
45

46 47
        weight[ngram1_id] = weight.get(ngram1_id, 0) + ccweight
        weight[ngram2_id] = weight.get(ngram2_id, 0) + ccweight
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62

    x = pd.DataFrame(matrix).fillna(0)

    if distance == 'conditional':
        x = x / x.sum(axis=1)
        #y = y / y.sum(axis=0)

        xs = x.sum(axis=1) - x
        ys = x.sum(axis=0) - x

        # top inclus ou exclus
        n = ( xs + ys) / (2 * (x.shape[0] - 1))
        # top generic or specific
        m = ( xs - ys) / (2 * (x.shape[0] - 1))

63 64
        n = n.sort_index(inplace=False)
        m = m.sort_index(inplace=False)
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90

        nodes_included = 500 #int(round(size/20,0))
        #nodes_excluded = int(round(size/10,0))

        nodes_specific = 500 #int(round(size/10,0))
        #nodes_generic = int(round(size/10,0))

        # TODO use the included score for the node size
        n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
        # Generic:
        #m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
        # Specific:
        m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
        #m_index = pd.Index.intersection(x.index, n.index[:nodes_included])

        x_index = pd.Index.union(n_index, m_index)
        xx = x[list(x_index)].T[list(x_index)]

        # Removing unconnected nodes
        xxx = xx.values
        threshold = min(xxx.max(axis=1))
        matrix_filtered = np.where(xxx >= threshold, xxx, 0)
        #matrix_filtered = matrix_filtered.resize((90,90))

        G = nx.from_numpy_matrix(np.matrix(matrix_filtered))
        G = nx.relabel_nodes(G, dict(enumerate([ ids[id_][1] for id_ in list(xx.columns)])))
91

92 93 94 95 96 97 98 99 100 101 102 103
    elif distance == 'cosine':
        scd = defaultdict(lambda : defaultdict(int))

        for i in matrix.keys():
            for j in matrix.keys():
                numerator = sum(
                                [
                                matrix[i][k] * matrix[j][k]
                                    for k in matrix.keys()
                                    if i != j and k != i and k != j
                                ]
                            )
104

105 106
                denominator  = sqrt(
                                    sum([
107
                                    matrix[i][k]
108 109 110 111 112
                                        for k in matrix.keys()
                                        if k != i and k != j #and matrix[i][k] > 0
                                       ])
                                    *
                                    sum([
113
                                    matrix[i][k]
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
                                        for k in matrix.keys()
                                        if k != i and k != j #and matrix[i][k] > 0
                                       ])

                               )

                try:
                    scd[i][j] = numerator / denominator
                except Exception as error:
                    scd[i][j] = 0

        minmax = min([ max([ scd[i][j] for i in scd.keys()]) for j in scd.keys()])

        G = nx.DiGraph()
        G.add_edges_from(
                          [
130
                            (i, j, {'weight': scd[i][j]})
131 132 133 134 135 136 137 138 139 140
                                for i in scd.keys() for j in scd.keys()
                                if i != j and scd[i][j] > minmax and scd[i][j] > scd[j][i]
                          ]
                        )



    elif distance == 'distributional':
        mi = defaultdict(lambda : defaultdict(int))
        total_cooc = x.sum().sum()
141

142 143 144 145 146 147
        for i in matrix.keys():
            si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
            for j in matrix[i].keys():
                sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
                if i!=j :
                    mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
148

149
        r = defaultdict(lambda : defaultdict(int))
150

151 152 153 154 155 156 157 158 159
        for i in matrix.keys():
            for j in matrix.keys():
                sumMin = sum(
                                [
                                min(mi[i][k], mi[j][k])
                                    for k in matrix.keys()
                                    if i != j and k != i and k != j and mi[i][k] > 0
                                ]
                            )
160

161 162
                sumMi  = sum(
                                [
163
                                mi[i][k]
164 165 166 167 168 169 170 171 172
                                    for k in matrix.keys()
                                    if k != i and k != j and mi[i][k] > 0
                                ]
                            )

                try:
                    r[i][j] = sumMin / sumMi
                except Exception as error:
                    r[i][j] = 0
173

174 175 176 177 178 179
        # Need to filter the weak links, automatic threshold here
        minmax = min([ max([ r[i][j] for i in r.keys()]) for j in r.keys()])

        G = nx.DiGraph()
        G.add_edges_from(
                          [
180
                            (i, j, {'weight': r[i][j]})
181 182 183 184
                                for i in r.keys() for j in r.keys()
                                if i != j and r[i][j] > minmax and r[i][j] > r[j][i]
                          ]
                        )
185

186 187 188 189 190 191 192 193 194 195 196 197 198 199
#        degree_max = max([(n, d) for n,d in G.degree().items()], key=itemgetter(1))[1]
#        nodes_to_remove = [n for (n,d) in G.degree().items() if d <= round(degree_max/2)]
#        G.remove_nodes_from(nodes_to_remove)

    # Removing too connected nodes (find automatic way to do it)
    #edges_to_remove = [ e for e in G.edges_iter() if

    #   nodes_to_remove = [n for n in degree if degree[n] <= 1]
    #   G.remove_nodes_from(nodes_to_remove)



    def getWeight(item):
        return item[1]
200
#
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
#    node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
#    #print(node_degree)
#    nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
#
#    for n in nodes_too_connected:
#        n_edges = list()
#        for v in nx.neighbors(G,n):
#            #print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
#            n_edges.append(((n, v), G[n][v]['weight']))
#
#        n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
#        #G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
#        #G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
#        G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])

    G.remove_nodes_from(nx.isolates(G))
    partition = best_partition(G.to_undirected())

    return(G,partition,ids,weight)