Matrice.hs 12.3 KB
Newer Older
1 2 3 4 5 6 7 8 9
{-|
Module      : Gargantext.Graph.Distances.Matrix
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

10 11
This module aims at implementig distances of terms context by context is
the same referential of corpus.
12

Alexandre Delanoë's avatar
Alexandre Delanoë committed
13

14
Implementation use Accelerate library which enables GPU and CPU computation:
15

16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
  * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
    [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
    In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.

  * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
    [Optimising Purely Functional GPU Programs][MCKL13].
    In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.

  * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
    [Embedding Foreign Code][CMCK14].
    In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.

  * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
    [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
    In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.

32 33
-}

Nicolas Pouillard's avatar
Nicolas Pouillard committed
34 35 36 37 38 39
{-# LANGUAGE NoImplicitPrelude   #-}
{-# LANGUAGE FlexibleContexts    #-}
{-# LANGUAGE TypeFamilies        #-}
{-# LANGUAGE TypeOperators       #-}
{-# LANGUAGE ScopedTypeVariables #-}

40
module Gargantext.Viz.Graph.Distances.Matrice
41 42 43
  where

import Data.Array.Accelerate
44
import Data.Array.Accelerate.Interpreter (run)
45 46

import qualified Gargantext.Prelude as P
47 48 49


-----------------------------------------------------------------------
50 51 52 53
-- | Define a vector
--
-- >>> vector 3
-- Vector (Z :. 3) [0,1,2]
54 55 56
vector :: Int -> (Array (Z :. Int) Int)
vector n = fromList (Z :. n) [0..n]

57 58 59 60 61 62 63
-- | Define a matrix
--
-- >>> matrix 3 ([1..] :: [Double])
-- Matrix (Z :. 3 :. 3)
--   [ 1.0, 2.0, 3.0,
--     4.0, 5.0, 6.0,
--     7.0, 8.0, 9.0]
64 65 66 67
matrix :: Elt c => Int -> [c] -> Matrix c
matrix n l = fromList (Z :. n :. n) l

-- | Two ways to get the rank (as documentation)
68 69 70
--
-- >>> rank (matrix 3 ([1..] :: [Int]))
-- 2
71
rank :: (Matrix a) -> Int
72 73 74
rank m = arrayRank $ arrayShape m

-----------------------------------------------------------------------
75 76 77
-- | Dimension of a square Matrix
-- How to force use with SquareMatrix ?
type Dim = Int
78

79 80 81 82
-- | Get Dimension of a square Matrix
--
-- >>> dim (matrix 3 ([1..] :: [Int]))
-- 3
83
dim :: Matrix a -> Dim
84 85 86
dim m = n
  where
    Z :. _ :. n = arrayShape m
87
    -- indexTail (arrayShape m)
88

Alexandre Delanoë's avatar
Alexandre Delanoë committed
89
-----------------------------------------------------------------------
90

91 92 93 94 95 96 97 98 99
-- | Sum of a Matrix by Column
--
-- >>> run $ matSum 3 (use $ matrix 3 [1..])
-- Matrix (Z :. 3 :. 3)
--   [ 12.0, 15.0, 18.0,
--     12.0, 15.0, 18.0,
--     12.0, 15.0, 18.0]
matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
100

101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126

-- | Proba computes de probability matrix: all cells divided by thee sum of its column
-- if you need get the probability on the lines, just transpose it
--
-- >>> run $ matProba 3 (use $ matrix 3 [1..])
-- Matrix (Z :. 3 :. 3)
--   [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
--       0.3333333333333333,  0.3333333333333333,  0.3333333333333333,
--       0.5833333333333334,  0.5333333333333333,                 0.5]
matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
matProba r mat = zipWith (/) mat (matSum r mat)

-- | Diagonal of the matrix
--
-- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
-- Vector (Z :. 3) [1,5,9]
diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m

-- | Divide by the Diagonal of the matrix
--
-- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
-- Matrix (Z :. 3 :. 3)
--   [ 1.0, 0.4, 0.3333333333333333,
--     4.0, 1.0, 0.6666666666666666,
--     7.0, 1.6,                1.0]
127 128
divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
129

130 131 132 133 134 135 136 137 138 139
-----------------------------------------------------------------------
-- | Filters the matrix with the minimum of maximums
--
-- >>> run $ matMiniMax $ use $ matrix 3 [1..]
-- Matrix (Z :. 3 :. 3)
--   [ 0.0, 4.0, 7.0,
--     0.0, 5.0, 8.0,
--     0.0, 6.0, 9.0]
matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
140 141 142
  where
    miniMax' = (the $ minimum $ maximum m)

143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
-- | Filters the matrix with a constant
--
-- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
-- Matrix (Z :. 3 :. 3)
--   [ 0.0, 0.0, 7.0,
--     0.0, 0.0, 8.0,
--     0.0, 6.0, 9.0]
matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)

-----------------------------------------------------------------------
-- * Measures of proximity
-----------------------------------------------------------------------
-- ** Conditional distance

-- *** Conditional distance (basic)

160
-- | Conditional distance (basic version)
161 162 163 164 165 166 167 168 169
--
-- 2 main measures are actually implemented in order to compute the
-- proximity of two terms: conditional and distributional
--
-- Conditional measure is an absolute measure which reflects
-- interactions of 2 terms in the corpus.
measureConditional :: Matrix Int -> Matrix Double
--measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
170 171


172 173
-- *** Conditional distance (advanced)

174
-- | Conditional distance (advanced version)
175 176 177 178 179 180 181 182
--
-- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
-- "confidence" , is the maximum probability between @i@ and @j@ to see
-- @i@ in the same context of @j@ knowing @j@.
--
-- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
-- in the corpus and _[n_{ij}\] the number of its occurrences we get:
--
183
-- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
184 185
conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
186
  where
Alexandre Delanoë's avatar
Alexandre Delanoë committed
187
    ie :: Acc (Matrix Double) -> Acc (Matrix Double)
188 189 190 191 192 193
    ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
    sg :: Acc (Matrix Double) -> Acc (Matrix Double)
    sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)

    n :: Exp Double
    n = P.fromIntegral r
194

195 196
    r :: Dim
    r = dim m
197

Alexandre Delanoë's avatar
Alexandre Delanoë committed
198
    xs :: Acc (Matrix Double) -> Acc (Matrix Double)
199
    xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
200
    ys :: Acc (Matrix Double) -> Acc (Matrix Double)
201
    ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
202 203

-----------------------------------------------------------------------
204
-- ** Distributional Distance
205

206 207 208 209 210
-- | Distributional Distance Measure
--
-- Distributional measure is a relative measure which depends on the
-- selected list, it represents structural equivalence.
--
211
-- The distributional measure P(c) of @i@ and @j@ terms is: \[
212 213
-- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
-- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
214 215
--
-- Mutual information
216
-- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
217
--
218
-- Number of cooccurrences of @i@ and @j@ in the same context of text
Alexandre Delanoë's avatar
Alexandre Delanoë committed
219 220
--                        \[C{ij}\]
--
221 222 223 224 225 226 227 228
-- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
--            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
--
-- Total cooccurrences of term @i@ given a map list of size @m@
--            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
--
-- Total cooccurrences of terms given a map list of size @m@
--            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
229
--
230
distributional :: Matrix Int -> Matrix Double
231
distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
232
  where
233
    
234
    -- filter  m = zipWith (\a b -> max a b) m (transpose m)
235
    
236 237
    ri mat = zipWith (/) mat1 mat2
      where
238
        mat1 = matSum n $ zipWith min (s_mi mat) (s_mi $ transpose mat)
239
        mat2 = matSum n mat
240
    
241
    s_mi    m'  = zipWith (\a b -> log (a/b))  m'
242 243 244
              $ zipWith (/) (crossProduct m') (total m')

    total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
245
    n         = dim m
246
    
247
    crossProduct m''' = zipWith (*) (cross m'''  ) (cross (transpose m'''))
248
    cross mat         = zipWith (-) (matSum n mat) (mat)
249

Alexandre Delanoë's avatar
Alexandre Delanoë committed
250 251
-----------------------------------------------------------------------
-----------------------------------------------------------------------
252 253
-- * Specificity and Genericity

Alexandre Delanoë's avatar
Alexandre Delanoë committed
254
{- | Metric Specificity and genericity: select terms
Alexandre Delanoë's avatar
Alexandre Delanoë committed
255

Alexandre Delanoë's avatar
Alexandre Delanoë committed
256
- let N termes and occurrences of i \[N{i}\]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
257

Alexandre Delanoë's avatar
Alexandre Delanoë committed
258 259
- Cooccurrences of i and j \[N{ij}\]
- Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
260

Alexandre Delanoë's avatar
Alexandre Delanoë committed
261 262
- Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
- Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
263

Alexandre Delanoë's avatar
Alexandre Delanoë committed
264 265
- \[Inclusion (i) = Gen(i) + Spec(i)\)
- \[GenericityScore = Gen(i)- Spec(i)\]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
266

267 268 269 270
- References: Science mapping with asymmetrical paradigmatic proximity
Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
arXiv:0803.2315 [cs.OH]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
271 272 273 274 275 276 277 278 279 280 281
-}
type InclusionExclusion    = Double
type SpecificityGenericity = Double

data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
type SymetricMatrix    = Matrix
type NonSymetricMatrix = Matrix


incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
282 283 284
  where
    run' fun mat = run $ fun $ map fromIntegral $ use mat

285
    -- | Inclusion (i) = Gen(i)+Spec(i)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
286
    inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
287
    inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
288
    
289
    -- | Genericity score = Gen(i)- Spec(i)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
290
    specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
291
    specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
292
    
293
    -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
294
    pV :: Acc (Matrix Double) -> Acc (Vector Double)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
295
    pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
296
    
297
    -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
298
    pH :: Acc (Matrix Double) -> Acc (Vector Double)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
299 300 301 302 303
    pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
    
    cardN :: Exp Double
    cardN = constant (P.fromIntegral (dim m) :: Double)

304

305
-- | P(i|j) = Nij /N(jj) Probability to get i given j
306 307
--p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
308 309 310
p_ij m = zipWith (/) m (n_jj m)
  where
    n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
311
    n_jj myMat' = backpermute (shape m)
312
                         (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
Alexandre Delanoë's avatar
Alexandre Delanoë committed
313 314
                                   -> (Z :. j :. j)
                                )
315
                         ) myMat'
316

317
-- | P(j|i) = Nij /N(ii) Probability to get i given j
Alexandre Delanoë's avatar
Alexandre Delanoë committed
318 319 320
-- to test
p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
p_ji = transpose . p_ij
321

322 323

-- | Step to ckeck the result in visual/qualitative tests
324 325 326 327
incExcSpeGen_proba :: Matrix Int -> Matrix Double
incExcSpeGen_proba m = run' pro m
  where
    run' fun mat = run $ fun $ map fromIntegral $ use mat
328

329
    pro mat = p_ji mat
Alexandre Delanoë's avatar
Alexandre Delanoë committed
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344

{-
-- | Hypothesis to test maybe later (or not)
-- TODO ask accelerate for instances to ease such writtings:
p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
p_ m = zipWith (/) m (n_ m)
  where
    n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
    n_ m = backpermute (shape m)
                         (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
                                   -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
                                )
                         ) m
-}

345 346 347 348 349 350
-- * For Tests (to be removed)
-- | Test perfermance with this matrix
-- TODO : add this in a benchmark folder
distriTest :: Matrix Double
distriTest = distributional $ matrix 100 [1..]
-----------------------------------------------------------------------
Alexandre Delanoë's avatar
Alexandre Delanoë committed
351