Commit a7462f0c authored by Alexandre Delanoë's avatar Alexandre Delanoë

[Org] Core.Methods.Distances

parent e5871a7d
...@@ -66,7 +66,6 @@ library: ...@@ -66,7 +66,6 @@ library:
- Gargantext.Prelude - Gargantext.Prelude
- Gargantext.Prelude.Crypto.Pass.User - Gargantext.Prelude.Crypto.Pass.User
- Gargantext.Prelude.Utils - Gargantext.Prelude.Utils
- Gargantext.Core.Methods.Distances.Matrice
- Gargantext.Core.Text - Gargantext.Core.Text
- Gargantext.Core.Text.Context - Gargantext.Core.Text.Context
- Gargantext.Core.Text.Corpus.Parsers - Gargantext.Core.Text.Corpus.Parsers
......
...@@ -20,7 +20,8 @@ import Data.Swagger ...@@ -20,7 +20,8 @@ import Data.Swagger
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Gargantext.Prelude (Ord, Eq, Int, Double) import Gargantext.Prelude (Ord, Eq, Int, Double)
import Gargantext.Prelude (Show) import Gargantext.Prelude (Show)
import Gargantext.Core.Methods.Distances.Matrice (measureConditional, distributional) import Gargantext.Core.Methods.Distances.Accelerate.Conditional (measureConditional)
import Gargantext.Core.Methods.Distances.Accelerate.Distributional (distributional)
import Prelude (Enum, Bounded, minBound, maxBound) import Prelude (Enum, Bounded, minBound, maxBound)
import Test.QuickCheck (elements) import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary import Test.QuickCheck.Arbitrary
......
{-|
Module : Gargantext.Core.Methods.Distances.Accelerate.Conditional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This module aims at implementig distances of terms context by context is
the same referential of corpus.
Implementation use Accelerate library which enables GPU and CPU computation
See Gargantext.Core.Methods.Graph.Accelerate)
-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Methods.Distances.Accelerate.Conditional
where
-- import qualified Data.Foldable as P (foldl1)
-- import Debug.Trace (trace)
import Data.Array.Accelerate
import Data.Array.Accelerate.Interpreter (run)
import Gargantext.Core.Methods.Matrix.Accelerate.Utils
import Gargantext.Core.Methods.Distances.Accelerate.SpeGen
import qualified Gargantext.Prelude as P
-- * Metrics of proximity
-----------------------------------------------------------------------
-- ** Conditional distance
-- *** Conditional distance (basic)
-- | Conditional distance (basic version)
--
-- 2 main metrics are actually implemented in order to compute the
-- proximity of two terms: conditional and distributional
--
-- Conditional metric is an absolute metric which reflects
-- interactions of 2 terms in the corpus.
measureConditional :: Matrix Int -> Matrix Double
--measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
measureConditional m = run $ matProba (dim m)
$ map fromIntegral
$ use m
-- *** Conditional distance (advanced)
-- | Conditional distance (advanced version)
--
-- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called
-- "confidence" , is the maximum probability between @i@ and @j@ to see
-- @i@ in the same context of @j@ knowing @j@.
--
-- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
-- in the corpus and _[n_{ij}\] the number of its occurrences we get:
--
-- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
conditional' :: Matrix Int -> (Matrix GenericityInclusion, Matrix SpecificityExclusion)
conditional' m = ( run $ ie $ map fromIntegral $ use m
, run $ sg $ map fromIntegral $ use m
)
where
ie :: Acc (Matrix Double) -> Acc (Matrix Double)
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double)
sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
n :: Exp Double
n = P.fromIntegral r
r :: Dim
r = dim m
xs :: Acc (Matrix Double) -> Acc (Matrix Double)
xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
{-|
Module : Gargantext.Core.Methods.Distances.Accelerate.Distributional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This module aims at implementig distances of terms context by context is
the same referential of corpus.
Implementation use Accelerate library which enables GPU and CPU computation
See Gargantext.Core.Methods.Graph.Accelerate)
-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Methods.Distances.Accelerate.Distributional
where
-- import qualified Data.Foldable as P (foldl1)
-- import Debug.Trace (trace)
import Data.Array.Accelerate
import Data.Array.Accelerate.Interpreter (run)
import Gargantext.Core.Methods.Matrix.Accelerate.Utils
import qualified Gargantext.Prelude as P
-- * Metrics of proximity
-----------------------------------------------------------------------
-- ** Distributional Distance
-- | Distributional Distance metric
--
-- Distributional metric is a relative metric which depends on the
-- selected list, it represents structural equivalence of mutual information.
--
-- The distributional metric P(c) of @i@ and @j@ terms is: \[
-- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
-- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
--
-- Mutual information
-- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
--
-- Number of cooccurrences of @i@ and @j@ in the same context of text
-- \[C{ij}\]
--
-- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
-- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
--
-- Total cooccurrences of term @i@ given a map list of size @m@
-- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
--
-- Total cooccurrences of terms given a map list of size @m@
-- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
--
distributional :: Matrix Int -> Matrix Double
distributional m = -- run {- $ matMiniMax -}
run $ diagNull n
$ rIJ n
$ filterWith 0 100
$ filter' 0
$ s_mi
$ map fromIntegral
{- from Int to Double -}
$ use m
{- push matrix in Accelerate type -}
where
_ri :: Acc (Matrix Double) -> Acc (Matrix Double)
_ri mat = mat1 -- zipWith (/) mat1 mat2
where
mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
_mat2 = total mat
_myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
_myMin = replicate (constant (Z :. n :. All)) . minimum
-- TODO fix NaN
-- Quali TEST: OK
s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
$ zipWith (/) (crossProduct n m') (total m')
-- crossProduct n m'
total :: Acc (Matrix Double) -> Acc (Matrix Double)
total = replicate (constant (Z :. n :. n)) . sum . sum
n :: Dim
n = dim m
rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
=> Dim -> Acc (Matrix a) -> Acc (Matrix a)
rIJ n m = matMiniMax $ divide a b
where
a = sumRowMin n m
b = sumColMin n m
-- * For Tests (to be removed)
-- | Test perfermance with this matrix
-- TODO : add this in a benchmark folder
distriTest :: Int -> Matrix Double
distriTest n = distributional (theMatrix n)
{-| {-|
Module : Gargantext.Core.Methods.Distances.Matrice Module : Gargantext.Core.Methods.Distances.Accelerate.SpeGen
Description : Description :
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
...@@ -20,7 +20,7 @@ See Gargantext.Core.Methods.Graph.Accelerate) ...@@ -20,7 +20,7 @@ See Gargantext.Core.Methods.Graph.Accelerate)
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Methods.Distances.Matrice module Gargantext.Core.Methods.Distances.Accelerate.SpeGen
where where
-- import qualified Data.Foldable as P (foldl1) -- import qualified Data.Foldable as P (foldl1)
...@@ -31,130 +31,6 @@ import Gargantext.Core.Methods.Matrix.Accelerate.Utils ...@@ -31,130 +31,6 @@ import Gargantext.Core.Methods.Matrix.Accelerate.Utils
import qualified Gargantext.Prelude as P import qualified Gargantext.Prelude as P
-- * Metrics of proximity
-----------------------------------------------------------------------
-- ** Conditional distance
-- *** Conditional distance (basic)
-- | Conditional distance (basic version)
--
-- 2 main metrics are actually implemented in order to compute the
-- proximity of two terms: conditional and distributional
--
-- Conditional metric is an absolute metric which reflects
-- interactions of 2 terms in the corpus.
measureConditional :: Matrix Int -> Matrix Double
--measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
measureConditional m = run $ matProba (dim m)
$ map fromIntegral
$ use m
-- *** Conditional distance (advanced)
-- | Conditional distance (advanced version)
--
-- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called
-- "confidence" , is the maximum probability between @i@ and @j@ to see
-- @i@ in the same context of @j@ knowing @j@.
--
-- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
-- in the corpus and _[n_{ij}\] the number of its occurrences we get:
--
-- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
conditional' :: Matrix Int -> (Matrix GenericityInclusion, Matrix SpecificityExclusion)
conditional' m = ( run $ ie $ map fromIntegral $ use m
, run $ sg $ map fromIntegral $ use m
)
where
ie :: Acc (Matrix Double) -> Acc (Matrix Double)
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double)
sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
n :: Exp Double
n = P.fromIntegral r
r :: Dim
r = dim m
xs :: Acc (Matrix Double) -> Acc (Matrix Double)
xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
-----------------------------------------------------------------------
-- ** Distributional Distance
-- | Distributional Distance metric
--
-- Distributional metric is a relative metric which depends on the
-- selected list, it represents structural equivalence of mutual information.
--
-- The distributional metric P(c) of @i@ and @j@ terms is: \[
-- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
-- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
--
-- Mutual information
-- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
--
-- Number of cooccurrences of @i@ and @j@ in the same context of text
-- \[C{ij}\]
--
-- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
-- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
--
-- Total cooccurrences of term @i@ given a map list of size @m@
-- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
--
-- Total cooccurrences of terms given a map list of size @m@
-- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
--
distributional :: Matrix Int -> Matrix Double
distributional m = -- run {- $ matMiniMax -}
run $ diagNull n
$ rIJ n
$ filterWith 0 100
$ filter' 0
$ s_mi
$ map fromIntegral
{- from Int to Double -}
$ use m
{- push matrix in Accelerate type -}
where
_ri :: Acc (Matrix Double) -> Acc (Matrix Double)
_ri mat = mat1 -- zipWith (/) mat1 mat2
where
mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
_mat2 = total mat
_myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
_myMin = replicate (constant (Z :. n :. All)) . minimum
-- TODO fix NaN
-- Quali TEST: OK
s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
$ zipWith (/) (crossProduct n m') (total m')
-- crossProduct n m'
total :: Acc (Matrix Double) -> Acc (Matrix Double)
total = replicate (constant (Z :. n :. n)) . sum . sum
n :: Dim
n = dim m
rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
=> Dim -> Acc (Matrix a) -> Acc (Matrix a)
rIJ n m = matMiniMax $ divide a b
where
a = sumRowMin n m
b = sumColMin n m
----------------------------------------------------------------------- -----------------------------------------------------------------------
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- * Specificity and Genericity -- * Specificity and Genericity
...@@ -255,23 +131,3 @@ p_ m = zipWith (/) m (n_ m) ...@@ -255,23 +131,3 @@ p_ m = zipWith (/) m (n_ m)
) m ) m
-} -}
-- * For Tests (to be removed)
-- | Test perfermance with this matrix
-- TODO : add this in a benchmark folder
distriTest :: Int -> Matrix Double
distriTest n = distributional (theMatrix n)
{-
theResult :: Int -> Matrix Double
theResult n | (P.==) n 2 = let r = 1.6094379124341003 in [ 0, r, r, 0]
| P.otherwise = [ 1, 1 ]
-}
colMatrix :: Elt e
=> Int -> [e] -> Acc (Array ((Z :. Int) :. Int) e)
colMatrix n ns = replicate (constant (Z :. (n :: Int) :. All)) v
where
v = use $ vector (P.length ns) ns
{-| {-|
Module : Gargantext.Core.Methods.Distances Module : Gargantext.Core.Methods.Distances.Distributional
Description : Description :
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
......
...@@ -32,7 +32,7 @@ import Data.Ord (Down(..)) ...@@ -32,7 +32,7 @@ import Data.Ord (Down(..))
import Data.Text (Text) import Data.Text (Text)
import Data.Tuple.Extra (both) import Data.Tuple.Extra (both)
import Gargantext.Core (Lang(EN)) import Gargantext.Core (Lang(EN))
import Gargantext.Core.Methods.Distances.Matrice import Gargantext.Core.Methods.Distances.Accelerate.SpeGen
import Gargantext.Core.Text.Context (splitBy, SplitContext(Sentences)) import Gargantext.Core.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Core.Text.Metrics.Count (Grouped) import Gargantext.Core.Text.Metrics.Count (Grouped)
import Gargantext.Core.Text.Metrics.Count (occurrences, cooc) import Gargantext.Core.Text.Metrics.Count (occurrences, cooc)
......
...@@ -20,7 +20,7 @@ module Gargantext.Core.Text.Metrics ...@@ -20,7 +20,7 @@ module Gargantext.Core.Text.Metrics
--import Math.KMeans (kmeans, euclidSq, elements) --import Math.KMeans (kmeans, euclidSq, elements)
import Data.Map (Map) import Data.Map (Map)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Methods.Distances.Matrice import Gargantext.Core.Methods.Distances.Accelerate.SpeGen
import Gargantext.Core.Viz.Graph.Index import Gargantext.Core.Viz.Graph.Index
import Gargantext.Core.Statistics (pcaReduceTo, Dimension(..)) import Gargantext.Core.Statistics (pcaReduceTo, Dimension(..))
import qualified Data.Array.Accelerate as DAA import qualified Data.Array.Accelerate as DAA
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment