Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
a7462f0c
Commit
a7462f0c
authored
Oct 27, 2020
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[Org] Core.Methods.Distances
parent
e5871a7d
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
338 additions
and
6 deletions
+338
-6
package.yaml
package.yaml
+0
-1
Distances.hs
src/Gargantext/Core/Methods/Distances.hs
+2
-1
Conditional.hs
...rgantext/Core/Methods/Distances/Accelerate/Conditional.hs
+87
-0
Distributional.hs
...ntext/Core/Methods/Distances/Accelerate/Distributional.hs
+112
-0
SpeGen.hs
src/Gargantext/Core/Methods/Distances/Accelerate/SpeGen.hs
+133
-0
Distributional.hs
src/Gargantext/Core/Methods/Distances/Distributional.hs
+2
-2
Examples.hs
src/Gargantext/Core/Text/Examples.hs
+1
-1
Metrics.hs
src/Gargantext/Core/Text/Metrics.hs
+1
-1
No files found.
package.yaml
View file @
a7462f0c
...
...
@@ -66,7 +66,6 @@ library:
-
Gargantext.Prelude
-
Gargantext.Prelude.Crypto.Pass.User
-
Gargantext.Prelude.Utils
-
Gargantext.Core.Methods.Distances.Matrice
-
Gargantext.Core.Text
-
Gargantext.Core.Text.Context
-
Gargantext.Core.Text.Corpus.Parsers
...
...
src/Gargantext/Core/Methods/Distances.hs
View file @
a7462f0c
...
...
@@ -20,7 +20,8 @@ import Data.Swagger
import
GHC.Generics
(
Generic
)
import
Gargantext.Prelude
(
Ord
,
Eq
,
Int
,
Double
)
import
Gargantext.Prelude
(
Show
)
import
Gargantext.Core.Methods.Distances.Matrice
(
measureConditional
,
distributional
)
import
Gargantext.Core.Methods.Distances.Accelerate.Conditional
(
measureConditional
)
import
Gargantext.Core.Methods.Distances.Accelerate.Distributional
(
distributional
)
import
Prelude
(
Enum
,
Bounded
,
minBound
,
maxBound
)
import
Test.QuickCheck
(
elements
)
import
Test.QuickCheck.Arbitrary
...
...
src/Gargantext/Core/Methods/Distances/Accelerate/Conditional.hs
0 → 100644
View file @
a7462f0c
{-|
Module : Gargantext.Core.Methods.Distances.Accelerate.Conditional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This module aims at implementig distances of terms context by context is
the same referential of corpus.
Implementation use Accelerate library which enables GPU and CPU computation
See Gargantext.Core.Methods.Graph.Accelerate)
-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
module
Gargantext.Core.Methods.Distances.Accelerate.Conditional
where
-- import qualified Data.Foldable as P (foldl1)
-- import Debug.Trace (trace)
import
Data.Array.Accelerate
import
Data.Array.Accelerate.Interpreter
(
run
)
import
Gargantext.Core.Methods.Matrix.Accelerate.Utils
import
Gargantext.Core.Methods.Distances.Accelerate.SpeGen
import
qualified
Gargantext.Prelude
as
P
-- * Metrics of proximity
-----------------------------------------------------------------------
-- ** Conditional distance
-- *** Conditional distance (basic)
-- | Conditional distance (basic version)
--
-- 2 main metrics are actually implemented in order to compute the
-- proximity of two terms: conditional and distributional
--
-- Conditional metric is an absolute metric which reflects
-- interactions of 2 terms in the corpus.
measureConditional
::
Matrix
Int
->
Matrix
Double
--measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
measureConditional
m
=
run
$
matProba
(
dim
m
)
$
map
fromIntegral
$
use
m
-- *** Conditional distance (advanced)
-- | Conditional distance (advanced version)
--
-- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called
-- "confidence" , is the maximum probability between @i@ and @j@ to see
-- @i@ in the same context of @j@ knowing @j@.
--
-- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
-- in the corpus and _[n_{ij}\] the number of its occurrences we get:
--
-- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
conditional'
::
Matrix
Int
->
(
Matrix
GenericityInclusion
,
Matrix
SpecificityExclusion
)
conditional'
m
=
(
run
$
ie
$
map
fromIntegral
$
use
m
,
run
$
sg
$
map
fromIntegral
$
use
m
)
where
ie
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
ie
mat
=
map
(
\
x
->
x
/
(
2
*
n
-
1
))
$
zipWith
(
+
)
(
xs
mat
)
(
ys
mat
)
sg
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
sg
mat
=
map
(
\
x
->
x
/
(
2
*
n
-
1
))
$
zipWith
(
-
)
(
xs
mat
)
(
ys
mat
)
n
::
Exp
Double
n
=
P
.
fromIntegral
r
r
::
Dim
r
=
dim
m
xs
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
xs
mat
=
zipWith
(
-
)
(
matSumCol
r
$
matProba
r
mat
)
(
matProba
r
mat
)
ys
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
ys
mat
=
zipWith
(
-
)
(
matSumCol
r
$
transpose
$
matProba
r
mat
)
(
matProba
r
mat
)
src/Gargantext/Core/Methods/Distances/Accelerate/Distributional.hs
0 → 100644
View file @
a7462f0c
{-|
Module : Gargantext.Core.Methods.Distances.Accelerate.Distributional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This module aims at implementig distances of terms context by context is
the same referential of corpus.
Implementation use Accelerate library which enables GPU and CPU computation
See Gargantext.Core.Methods.Graph.Accelerate)
-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
module
Gargantext.Core.Methods.Distances.Accelerate.Distributional
where
-- import qualified Data.Foldable as P (foldl1)
-- import Debug.Trace (trace)
import
Data.Array.Accelerate
import
Data.Array.Accelerate.Interpreter
(
run
)
import
Gargantext.Core.Methods.Matrix.Accelerate.Utils
import
qualified
Gargantext.Prelude
as
P
-- * Metrics of proximity
-----------------------------------------------------------------------
-- ** Distributional Distance
-- | Distributional Distance metric
--
-- Distributional metric is a relative metric which depends on the
-- selected list, it represents structural equivalence of mutual information.
--
-- The distributional metric P(c) of @i@ and @j@ terms is: \[
-- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
-- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
--
-- Mutual information
-- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
--
-- Number of cooccurrences of @i@ and @j@ in the same context of text
-- \[C{ij}\]
--
-- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
-- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
--
-- Total cooccurrences of term @i@ given a map list of size @m@
-- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
--
-- Total cooccurrences of terms given a map list of size @m@
-- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
--
distributional
::
Matrix
Int
->
Matrix
Double
distributional
m
=
-- run {- $ matMiniMax -}
run
$
diagNull
n
$
rIJ
n
$
filterWith
0
100
$
filter'
0
$
s_mi
$
map
fromIntegral
{- from Int to Double -}
$
use
m
{- push matrix in Accelerate type -}
where
_ri
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
_ri
mat
=
mat1
-- zipWith (/) mat1 mat2
where
mat1
=
matSumCol
n
$
zipWith
min
(
_myMin
mat
)
(
_myMin
$
filterWith
0
100
$
diagNull
n
$
transpose
mat
)
_mat2
=
total
mat
_myMin
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
_myMin
=
replicate
(
constant
(
Z
:.
n
:.
All
))
.
minimum
-- TODO fix NaN
-- Quali TEST: OK
s_mi
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
s_mi
m'
=
zipWith
(
\
x
y
->
log
(
x
/
y
))
(
diagNull
n
m'
)
$
zipWith
(
/
)
(
crossProduct
n
m'
)
(
total
m'
)
-- crossProduct n m'
total
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
total
=
replicate
(
constant
(
Z
:.
n
:.
n
))
.
sum
.
sum
n
::
Dim
n
=
dim
m
rIJ
::
(
Elt
a
,
Ord
a
,
P
.
Fractional
(
Exp
a
),
P
.
Num
a
)
=>
Dim
->
Acc
(
Matrix
a
)
->
Acc
(
Matrix
a
)
rIJ
n
m
=
matMiniMax
$
divide
a
b
where
a
=
sumRowMin
n
m
b
=
sumColMin
n
m
-- * For Tests (to be removed)
-- | Test perfermance with this matrix
-- TODO : add this in a benchmark folder
distriTest
::
Int
->
Matrix
Double
distriTest
n
=
distributional
(
theMatrix
n
)
src/Gargantext/Core/Methods/Distances/
Matrice
.hs
→
src/Gargantext/Core/Methods/Distances/
Accelerate/SpeGen
.hs
View file @
a7462f0c
{-|
Module : Gargantext.Core.Methods.Distances.
Matrice
Module : Gargantext.Core.Methods.Distances.
Accelerate.SpeGen
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
...
...
@@ -20,7 +20,7 @@ See Gargantext.Core.Methods.Graph.Accelerate)
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
module
Gargantext.Core.Methods.Distances.
Matrice
module
Gargantext.Core.Methods.Distances.
Accelerate.SpeGen
where
-- import qualified Data.Foldable as P (foldl1)
...
...
@@ -31,130 +31,6 @@ import Gargantext.Core.Methods.Matrix.Accelerate.Utils
import
qualified
Gargantext.Prelude
as
P
-- * Metrics of proximity
-----------------------------------------------------------------------
-- ** Conditional distance
-- *** Conditional distance (basic)
-- | Conditional distance (basic version)
--
-- 2 main metrics are actually implemented in order to compute the
-- proximity of two terms: conditional and distributional
--
-- Conditional metric is an absolute metric which reflects
-- interactions of 2 terms in the corpus.
measureConditional
::
Matrix
Int
->
Matrix
Double
--measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
measureConditional
m
=
run
$
matProba
(
dim
m
)
$
map
fromIntegral
$
use
m
-- *** Conditional distance (advanced)
-- | Conditional distance (advanced version)
--
-- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called
-- "confidence" , is the maximum probability between @i@ and @j@ to see
-- @i@ in the same context of @j@ knowing @j@.
--
-- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
-- in the corpus and _[n_{ij}\] the number of its occurrences we get:
--
-- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
conditional'
::
Matrix
Int
->
(
Matrix
GenericityInclusion
,
Matrix
SpecificityExclusion
)
conditional'
m
=
(
run
$
ie
$
map
fromIntegral
$
use
m
,
run
$
sg
$
map
fromIntegral
$
use
m
)
where
ie
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
ie
mat
=
map
(
\
x
->
x
/
(
2
*
n
-
1
))
$
zipWith
(
+
)
(
xs
mat
)
(
ys
mat
)
sg
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
sg
mat
=
map
(
\
x
->
x
/
(
2
*
n
-
1
))
$
zipWith
(
-
)
(
xs
mat
)
(
ys
mat
)
n
::
Exp
Double
n
=
P
.
fromIntegral
r
r
::
Dim
r
=
dim
m
xs
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
xs
mat
=
zipWith
(
-
)
(
matSumCol
r
$
matProba
r
mat
)
(
matProba
r
mat
)
ys
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
ys
mat
=
zipWith
(
-
)
(
matSumCol
r
$
transpose
$
matProba
r
mat
)
(
matProba
r
mat
)
-----------------------------------------------------------------------
-- ** Distributional Distance
-- | Distributional Distance metric
--
-- Distributional metric is a relative metric which depends on the
-- selected list, it represents structural equivalence of mutual information.
--
-- The distributional metric P(c) of @i@ and @j@ terms is: \[
-- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
-- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
--
-- Mutual information
-- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
--
-- Number of cooccurrences of @i@ and @j@ in the same context of text
-- \[C{ij}\]
--
-- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
-- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
--
-- Total cooccurrences of term @i@ given a map list of size @m@
-- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
--
-- Total cooccurrences of terms given a map list of size @m@
-- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
--
distributional
::
Matrix
Int
->
Matrix
Double
distributional
m
=
-- run {- $ matMiniMax -}
run
$
diagNull
n
$
rIJ
n
$
filterWith
0
100
$
filter'
0
$
s_mi
$
map
fromIntegral
{- from Int to Double -}
$
use
m
{- push matrix in Accelerate type -}
where
_ri
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
_ri
mat
=
mat1
-- zipWith (/) mat1 mat2
where
mat1
=
matSumCol
n
$
zipWith
min
(
_myMin
mat
)
(
_myMin
$
filterWith
0
100
$
diagNull
n
$
transpose
mat
)
_mat2
=
total
mat
_myMin
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
_myMin
=
replicate
(
constant
(
Z
:.
n
:.
All
))
.
minimum
-- TODO fix NaN
-- Quali TEST: OK
s_mi
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
s_mi
m'
=
zipWith
(
\
x
y
->
log
(
x
/
y
))
(
diagNull
n
m'
)
$
zipWith
(
/
)
(
crossProduct
n
m'
)
(
total
m'
)
-- crossProduct n m'
total
::
Acc
(
Matrix
Double
)
->
Acc
(
Matrix
Double
)
total
=
replicate
(
constant
(
Z
:.
n
:.
n
))
.
sum
.
sum
n
::
Dim
n
=
dim
m
rIJ
::
(
Elt
a
,
Ord
a
,
P
.
Fractional
(
Exp
a
),
P
.
Num
a
)
=>
Dim
->
Acc
(
Matrix
a
)
->
Acc
(
Matrix
a
)
rIJ
n
m
=
matMiniMax
$
divide
a
b
where
a
=
sumRowMin
n
m
b
=
sumColMin
n
m
-----------------------------------------------------------------------
-----------------------------------------------------------------------
-- * Specificity and Genericity
...
...
@@ -255,23 +131,3 @@ p_ m = zipWith (/) m (n_ m)
) m
-}
-- * For Tests (to be removed)
-- | Test perfermance with this matrix
-- TODO : add this in a benchmark folder
distriTest
::
Int
->
Matrix
Double
distriTest
n
=
distributional
(
theMatrix
n
)
{-
theResult :: Int -> Matrix Double
theResult n | (P.==) n 2 = let r = 1.6094379124341003 in [ 0, r, r, 0]
| P.otherwise = [ 1, 1 ]
-}
colMatrix
::
Elt
e
=>
Int
->
[
e
]
->
Acc
(
Array
((
Z
:.
Int
)
:.
Int
)
e
)
colMatrix
n
ns
=
replicate
(
constant
(
Z
:.
(
n
::
Int
)
:.
All
))
v
where
v
=
use
$
vector
(
P
.
length
ns
)
ns
src/Gargantext/Core/Methods/Distances/Distributional.hs
View file @
a7462f0c
{-|
Module : Gargantext.Core.Methods.Distances
Module : Gargantext.Core.Methods.Distances
.Distributional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
...
...
src/Gargantext/Core/Text/Examples.hs
View file @
a7462f0c
...
...
@@ -32,7 +32,7 @@ import Data.Ord (Down(..))
import
Data.Text
(
Text
)
import
Data.Tuple.Extra
(
both
)
import
Gargantext.Core
(
Lang
(
EN
))
import
Gargantext.Core.Methods.Distances.
Matrice
import
Gargantext.Core.Methods.Distances.
Accelerate.SpeGen
import
Gargantext.Core.Text.Context
(
splitBy
,
SplitContext
(
Sentences
))
import
Gargantext.Core.Text.Metrics.Count
(
Grouped
)
import
Gargantext.Core.Text.Metrics.Count
(
occurrences
,
cooc
)
...
...
src/Gargantext/Core/Text/Metrics.hs
View file @
a7462f0c
...
...
@@ -20,7 +20,7 @@ module Gargantext.Core.Text.Metrics
--import Math.KMeans (kmeans, euclidSq, elements)
import
Data.Map
(
Map
)
import
Gargantext.Prelude
import
Gargantext.Core.Methods.Distances.
Matrice
import
Gargantext.Core.Methods.Distances.
Accelerate.SpeGen
import
Gargantext.Core.Viz.Graph.Index
import
Gargantext.Core.Statistics
(
pcaReduceTo
,
Dimension
(
..
))
import
qualified
Data.Array.Accelerate
as
DAA
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment