Commit 72a1cfff authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TestFlow] seems good, need to add tests on it and fix distributional distance.

parent bb989318
...@@ -50,8 +50,12 @@ library: ...@@ -50,8 +50,12 @@ library:
- Gargantext.Text.Parsers.WOS - Gargantext.Text.Parsers.WOS
- Gargantext.Text.Search - Gargantext.Text.Search
- Gargantext.Text.Terms - Gargantext.Text.Terms
- Gargantext.Text.Terms.Mono
- Gargantext.Text.Terms.Multi.Lang.En
- Gargantext.Text.Terms.Multi.Lang.Fr
- Gargantext.Text.Terms.WithList - Gargantext.Text.Terms.WithList
- Gargantext.TextFlow - Gargantext.TextFlow
- Gargantext.Viz.Graph
- Gargantext.Viz.Graph.Distances.Matrice - Gargantext.Viz.Graph.Distances.Matrice
- Gargantext.Viz.Graph.Index - Gargantext.Viz.Graph.Index
dependencies: dependencies:
......
{-|PI/Application.hs {-|
API/Count.hs
API/FrontEnd.hs
API/Node.hs
API/Auth.hs
API.hs
Database/NodeNodeNgram.hs
Database/User.hs
Database/Queries.hs
Module : Gargantext.API.Settings Module : Gargantext.API.Settings
Description : Settings of the API (Server and Client) Description : Settings of the API (Server and Client)
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
......
{-| {-|
Module : Gargantext.Database Module : Gargantext.Database
Description : Main commands of BASHQL a Domain Specific Language to deal with Gargantext Database. Description : BASHQL to deal with Gargantext Database.
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
* BASHQL is a Domain Specific Language to deal with the Database
* BASHQL = functional (Bash * SQL) * BASHQL = functional (Bash * SQL)
* Which language to chose when working with a database ? To make it * Which language to chose when working with a database ? To make it
...@@ -42,6 +44,7 @@ write Haskell bash translations. ...@@ -42,6 +44,7 @@ write Haskell bash translations.
- FileSystem is now a NodeSystem where each File is a Node in a Directed Graph (DG). - FileSystem is now a NodeSystem where each File is a Node in a Directed Graph (DG).
* References * References
[0] MIT Press has published "Category theory for the sciences". The book [0] MIT Press has published "Category theory for the sciences". The book
can also be purchased on Amazon. Here are reviews by the MAA, by the can also be purchased on Amazon. Here are reviews by the MAA, by the
AMS, and by SIAM. AMS, and by SIAM.
...@@ -64,6 +67,7 @@ module Gargantext.Database ( module Gargantext.Database.Utils ...@@ -64,6 +67,7 @@ module Gargantext.Database ( module Gargantext.Database.Utils
, del , del' , del , del'
, tree, tree' , tree, tree'
, postCorpus, postAnnuaire , postCorpus, postAnnuaire
, Connection
) )
where where
......
...@@ -59,6 +59,9 @@ import qualified Data.Profunctor.Product as PP ...@@ -59,6 +59,9 @@ import qualified Data.Profunctor.Product as PP
-- | Types for Node Database Management -- | Types for Node Database Management
data PGTSVector data PGTSVector
------------------------------------------------------------------------
type CorpusId = Int
------------------------------------------------------------------------
instance FromField HyperdataCorpus where instance FromField HyperdataCorpus where
fromField = fromField' fromField = fromField'
......
...@@ -11,9 +11,9 @@ This file is intended for these purposes: ...@@ -11,9 +11,9 @@ This file is intended for these purposes:
- documentation for teaching and research - documentation for teaching and research
- learn basics of Haskell which is a scientific programming language - learn basics of Haskell which is a scientific programming language
- behavioral tests (that should be completed with uni-tests and scale-tests - behavioral tests (that should be completed with uni-tests and scale-tests)
This documents defines basic of Text definitions according to Gargantext.. This document defines basic of Text definitions according to Gargantext..
- What is a term ? - What is a term ?
- What is a sentence ? - What is a sentence ?
...@@ -104,7 +104,18 @@ ex_occ = occurrences <$> L.concat <$> ex_terms ...@@ -104,7 +104,18 @@ ex_occ = occurrences <$> L.concat <$> ex_terms
ex_cooc :: IO (Map (Label, Label) Int) ex_cooc :: IO (Map (Label, Label) Int)
ex_cooc = cooc <$> ex_terms ex_cooc = cooc <$> ex_terms
-- | Tests -- | Tests the specificity and genericity
--
-- >>> ex_cooc_mat
-- (fromList [(["glass"],0),(["spoon"],1),(["table"],2),(["wine"],3)],Matrix (Z :. 4 :. 4)
-- [ 4, 0, 0, 0,
-- 1, 2, 0, 0,
-- 3, 2, 4, 0,
-- 3, 1, 2, 3],Matrix (Z :. 4 :. 4)
-- [ 1.0, 0.25, 0.75, 0.75,
-- 0.0, 1.0, 1.0, 0.5,
-- 0.0, 0.0, 1.0, 0.5,
-- 0.0, 0.0, 0.0, 1.0],(Vector (Z :. 4) [0.5833333333333334,0.5833333333333334,0.75,0.5833333333333334],Vector (Z :. 4) [-0.5833333333333334,-0.4166666666666667,0.41666666666666674,0.5833333333333334]))
ex_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity)) ex_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity))
ex_cooc_mat = do ex_cooc_mat = do
m <- ex_cooc m <- ex_cooc
......
...@@ -36,7 +36,7 @@ import qualified Data.Array.Accelerate as DAA ...@@ -36,7 +36,7 @@ import qualified Data.Array.Accelerate as DAA
import GHC.Real (round) import GHC.Real (round)
--import Debug.Trace import Debug.Trace (trace)
data MapListSize = MapListSize Int data MapListSize = MapListSize Int
data InclusionSize = InclusionSize Int data InclusionSize = InclusionSize Int
...@@ -51,19 +51,19 @@ data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize ...@@ -51,19 +51,19 @@ data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
, fc_defaultValue :: DefaultValue , fc_defaultValue :: DefaultValue
} }
filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc filterCooc fc cc = (filterCooc' fc) ts cc
where where
ts = map _scored_terms $ takeSome fc $ coocScored cc ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = trace ("coocScored " <> show ts) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m') foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty selection M.empty selection
where where
selection = [(x,y) | x <- ts selection = [(x,y) | x <- ts
, y <- ts , y <- ts
-- , x >= y -- , x >= y
] ]
...@@ -75,7 +75,7 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore ...@@ -75,7 +75,7 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore
takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t] takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m $ takeSample n m
$ L.take l' $ sortWith (Down . _scored_incExc) scores $ L.take l' $ reverse $ sortWith (Down . _scored_incExc) scores
-- splitKmeans k scores -- splitKmeans k scores
where where
-- TODO: benchmark with accelerate-example kmeans version -- TODO: benchmark with accelerate-example kmeans version
...@@ -95,10 +95,10 @@ takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Cluste ...@@ -95,10 +95,10 @@ takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Cluste
$ sortWith (Down . _scored_speGen) xs $ sortWith (Down . _scored_speGen) xs
data Scored t = Scored { _scored_terms :: !t data Scored ts = Scored { _scored_terms :: !ts
, _scored_incExc :: !InclusionExclusion , _scored_incExc :: !InclusionExclusion
, _scored_speGen :: !SpecificityGenericity , _scored_speGen :: !SpecificityGenericity
} deriving (Show) } deriving (Show)
-- TODO in the textflow we end up needing these indices, it might be better -- TODO in the textflow we end up needing these indices, it might be better
-- to compute them earlier and pass them around. -- to compute them earlier and pass them around.
......
...@@ -17,7 +17,6 @@ module Gargantext.Text.Terms.Mono.Token.En ...@@ -17,7 +17,6 @@ module Gargantext.Text.Terms.Mono.Token.En
( EitherList(..) ( EitherList(..)
, Tokenizer , Tokenizer
, tokenize , tokenize
, run
, defaultTokenizer , defaultTokenizer
, whitespace , whitespace
, uris , uris
...@@ -64,8 +63,6 @@ import Gargantext.Prelude ...@@ -64,8 +63,6 @@ import Gargantext.Prelude
-- ,"Hyphen-words" -- ,"Hyphen-words"
-- ,"Yes/No questions" -- ,"Yes/No questions"
-- ] -- ]
---
type Tokenizer = Text -> EitherList Text Text type Tokenizer = Text -> EitherList Text Text
-- | The EitherList is a newtype-wrapped list of Eithers. -- | The EitherList is a newtype-wrapped list of Eithers.
......
...@@ -21,15 +21,22 @@ import GHC.IO (FilePath) ...@@ -21,15 +21,22 @@ import GHC.IO (FilePath)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Text.IO (readFile) import Data.Text.IO (readFile)
import Data.Maybe (catMaybes)
import qualified Data.Set as DS
import qualified Data.Array.Accelerate as A import qualified Data.Array.Accelerate as A
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
---------------------------------------------- ----------------------------------------------
import Gargantext.Database (Connection)
import Gargantext.Database.Node
import Gargantext.Core.Types.Node
import Gargantext.Core (Lang) import Gargantext.Core (Lang)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map) import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (distributional) import Gargantext.Viz.Graph.Distances.Matrice (distributional, measureConditional)
import Gargantext.Viz.Graph (Graph(..), data2graph) import Gargantext.Viz.Graph (Graph(..), data2graph)
import Gargantext.Text.Metrics.Count (cooc) import Gargantext.Text.Metrics.Count (cooc)
import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..)) import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
...@@ -38,7 +45,7 @@ import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) ...@@ -38,7 +45,7 @@ import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Text.Parsers.CSV import Gargantext.Text.Parsers.CSV
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
{- {-
____ _ _ ____ _ _
...@@ -50,11 +57,27 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) ...@@ -50,11 +57,27 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
-} -}
contextText :: [T.Text]
contextText = ["The dog is an animal."
,"The bird is an animal."
,"The bird is an animal."
,"The bird and the dog are an animal."
,"The table is an object."
,"The pen is an object."
,"This object is a pen or a table?"
,"The girl has a human body."
,"The girl has a human body."
,"The boy has a human body."
,"The boy has a human body."
]
data TextFlow = CSV FilePath data TextFlow = CSV FilePath
| FullText FilePath | FullText FilePath
| Contexts [T.Text] | Contexts [T.Text]
| SQL Int | DB Connection CorpusId
| Database T.Text | Query T.Text
-- ExtDatabase Query -- ExtDatabase Query
-- IntDatabase NodeId -- IntDatabase NodeId
...@@ -64,6 +87,7 @@ textFlow termType workType = do ...@@ -64,6 +87,7 @@ textFlow termType workType = do
FullText path -> splitBy (Sentences 5) <$> readFile path FullText path -> splitBy (Sentences 5) <$> readFile path
CSV path -> readCsvOn [csv_title, csv_abstract] path CSV path -> readCsvOn [csv_title, csv_abstract] path
Contexts ctxt -> pure ctxt Contexts ctxt -> pure ctxt
SQL con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (node_hyperdata n) <> hyperdataDocumentV3_abstract (node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
_ -> undefined _ -> undefined
textFlow' termType contexts textFlow' termType contexts
...@@ -78,53 +102,61 @@ textFlow' termType contexts = do ...@@ -78,53 +102,61 @@ textFlow' termType contexts = do
-- TermsType = Mono | Multi | MonoMulti -- TermsType = Mono | Multi | MonoMulti
-- myterms # filter (\t -> not . elem t stopList) -- myterms # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList|Ontology) -- # groupBy (Stem|GroupList|Ontology)
printDebug "terms" myterms
printDebug "myterms" (sum $ map length myterms) printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list -- Bulding the map list
-- compute copresences of terms, i.e. cooccurrences of terms in same context of text -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
-- Cooc = Map (Term, Term) Int -- Cooc = Map (Term, Term) Int
let myCooc1 = cooc myterms let myCooc1 = cooc myterms
printDebug "myCooc1" (M.size myCooc1) printDebug "myCooc1 size" (M.size myCooc1)
-- Remove Apax: appears one time only => lighting the matrix -- Remove Apax: appears one time only => lighting the matrix
let myCooc2 = M.filter (>1) myCooc1 let myCooc2 = M.filter (>0) myCooc1
printDebug "myCooc2" (M.size myCooc2) printDebug "myCooc2 size" (M.size myCooc2)
printDebug "myCooc2" myCooc2
-- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 ) let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
(InclusionSize 900 ) (InclusionSize 500 )
(SampleBins 10 ) (SampleBins 10 )
(Clusters 3 ) (Clusters 3 )
(DefaultValue 0 ) (DefaultValue 0 )
) myCooc2 ) myCooc2
printDebug "myCooc3" $ M.size myCooc3 printDebug "myCooc3 size" $ M.size myCooc3
-- putStrLn $ show myCooc3 printDebug "myCooc3" myCooc3
-- Cooc -> Matrix -- Cooc -> Matrix
let (ti, _) = createIndices myCooc3 let (ti, _) = createIndices myCooc3
printDebug "ti" $ M.size ti printDebug "ti size" $ M.size ti
printDebug "ti" ti
let myCooc4 = toIndex ti myCooc3 let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4 printDebug "myCooc4 size" $ M.size myCooc4
printDebug "myCooc4" myCooc4
let matCooc = map2mat (0) (M.size ti) myCooc4 let matCooc = map2mat (0) (M.size ti) myCooc4
printDebug "matCooc shape" $ A.arrayShape matCooc
printDebug "matCooc" matCooc printDebug "matCooc" matCooc
-- Matrix -> Clustering -- Matrix -> Clustering
--let distanceMat = conditional' matCooc let distanceMat = measureConditional matCooc
let distanceMat = distributional matCooc --let distanceMat = distributional matCooc
printDebug "distanceMat" $ A.arrayShape distanceMat printDebug "distanceMat shape" $ A.arrayShape distanceMat
printDebug "distanceMat" distanceMat printDebug "distanceMat" distanceMat
-- --
let distanceMap = mat2map distanceMat --let distanceMap = M.filter (>0) $ mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap let distanceMap = M.map (\n -> 1) $ M.filter (>0) $ mat2map distanceMat
printDebug "distanceMap size" $ M.size distanceMap
printDebug "distanceMap" distanceMap
-- let distance = fromIndex fi distanceMap -- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance -- printDebug "distance" $ M.size distance
partitions <- cLouvain distanceMap partitions <- cLouvain distanceMap
-- Building : -> Graph -> JSON -- Building : -> Graph -> JSON
printDebug "partitions" $ length partitions printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
--printDebug "partitions" partitions --printDebug "partitions" partitions
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
......
...@@ -7,15 +7,12 @@ Maintainer : team@gargantext.org ...@@ -7,15 +7,12 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
This module aims at implementig distances of terms context by context is
the same referential of corpus.
2 main measures are actually implemented in order to compute the proximity of two terms.
- Conditional measure is an absolute measure which reflects interactions of 2 terms in the corpus. Implementation use Accelerate library which enables GPU and CPU computation:
- Distributional measure is a relative measure which depends on the selected list, it represents structural equivalence.
Motivation and definition of the @Conditional@ distance.
Implementation use Accelerate library :
* Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover. * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
[Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11]. [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011. In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
...@@ -50,21 +47,27 @@ import qualified Gargantext.Prelude as P ...@@ -50,21 +47,27 @@ import qualified Gargantext.Prelude as P
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- | Test perf. -- | Define a vector
distriTest :: Matrix Double --
distriTest = distributional $ myMat 100 -- >>> vector 3
----------------------------------------------------------------------- -- Vector (Z :. 3) [0,1,2]
vector :: Int -> (Array (Z :. Int) Int) vector :: Int -> (Array (Z :. Int) Int)
vector n = fromList (Z :. n) [0..n] vector n = fromList (Z :. n) [0..n]
-- | Define a matrix
--
-- >>> matrix 3 ([1..] :: [Double])
-- Matrix (Z :. 3 :. 3)
-- [ 1.0, 2.0, 3.0,
-- 4.0, 5.0, 6.0,
-- 7.0, 8.0, 9.0]
matrix :: Elt c => Int -> [c] -> Matrix c matrix :: Elt c => Int -> [c] -> Matrix c
matrix n l = fromList (Z :. n :. n) l matrix n l = fromList (Z :. n :. n) l
myMat :: Int -> Matrix Int
myMat n = matrix n [1..]
-- | Two ways to get the rank (as documentation) -- | Two ways to get the rank (as documentation)
--
-- >>> rank (matrix 3 ([1..] :: [Int]))
-- 2
rank :: (Matrix a) -> Int rank :: (Matrix a) -> Int
rank m = arrayRank $ arrayShape m rank m = arrayRank $ arrayShape m
...@@ -73,6 +76,10 @@ rank m = arrayRank $ arrayShape m ...@@ -73,6 +76,10 @@ rank m = arrayRank $ arrayShape m
-- How to force use with SquareMatrix ? -- How to force use with SquareMatrix ?
type Dim = Int type Dim = Int
-- | Get Dimension of a square Matrix
--
-- >>> dim (matrix 3 ([1..] :: [Int]))
-- 3
dim :: Matrix a -> Dim dim :: Matrix a -> Dim
dim m = n dim m = n
where where
...@@ -80,34 +87,100 @@ dim m = n ...@@ -80,34 +87,100 @@ dim m = n
-- indexTail (arrayShape m) -- indexTail (arrayShape m)
----------------------------------------------------------------------- -----------------------------------------------------------------------
proba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
proba r mat = zipWith (/) mat (mkSum r mat)
mkSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double) -- | Sum of a Matrix by Column
mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat --
-- >>> run $ matSum 3 (use $ matrix 3 [1..])
-- Matrix (Z :. 3 :. 3)
-- [ 12.0, 15.0, 18.0,
-- 12.0, 15.0, 18.0,
-- 12.0, 15.0, 18.0]
matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
-- | divByDiag
-- | Proba computes de probability matrix: all cells divided by thee sum of its column
-- if you need get the probability on the lines, just transpose it
--
-- >>> run $ matProba 3 (use $ matrix 3 [1..])
-- Matrix (Z :. 3 :. 3)
-- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
-- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
-- 0.5833333333333334, 0.5333333333333333, 0.5]
matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
matProba r mat = zipWith (/) mat (matSum r mat)
-- | Diagonal of the matrix
--
-- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
-- Vector (Z :. 3) [1,5,9]
diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
-- | Divide by the Diagonal of the matrix
--
-- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
-- Matrix (Z :. 3 :. 3)
-- [ 1.0, 0.4, 0.3333333333333333,
-- 4.0, 1.0, 0.6666666666666666,
-- 7.0, 1.6, 1.0]
divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double) divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat) divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
where
diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
-----------------------------------------------------------------------
miniMax :: Acc (Matrix Double) -> Acc (Matrix Double) -----------------------------------------------------------------------
miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m -- | Filters the matrix with the minimum of maximums
--
-- >>> run $ matMiniMax $ use $ matrix 3 [1..]
-- Matrix (Z :. 3 :. 3)
-- [ 0.0, 4.0, 7.0,
-- 0.0, 5.0, 8.0,
-- 0.0, 6.0, 9.0]
matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
where where
miniMax' = (the $ minimum $ maximum m) miniMax' = (the $ minimum $ maximum m)
-- | Filters the matrix with a constant
--
-- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
-- Matrix (Z :. 3 :. 3)
-- [ 0.0, 0.0, 7.0,
-- 0.0, 0.0, 8.0,
-- 0.0, 6.0, 9.0]
matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
-----------------------------------------------------------------------
-- * Measures of proximity
-----------------------------------------------------------------------
-- ** Conditional distance
-- *** Conditional distance (basic)
-- | Conditional distance (basic version) -- | Conditional distance (basic version)
conditional :: Matrix Int -> Matrix Double --
conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m) -- 2 main measures are actually implemented in order to compute the
-- proximity of two terms: conditional and distributional
--
-- Conditional measure is an absolute measure which reflects
-- interactions of 2 terms in the corpus.
measureConditional :: Matrix Int -> Matrix Double
--measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
-- *** Conditional distance (advanced)
-- | Conditional distance (advanced version) -- | Conditional distance (advanced version)
-- The conditional measure \[P_c\] of 2 terms @i@ and @j@, also called "confidence" --
-- , is the maximum probability between @i@ and @j@. If \[n_i\] (resp. -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
-- \[n_j\]) is the number of occurrences of @i@ (resp. @j@) in the corpus and _[n_{ij}\] the number of its occurrences we get: -- "confidence" , is the maximum probability between @i@ and @j@ to see
-- @i@ in the same context of @j@ knowing @j@.
--
-- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
-- in the corpus and _[n_{ij}\] the number of its occurrences we get:
--
-- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\] -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m) conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
...@@ -124,21 +197,26 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr ...@@ -124,21 +197,26 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr
r = dim m r = dim m
xs :: Acc (Matrix Double) -> Acc (Matrix Double) xs :: Acc (Matrix Double) -> Acc (Matrix Double)
xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat) xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double) ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat) ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- ** Distributional Distance
-- | Distributional Distance -- | Distributional Distance Measure
-- The distributional measure \[P_c\] of @i@ and @j@ terms is: --
-- \[ S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik}, MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} -- Distributional measure is a relative measure which depends on the
-- \] -- selected list, it represents structural equivalence.
--
-- The distributional measure \[P_c\] of @i@ and @j@ terms is: \[
-- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
-- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
-- --
-- Mutual information -- Mutual information
-- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\] -- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
-- --
-- Number of cooccurrences of @i@ and @j@ in the same context of text -- Number of cooccurrences of @i@ and @j@ in the same context of text
-- \[C{ij}\] -- \[C{ij}\]
-- --
-- The expected value of the cooccurrences -- The expected value of the cooccurrences
...@@ -147,7 +225,7 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr ...@@ -147,7 +225,7 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr
-- Total cooccurrences of @i@ term -- Total cooccurrences of @i@ term
-- \[N_{i} = \sum_{i}^{} S_{i}\] -- \[N_{i} = \sum_{i}^{} S_{i}\]
distributional :: Matrix Int -> Matrix Double distributional :: Matrix Int -> Matrix Double
distributional m = run $ miniMax $ ri (map fromIntegral $ use m) distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
where where
n = dim m n = dim m
...@@ -155,8 +233,8 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m) ...@@ -155,8 +233,8 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
ri mat = zipWith (/) mat1 mat2 ri mat = zipWith (/) mat1 mat2
where where
mat1 = mkSum n $ zipWith min (mi mat) (mi $ transpose mat) mat1 = matSum n $ zipWith min (mi mat) (mi $ transpose mat)
mat2 = mkSum n mat mat2 = matSum n mat
mi m' = zipWith (\a b -> max (log $ a/b) 0) m' mi m' = zipWith (\a b -> max (log $ a/b) 0) m'
$ zipWith (/) (crossProduct m') (total m') $ zipWith (/) (crossProduct m') (total m')
...@@ -164,10 +242,12 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m) ...@@ -164,10 +242,12 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m'' total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
crossProduct m''' = zipWith (*) (cross m''' ) (cross (transpose m''')) crossProduct m''' = zipWith (*) (cross m''' ) (cross (transpose m'''))
cross mat = zipWith (-) (mkSum n mat) (mat) cross mat = zipWith (-) (matSum n mat) (mat)
----------------------------------------------------------------------- -----------------------------------------------------------------------
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- * Specificity and Genericity
{- | Metric Specificity and genericity: select terms {- | Metric Specificity and genericity: select terms
- let N termes and occurrences of i \[N{i}\] - let N termes and occurrences of i \[N{i}\]
...@@ -181,7 +261,10 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m) ...@@ -181,7 +261,10 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
- \[Inclusion (i) = Gen(i) + Spec(i)\) - \[Inclusion (i) = Gen(i) + Spec(i)\)
- \[GenericityScore = Gen(i)- Spec(i)\] - \[GenericityScore = Gen(i)- Spec(i)\]
- References: Science mapping with asymmetrical paradigmatic proximity Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276, arXiv:0803.2315 [cs.OH] - References: Science mapping with asymmetrical paradigmatic proximity
Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
arXiv:0803.2315 [cs.OH]
-} -}
type InclusionExclusion = Double type InclusionExclusion = Double
type SpecificityGenericity = Double type SpecificityGenericity = Double
...@@ -217,7 +300,8 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m) ...@@ -217,7 +300,8 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
-- | P(i|j) = Nij /N(jj) Probability to get i given j -- | P(i|j) = Nij /N(jj) Probability to get i given j
p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e) --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
p_ij m = zipWith (/) m (n_jj m) p_ij m = zipWith (/) m (n_jj m)
where where
n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e) n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
...@@ -255,4 +339,10 @@ p_ m = zipWith (/) m (n_ m) ...@@ -255,4 +339,10 @@ p_ m = zipWith (/) m (n_ m)
) m ) m
-} -}
-- * For Tests (to be removed)
-- | Test perfermance with this matrix
-- TODO : add this in a benchmark folder
distriTest :: Matrix Double
distriTest = distributional $ matrix 100 [1..]
-----------------------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment