Merge branch 'pipeline'

d710b723 · Alexandre Delanoë · 7e930b57 · b5124f1e · d710b723 · d710b723
Commit d710b723 authored Jun 08, 2018 by Alexandre Delanoë
18 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,8 @@
 *.swp
 *.cabal
 *purescript-gargantext
+doc
+bin
+clustering-louvain
+profiling
+servant-job
--- a/package.yaml
+++ b/package.yaml
@@ -35,7 +35,6 @@ library:
  dependencies:
  - QuickCheck
  - accelerate
-  - accelerate-io
  - aeson
  - aeson-lens
  - aeson-pretty
@@ -46,6 +45,7 @@ library:
  - bytestring
  - case-insensitive
  - cassava
+  - clustering-louvain
  - conduit
  - conduit-extra
  - containers

--- a/src/Gargantext/Core/Types.hs
+++ b/src/Gargantext/Core/Types.hs
@@ -46,15 +46,10 @@ data Terms = Terms { _terms_label :: Label

 instance Show Terms where
  show (Terms l s) = show l
-- class Inclusion where include
--instance Eq Terms where
--  (==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
--                                || s2 `S.isSubsetOf` s1

 instance Eq Terms where
  (==) (Terms _ s1) (Terms _ s2) = s1 == s2

-
 ------------------------------------------------------------------------
 data Tag = POS | NER
  deriving (Show, Eq)

--- a/src/Gargantext/Pipeline.hs
+++ b/src/Gargantext/Pipeline.hs
@@ -6,7 +6,6 @@ License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
-
 -}

 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
@@ -17,27 +16,57 @@ module Gargantext.Pipeline

 import Data.Text.IO (readFile)

+import Control.Arrow ((***))
+import Data.Map.Strict (Map)
+import qualified Data.Map.Strict as M
+import qualified Data.List       as L
+import Data.Tuple.Extra (both)
 ----------------------------------------------
----------------------------------------------
-import Gargantext.Core
+import Gargantext.Core (Lang(FR))
 import Gargantext.Prelude

-import Gargantext.Viz.Graph.Index (score)
-import Gargantext.Viz.Graph.Distances.Matrice (distributional)
-import Gargantext.Text.Metrics.Occurrences
-import Gargantext.Text.Terms
-import Gargantext.Text.Context
+import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
+import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
+import Gargantext.Viz.Graph.Index (Index)
+import Gargantext.Text.Metrics.Count (cooc, removeApax)
+import Gargantext.Text.Metrics
+import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
+import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
+
+import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)


+{-
+  ____                             _            _
+ / ___| __ _ _ __ __ _  __ _ _ __ | |_ _____  _| |_
+| |  _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
+| |_| | (_| | | | (_| | (_| | | | | ||  __/>  <| |_
+ \____|\__,_|_|  \__, |\__,_|_| |_|\__\___/_/\_\\__|
+                 |___/
+
+-}
+
 pipeline path = do
  -- Text  <- IO Text <- FilePath
-  text     <- readFile path  
-  let contexts = splitBy (Sentences 3) text
+  text     <- readFile path
+  let contexts = splitBy (Sentences 5) text
  myterms <- extractTerms Multi FR contexts
+  
  -- TODO    filter (\t -> not . elem t stopList) myterms
  -- TODO    groupBy (Stem | GroupList)
+  
  let myCooc = removeApax $ cooc myterms
+  --let (ti, fi) = createIndices myCooc
+  pure True
+  --pure $ incExcSpeGen myCooc
  -- Cooc -> Matrix
-  pure $ score distributional myCooc
-  -- Matrix -> Clustering -> Graph -> JSON
+  
+--  -- filter by spec/gen (dynmaic programming)
+--  let theScores = M.filter (>0) $ score conditional myCoocFiltered
+----
+------  -- Matrix -> Clustering
+------  pure $ bestpartition False $ map2graph $ toIndex ti theScores
+--  partitions <- cLouvain theScores
+--  pure partitions
+---- | Building : -> Graph -> JSON

--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -237,5 +237,3 @@ unMaybe = map fromJust . L.filter isJust
 -- maximumWith
 maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))

-
-
--- a/src/Gargantext/Text.hs
+++ b/src/Gargantext/Text.hs
@@ -27,7 +27,7 @@ import NLP.FullStop (segment)
 -----------------------------------------------------------------
 import Gargantext.Core
 import Gargantext.Core.Types
-import Gargantext.Text.Metrics.Occurrences (Occ, occurrences, cooc)
+import Gargantext.Text.Metrics.Count (Occ, occurrences, cooc)
 import Gargantext.Prelude hiding (filter)
 -----------------------------------------------------------------


--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -8,18 +8,132 @@ Stability   : experimental
 Portability : POSIX

 Mainly reexport functions in @Data.Text.Metrics@
+
+
+TODO
+noApax :: Ord a => Map a Occ -> Map a Occ
+noApax m = M.filter (>1) m
+
 -}

 {-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Text.Metrics where
+module Gargantext.Text.Metrics 
+  where

--import Data.Text (Text)
+import Data.Text (Text, pack)
+import Data.Map (Map)
+
+import qualified Data.List as L
+import qualified Data.Map  as M
+import qualified Data.Set  as S
+import qualified Data.Text as T
+import Data.Tuple.Extra (both)
 --import GHC.Real (Ratio)
 --import qualified Data.Text.Metrics as DTM
--
--import Gargantext.Prelude
--
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
+import Data.Array.Accelerate (toList)
+
+
+import Gargantext.Prelude
+
+import Gargantext.Text.Metrics.Count (occurrences, cooc)
+import Gargantext.Text.Terms (TermType(MonoMulti), terms)
+import Gargantext.Core (Lang(EN))
+import Gargantext.Core.Types (Terms(..))
+import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
+
+import Gargantext.Viz.Graph.Distances.Matrice
+import Gargantext.Viz.Graph.Index
+
+
+-- ord relevance: top n plus inclus
+-- échantillonnage de généricity
+-- 
+--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
+--filterCooc m = 
+---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
+----(ti, fi)  = createIndices m
+-- . fromIndex fi $ filterMat $ cooc2mat ti m
+
+
+import Data.Array.Accelerate (Matrix)
+
+filterMat :: Matrix Int -> [(Index, Index)]
+filterMat m = S.toList $ S.take n $ S.fromList $ (L.take nIe incExc') <> (L.take nSg speGen')
+  where
+    (incExc', speGen') = both ( map fst . L.sortOn snd . M.toList . mat2map) (conditional' m)
+    n = nIe + nSg
+    nIe = 30
+    nSg = 70
+
+
+
+incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
+incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
+  where
+    (ti,fi) = createIndices m
+    ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
+
+
+
+
+metrics_text :: Text
+metrics_text = T.intercalate " " metrics_sentences
+
+metrics_sentences' :: [Text]
+metrics_sentences' = splitBy (Sentences 0) metrics_text
+
+-- | Sentences 
+metrics_sentences :: [Text]
+metrics_sentences = [ "There is a table with a glass of wine and a spoon."
+                    , "I can see the glass on the table."
+                    , "There was only a spoon on that table."
+                    , "The glass just fall from the table, pouring wine everywhere."
+                    , "I wish the glass did not contain wine."
+                    ]
+
+metrics_sentences_Test = metrics_sentences == metrics_sentences'
+
+-- | Terms reordered to visually check occurrences
+-- >>> 
+{- [ [["table"],["glass"],["wine"],["spoon"]]
+   , [["glass"],["table"]]
+   , [["spoon"],["table"]]
+   , [["glass"],["table"],["wine"]]
+   , [["glass"],["wine"]]
+   ]
+-}
+
+metrics_terms :: IO [[Terms]]
+metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
+
+-- | Occurrences
+{-
+fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
+         , (fromList ["object"],fromList [(["object"], 3 )])
+         , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
+         , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
+-}
+metrics_occ = occurrences <$> L.concat <$> metrics_terms
+
+{- 
+-- fromList [((["glas"],["object"]),6)
+            ,((["glas"],["spoon"]),4)
+            ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
+
+-}
+metrics_cooc = cooc <$> metrics_terms
+
+metrics_cooc_mat = do
+  m <- metrics_cooc
+  let (ti,_) = createIndices m
+  let mat_cooc = cooc2mat ti m
+  pure ( ti
+       , mat_cooc
+       , incExcSpeGen_proba  mat_cooc
+       , incExcSpeGen        mat_cooc
+       )
+
+metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc

--- a/src/Gargantext/Text/Metrics/Occurrences.hs
+++ b/src/Gargantext/Text/Metrics/Occurrences.hs
 {-|
-Module      : Gargantext.Text.Metrics.Occurrences
+Module      : Gargantext.Text.Metrics.Count
 Description : 
 Copyright   : (c) CNRS, 2017-Present
 License     : AGPL + CECILL v3
@@ -25,7 +25,7 @@ Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrence
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Text.Metrics.Occurrences
+module Gargantext.Text.Metrics.Count
  where


@@ -71,7 +71,6 @@ type Grouped = Stems
 --λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
 ---- 
-
           -}

 type Occs = Int
@@ -81,10 +80,16 @@ removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
 removeApax = DMS.filter (> 1)

 cooc :: [[Terms]] -> Map (Label, Label) Int
-cooc tss =
-  mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss)
+cooc tss = coocOnWithLabel _terms_stem (labelPolicy terms_occs) tss
+  where
+    terms_occs = occurrencesOn _terms_stem (List.concat tss)
+
+
+coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
+                                      -> [[a]] -> Map (label, label) Coocs
+coocOnWithLabel on policy tss =
+  mapKeys (delta policy) $ coocOn on tss
  where
-    terms_occs = occurrences (List.concat tss)
    delta f = f *** f


@@ -93,26 +98,29 @@ labelPolicy m g =  case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList
                     Just label -> label
                     Nothing    -> panic $ "Label of Grouped not found: " <> (pack $ show g)

-cooc' :: Ord b => [Set b] -> Map (b, b) Coocs
-cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
+coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
+coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
  where
-      xs = [ ((x, y), 1)
-           | xs <- tss
-           , ys <- tss
-           , x <- Set.toList xs
-           , y <- Set.toList ys
-           , x < y
-           ]
+    coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs
+    coocOn' f ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
+      where
+          ts' = List.nub $ map f ts
+          xs = [ ((x, y), 1)
+               | x <- ts'
+               , y <- ts'
+    --           , x /= y
+               ]


 -- | Compute the grouped occurrences (occ)
 occurrences :: [Terms] -> Map Grouped (Map Terms Int)
-occurrences = occurrences' _terms_stem
+occurrences = occurrencesOn _terms_stem

-occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
-occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
+occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
+occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty

 -- TODO add groups and filter stops
+
 sumOcc :: Ord a => [Occ a] -> Occ a
 sumOcc xs = foldl' (unionWith (+)) empty xs


--- a/src/Gargantext/Text/Metrics/FrequentItemSet.hs
+++ b/src/Gargantext/Text/Metrics/FrequentItemSet.hs
@@ -14,42 +14,52 @@ Domain Specific Language to manage Frequent Item Set (FIS)
 {-# LANGUAGE NoImplicitPrelude #-}

 module Gargantext.Text.Metrics.FrequentItemSet
-  ( Fis, Size
+  ( Fis, Size(..)
  , occ_hlcm, cooc_hlcm
  , all, between
+  , fisWithSize
+  , fisWith
+  , fisWithSizePoly
+  , fisWithSizePoly2
  , module HLCM
  )
  where

-import Data.List (tail, filter)
-import Data.Either
+import Prelude (Functor(..)) -- TODO
+import qualified Data.Map.Strict as Map
+import Data.Map.Strict (Map)
+import qualified Data.Set as Set
+import Data.Set (Set)
+import qualified Data.Vector as V
+import Data.Vector (Vector)
+
+import Data.List (filter, concat)
+import Data.Maybe (catMaybes)

 import HLCM

 import Gargantext.Prelude

-type Size = Either Int (Int, Int)
-
--data Size = Point | Segment
+data Size = Point Int | Segment Int Int

 ------------------------------------------------------------------------
 -- | Occurrence is Frequent Item Set of size 1
 occ_hlcm  :: Frequency -> [[Item]] -> [Fis]
-occ_hlcm f is  = fisWithSize (Left 1) f is
+occ_hlcm = fisWithSize (Point 1)

 -- | Cooccurrence is Frequent Item Set of size 2
 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
-cooc_hlcm f is = fisWithSize (Left 2) f is
+cooc_hlcm = fisWithSize (Point 2)

 all :: Frequency -> [[Item]] -> [Fis]
-all f is  = fisWith Nothing f is
+all = fisWith Nothing

 ------------------------------------------------------------------------
 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
-between (x,y) f is = fisWithSize (Right (x,y)) f is
+between (x,y) = fisWithSize (Segment x y)

 --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
--maximum m f is = between (0,m) f is
+--maximum m = between (0,m)


 ------------------------------------------------------------------------
@@ -62,31 +72,48 @@ data Fis' a = Fis' { _fisCount   :: Int
                   , _fisItemSet :: [a]
                   } deriving (Show)

+instance Functor Fis' where
+  fmap f (Fis' c is) = Fis' c (fmap f is)
+
 -- | Sugar from items to FIS
 items2fis :: [Item] -> Maybe Fis
-items2fis is = case head is of
-                 Nothing -> Nothing
-                 Just h  -> Just (Fis' h (tail is))
+items2fis []     = Nothing
+items2fis (i:is) = Just $ Fis' i is

 ------------------------------------------------------------------------
 ------------------------------------------------------------------------

 fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
 fisWithSize n f is = case n of
-  Left   n'   -> fisWith (Just (\x -> length x == (n'+1)    )) f is
-  Right (a,b) -> fisWith (Just (\x -> cond1 a x && cond2 b x)) f is
+  Point  n'   -> fisWith (Just (\x -> length x == (n'+1) )) f is
+  Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
                    where
-                      cond1 a' x = length x >= a'
-                      cond2 b' x = length x <= b'
+                      cond a' x b' = a' <= x && x <= b'


 fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
-fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f
+fisWith s f is = catMaybes $ map items2fis $ filter' $ runLCMmatrix is f
  where
    filter' = case s of
                Nothing  -> identity
                Just fun -> filter fun

+-- Here the sole purpose to take the keys as a Set is tell we do not want
+-- duplicates.
+fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
+fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
+  where
+    ksv = V.fromList $ Set.toList ks
+    ksm = Map.fromList . flip zip [0..] $ V.toList ksv
+    toItem = (ksm Map.!)
+    fromItem = (ksv V.!)
+
+fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
+fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
+  where
+    ks = Set.fromList $ concat is
+
+
 ------------------------------------------------------------------------
 ------------------------------------------------------------------------


--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -42,16 +42,23 @@ import Gargantext.Core.Types
 import Gargantext.Text.Terms.Multi (multiterms)
 import Gargantext.Text.Terms.Mono  (monoterms')

-data TermType = Mono | Multi
+data TermType = Mono | Multi | MonoMulti

 -- remove Stop Words
 -- map (filter (\t -> not . elem t)) $ 
 ------------------------------------------------------------------------
+-- | Sugar to extract terms from text (hiddeng mapM from end user).
 extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
 extractTerms termType lang = mapM (terms termType lang)
 ------------------------------------------------------------------------
+-- | Terms from Text
+-- Mono : mono terms
+-- Multi : multi terms
+-- MonoMulti : mono and multi
+-- TODO : multi terms should exclude mono (intersection is not empty yet)
 terms :: TermType -> Lang -> Text -> IO [Terms]
-terms Mono  lang txt = pure $ monoterms' lang txt
-terms Multi lang txt = multiterms lang txt
+terms Mono  lang txt     = pure $ monoterms' lang txt
+terms Multi lang txt     = multiterms lang txt
+terms MonoMulti lang txt = terms Multi lang txt
 ------------------------------------------------------------------------

--- a/src/Gargantext/Text/Terms/Multi.hs
+++ b/src/Gargantext/Text/Terms/Multi.hs
@@ -18,23 +18,27 @@ module Gargantext.Text.Terms.Multi (multiterms)

 import Data.Text hiding (map, group, filter, concat)
 import Data.List (concat)
+import qualified Data.Set as S

 import Gargantext.Prelude
 import Gargantext.Core (Lang(..))
 import Gargantext.Core.Types

 import Gargantext.Text.Terms.Multi.PosTagging
+import Gargantext.Text.Terms.Mono.Stem (stem)
 import qualified Gargantext.Text.Terms.Multi.Lang.En as En
 import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr

 multiterms :: Lang -> Text -> IO [Terms]
 multiterms lang txt = concat
-                   <$> map (map tokenTag2terms)
+                   <$> map (map (tokenTag2terms lang))
                   <$> map (filter (\t -> _my_token_pos t == Just NP)) 
                   <$> tokenTags lang txt

-tokenTag2terms :: TokenTag -> Terms
-tokenTag2terms (TokenTag w t _ _) =  Terms w t
+tokenTag2terms :: Lang -> TokenTag -> Terms
+tokenTag2terms lang (TokenTag w t _ _) =  Terms w t'
+  where
+    t' = S.fromList $ map (stem lang) $ S.toList t

 tokenTags :: Lang -> Text -> IO [[TokenTag]]
 tokenTags lang s = map (group lang) <$> tokenTags' lang s

--- a/src/Gargantext/Text/Terms/Multi/Lang/En.hs
+++ b/src/Gargantext/Text/Terms/Multi/Lang/En.hs
@@ -28,9 +28,9 @@ group :: [TokenTag] -> [TokenTag]
 group [] = []
 group ntags = group2 NP NP
           $ group2 NP VB
-           $ group2 NP IN
+--           $ group2 NP IN
           $ group2 IN DT
-           $ group2 VB NP
+--           $ group2 VB NP
           $ group2 JJ NP
           $ group2 JJ JJ
           $ group2 JJ CC

--- a/src/Gargantext/Text/Terms/Multi/Lang/Fr.hs
+++ b/src/Gargantext/Text/Terms/Multi/Lang/Fr.hs
@@ -27,8 +27,8 @@ group :: [TokenTag] -> [TokenTag]
 group [] = []
 group ntags = group2 NP NP
            $ group2 NP VB
-            $ group2 NP IN
-            $ group2 IN DT
+            -- $ group2 NP IN
+            -- $ group2 IN DT
            $ group2 VB NP
            $ group2 JJ NP
            $ group2 NP JJ

--- a/src/Gargantext/Text/Terms/Multi/PosTagging.hs
+++ b/src/Gargantext/Text/Terms/Multi/PosTagging.hs
@@ -96,10 +96,10 @@ data Properties = Properties { _propertiesAnnotators  :: Text

 $(deriveJSON (unPrefix "_properties") ''Properties)

-data Sentences = Sentences { _sentences :: [Sentence]}
+data PosSentences = PosSentences { _sentences :: [Sentence]}
  deriving (Show, Generic)

-$(deriveJSON (unPrefix "_") ''Sentences)
+$(deriveJSON (unPrefix "_") ''PosSentences)


 -- request = 
@@ -134,7 +134,7 @@ corenlpRaw lang txt = do
  pure (getResponseBody response)


-corenlp :: Lang -> Text -> IO Sentences
+corenlp :: Lang -> Text -> IO PosSentences
 corenlp lang txt = do
  response <- corenlp' lang txt
  pure (getResponseBody response)

--- a/src/Gargantext/Viz/Graph.hs
+++ b/src/Gargantext/Viz/Graph.hs
@@ -60,8 +60,3 @@ $(deriveJSON (unPrefix "g_") ''Graph)



-
-
-
-
-
--- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs
@@ -28,10 +28,12 @@ Implementation use Accelerate library :

 -}

-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE FlexibleContexts  #-}
-{-# LANGUAGE TypeFamilies      #-}
-{-# LANGUAGE TypeOperators     #-}
+{-# LANGUAGE NoImplicitPrelude   #-}
+{-# LANGUAGE FlexibleContexts    #-}
+{-# LANGUAGE TypeFamilies        #-}
+{-# LANGUAGE TypeOperators       #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+

 module Gargantext.Viz.Graph.Distances.Matrice
  where
@@ -46,7 +48,7 @@ import Data.Maybe (Maybe(Just))
 import qualified Gargantext.Prelude as P
 import qualified Data.Array.Accelerate.Array.Representation     as Repr

-import Gargantext.Text.Metrics.Occurrences
+import Gargantext.Text.Metrics.Count


 -----------------------------------------------------------------------
@@ -67,33 +69,48 @@ myMat n = matrix n [1..]
 rank :: (Matrix a) -> Int
 rank m = arrayRank $ arrayShape m

-rank' :: (Matrix a) -> Int
-rank' m = n
+-----------------------------------------------------------------------
+-- | Dimension of a square Matrix
+-- How to force use with SquareMatrix ?
+type Dim = Int
+
+dim :: (Matrix a) -> Dim
+dim m = n
  where
    Z :. _ :. n = arrayShape m
+    -- == indexTail (arrayShape m)

 -----------------------------------------------------------------------
-- | Conditional Distance
+proba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
+proba r mat = zipWith (/) mat (mkSum r mat)

-type Rank = Int
+mkSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
+mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat

-proba :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double)
-proba r mat = zipWith (/) mat (mkSum r mat)
+-- divByDiag 
+divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
+divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
+  where
+    diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
+    diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
+-----------------------------------------------------------------------

-mkSum :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double)
-mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) 
-            $ fold (+) 0 mat
+miniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
+miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
+  where
+    miniMax' = (the $ minimum $ maximum m)

+-- | Conditional distance (basic version)
+conditional :: Matrix Int -> Matrix Double
+conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)

-type Matrix' a = Acc (Matrix a)
-type InclusionExclusion    = Double
-type SpecificityGenericity = Double

-conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
-conditional m = (run $ ie (use m), run $ sg (use m))
+-- | Conditional distance (advanced version)
+conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
+conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
  where

-    ie :: Matrix' Double -> Matrix' Double
+    ie :: Acc (Matrix Double) -> Acc (Matrix Double)
    ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
    sg :: Acc (Matrix Double) -> Acc (Matrix Double)
    sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
@@ -101,27 +118,21 @@ conditional m = (run $ ie (use m), run $ sg (use m))
    n :: Exp Double
    n = P.fromIntegral r

-    r :: Rank
-    r = rank' m
+    r :: Dim
+    r = dim m

-    xs :: Matrix' Double -> Matrix' Double
+    xs :: Acc (Matrix Double) -> Acc (Matrix Double)
    xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
    ys :: Acc (Matrix Double) -> Acc (Matrix Double)
    ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)

-- filter with threshold
 -----------------------------------------------------------------------

 -- | Distributional Distance
-
 distributional :: Matrix Int -> Matrix Double
-distributional m = run $ filter $ ri (map fromIntegral $ use m)
+distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
  where
-    n    = rank' m
-    
-    miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
-      where
-        miniMax' = (the $ minimum $ maximum m)
+    n    = dim m
    
    filter  m = zipWith (\a b -> max a b) m (transpose m)
    
@@ -139,6 +150,98 @@ distributional m = run $ filter $ ri (map fromIntegral $ use m)
    cross mat      = zipWith (-) (mkSum n mat) (mat)


-int2double :: Matrix Int -> Matrix Double
-int2double m = run (map fromIntegral $ use m)
+
+-----------------------------------------------------------------------
+-----------------------------------------------------------------------
+
+{-
+Metric Specificity and genericity: select terms
+
+let N termes
+Ni      : occ de i
+Nij     : cooc i et j
+Probability to get i given j : P(i|j)=Nij/Nj
+Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
+Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
+Inclusion (i) = Gen(i)+Spec(i)
+Genericity score = Gen(i)- Spec(i)
+
+
+References:
+ *  Science mapping with asymmetrical paradigmatic proximity Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276, arXiv:0803.2315 [cs.OH]
+
+-}
+
+type InclusionExclusion    = Double
+type SpecificityGenericity = Double
+
+data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
+type SymetricMatrix    = Matrix
+type NonSymetricMatrix = Matrix
+
+
+incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
+incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
+  where
+    run' fun mat = run $ fun $ map fromIntegral $ use mat
+
+    -- | Inclusion (i) = Gen(i)+Spec(i)
+    inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
+    inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
+--    
+    -- | Genericity score = Gen(i)- Spec(i)
+    specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
+    specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
+    
+    -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
+    pV :: Acc (Matrix Double) -> Acc (Vector Double)
+    pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
+    
+    -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
+    pH :: Acc (Matrix Double) -> Acc (Vector Double)
+    pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
+    
+    cardN :: Exp Double
+    cardN = constant (P.fromIntegral (dim m) :: Double)
+
+
+-- | P(i|j) = Nij /N(jj) Probability to get i given j
+p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
+p_ij m = zipWith (/) m (n_jj m)
+  where
+    n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
+    n_jj m = backpermute (shape m)
+                         (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
+                                   -> (Z :. j :. j)
+                                )
+                         ) m
+
+-- | P(j|i) = Nij /N(ii) Probability to get i given j
+-- to test
+p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
+p_ji = transpose . p_ij
+
+
+-- | Step to ckeck the result in visual/qualitative tests
+incExcSpeGen_proba :: Matrix Int -> Matrix Double
+incExcSpeGen_proba m = run' pro m
+  where
+    run' fun mat = run $ fun $ map fromIntegral $ use mat
+
+    pro mat = p_ji mat
+
+{-
+-- | Hypothesis to test maybe later (or not)
+-- TODO ask accelerate for instances to ease such writtings:
+p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
+p_ m = zipWith (/) m (n_ m)
+  where
+    n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
+    n_ m = backpermute (shape m)
+                         (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
+                                   -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
+                                )
+                         ) m
+-}
+

--- a/src/Gargantext/Viz/Graph/Index.hs
+++ b/src/Gargantext/Viz/Graph/Index.hs
@@ -39,6 +39,8 @@ import qualified Data.Set as S
 import Data.Map (Map)
 import qualified Data.Map.Strict    as M

+import Data.Vector (Vector)
+
 import Gargantext.Prelude

 type Index    = Int
@@ -50,7 +52,7 @@ score :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
                -> Map (t, t) Double
 score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m
  where
-    (toI, fromI) = createIndexes m
+    (toI, fromI) = createIndices m

 -------------------------------------------------------------------------------
 -------------------------------------------------------------------------------
@@ -79,21 +81,30 @@ toIndex ni ns = indexConversion ni ns

 fromIndex :: Ord t => Map Index t -> Map (Index, Index) a -> Map (t,t) a
 fromIndex ni ns = indexConversion ni ns
---------------------------------------------------------------------------------
+
 indexConversion :: (Ord b, Ord k) => Map k b -> Map (k,k) a -> Map (b, b) a
 indexConversion index ms = M.fromList $ map (\((k1,k2),c) -> ( ((M.!) index k1, (M.!) index k2), c)) (M.toList ms)
+---------------------------------------------------------------------------------
+
 -------------------------------------------------------------------------------
-------------------------------------------------------------------------------
-createIndexes :: Ord t => Map (t, t) b -> (Map t Index, Map Index t)
-createIndexes = set2indexes . cooc2set
+-- TODO
+fromIndex' :: Ord t => Vector t -> Map (Index, Index) a -> Map (t,t) a
+fromIndex' vi ns = undefined
+
+-- TODO
+createIndices' :: Ord t => Map (t, t) b -> (Map t Index, Vector t)
+createIndices' = undefined
+
+createIndices :: Ord t => Map (t, t) b -> (Map t Index, Map Index t)
+createIndices = set2indices . map2set
  where
-    cooc2set :: Ord t => Map (t, t) a -> Set t
-    cooc2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs')
+    map2set :: Ord t => Map (t, t) a -> Set t
+    map2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs')
      where
        insert as s = foldl' (\s' t -> S.insert t s') s as

-    set2indexes :: Ord t => Set t -> (Map t Index, Map Index t)
-    set2indexes s = (M.fromList toIndex', M.fromList fromIndex')
+    set2indices :: Ord t => Set t -> (Map t Index, Map Index t)
+    set2indices s = (M.fromList toIndex', M.fromList fromIndex')
      where
        fromIndex' = zip [0..] xs
        toIndex'   = zip xs [0..]

--- a/stack.yaml
+++ b/stack.yaml
@@ -14,7 +14,7 @@ extra-deps:
 - git: https://github.com/delanoe/servant-static-th.git
  commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434
 - accelerate-1.2.0.0
- accelerate-io-1.2.0.0
+- hashtables-1.2.3.0 # needed by accelerate-1.2.0.0
 - aeson-1.2.4.0
 - aeson-lens-0.5.0.0
 - duckling-0.1.3.0