[NGRAMS SELECTION] Flow, dim reduction.

1f520fdb · Alexandre Delanoë · 99d21028 · 1f520fdb · 1f520fdb · 1f520fdb
Commit 1f520fdb authored Mar 14, 2019 by Alexandre Delanoë
5 changed files
--- a/package.yaml
+++ b/package.yaml
@@ -72,6 +72,7 @@ library:
  - Gargantext.Viz.Graph.Distances.Matrice
  - Gargantext.Viz.Graph.Index
  dependencies:
+  - array
  - QuickCheck
  - accelerate
  - aeson
@@ -107,6 +108,7 @@ library:
  - http-api-data
  - http-types
  - hsparql
+  - hstatistics
  - hxt
  - hlcm
  - ini

--- a/src/Gargantext/Text/List.hs
+++ b/src/Gargantext/Text/List.hs
@@ -98,8 +98,8 @@ toTermList stop ns =  map (toTermList' stop CandidateTerm) xs
      ys = take b $ drop a ns
      zs = drop b $ drop a ns
-      a = 10
+      a = 50
-      b = 5000
+      b = 1000
 isStopTerm :: Text -> Bool
 isStopTerm x = Text.length x < 3

--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -20,6 +20,9 @@ module Gargantext.Text.Metrics
 --import Data.Array.Accelerate ((:.)(..), Z(..))
 --import Math.KMeans (kmeans, euclidSq, elements)
+--import GHC.Float (exp)
 import Data.Map (Map)
 import Data.List.Extra (sortOn)
 import GHC.Real (round)
@@ -28,20 +31,28 @@ import Gargantext.Viz.Graph.Distances.Matrice
 import Gargantext.Viz.Graph.Index
 import qualified Data.Array.Accelerate as DAA
 import qualified Data.Array.Accelerate.Interpreter as DAA
-import qualified Data.List as L
+import qualified Data.List as List
-import qualified Data.Map  as M
+import qualified Data.Map  as Map
+import Numeric.Statistics.PCA (pcaReduceN)
+import qualified Data.Vector.Storable as Vec
+import Data.Array.IArray (Array, listArray, elems)
 type GraphListSize = Int
 type InclusionSize = Int
 takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t]
 takeScored listSize incSize = map _scored_terms
                            . linearTakes listSize incSize _scored_speGen
                                                           _scored_incExc
                            . scored
+scored :: Ord t => Map (t,t) Int -> [Scored t]
+scored = map2scored . (reduceDim 2) . scored2map
 data Scored ts = Scored
  { _scored_terms  :: !ts
  , _scored_incExc :: !InclusionExclusion
@@ -49,10 +60,26 @@ data Scored ts = Scored
  } deriving (Show)
+reduceDim :: Ord t => Int -> Map t (Vec.Vector Double)
+                          -> Map t (Vec.Vector Double)
+reduceDim d ss = Map.fromList $ zip txts $ elems $ pcaReduceN ss'' d
+  where
+    ss'' :: Array Int (Vec.Vector Double)
+    ss'' = listArray (1, List.length ss') ss'
+    (txts,ss') = List.unzip $ Map.toList ss
+scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
+scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m
+map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t]
+map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList
 -- TODO in the textflow we end up needing these indices, it might be better
 -- to compute them earlier and pass them around.
-scored :: Ord t => Map (t,t) Int -> [Scored t]
+scored' :: Ord t => Map (t,t) Int -> [Scored t]
-scored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
+scored' m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (Map.toList fi) scores
  where
    (ti, fi) = createIndices m
    (is, ss) = incExcSpeGen $ cooc2mat ti m
@@ -67,7 +94,7 @@ linearTakes :: (Ord b1, Ord b2)
            => GraphListSize -> InclusionSize
            -> (a -> b2) -> (a -> b1) -> [a] -> [a]
 linearTakes gls incSize speGen incExc = take gls
-                      . L.concat
+                      . List.concat
                      . map (take $ round
                                  $ (fromIntegral gls     :: Double)
                                  / (fromIntegral incSize :: Double)

--- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs
@@ -284,11 +284,11 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
    -- | Inclusion (i) = Gen(i)+Spec(i)
    inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
-    inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
+    inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
    -- | Genericity score = Gen(i)- Spec(i)
    specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
-    specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
+    specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
    -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
    pV :: Acc (Matrix Double) -> Acc (Vector Double)

--- a/stack.yaml
+++ b/stack.yaml
@@ -19,6 +19,9 @@ extra-deps:
  commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
 - git: https://github.com/delanoe/servant-static-th.git
  commit: ba5347e7d8a13ce5275af8470c15b2305fbb23af
+- git: https://github.com/delanoe/hstatistics.git
+  commit: 90eef7604bb230644c2246eccd094d7bfefcb135
 #- opaleye-0.6.7002.0
 - KMP-0.1.0.2
 - accelerate-1.2.0.0
@@ -27,13 +30,13 @@ extra-deps:
 - full-text-search-0.2.1.4
 - fullstop-0.1.4
 - hgal-2.0.0.2
- rdf4h-3.1.1
 - located-base-0.1.1.1
+- multiset-0.3.4.1 # stack test
 - probable-0.1.3
 - rake-0.0.1
+- rdf4h-3.1.1
 - serialise-0.2.0.0
 - servant-flatten-0.2
 - servant-multipart-0.11.2
 - stemmer-0.5.2
 - validity-0.9.0.0 # patches-{map,class}
- multiset-0.3.4.1 # stack test