Commit 1f520fdb authored by Alexandre Delanoë's avatar Alexandre Delanoë

[NGRAMS SELECTION] Flow, dim reduction.

parent 99d21028
...@@ -72,6 +72,7 @@ library: ...@@ -72,6 +72,7 @@ library:
- Gargantext.Viz.Graph.Distances.Matrice - Gargantext.Viz.Graph.Distances.Matrice
- Gargantext.Viz.Graph.Index - Gargantext.Viz.Graph.Index
dependencies: dependencies:
- array
- QuickCheck - QuickCheck
- accelerate - accelerate
- aeson - aeson
...@@ -107,6 +108,7 @@ library: ...@@ -107,6 +108,7 @@ library:
- http-api-data - http-api-data
- http-types - http-types
- hsparql - hsparql
- hstatistics
- hxt - hxt
- hlcm - hlcm
- ini - ini
......
...@@ -98,8 +98,8 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs ...@@ -98,8 +98,8 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys = take b $ drop a ns ys = take b $ drop a ns
zs = drop b $ drop a ns zs = drop b $ drop a ns
a = 10 a = 50
b = 5000 b = 1000
isStopTerm :: Text -> Bool isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3 isStopTerm x = Text.length x < 3
......
...@@ -20,6 +20,9 @@ module Gargantext.Text.Metrics ...@@ -20,6 +20,9 @@ module Gargantext.Text.Metrics
--import Data.Array.Accelerate ((:.)(..), Z(..)) --import Data.Array.Accelerate ((:.)(..), Z(..))
--import Math.KMeans (kmeans, euclidSq, elements) --import Math.KMeans (kmeans, euclidSq, elements)
--import GHC.Float (exp)
import Data.Map (Map) import Data.Map (Map)
import Data.List.Extra (sortOn) import Data.List.Extra (sortOn)
import GHC.Real (round) import GHC.Real (round)
...@@ -28,20 +31,28 @@ import Gargantext.Viz.Graph.Distances.Matrice ...@@ -28,20 +31,28 @@ import Gargantext.Viz.Graph.Distances.Matrice
import Gargantext.Viz.Graph.Index import Gargantext.Viz.Graph.Index
import qualified Data.Array.Accelerate as DAA import qualified Data.Array.Accelerate as DAA
import qualified Data.Array.Accelerate.Interpreter as DAA import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.List as L import qualified Data.List as List
import qualified Data.Map as M import qualified Data.Map as Map
import Numeric.Statistics.PCA (pcaReduceN)
import qualified Data.Vector.Storable as Vec
import Data.Array.IArray (Array, listArray, elems)
type GraphListSize = Int type GraphListSize = Int
type InclusionSize = Int type InclusionSize = Int
takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t] takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t]
takeScored listSize incSize = map _scored_terms takeScored listSize incSize = map _scored_terms
. linearTakes listSize incSize _scored_speGen . linearTakes listSize incSize _scored_speGen
_scored_incExc _scored_incExc
. scored . scored
scored :: Ord t => Map (t,t) Int -> [Scored t]
scored = map2scored . (reduceDim 2) . scored2map
data Scored ts = Scored data Scored ts = Scored
{ _scored_terms :: !ts { _scored_terms :: !ts
, _scored_incExc :: !InclusionExclusion , _scored_incExc :: !InclusionExclusion
...@@ -49,10 +60,26 @@ data Scored ts = Scored ...@@ -49,10 +60,26 @@ data Scored ts = Scored
} deriving (Show) } deriving (Show)
reduceDim :: Ord t => Int -> Map t (Vec.Vector Double)
-> Map t (Vec.Vector Double)
reduceDim d ss = Map.fromList $ zip txts $ elems $ pcaReduceN ss'' d
where
ss'' :: Array Int (Vec.Vector Double)
ss'' = listArray (1, List.length ss') ss'
(txts,ss') = List.unzip $ Map.toList ss
scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m
map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t]
map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList
-- TODO in the textflow we end up needing these indices, it might be better -- TODO in the textflow we end up needing these indices, it might be better
-- to compute them earlier and pass them around. -- to compute them earlier and pass them around.
scored :: Ord t => Map (t,t) Int -> [Scored t] scored' :: Ord t => Map (t,t) Int -> [Scored t]
scored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores scored' m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (Map.toList fi) scores
where where
(ti, fi) = createIndices m (ti, fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m (is, ss) = incExcSpeGen $ cooc2mat ti m
...@@ -67,7 +94,7 @@ linearTakes :: (Ord b1, Ord b2) ...@@ -67,7 +94,7 @@ linearTakes :: (Ord b1, Ord b2)
=> GraphListSize -> InclusionSize => GraphListSize -> InclusionSize
-> (a -> b2) -> (a -> b1) -> [a] -> [a] -> (a -> b2) -> (a -> b1) -> [a] -> [a]
linearTakes gls incSize speGen incExc = take gls linearTakes gls incSize speGen incExc = take gls
. L.concat . List.concat
. map (take $ round . map (take $ round
$ (fromIntegral gls :: Double) $ (fromIntegral gls :: Double)
/ (fromIntegral incSize :: Double) / (fromIntegral incSize :: Double)
......
...@@ -284,11 +284,11 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m) ...@@ -284,11 +284,11 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
-- | Inclusion (i) = Gen(i)+Spec(i) -- | Inclusion (i) = Gen(i)+Spec(i)
inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double) inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
inclusionExclusion mat = zipWith (+) (pV mat) (pH mat) inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
-- | Genericity score = Gen(i)- Spec(i) -- | Genericity score = Gen(i)- Spec(i)
specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double) specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
specificityGenericity mat = zipWith (-) (pV mat) (pH mat) specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
-- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
pV :: Acc (Matrix Double) -> Acc (Vector Double) pV :: Acc (Matrix Double) -> Acc (Vector Double)
......
...@@ -19,6 +19,9 @@ extra-deps: ...@@ -19,6 +19,9 @@ extra-deps:
commit: 6f0595d2421005837d59151a8b26eee83ebb67b5 commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
- git: https://github.com/delanoe/servant-static-th.git - git: https://github.com/delanoe/servant-static-th.git
commit: ba5347e7d8a13ce5275af8470c15b2305fbb23af commit: ba5347e7d8a13ce5275af8470c15b2305fbb23af
- git: https://github.com/delanoe/hstatistics.git
commit: 90eef7604bb230644c2246eccd094d7bfefcb135
#- opaleye-0.6.7002.0 #- opaleye-0.6.7002.0
- KMP-0.1.0.2 - KMP-0.1.0.2
- accelerate-1.2.0.0 - accelerate-1.2.0.0
...@@ -27,13 +30,13 @@ extra-deps: ...@@ -27,13 +30,13 @@ extra-deps:
- full-text-search-0.2.1.4 - full-text-search-0.2.1.4
- fullstop-0.1.4 - fullstop-0.1.4
- hgal-2.0.0.2 - hgal-2.0.0.2
- rdf4h-3.1.1
- located-base-0.1.1.1 - located-base-0.1.1.1
- multiset-0.3.4.1 # stack test
- probable-0.1.3 - probable-0.1.3
- rake-0.0.1 - rake-0.0.1
- rdf4h-3.1.1
- serialise-0.2.0.0 - serialise-0.2.0.0
- servant-flatten-0.2 - servant-flatten-0.2
- servant-multipart-0.11.2 - servant-multipart-0.11.2
- stemmer-0.5.2 - stemmer-0.5.2
- validity-0.9.0.0 # patches-{map,class} - validity-0.9.0.0 # patches-{map,class}
- multiset-0.3.4.1 # stack test
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment