Commit 1f520fdb authored by Alexandre Delanoë's avatar Alexandre Delanoë

[NGRAMS SELECTION] Flow, dim reduction.

parent 99d21028
......@@ -72,6 +72,7 @@ library:
- Gargantext.Viz.Graph.Distances.Matrice
- Gargantext.Viz.Graph.Index
dependencies:
- array
- QuickCheck
- accelerate
- aeson
......@@ -107,6 +108,7 @@ library:
- http-api-data
- http-types
- hsparql
- hstatistics
- hxt
- hlcm
- ini
......
......@@ -98,8 +98,8 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys = take b $ drop a ns
zs = drop b $ drop a ns
a = 10
b = 5000
a = 50
b = 1000
isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3
......
......@@ -20,6 +20,9 @@ module Gargantext.Text.Metrics
--import Data.Array.Accelerate ((:.)(..), Z(..))
--import Math.KMeans (kmeans, euclidSq, elements)
--import GHC.Float (exp)
import Data.Map (Map)
import Data.List.Extra (sortOn)
import GHC.Real (round)
......@@ -28,20 +31,28 @@ import Gargantext.Viz.Graph.Distances.Matrice
import Gargantext.Viz.Graph.Index
import qualified Data.Array.Accelerate as DAA
import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.List as List
import qualified Data.Map as Map
import Numeric.Statistics.PCA (pcaReduceN)
import qualified Data.Vector.Storable as Vec
import Data.Array.IArray (Array, listArray, elems)
type GraphListSize = Int
type InclusionSize = Int
takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t]
takeScored listSize incSize = map _scored_terms
. linearTakes listSize incSize _scored_speGen
_scored_incExc
. scored
scored :: Ord t => Map (t,t) Int -> [Scored t]
scored = map2scored . (reduceDim 2) . scored2map
data Scored ts = Scored
{ _scored_terms :: !ts
, _scored_incExc :: !InclusionExclusion
......@@ -49,10 +60,26 @@ data Scored ts = Scored
} deriving (Show)
reduceDim :: Ord t => Int -> Map t (Vec.Vector Double)
-> Map t (Vec.Vector Double)
reduceDim d ss = Map.fromList $ zip txts $ elems $ pcaReduceN ss'' d
where
ss'' :: Array Int (Vec.Vector Double)
ss'' = listArray (1, List.length ss') ss'
(txts,ss') = List.unzip $ Map.toList ss
scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m
map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t]
map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList
-- TODO in the textflow we end up needing these indices, it might be better
-- to compute them earlier and pass them around.
scored :: Ord t => Map (t,t) Int -> [Scored t]
scored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
scored' :: Ord t => Map (t,t) Int -> [Scored t]
scored' m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (Map.toList fi) scores
where
(ti, fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m
......@@ -67,7 +94,7 @@ linearTakes :: (Ord b1, Ord b2)
=> GraphListSize -> InclusionSize
-> (a -> b2) -> (a -> b1) -> [a] -> [a]
linearTakes gls incSize speGen incExc = take gls
. L.concat
. List.concat
. map (take $ round
$ (fromIntegral gls :: Double)
/ (fromIntegral incSize :: Double)
......
......@@ -284,11 +284,11 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
-- | Inclusion (i) = Gen(i)+Spec(i)
inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
-- | Genericity score = Gen(i)- Spec(i)
specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
-- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
pV :: Acc (Matrix Double) -> Acc (Vector Double)
......
......@@ -19,6 +19,9 @@ extra-deps:
commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
- git: https://github.com/delanoe/servant-static-th.git
commit: ba5347e7d8a13ce5275af8470c15b2305fbb23af
- git: https://github.com/delanoe/hstatistics.git
commit: 90eef7604bb230644c2246eccd094d7bfefcb135
#- opaleye-0.6.7002.0
- KMP-0.1.0.2
- accelerate-1.2.0.0
......@@ -27,13 +30,13 @@ extra-deps:
- full-text-search-0.2.1.4
- fullstop-0.1.4
- hgal-2.0.0.2
- rdf4h-3.1.1
- located-base-0.1.1.1
- multiset-0.3.4.1 # stack test
- probable-0.1.3
- rake-0.0.1
- rdf4h-3.1.1
- serialise-0.2.0.0
- servant-flatten-0.2
- servant-multipart-0.11.2
- stemmer-0.5.2
- validity-0.9.0.0 # patches-{map,class}
- multiset-0.3.4.1 # stack test
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment