Commit 4a4ade7d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Cooc -> Matrix conversions tools.

parent df2a11cf
...@@ -35,6 +35,7 @@ library: ...@@ -35,6 +35,7 @@ library:
dependencies: dependencies:
- QuickCheck - QuickCheck
- accelerate - accelerate
- accelerate-io
- aeson - aeson
- aeson-lens - aeson-lens
- aeson-pretty - aeson-pretty
......
...@@ -10,29 +10,44 @@ Portability : POSIX ...@@ -10,29 +10,44 @@ Portability : POSIX
-} -}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Pipeline module Gargantext.Pipeline
where where
import Data.Text (unpack)
import qualified Data.Text as DT
import Data.Text.IO (readFile) import Data.Text.IO (readFile)
----------------------------------------------
----------------------------------------------
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (map', createIndexes)
import Gargantext.Viz.Graph.Distances.Matrice (distributional, int2double)
import Gargantext.Text.Metrics.Occurrences import Gargantext.Text.Metrics.Occurrences
import Gargantext.Text.Terms import Gargantext.Text.Terms
import Gargantext.Text.Context import Gargantext.Text.Context
import Data.Array.Accelerate as A
pipeline pth = do pipeline pth = do
text <- readFile pth text <- readFile pth
let contexts = splitBy Sentences 4 text let contexts = splitBy Sentences 4 text
cooc <$> map occurrences <$> mapM (terms Mono FR) contexts myterms <- mapM (terms Multi FR) contexts
-- todo -- todo filter stop words
let myCooc = removeApax $ cooc myterms
--pure myCooc
-- Cooc map -> Matrix -- Cooc map -> Matrix
-- distributional or conditional --pure $ createIndexes myCooc
pure $ map' int2double myCooc
-- Matrix -> Graph -- Matrix -> Graph
...@@ -49,6 +49,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer ...@@ -49,6 +49,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, otherwise, when , otherwise, when
, undefined , undefined
, IO() , IO()
, compare
) )
-- TODO import functions optimized in Utils.Count -- TODO import functions optimized in Utils.Count
...@@ -106,6 +107,12 @@ movingAverage steps xs = map mean $ chunkAlong steps 1 xs ...@@ -106,6 +107,12 @@ movingAverage steps xs = map mean $ chunkAlong steps 1 xs
ma :: [Double] -> [Double] ma :: [Double] -> [Double]
ma = movingAverage 3 ma = movingAverage 3
-- | splitEvery n == chunkAlong n n
splitEvery :: Int -> [a] -> [[a]]
splitEvery _ [] = L.cycle [[]]
splitEvery n xs =
let (h,t) = L.splitAt n xs
in h : splitEvery n t
-- | Function to split a range into chunks -- | Function to split a range into chunks
chunkAlong :: Int -> Int -> [a] -> [[a]] chunkAlong :: Int -> Int -> [a] -> [[a]]
...@@ -227,3 +234,8 @@ zipSnd f xs = zip xs (f xs) ...@@ -227,3 +234,8 @@ zipSnd f xs = zip xs (f xs)
unMaybe :: [Maybe a] -> [a] unMaybe :: [Maybe a] -> [a]
unMaybe = map fromJust . L.filter isJust unMaybe = map fromJust . L.filter isJust
-- maximumWith
maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
...@@ -29,12 +29,17 @@ module Gargantext.Text.Metrics.Occurrences ...@@ -29,12 +29,17 @@ module Gargantext.Text.Metrics.Occurrences
where where
import Control.Arrow ((***))
import qualified Data.List as List
import Data.Map.Strict (Map import Data.Map.Strict (Map
, empty , empty, singleton
, insertWith, insertWithKey, unionWith , insertWith, insertWithKey, unionWith
, toList , toList, lookup, mapKeys
) )
import Data.Set (Set) import Data.Set (Set)
import qualified Data.Set as Set
import Data.Text (pack)
import qualified Data.Map.Strict as DMS import qualified Data.Map.Strict as DMS
import Control.Monad ((>>),(>>=)) import Control.Monad ((>>),(>>=))
import Data.String (String()) import Data.String (String())
...@@ -53,6 +58,7 @@ data Group = ByStem | ByOntology ...@@ -53,6 +58,7 @@ data Group = ByStem | ByOntology
type Grouped = Stems type Grouped = Stems
{-
-- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"] -- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
-- >> map occurrences <$> Prelude.mapM (terms Mono EN) -- >> map occurrences <$> Prelude.mapM (terms Mono EN)
-- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]] -- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
...@@ -66,32 +72,45 @@ type Grouped = Stems ...@@ -66,32 +72,45 @@ type Grouped = Stems
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)] --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
---- ----
cooc :: (Ord b, Num a) => [Map b a] -> Map (b, b) a -}
cooc ts = cooc' $ map cooc'' ts
type Occs = Int
type Coocs = Int
cooc' :: (Ord b, Num a) => [Map (b, b) a] -> Map (b,b) a removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
cooc' = foldl' (\x y -> unionWith (+) x y) empty removeApax = DMS.filter (> 1)
cooc'' :: (Ord b, Num a) => Map b a -> Map (b, b) a cooc :: [[Terms]] -> Map (Label, Label) Int
cooc'' m = foldl' (\x (y,c) -> insertWith (+) y c x) empty xs cooc tss =
mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss)
where where
xs =[ ((x'',y''), c') | x' <- toList m terms_occs = occurrences (List.concat tss)
, y' <- toList m delta f = f *** f
, let x'' = fst x'
, let y'' = fst y'
, x'' < y'' labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
, let c' = 1 labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
--, let c' = snd x' + snd y' Just label -> label
] Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
cooc' :: Ord b => [Set b] -> Map (b, b) Coocs
cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
where
xs = [ ((x, y), 1)
| xs <- tss
, ys <- tss
, x <- Set.toList xs
, y <- Set.toList ys
, x < y
]
-- | Compute the grouped occurrences (occ) -- | Compute the grouped occurrences (occ)
occurrences :: [Terms] -> Map Grouped Int occurrences :: [Terms] -> Map Grouped (Map Terms Int)
occurrences = occurrences' _terms_stem occurrences = occurrences' _terms_stem
occurrences' :: Ord b => (a -> b) -> [a] -> Occ b occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
occurrences' f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
-- TODO add groups and filter stops -- TODO add groups and filter stops
sumOcc :: Ord a => [Occ a] -> Occ a sumOcc :: Ord a => [Occ a] -> Occ a
......
...@@ -43,6 +43,8 @@ import Gargantext.Text.Terms.Mono (monoterms') ...@@ -43,6 +43,8 @@ import Gargantext.Text.Terms.Mono (monoterms')
data TermType = Mono | Multi data TermType = Mono | Multi
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------ ------------------------------------------------------------------------
terms :: TermType -> Lang -> Text -> IO [Terms] terms :: TermType -> Lang -> Text -> IO [Terms]
terms Mono lang txt = pure $ monoterms' lang txt terms Mono lang txt = pure $ monoterms' lang txt
......
...@@ -35,4 +35,3 @@ data ListName = Stop | Candidate | Graph ...@@ -35,4 +35,3 @@ data ListName = Stop | Candidate | Graph
--stop :: [Ngrams] -> [Ngrams] --stop :: [Ngrams] -> [Ngrams]
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs --stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
{-|
Module : Gargantext.Graph.Distances
Description : Distance management tools
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Viz.Graph.Distances
where
...@@ -48,6 +48,13 @@ import Data.Maybe (Maybe(Just)) ...@@ -48,6 +48,13 @@ import Data.Maybe (Maybe(Just))
import qualified Gargantext.Prelude as P import qualified Gargantext.Prelude as P
import qualified Data.Array.Accelerate.Array.Representation as Repr import qualified Data.Array.Accelerate.Array.Representation as Repr
import Gargantext.Text.Metrics.Occurrences
-----------------------------------------------------------------------
-- Test perf.
distriTest = distributional $ myMat 100
-----------------------------------------------------------------------
vector :: Int -> (Array (Z :. Int) Int) vector :: Int -> (Array (Z :. Int) Int)
vector n = fromList (Z :. n) [0..n] vector n = fromList (Z :. n) [0..n]
...@@ -55,14 +62,14 @@ vector n = fromList (Z :. n) [0..n] ...@@ -55,14 +62,14 @@ vector n = fromList (Z :. n) [0..n]
matrix :: Elt c => Int -> [c] -> Matrix c matrix :: Elt c => Int -> [c] -> Matrix c
matrix n l = fromList (Z :. n :. n) l matrix n l = fromList (Z :. n :. n) l
myMat :: Int -> Matrix Double myMat :: Int -> Matrix Int
myMat n = matrix n [1..] myMat n = matrix n [1..]
-- | Two ways to get the rank (as documentation) -- | Two ways to get the rank (as documentation)
rank :: (Matrix Double) -> Int rank :: (Matrix a) -> Int
rank m = arrayRank $ arrayShape m rank m = arrayRank $ arrayShape m
rank' :: (Matrix Double) -> Int rank' :: (Matrix a) -> Int
rank' m = n rank' m = n
where where
Z :. _ :. n = arrayShape m Z :. _ :. n = arrayShape m
...@@ -109,8 +116,8 @@ conditional m = (run $ ie (use m), run $ sg (use m)) ...@@ -109,8 +116,8 @@ conditional m = (run $ ie (use m), run $ sg (use m))
-- | Distributional Distance -- | Distributional Distance
distributional :: Matrix Double -> Matrix Double distributional :: Matrix Int -> Matrix Double
distributional m = run $ filter $ ri (use m) distributional m = run $ filter $ ri (map fromIntegral $ use m)
where where
n = rank' m n = rank' m
...@@ -131,3 +138,7 @@ distributional m = run $ filter $ ri (use m) ...@@ -131,3 +138,7 @@ distributional m = run $ filter $ ri (use m)
crossProduct m = zipWith (*) (cross m ) (cross (transpose m)) crossProduct m = zipWith (*) (cross m ) (cross (transpose m))
cross mat = zipWith (-) (mkSum n mat) (mat) cross mat = zipWith (-) (mkSum n mat) (mat)
int2double :: Matrix Int -> Matrix Double
int2double m = run (map fromIntegral $ use m)
{-|
Module : Gargantext.Graph.Distances.Utils
Description : Tools to compute distances from Cooccurrences
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Basically @compute@ takes an accelerate function as first input, a Map
of coccurrences as second input and outputs a Map automatically using
indexes.
TODO:
--cooc2fgl :: Ord t, Integral n => Map (t, t) n -> Graph
--fgl2json
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Viz.Graph.Index
where
import qualified Data.Array.Accelerate as A
import qualified Data.Array.Accelerate.IO.Data.Vector.Unboxed as AU
import qualified Data.Vector.Unboxed as DVU
import Data.List (concat)
import Data.Set (Set)
import qualified Data.Set as S
import Data.Map (Map)
import qualified Data.Map.Strict as M
import Gargantext.Prelude
type Index = Int
-------------------------------------------------------------------------------
{-
map'' :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Int
-> Map (t, t) Double
map'' f m = back . f' . from m
where
from (fs, m') = unzip $ M.toAscList m
f' = f $ A.fromList shape m'
shape = (A.Z A.:. n A.:. n)
back = M.fromAscList . zip fs . A.toList
-}
-------------------------------------------------------------------------------
map' :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Int
-> Map (t, t) Double
map' f m = fromIndex fromI . mat2cooc . f $ cooc2mat toI m
where
(toI, fromI) = createIndexes m
map'' m = cooc2mat toI m
where
(toI, fromI) = createIndexes m
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
cooc2mat :: Ord t => Map t Index -> Map (t, t) Int -> A.Matrix Int
cooc2mat ti m = A.fromFunction shape (\(A.Z A.:. x A.:. y) -> lookup' x y)
where
shape = (A.Z A.:. n A.:. n)
n = M.size ti
lookup' x y = maybe 0 identity (M.lookup (x,y) (toIndex ti m))
mat2cooc :: A.Matrix Double -> Map (Index, Index) Double
mat2cooc m = M.fromList $ concat -- [((Int,Int), Double)]
$ map (\(x,xs) -> map (\(y,ys) -> ((x,y),ys)) xs) -- [[((Int,Int), Double)]]
$ zip ([1..] :: [Int]) -- [(Int, [(Int, Double)]]
$ map (zip ([1..] :: [Int])) -- [[(Int, Double)]]
$ splitEvery n (A.toList m) -- [[Double]]
where
A.Z A.:. _ A.:. n = A.arrayShape m
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
toIndex :: Ord t => Map t Index -> Map (t,t) a -> Map (Index,Index) a
toIndex ni ns = indexConversion ni ns
fromIndex :: Ord t => Map Index t -> Map (Index, Index) a -> Map (t,t) a
fromIndex ni ns = indexConversion ni ns
---------------------------------------------------------------------------------
indexConversion :: (Ord b, Ord k) => Map k b -> Map (k,k) a -> Map (b, b) a
indexConversion index ms = M.fromList $ map (\((k1,k2),c) -> ( ((M.!) index k1, (M.!) index k2), c)) (M.toList ms)
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
createIndexes :: Ord t => Map (t, t) b -> (Map t Index, Map Index t)
createIndexes = set2indexes . cooc2set
where
cooc2set :: Ord t => Map (t, t) a -> Set t
cooc2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs')
where
insert as s = foldl' (\s' t -> S.insert t s') s as
set2indexes :: Ord t => Set t -> (Map t Index, Map Index t)
set2indexes s = (M.fromList toIndex', M.fromList fromIndex')
where
fromIndex' = zip [1..] (S.toList s)
toIndex' = zip (S.toList s) [1..]
...@@ -7,6 +7,8 @@ Maintainer : team@gargantext.org ...@@ -7,6 +7,8 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
These functions are used for Vector.Matrix only.
-} -}
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
...@@ -34,65 +36,16 @@ import Gargantext.Prelude ...@@ -34,65 +36,16 @@ import Gargantext.Prelude
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Some utils to build the matrix from cooccurrence results -- | Some utils to build the matrix from cooccurrence results
type Distance = Double -- | For tests only, to be removed
type Cooc = Int -- m1 :: Matrix Double
type NgramId = Int -- m1 = fromList 300 300 [1..]
type Index = Int ------------------------------------------------------------------------
-- Type Families
--type Matrix' Index a
--type Matrix' NgramId a
data Matrice a = Matrice { matrice_fromIndex :: !(Map Index NgramId)
, matrice_toIndex :: !(Map NgramId Index)
, matrice :: !(Matrix a)
} deriving (Show)
--fromMatrice :: Matrice Double -> [(NgramId, NgramId, Double)]
--fromMatrice m = undefined
toMatrice :: [(NgramId, NgramId, Int)] -> Matrice Double
toMatrice ns = Matrice fromIndx toIndx m
where
s = cooc2set ns
(fromIndx, toIndx) = set2indexes s
n = (length (S.toList s))
idx = toIndex toIndx ns
m = matrix n n (\x -> maybe 0 identity (fromIntegral <$> M.lookup x idx))
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
toIndex :: Map NgramId Index -> [(NgramId, NgramId, a)] -> Map (Index,Index) a
toIndex ni ns = to ni ns
fromIndex :: Map Index NgramId -> [(Index, Index, a)] -> Map (NgramId,NgramId) a
fromIndex ni ns = to ni ns
-------------------------------------------------------------------------------
to :: (Ord b, Ord k) => Map k b -> [(k, k, a)] -> Map (b, b) a
to index ns = M.fromList $ map (\(a1,a2,c) -> ( ( (M.!) index a1
, (M.!) index a2
)
, c
)
) ns
-------------------------------------------------------------------------------
cooc2set :: [(NgramId, NgramId, a)] -> Set NgramId
cooc2set cs' = foldl' (\s (a1,a2,_) -> insert [a1,a2] s ) S.empty cs'
where
insert as s = foldl' (\s' a -> S.insert a s') s as
set2indexes :: Set NgramId -> (Map Index NgramId, Map NgramId Index)
set2indexes s = (M.fromList fromIndex', M.fromList toIndex')
where
s' = S.toList s
fromIndex' = zip [1..] s'
toIndex' = zip s' [1..]
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Axis = Col | Row
------------------------------------------------------------------------
-- | Matrix functions
type AxisId = Int
-- Data.Vector.Additions -- Data.Vector.Additions
dropAt :: Int -> Vector a -> Vector a dropAt :: Int -> Vector a -> Vector a
dropAt n v = debut <> (V.tail fin) dropAt n v = debut <> (V.tail fin)
...@@ -100,28 +53,6 @@ dropAt n v = debut <> (V.tail fin) ...@@ -100,28 +53,6 @@ dropAt n v = debut <> (V.tail fin)
debut = V.take n v debut = V.take n v
fin = V.drop n v fin = V.drop n v
------------------------------------------------------------------------
data Axis = Col | Row
---- | Matrix Algebra
--data Algebra a = Point a | Vector a | Matrix a
--
--multiply :: Algebra a -> Matrix a -> Matrix a
--multiply (Point a) = undefined
--multiply (Vector a) = undefined
--multiply (Matrix a) = undefined
--
--div :: Fractional a => Matrix a -> Matrix a
--div m = foldl' (\m c -> divCol c m) m [1.. (ncols m)]
-- where
-- divCol c m = mapCol (\_ x -> 1/x) c m
--
--divide :: Fractional a => Matrix a -> Matrix a -> Matrix a
--divide a b = a `multStd` (div b)
------------------------------------------------------------------------
-- | Matrix functions
type AxisId = Int
total :: Num a => Matrix a -> a total :: Num a => Matrix a -> a
total m = V.sum $ V.map (\c -> V.sum (getCol c m)) (V.enumFromTo 1 (nOf Col m)) total m = V.sum $ V.map (\c -> V.sum (getCol c m)) (V.enumFromTo 1 (nOf Col m))
...@@ -141,7 +72,3 @@ toListsWithIndex m = concat' $ zip [1..] $ map (\c -> zip [1..] c) $ toLists m ...@@ -141,7 +72,3 @@ toListsWithIndex m = concat' $ zip [1..] $ map (\c -> zip [1..] c) $ toLists m
concat' xs = L.concat $ map (\(x, ys) -> map (\(y, a) -> ((x,y), a)) ys ) xs concat' xs = L.concat $ map (\(x, ys) -> map (\(y, a) -> ((x,y), a)) ys ) xs
-- | For tests only, to be removed
m1 :: Matrix Double
m1 = fromList 300 300 [1..]
...@@ -13,9 +13,8 @@ extra-deps: ...@@ -13,9 +13,8 @@ extra-deps:
commit: 6f0595d2421005837d59151a8b26eee83ebb67b5 commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
- git: https://github.com/delanoe/servant-static-th.git - git: https://github.com/delanoe/servant-static-th.git
commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434 commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434
#- git: https://github.com/delanoe/accelerate.git
#commit: 007fd483a4410441fb5dd1b689a5f7dab66d27ad
- accelerate-1.2.0.0 - accelerate-1.2.0.0
- accelerate-io-1.2.0.0
- aeson-1.2.4.0 - aeson-1.2.4.0
- aeson-lens-0.5.0.0 - aeson-lens-0.5.0.0
- duckling-0.1.3.0 - duckling-0.1.3.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment