Commit 4a4ade7d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Cooc -> Matrix conversions tools.

parent df2a11cf
......@@ -35,6 +35,7 @@ library:
dependencies:
- QuickCheck
- accelerate
- accelerate-io
- aeson
- aeson-lens
- aeson-pretty
......
......@@ -10,29 +10,44 @@ Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Pipeline
where
import Data.Text (unpack)
import qualified Data.Text as DT
import Data.Text.IO (readFile)
----------------------------------------------
----------------------------------------------
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (map', createIndexes)
import Gargantext.Viz.Graph.Distances.Matrice (distributional, int2double)
import Gargantext.Text.Metrics.Occurrences
import Gargantext.Text.Terms
import Gargantext.Text.Context
import Data.Array.Accelerate as A
pipeline pth = do
text <- readFile pth
let contexts = splitBy Sentences 4 text
cooc <$> map occurrences <$> mapM (terms Mono FR) contexts
-- todo
myterms <- mapM (terms Multi FR) contexts
-- todo filter stop words
let myCooc = removeApax $ cooc myterms
--pure myCooc
-- Cooc map -> Matrix
-- distributional or conditional
--pure $ createIndexes myCooc
pure $ map' int2double myCooc
-- Matrix -> Graph
......@@ -49,6 +49,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, otherwise, when
, undefined
, IO()
, compare
)
-- TODO import functions optimized in Utils.Count
......@@ -106,6 +107,12 @@ movingAverage steps xs = map mean $ chunkAlong steps 1 xs
ma :: [Double] -> [Double]
ma = movingAverage 3
-- | splitEvery n == chunkAlong n n
splitEvery :: Int -> [a] -> [[a]]
splitEvery _ [] = L.cycle [[]]
splitEvery n xs =
let (h,t) = L.splitAt n xs
in h : splitEvery n t
-- | Function to split a range into chunks
chunkAlong :: Int -> Int -> [a] -> [[a]]
......@@ -227,3 +234,8 @@ zipSnd f xs = zip xs (f xs)
unMaybe :: [Maybe a] -> [a]
unMaybe = map fromJust . L.filter isJust
-- maximumWith
maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
......@@ -29,12 +29,17 @@ module Gargantext.Text.Metrics.Occurrences
where
import Control.Arrow ((***))
import qualified Data.List as List
import Data.Map.Strict (Map
, empty
, empty, singleton
, insertWith, insertWithKey, unionWith
, toList
, toList, lookup, mapKeys
)
import Data.Set (Set)
import qualified Data.Set as Set
import Data.Text (pack)
import qualified Data.Map.Strict as DMS
import Control.Monad ((>>),(>>=))
import Data.String (String())
......@@ -53,6 +58,7 @@ data Group = ByStem | ByOntology
type Grouped = Stems
{-
-- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
-- >> map occurrences <$> Prelude.mapM (terms Mono EN)
-- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
......@@ -66,32 +72,45 @@ type Grouped = Stems
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
----
cooc :: (Ord b, Num a) => [Map b a] -> Map (b, b) a
cooc ts = cooc' $ map cooc'' ts
-}
type Occs = Int
type Coocs = Int
cooc' :: (Ord b, Num a) => [Map (b, b) a] -> Map (b,b) a
cooc' = foldl' (\x y -> unionWith (+) x y) empty
removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
removeApax = DMS.filter (> 1)
cooc'' :: (Ord b, Num a) => Map b a -> Map (b, b) a
cooc'' m = foldl' (\x (y,c) -> insertWith (+) y c x) empty xs
cooc :: [[Terms]] -> Map (Label, Label) Int
cooc tss =
mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss)
where
xs =[ ((x'',y''), c') | x' <- toList m
, y' <- toList m
, let x'' = fst x'
, let y'' = fst y'
, x'' < y''
, let c' = 1
--, let c' = snd x' + snd y'
]
terms_occs = occurrences (List.concat tss)
delta f = f *** f
labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
cooc' :: Ord b => [Set b] -> Map (b, b) Coocs
cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
where
xs = [ ((x, y), 1)
| xs <- tss
, ys <- tss
, x <- Set.toList xs
, y <- Set.toList ys
, x < y
]
-- | Compute the grouped occurrences (occ)
occurrences :: [Terms] -> Map Grouped Int
occurrences :: [Terms] -> Map Grouped (Map Terms Int)
occurrences = occurrences' _terms_stem
occurrences' :: Ord b => (a -> b) -> [a] -> Occ b
occurrences' f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs
occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
-- TODO add groups and filter stops
sumOcc :: Ord a => [Occ a] -> Occ a
......
......@@ -43,6 +43,8 @@ import Gargantext.Text.Terms.Mono (monoterms')
data TermType = Mono | Multi
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
terms :: TermType -> Lang -> Text -> IO [Terms]
terms Mono lang txt = pure $ monoterms' lang txt
......
......@@ -35,4 +35,3 @@ data ListName = Stop | Candidate | Graph
--stop :: [Ngrams] -> [Ngrams]
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
{-|
Module : Gargantext.Graph.Distances
Description : Distance management tools
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Viz.Graph.Distances
where
......@@ -48,6 +48,13 @@ import Data.Maybe (Maybe(Just))
import qualified Gargantext.Prelude as P
import qualified Data.Array.Accelerate.Array.Representation as Repr
import Gargantext.Text.Metrics.Occurrences
-----------------------------------------------------------------------
-- Test perf.
distriTest = distributional $ myMat 100
-----------------------------------------------------------------------
vector :: Int -> (Array (Z :. Int) Int)
vector n = fromList (Z :. n) [0..n]
......@@ -55,14 +62,14 @@ vector n = fromList (Z :. n) [0..n]
matrix :: Elt c => Int -> [c] -> Matrix c
matrix n l = fromList (Z :. n :. n) l
myMat :: Int -> Matrix Double
myMat :: Int -> Matrix Int
myMat n = matrix n [1..]
-- | Two ways to get the rank (as documentation)
rank :: (Matrix Double) -> Int
rank :: (Matrix a) -> Int
rank m = arrayRank $ arrayShape m
rank' :: (Matrix Double) -> Int
rank' :: (Matrix a) -> Int
rank' m = n
where
Z :. _ :. n = arrayShape m
......@@ -109,8 +116,8 @@ conditional m = (run $ ie (use m), run $ sg (use m))
-- | Distributional Distance
distributional :: Matrix Double -> Matrix Double
distributional m = run $ filter $ ri (use m)
distributional :: Matrix Int -> Matrix Double
distributional m = run $ filter $ ri (map fromIntegral $ use m)
where
n = rank' m
......@@ -131,3 +138,7 @@ distributional m = run $ filter $ ri (use m)
crossProduct m = zipWith (*) (cross m ) (cross (transpose m))
cross mat = zipWith (-) (mkSum n mat) (mat)
int2double :: Matrix Int -> Matrix Double
int2double m = run (map fromIntegral $ use m)
{-|
Module : Gargantext.Graph.Distances.Utils
Description : Tools to compute distances from Cooccurrences
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Basically @compute@ takes an accelerate function as first input, a Map
of coccurrences as second input and outputs a Map automatically using
indexes.
TODO:
--cooc2fgl :: Ord t, Integral n => Map (t, t) n -> Graph
--fgl2json
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Viz.Graph.Index
where
import qualified Data.Array.Accelerate as A
import qualified Data.Array.Accelerate.IO.Data.Vector.Unboxed as AU
import qualified Data.Vector.Unboxed as DVU
import Data.List (concat)
import Data.Set (Set)
import qualified Data.Set as S
import Data.Map (Map)
import qualified Data.Map.Strict as M
import Gargantext.Prelude
type Index = Int
-------------------------------------------------------------------------------
{-
map'' :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Int
-> Map (t, t) Double
map'' f m = back . f' . from m
where
from (fs, m') = unzip $ M.toAscList m
f' = f $ A.fromList shape m'
shape = (A.Z A.:. n A.:. n)
back = M.fromAscList . zip fs . A.toList
-}
-------------------------------------------------------------------------------
map' :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
-> Map (t, t) Int
-> Map (t, t) Double
map' f m = fromIndex fromI . mat2cooc . f $ cooc2mat toI m
where
(toI, fromI) = createIndexes m
map'' m = cooc2mat toI m
where
(toI, fromI) = createIndexes m
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
cooc2mat :: Ord t => Map t Index -> Map (t, t) Int -> A.Matrix Int
cooc2mat ti m = A.fromFunction shape (\(A.Z A.:. x A.:. y) -> lookup' x y)
where
shape = (A.Z A.:. n A.:. n)
n = M.size ti
lookup' x y = maybe 0 identity (M.lookup (x,y) (toIndex ti m))
mat2cooc :: A.Matrix Double -> Map (Index, Index) Double
mat2cooc m = M.fromList $ concat -- [((Int,Int), Double)]
$ map (\(x,xs) -> map (\(y,ys) -> ((x,y),ys)) xs) -- [[((Int,Int), Double)]]
$ zip ([1..] :: [Int]) -- [(Int, [(Int, Double)]]
$ map (zip ([1..] :: [Int])) -- [[(Int, Double)]]
$ splitEvery n (A.toList m) -- [[Double]]
where
A.Z A.:. _ A.:. n = A.arrayShape m
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
toIndex :: Ord t => Map t Index -> Map (t,t) a -> Map (Index,Index) a
toIndex ni ns = indexConversion ni ns
fromIndex :: Ord t => Map Index t -> Map (Index, Index) a -> Map (t,t) a
fromIndex ni ns = indexConversion ni ns
---------------------------------------------------------------------------------
indexConversion :: (Ord b, Ord k) => Map k b -> Map (k,k) a -> Map (b, b) a
indexConversion index ms = M.fromList $ map (\((k1,k2),c) -> ( ((M.!) index k1, (M.!) index k2), c)) (M.toList ms)
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
createIndexes :: Ord t => Map (t, t) b -> (Map t Index, Map Index t)
createIndexes = set2indexes . cooc2set
where
cooc2set :: Ord t => Map (t, t) a -> Set t
cooc2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs')
where
insert as s = foldl' (\s' t -> S.insert t s') s as
set2indexes :: Ord t => Set t -> (Map t Index, Map Index t)
set2indexes s = (M.fromList toIndex', M.fromList fromIndex')
where
fromIndex' = zip [1..] (S.toList s)
toIndex' = zip (S.toList s) [1..]
......@@ -7,6 +7,8 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
These functions are used for Vector.Matrix only.
-}
{-# LANGUAGE BangPatterns #-}
......@@ -34,65 +36,16 @@ import Gargantext.Prelude
------------------------------------------------------------------------
-- | Some utils to build the matrix from cooccurrence results
type Distance = Double
type Cooc = Int
type NgramId = Int
type Index = Int
-- Type Families
--type Matrix' Index a
--type Matrix' NgramId a
data Matrice a = Matrice { matrice_fromIndex :: !(Map Index NgramId)
, matrice_toIndex :: !(Map NgramId Index)
, matrice :: !(Matrix a)
} deriving (Show)
--fromMatrice :: Matrice Double -> [(NgramId, NgramId, Double)]
--fromMatrice m = undefined
toMatrice :: [(NgramId, NgramId, Int)] -> Matrice Double
toMatrice ns = Matrice fromIndx toIndx m
where
s = cooc2set ns
(fromIndx, toIndx) = set2indexes s
n = (length (S.toList s))
idx = toIndex toIndx ns
m = matrix n n (\x -> maybe 0 identity (fromIntegral <$> M.lookup x idx))
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
toIndex :: Map NgramId Index -> [(NgramId, NgramId, a)] -> Map (Index,Index) a
toIndex ni ns = to ni ns
fromIndex :: Map Index NgramId -> [(Index, Index, a)] -> Map (NgramId,NgramId) a
fromIndex ni ns = to ni ns
-------------------------------------------------------------------------------
to :: (Ord b, Ord k) => Map k b -> [(k, k, a)] -> Map (b, b) a
to index ns = M.fromList $ map (\(a1,a2,c) -> ( ( (M.!) index a1
, (M.!) index a2
)
, c
)
) ns
-------------------------------------------------------------------------------
cooc2set :: [(NgramId, NgramId, a)] -> Set NgramId
cooc2set cs' = foldl' (\s (a1,a2,_) -> insert [a1,a2] s ) S.empty cs'
where
insert as s = foldl' (\s' a -> S.insert a s') s as
set2indexes :: Set NgramId -> (Map Index NgramId, Map NgramId Index)
set2indexes s = (M.fromList fromIndex', M.fromList toIndex')
where
s' = S.toList s
fromIndex' = zip [1..] s'
toIndex' = zip s' [1..]
-- | For tests only, to be removed
-- m1 :: Matrix Double
-- m1 = fromList 300 300 [1..]
------------------------------------------------------------------------
------------------------------------------------------------------------
data Axis = Col | Row
------------------------------------------------------------------------
-- | Matrix functions
type AxisId = Int
-- Data.Vector.Additions
dropAt :: Int -> Vector a -> Vector a
dropAt n v = debut <> (V.tail fin)
......@@ -100,28 +53,6 @@ dropAt n v = debut <> (V.tail fin)
debut = V.take n v
fin = V.drop n v
------------------------------------------------------------------------
data Axis = Col | Row
---- | Matrix Algebra
--data Algebra a = Point a | Vector a | Matrix a
--
--multiply :: Algebra a -> Matrix a -> Matrix a
--multiply (Point a) = undefined
--multiply (Vector a) = undefined
--multiply (Matrix a) = undefined
--
--div :: Fractional a => Matrix a -> Matrix a
--div m = foldl' (\m c -> divCol c m) m [1.. (ncols m)]
-- where
-- divCol c m = mapCol (\_ x -> 1/x) c m
--
--divide :: Fractional a => Matrix a -> Matrix a -> Matrix a
--divide a b = a `multStd` (div b)
------------------------------------------------------------------------
-- | Matrix functions
type AxisId = Int
total :: Num a => Matrix a -> a
total m = V.sum $ V.map (\c -> V.sum (getCol c m)) (V.enumFromTo 1 (nOf Col m))
......@@ -141,7 +72,3 @@ toListsWithIndex m = concat' $ zip [1..] $ map (\c -> zip [1..] c) $ toLists m
concat' xs = L.concat $ map (\(x, ys) -> map (\(y, a) -> ((x,y), a)) ys ) xs
-- | For tests only, to be removed
m1 :: Matrix Double
m1 = fromList 300 300 [1..]
......@@ -13,9 +13,8 @@ extra-deps:
commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
- git: https://github.com/delanoe/servant-static-th.git
commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434
#- git: https://github.com/delanoe/accelerate.git
#commit: 007fd483a4410441fb5dd1b689a5f7dab66d27ad
- accelerate-1.2.0.0
- accelerate-io-1.2.0.0
- aeson-1.2.4.0
- aeson-lens-0.5.0.0
- duckling-0.1.3.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment