Commit 46330fdd authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CLEAN] Graph: unoptmized distances using Data.Matrix (conditional and

                                                      distributional)
        Prelude: ideas on $ and & removed, kept in git history anyway.
parent e4783e24
......@@ -8,15 +8,13 @@ Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module Main where
......@@ -27,6 +25,10 @@ import Data.Text (unpack)
import Gargantext.Prelude
import Gargantext.API (startGargantext, startGargantextMock)
import qualified Gargantext.Graph.Utils as U
import qualified Gargantext.Graph.Distances.Conditional as C
import qualified Gargantext.Graph.Distances.Distributional as D
--------------------------------------------------------
data Mode = Dev | Mock | Prod
......@@ -46,8 +48,11 @@ instance ParseRecord (MyOptions Wrapped)
deriving instance Show (MyOptions Unwrapped)
main :: IO ()
main = do
main = putStrLn $ show $ C.conditional U.m1
--main = putStrLn $ show $ map show $ take 10 $ D.distributional U.m1
main' :: IO ()
main' = do
MyOptions myMode myPort myIniFile <- unwrapRecord
"Gargantext: collaborative platform for text-mining"
......
{-|
Module : Gargantext.
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
......@@ -19,7 +19,8 @@ library:
ghc-options:
- -Wincomplete-uni-patterns
- -Wincomplete-record-updates
- -Werror
- -Wmissing-signatures
# - -Werror
exposed-modules:
- Gargantext
- Gargantext.Database
......@@ -33,6 +34,9 @@ library:
- Gargantext.Database.NodeNodeNgram
- Gargantext.Database.Utils
- Gargantext.Database.User
- Gargantext.Graph.Utils
- Gargantext.Graph.Distances.Conditional
- Gargantext.Graph.Distances.Distributional
- Gargantext.Ngrams
- Gargantext.Ngrams.Analysis
- Gargantext.Ngrams.TFICF
......@@ -86,6 +90,7 @@ library:
- jose-jwt
- lens
- logging-effect
- matrix
- monad-logger
- mtl
- natural-transformation
......@@ -138,7 +143,12 @@ library:
executable:
main: Main.hs
source-dirs: app
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
ghc-options:
#- -threaded
- -rtsopts
# - -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- base
- gargantext
......@@ -173,6 +183,7 @@ tests:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -Wmissing-signatures
dependencies:
- doctest
- Glob
......
{-|
Module : Gargantext.Graph.Distances.Conditional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Motivation and definition of the @Conditional@ distance.
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Graph.Distances.Conditional
where
import Data.Matrix hiding (identity)
import Data.String.Conversions (ConvertibleStrings(..))
import Data.List (concat, sortOn)
import qualified Data.List as L
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Graph.Utils
------------------------------------------------------------------------
-------------------------------------------------------
conditional :: (Num a, Fractional a, Ord a) => Matrix a -> Matrix a
conditional m = x' -- filter (threshold m') m'
where
------------------------------------------------------------------------
-- | Main Operations
-- x' = x / (sum Col x)
x' = proba Col m
------------------------------------------------------------------------
-- xs = (sum Col x') - x'
xs = distFromSum Col x'
-- ys = (sum Row x') - x'
ys = distFromSum Row x'
------------------------------------------------------------------------
-- | Top included or excluded
ie = opWith (+) xs ys
-- ie = ( xs + ys) / (2 * (x.shape[0] - 1))
-- | Top specific or generic
sg = opWith (-) xs ys
-- sg = ( xs - ys) / (2 * (x.shape[0] - 1))
nodes_kept :: [Int]
nodes_kept = take k' $ S.toList
$ foldl' (\s (n1,n2) -> insert [n1,n2] s) S.empty
$ map fst
$ nodes_included k <> nodes_specific k
nodes_included n = take n $ sortOn snd $ toListsWithIndex ie
nodes_specific m = take m $ sortOn snd $ toListsWithIndex sg
insert as s = foldl' (\s' a -> S.insert a s') s as
k' = 2*k
k = 10
dico_nodes :: Map Int Int
dico_nodes = M.fromList $ zip [1..] nodes_kept
dico_nodes_rev = M.fromList $ zip nodes_kept [1..]
m' = matrix (length nodes_kept)
(length nodes_kept)
(\(i,j) -> getElem ((M.!) dico_nodes i) ((M.!) dico_nodes j) x')
threshold m = V.minimum $ V.map (\cId -> V.maximum $ getCol cId m) (V.enumFromTo 1 (nOf Col m))
filter t m = mapAll (\x -> filter' t x) m
where
filter' t x = case (x >= t) of
True -> x
False -> 0
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Main Functions
-- Compute the probability from axis
-- x' = x / (sum Col x)
proba :: (Num a, Fractional a) => Axis -> Matrix a -> Matrix a
proba a m = mapOn a (\c x -> x / V.sum (axis a c m)) m
---------------------------------------------------------------
-- | Compute a distance from axis
-- xs = (sum Col x') - x'
distFromSum :: (Num a, Fractional a)
=> Axis -> Matrix a -> Matrix a
distFromSum a m = mapOn a (\c x -> V.sum (axis a c m) - x) m
---------------------------------------------------------------
---------------------------------------------------------------
-- | To compute included/excluded or specific/generic scores
opWith :: (Fractional a1, Num a1)
=> (Matrix a2 -> t -> Matrix a1) -> Matrix a2 -> t -> Matrix a1
opWith op xs ys = mapAll (\x -> x / (2*n -1)) (xs `op` ys)
where
n = fromIntegral $ nOf Col xs
---------------------------------------------------------------
{-|
Module : Gargantext.Graph.Distances.Distributional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Motivation and definition of the @Conditional@ distance.
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Graph.Distances.Distributional
where
import Data.Matrix hiding (identity)
import Data.String.Conversions (ConvertibleStrings(..))
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Graph.Utils
distributional :: (Floating a, Ord a) => Matrix a -> [((Int, Int), a)]
distributional m = filter (\((x,y), d) -> foldl' (&&) True (conditions x y d) ) distriList
where
conditions x y d = [ (x /= y)
, (d > miniMax')
, ((M.lookup (x,y) distriMap) > (M.lookup (y,x) distriMap))
]
distriList = toListsWithIndex distriMatrix
distriMatrix = ri (mi m)
distriMap = M.fromList $ distriList
miniMax' = miniMax distriMatrix
ri :: (Ord a, Fractional a) => Matrix a -> Matrix a
ri m = matrix c r doRi
where
doRi (x,y) = doRi' x y m
doRi' x y mi'' = sumMin x y mi'' / (V.sum $ ax Col x y mi'')
sumMin x y mi' = V.sum $ V.map (\(a,b) -> min a b )
$ V.zip (ax Col x y mi') (ax Row x y mi')
(c,r) = (nOf Col m, nOf Row m)
mi :: (Ord a, Floating a) => Matrix a -> Matrix a
mi m = matrix c r createMat
where
(c,r) = (nOf Col m, nOf Row m)
createMat (x,y) = doMi x y m
doMi x y m = if x == y then 0 else (nonNegative $ log (doMi' x y m))
doMi' x y m = (getElem x y m) / ( cross x y m / total m )
cross x y m = (V.sum $ ax Col x y m) * (V.sum $ ax Row x y m)
ax :: Axis -> Int -> Int -> Matrix a -> Vector a
ax a i j m = dropAt j' $ axis a i' m
where
i' = div i c + 1
j' = mod r j + 1
(c,r) = (nOf Col m, nOf Row m)
nonNegative :: (Ord a, Num a) => a -> a
nonNegative x = if x > 0 then x else 0
miniMax :: (Ord a) => Matrix a -> a
miniMax m = V.minimum $ V.map (\c -> V.maximum $ getCol c m) (V.enumFromTo 1 (nOf Col m))
{-|
Module : Gargantext.Graph.Distances.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Graph.Utils
where
import Data.Matrix hiding (identity)
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import qualified Data.List as L
import Gargantext.Prelude
------------------------------------------------------------------------
type Distance = Double
type Cooc = Int
type NgramId = Int
type Index = Int
-- Type Families
--type Matrix' Index a
--type Matrix' NgramId a
data Matrice a = Matrice { matrice_fromIndex :: !(Map Index NgramId)
, matrice_toIndex :: !(Map NgramId Index)
, matrice :: !(Matrix a)
} deriving (Show)
--fromMatrice :: Matrice Double -> [(NgramId, NgramId, Double)]
--fromMatrice m = undefined
toMatrice :: [(NgramId, NgramId, Int)] -> Matrice Double
toMatrice ns = Matrice fromIndx toIndx m
where
s = cooc2set ns
(fromIndx, toIndx) = set2indexes s
n = (length (S.toList s))
idx = toIndex toIndx ns
m = matrix n n (\x -> maybe 0 identity (fromIntegral <$> M.lookup x idx))
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
toIndex :: Map NgramId Index -> [(NgramId, NgramId, a)] -> Map (Index,Index) a
toIndex ni ns = to ni ns
fromIndex :: Map Index NgramId -> [(Index, Index, a)] -> Map (NgramId,NgramId) a
fromIndex ni ns = to ni ns
-------------------------------------------------------------------------------
to :: (Ord b, Ord k) => Map k b -> [(k, k, a)] -> Map (b, b) a
to index ns = M.fromList $ map (\(a1,a2,c) -> ( ( (M.!) index a1
, (M.!) index a2
)
, c
)
) ns
-------------------------------------------------------------------------------
cooc2set :: [(NgramId, NgramId, a)] -> Set NgramId
cooc2set cs' = foldl' (\s (a1,a2,_) -> insert [a1,a2] s ) S.empty cs'
where
insert as s = foldl' (\s' a -> S.insert a s') s as
set2indexes :: Set NgramId -> (Map Index NgramId, Map NgramId Index)
set2indexes s = (M.fromList fromIndex', M.fromList toIndex')
where
s' = S.toList s
fromIndex' = zip [1..] s'
toIndex' = zip s' [1..]
------------------------------------------------------------------------
-- Data.Vector.Additions
dropAt :: Int -> Vector a -> Vector a
dropAt n v = debut <> (V.tail fin)
where
debut = V.take n v
fin = V.drop n v
------------------------------------------------------------------------
data Axis = Col | Row
---- | Matrix Algebra
--data Algebra a = Point a | Vector a | Matrix a
--
--multiply :: Algebra a -> Matrix a -> Matrix a
--multiply (Point a) = undefined
--multiply (Vector a) = undefined
--multiply (Matrix a) = undefined
--
--div :: Fractional a => Matrix a -> Matrix a
--div m = foldl' (\m c -> divCol c m) m [1.. (ncols m)]
-- where
-- divCol c m = mapCol (\_ x -> 1/x) c m
--
--divide :: Fractional a => Matrix a -> Matrix a -> Matrix a
--divide a b = a `multStd` (div b)
total :: Num a => Matrix a -> a
total m = V.sum $ V.map (\c -> V.sum (getCol c m)) (V.enumFromTo 1 (nOf Col m))
------------------------------------------------------------------------
-- | Matrix functions
type AxisId = Int
nOf :: Axis -> Matrix a -> Int
nOf Row = nrows
nOf Col = ncols
axis :: Axis -> AxisId -> Matrix a -> Vector a
axis Col = getCol
axis Row = getRow
--mapOn' :: Axis -> (a -> a) -> Matrix a -> Matrix a
--mapOn' a f m = foldl' (\m' aId -> mapOn a (aId f) m') m [1.. (nOf a m)]
mapOn :: Axis -> (AxisId -> a -> a) -> Matrix a -> Matrix a
mapOn a f m = V.foldl' f' m (V.enumFromTo 1 (nOf a m))
where
f' m' c = mapOnly a f c m'
mapOnly :: Axis -> (AxisId -> a -> a) -> AxisId -> Matrix a -> Matrix a
mapOnly Col = mapCol
mapOnly Row = mapRow
mapAll :: (a -> a) -> Matrix a -> Matrix a
mapAll f m = mapOn Col (\_ -> f) m
toListsWithIndex :: Matrix a -> [((Int, Int), a)]
toListsWithIndex m = concat' $ zip [1..] $ map (\c -> zip [1..] c) $ toLists m
where
concat' :: [(Int, [(Int, a)])] -> [((Int, Int), a)]
concat' xs = L.concat $ map (\(x, ys) -> map (\(y, a) -> ((x,y), a)) ys ) xs
-- | For tests only, to be removed
m1 :: Matrix Double
m1 = fromList 300 300 [1..]
......@@ -51,16 +51,16 @@ import Gargantext.Parsers.WOS (wosParser)
---- import Gargantext.Parsers.DOC (docParser)
---- import Gargantext.Parsers.ODT (odtParser)
--import Gargantext.Prelude (pm)
--import Gargantext.Types.Main (ErrorMessage(), Corpus)
-- FIXME
--type Field = Text
type ParseError = String
--
--data Corpus = Corpus { _corpusErrors :: [ParseError]
-- , _corpusMap :: Map FilePath (Map Field Text)
-- }
type Field = Text
type Document = DM.Map Field Text
type FilesParsed = DM.Map FilePath FileParsed
data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
, _fileParsed_result :: [Document]
} deriving (Show)
-- | According to the format of Input file,
......
......@@ -235,32 +235,3 @@ zipSnd f xs = zip xs (f xs)
unMaybe :: [Maybe a] -> [a]
unMaybe = map fromJust . L.filter isJust
-- | Syntactic convention for the reader/writer coordination.
-- @Motivation@: explicit functional flux ease coordination between
-- readers and writers who are not always the same individuals. Each
-- natural languages has its own syntaxical conventions from left to
-- right or the contrary. In computer programming languages it depends
-- on context of the algorithm itself and we need some clarity since
-- both are possible, here is a proposition to get more explicitiness.
-- | (<|) is called : "Pipe rightLeft" as "from right to left". The most right
-- function sends its output to the most left function which takes it as
-- input.
(<|) :: (a -> b) -> a -> b
(<|) = ($)
-- | (|>) is called : "Pipe leftRight" as "from left to right". The most left
-- function sends its output to the most right function which takes it as
-- input. (|>) == (&) = True -- in base prelude
(|>) :: a -> (a -> c) -> c
(|>) = flip ($)
-- | Function composition orientation
(<.) :: (b -> c) -> (a -> b) -> a -> c
(<.) = (.)
-- | Function composition orientation
(.>) :: (a -> b) -> (b -> c) -> a -> c
(.>) = flip (.)
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment