Commit 03f5859a authored by Alexandre Delanoë's avatar Alexandre Delanoë

[MERGE] distances for Graph.

parents 7e4ad917 d7fd1875
......@@ -7,6 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
<<<<<<< HEAD
Script to start gargantext with different modes (Dev, Prod, Mock).
-}
......@@ -18,6 +19,7 @@ Script to start gargantext with different modes (Dev, Prod, Mock).
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module Main where
......@@ -27,6 +29,13 @@ import Data.Text (unpack)
import Gargantext.Prelude
import Gargantext.API (startGargantext, startGargantextMock)
--------------------------------------------------------
-- Graph Tests
--import qualified Gargantext.Graph.Utils as U
--import qualified Gargantext.Graph.Distances.Conditional as C
--import qualified Gargantext.Graph.Distances.Distributional as D
--import qualified Gargantext.Graph.Distances.Matrice as M
--------------------------------------------------------
data Mode = Dev | Mock | Prod
......@@ -45,6 +54,8 @@ data MyOptions w = MyOptions { run :: w ::: Mode <?> "Possible modes: De
instance ParseRecord (MyOptions Wrapped)
deriving instance Show (MyOptions Unwrapped)
main :: IO ()
main = do
MyOptions myMode myPort myIniFile <- unwrapRecord
......@@ -66,3 +77,8 @@ main = do
putStrLn $ "Starting Gargantext with mode: " <> show myMode
start
-- main' :: IO ()
--main' = putStrLn $ show $ M.conditional $ M.myMat 10
......@@ -18,7 +18,8 @@ library:
ghc-options:
- -Wincomplete-uni-patterns
- -Wincomplete-record-updates
- -Werror
- -Wmissing-signatures
# - -Werror
exposed-modules:
- Gargantext
- Gargantext.Prelude
......@@ -30,6 +31,7 @@ library:
- Gargantext.API
dependencies:
- QuickCheck
- accelerate
- aeson
- aeson-lens
- aeson-pretty
......@@ -64,6 +66,7 @@ library:
- jose-jwt
- lens
- logging-effect
- matrix
- monad-logger
- mtl
- natural-transformation
......@@ -117,7 +120,12 @@ library:
executable:
main: Main.hs
source-dirs: app
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- base
- containers
......@@ -155,6 +163,7 @@ tests:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -Wmissing-signatures
dependencies:
- doctest
- Glob
......
......@@ -34,7 +34,7 @@ import Gargantext.Prelude
------------------------------------------------------------------------
-- All the Database is structred like a hierarchical Tree
data Tree a = NodeT a [Tree a]
deriving (Show, Read, Eq)
deriving (Show, Read, Eq)
-- data Tree a = NodeT a [Tree a]
-- same as Data.Tree
......
......@@ -45,7 +45,7 @@ import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
import Data.Typeable (Typeable)
import qualified Data.ByteString.Internal as DBI
import Database.PostgreSQL.Simple (Connection)
import Opaleye
import Opaleye hiding (FromField)
-- | Types for Node Database Management
data PGTSVector
......
{-|
Module : Gargantext.Graph.Distances.Conditional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Motivation and definition of the @Conditional@ distance.
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Graph.Distances.Conditional
where
import Data.Matrix hiding (identity)
import Data.String.Conversions (ConvertibleStrings(..))
import Data.List (concat, sortOn)
import qualified Data.List as L
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Graph.Utils
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Optimisation issue
toBeOptimized :: (Num a, Fractional a, Ord a) => Matrix a -> Matrix a
toBeOptimized m = proba Col m
------------------------------------------------------------------------
-- | Main Functions
-- Compute the probability from axis
-- x' = x / (sum Col x)
proba :: (Num a, Fractional a) => Axis -> Matrix a -> Matrix a
proba a m = mapOn a (\c x -> x / V.sum (axis a c m)) m
mapOn :: Axis -> (AxisId -> a -> a) -> Matrix a -> Matrix a
mapOn a f m = V.foldl' f' m (V.enumFromTo 1 (nOf a m))
where
f' m' c = mapOnly a f c m'
mapOnly :: Axis -> (AxisId -> a -> a) -> AxisId -> Matrix a -> Matrix a
mapOnly Col = mapCol
mapOnly Row = mapRow
mapAll :: (a -> a) -> Matrix a -> Matrix a
mapAll f m = mapOn Col (\_ -> f) m
---------------------------------------------------------------
-- | Compute a distance from axis
-- xs = (sum Col x') - x'
distFromSum :: (Num a, Fractional a)
=> Axis -> Matrix a -> Matrix a
distFromSum a m = mapOn a (\c x -> V.sum (axis a c m) - x) m
---------------------------------------------------------------
---------------------------------------------------------------
-- | To compute included/excluded or specific/generic scores
opWith :: (Fractional a1, Num a1)
=> (Matrix a2 -> t -> Matrix a1) -> Matrix a2 -> t -> Matrix a1
opWith op xs ys = mapAll (\x -> x / (2*n -1)) (xs `op` ys)
where
n = fromIntegral $ nOf Col xs
---------------------------------------------------------------
-------------------------------------------------------
conditional :: (Num a, Fractional a, Ord a) => Matrix a -> Matrix a
conditional m = filter (threshold m') m'
where
------------------------------------------------------------------------
-- | Main Operations
-- x' = x / (sum Col x)
x' = proba Col m
------------------------------------------------------------------------
-- xs = (sum Col x') - x'
xs = distFromSum Col x'
-- ys = (sum Row x') - x'
ys = distFromSum Row x'
------------------------------------------------------------------------
-- | Top included or excluded
ie = opWith (+) xs ys
-- ie = ( xs + ys) / (2 * (x.shape[0] - 1))
-- | Top specific or generic
sg = opWith (-) xs ys
-- sg = ( xs - ys) / (2 * (x.shape[0] - 1))
nodes_kept :: [Int]
nodes_kept = take k' $ S.toList
$ foldl' (\s (n1,n2) -> insert [n1,n2] s) S.empty
$ map fst
$ nodes_included k <> nodes_specific k
nodes_included n = take n $ sortOn snd $ toListsWithIndex ie
nodes_specific m = take m $ sortOn snd $ toListsWithIndex sg
insert as s = foldl' (\s' a -> S.insert a s') s as
k' = 2*k
k = 10
dico_nodes :: Map Int Int
dico_nodes = M.fromList $ zip [1..] nodes_kept
dico_nodes_rev = M.fromList $ zip nodes_kept [1..]
m' = matrix (length nodes_kept)
(length nodes_kept)
(\(i,j) -> getElem ((M.!) dico_nodes i) ((M.!) dico_nodes j) x')
threshold m = V.minimum $ V.map (\cId -> V.maximum $ getCol cId m) (V.enumFromTo 1 (nOf Col m))
filter t m = mapAll (\x -> filter' t x) m
where
filter' t x = case (x >= t) of
True -> x
False -> 0
------------------------------------------------------------------------
{-|
Module : Gargantext.Graph.Distances.Distributional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Motivation and definition of the @Distributional@ distance.
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Graph.Distances.Distributional
where
import Data.Matrix hiding (identity)
import Data.String.Conversions (ConvertibleStrings(..))
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Graph.Utils
distributional :: (Floating a, Ord a) => Matrix a -> [((Int, Int), a)]
distributional m = filter (\((x,y), d) -> foldl' (&&) True (conditions x y d) ) distriList
where
conditions x y d = [ (x /= y)
, (d > miniMax')
, ((M.lookup (x,y) distriMap) > (M.lookup (y,x) distriMap))
]
distriList = toListsWithIndex distriMatrix
distriMatrix = ri (mi m)
distriMap = M.fromList $ distriList
miniMax' = miniMax distriMatrix
ri :: (Ord a, Fractional a) => Matrix a -> Matrix a
ri m = matrix c r doRi
where
doRi (x,y) = doRi' x y m
doRi' x y mi'' = sumMin x y mi'' / (V.sum $ ax Col x y mi'')
sumMin x y mi' = V.sum $ V.map (\(a,b) -> min a b )
$ V.zip (ax Col x y mi') (ax Row x y mi')
(c,r) = (nOf Col m, nOf Row m)
mi :: (Ord a, Floating a) => Matrix a -> Matrix a
mi m = matrix c r createMat
where
(c,r) = (nOf Col m, nOf Row m)
createMat (x,y) = doMi x y m
doMi x y m = if x == y then 0 else (max (log (doMi' x y m)) 0 )
doMi' x y m = (getElem x y m) / ( cross x y m / total m )
cross x y m = (V.sum $ ax Col x y m) * (V.sum $ ax Row x y m)
ax :: Axis -> Int -> Int -> Matrix a -> Vector a
ax a i j m = dropAt j' $ axis a i' m
where
i' = div i c + 1
j' = mod r j + 1
(c,r) = (nOf Col m, nOf Row m)
miniMax :: (Ord a) => Matrix a -> a
miniMax m = V.minimum $ V.map (\c -> V.maximum $ getCol c m) (V.enumFromTo 1 (nOf Col m))
{-|
Module : Gargantext.Graph.Distances.Matrix
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Motivation and definition of the @Conditional@ distance.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE TypeFamilies #-}
module Gargantext.Graph.Distances.Matrice
where
import Data.Array.Accelerate.Data.Bits
import Data.Array.Accelerate.Interpreter
import Data.Array.Accelerate
import Data.Array.Accelerate.Smart
import Data.Array.Accelerate.Type
import Data.Array.Accelerate.Array.Sugar (fromArr, Array, Z)
import Data.Maybe (Maybe(Just))
import qualified Gargantext.Prelude as P
import qualified Data.Array.Accelerate.Array.Representation as Repr
matrix :: Elt c => Int -> [c] -> Matrix c
matrix n l = fromList (Z :. n :. n) l
myMat :: Int -> Matrix Double
myMat n = matrix n [1..]
-- | Two ways to get the rank (as documentation)
rank :: (Matrix Double) -> Int
rank m = arrayRank $ arrayShape m
rank' :: (Matrix Double) -> Int
rank' m = n
where
Z :. _ :. n = arrayShape m
-----------------------------------------------------------------------
-- | Conditional Distance
type Rank = Int
proba :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double)
proba r mat = zipWith (/) mat (mkSum r mat)
mkSum :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double)
mkSum r mat = replicate (constant (Z :. (r :: Int) :. All))
$ fold (+) 0 mat
type Matrix' a = Acc (Matrix a)
conditional :: Matrix Double -> (Matrix Double, Matrix Double)
conditional m = (run $ ie (use m), run $ sg (use m))
where
r :: Rank
r = rank' m
xs :: Matrix' Double -> Matrix' Double
xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
ie :: Matrix' Double -> Matrix' Double
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double)
sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
n :: Exp Double
n = P.fromIntegral r
-- filter with threshold
-----------------------------------------------------------------------
-- | Distributional Distance
distributional :: Matrix Double -> Matrix Double
distributional m = run $ filter $ ri (use m)
where
n = rank m
filter m = zipWith (\a b -> max a b) m (transpose m)
--miniMax m = fold minimum $ fold maximum m
ri mat = zipWith (/) mat1 mat2
where
mat1 = mkSum n $ zipWith min (mi mat) (mi $ transpose mat)
mat2 = mkSum n mat
mi m' = zipWith (\a b -> max (log $ a/b) 0) m'
$ zipWith (/) (crossProduct m') (total m')
total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
crossProduct m = zipWith (*) (cross m ) (cross (transpose m))
cross mat = zipWith (-) (mkSum n mat) (mat)
{-|
Module : Gargantext.Graph.Distances.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE Strict #-}
module Gargantext.Graph.Utils
where
import Data.Matrix hiding (identity)
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import qualified Data.List as L
import Gargantext.Prelude
------------------------------------------------------------------------
-- | Some utils to build the matrix from cooccurrence results
type Distance = Double
type Cooc = Int
type NgramId = Int
type Index = Int
-- Type Families
--type Matrix' Index a
--type Matrix' NgramId a
data Matrice a = Matrice { matrice_fromIndex :: !(Map Index NgramId)
, matrice_toIndex :: !(Map NgramId Index)
, matrice :: !(Matrix a)
} deriving (Show)
--fromMatrice :: Matrice Double -> [(NgramId, NgramId, Double)]
--fromMatrice m = undefined
toMatrice :: [(NgramId, NgramId, Int)] -> Matrice Double
toMatrice ns = Matrice fromIndx toIndx m
where
s = cooc2set ns
(fromIndx, toIndx) = set2indexes s
n = (length (S.toList s))
idx = toIndex toIndx ns
m = matrix n n (\x -> maybe 0 identity (fromIntegral <$> M.lookup x idx))
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
toIndex :: Map NgramId Index -> [(NgramId, NgramId, a)] -> Map (Index,Index) a
toIndex ni ns = to ni ns
fromIndex :: Map Index NgramId -> [(Index, Index, a)] -> Map (NgramId,NgramId) a
fromIndex ni ns = to ni ns
-------------------------------------------------------------------------------
to :: (Ord b, Ord k) => Map k b -> [(k, k, a)] -> Map (b, b) a
to index ns = M.fromList $ map (\(a1,a2,c) -> ( ( (M.!) index a1
, (M.!) index a2
)
, c
)
) ns
-------------------------------------------------------------------------------
cooc2set :: [(NgramId, NgramId, a)] -> Set NgramId
cooc2set cs' = foldl' (\s (a1,a2,_) -> insert [a1,a2] s ) S.empty cs'
where
insert as s = foldl' (\s' a -> S.insert a s') s as
set2indexes :: Set NgramId -> (Map Index NgramId, Map NgramId Index)
set2indexes s = (M.fromList fromIndex', M.fromList toIndex')
where
s' = S.toList s
fromIndex' = zip [1..] s'
toIndex' = zip s' [1..]
------------------------------------------------------------------------
-- Data.Vector.Additions
dropAt :: Int -> Vector a -> Vector a
dropAt n v = debut <> (V.tail fin)
where
debut = V.take n v
fin = V.drop n v
------------------------------------------------------------------------
data Axis = Col | Row
---- | Matrix Algebra
--data Algebra a = Point a | Vector a | Matrix a
--
--multiply :: Algebra a -> Matrix a -> Matrix a
--multiply (Point a) = undefined
--multiply (Vector a) = undefined
--multiply (Matrix a) = undefined
--
--div :: Fractional a => Matrix a -> Matrix a
--div m = foldl' (\m c -> divCol c m) m [1.. (ncols m)]
-- where
-- divCol c m = mapCol (\_ x -> 1/x) c m
--
--divide :: Fractional a => Matrix a -> Matrix a -> Matrix a
--divide a b = a `multStd` (div b)
------------------------------------------------------------------------
-- | Matrix functions
type AxisId = Int
total :: Num a => Matrix a -> a
total m = V.sum $ V.map (\c -> V.sum (getCol c m)) (V.enumFromTo 1 (nOf Col m))
nOf :: Axis -> Matrix a -> Int
nOf Row = nrows
nOf Col = ncols
axis :: Axis -> AxisId -> Matrix a -> Vector a
axis Col = getCol
axis Row = getRow
toListsWithIndex :: Matrix a -> [((Int, Int), a)]
toListsWithIndex m = concat' $ zip [1..] $ map (\c -> zip [1..] c) $ toLists m
where
concat' :: [(Int, [(Int, a)])] -> [((Int, Int), a)]
concat' xs = L.concat $ map (\(x, ys) -> map (\(y, a) -> ((x,y), a)) ys ) xs
-- | For tests only, to be removed
m1 :: Matrix Double
m1 = fromList 300 300 [1..]
......@@ -40,12 +40,12 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, sum, fromIntegral, length, fmap, foldl, foldl'
, takeWhile, sqrt, undefined, identity
, abs, min, max, maximum, minimum, return, snd, truncate
, (+), (*), (/), (-), (.), ($), (**), (^), (<), (>), log
, (+), (*), (/), (-), (.), ($), (&), (**), (^), (<), (>), log
, Eq, (==), (>=), (<=), (<>), (/=)
, (&&), (||), not, any
, fst, snd, toS
, elem, die, mod, div, const, either
, curry, uncurry
, curry, uncurry, repeat
, otherwise, when
, undefined
, IO()
......
......@@ -34,7 +34,7 @@ import Data.Map.Strict (Map
, insertWith, insertWithKey, unionWith
, toList
)
import Data.Set (Set)
import qualified Data.Map.Strict as DMS
import Control.Monad ((>>),(>>=))
import Data.String (String())
......
......@@ -52,16 +52,16 @@ import Gargantext.Text.Parsers.WOS (wosParser)
---- import Gargantext.Parsers.DOC (docParser)
---- import Gargantext.Parsers.ODT (odtParser)
--import Gargantext.Prelude (pm)
--import Gargantext.Types.Main (ErrorMessage(), Corpus)
-- FIXME
--type Field = Text
type ParseError = String
--
--data Corpus = Corpus { _corpusErrors :: [ParseError]
-- , _corpusMap :: Map FilePath (Map Field Text)
-- }
type Field = Text
type Document = DM.Map Field Text
type FilesParsed = DM.Map FilePath FileParsed
data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
, _fileParsed_result :: [Document]
} deriving (Show)
-- | According to the format of Input file,
......@@ -76,38 +76,38 @@ data FileFormat = WOS -- Implemented (ISI Format)
-- TODO: to debug maybe add the filepath in error message
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
files <- case takeExtension path of
".zip" -> openZip path
_ -> pure <$> DB.readFile path
(as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
pure (as, map toText $ concat bs)
where
-- TODO : decode with bayesian inference on encodings
toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
-- | withParser:
-- According the format of the text, choosing the right parser.
-- TODO withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = wosParser
--withParser DOC = docParser
--withParser ODT = odtParser
--withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text
openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do
path <- resolveFile' fp
entries <- withArchive path (DM.keys <$> getEntries)
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs
--parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
--parse format path = do
-- files <- case takeExtension path of
-- ".zip" -> openZip path
-- _ -> pure <$> DB.readFile path
-- (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
-- pure (as, map toText $ concat bs)
-- where
-- -- TODO : decode with bayesian inference on encodings
-- toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
--
--
---- | withParser:
---- According the format of the text, choosing the right parser.
---- TODO withParser :: FileFormat -> Parser [Document]
--withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
--withParser WOS = wosParser
----withParser DOC = docParser
----withParser ODT = odtParser
----withParser XML = xmlParser
----withParser _ = error "[ERROR] Parser not implemented yet"
--
--runParser :: FileFormat -> DB.ByteString
-- -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
--runParser format text = pure $ parseOnly (withParser format) text
--
--openZip :: FilePath -> IO [DB.ByteString]
--openZip fp = do
-- path <- resolveFile' fp
-- entries <- withArchive path (DM.keys <$> getEntries)
-- bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
-- pure bs
clean :: Text -> Text
clean txt = DT.map clean' txt
......
......@@ -3,6 +3,7 @@ extra-package-dbs: []
packages:
- .
- servant-job
#- '/home/alexandre/local/logiciels/haskell/accelerate/accelerate'
allow-newer: true
extra-deps:
......@@ -12,6 +13,8 @@ extra-deps:
commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
- git: https://github.com/delanoe/servant-static-th.git
commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434
- git: https://github.com/delanoe/accelerate.git
commit: 007fd483a4410441fb5dd1b689a5f7dab66d27ad
- aeson-1.2.4.0
- aeson-lens-0.5.0.0
- duckling-0.1.3.0
......@@ -33,4 +36,4 @@ extra-deps:
- text-1.2.3.0
- text-show-3.6.2
- servant-flatten-0.2
resolver: lts-10.6
resolver: lts-11.10
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment