Commit 4499edc1 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] Order 2 regression and split of clustering

parent 3025b9f6
...@@ -5,7 +5,7 @@ cabal-version: 1.12 ...@@ -5,7 +5,7 @@ cabal-version: 1.12
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
name: gargantext name: gargantext
version: 0.0.6.9.4.6 version: 0.0.6.9.4.6
synopsis: Search, map, share synopsis: Search, map, share
description: Please see README.md description: Please see README.md
category: Data category: Data
......
...@@ -159,37 +159,24 @@ reIndexWith cId lId nt lts = do ...@@ -159,37 +159,24 @@ reIndexWith cId lId nt lts = do
<$> HashMap.toList <$> HashMap.toList
<$> getTermsWith identity [lId] nt lts <$> getTermsWith identity [lId] nt lts
let orphans = ts {- List.concat
$ map (\t -> case HashMap.lookup t occs of
Nothing -> [t]
Just n -> if n <= 1 then [t] else [ ]
) ts
-}
printDebug "orphans" orphans
-- Get all documents of the corpus -- Get all documents of the corpus
docs <- selectDocNodes cId docs <- selectDocNodes cId
-- printDebug "docs length" (List.length docs)
-- Checking Text documents where orphans match
-- TODO Tests here
let let
-- fromListWith (<>) -- fromListWith (<>)
ngramsByDoc = map (HashMap.fromList) ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v))) $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ map (\doc -> List.zip $ map (\doc -> List.zip
(termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans) (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
$ Text.unlines $ catMaybes $ Text.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title [ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract , doc ^. context_hyperdata . hd_abstract
] ]
) )
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]]) (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
) docs ) docs
printDebug "ngramsByDoc: " ngramsByDoc -- printDebug "ngramsByDoc: " ngramsByDoc
-- Saving the indexation in database -- Saving the indexation in database
_ <- mapM (saveDocNgramsWith lId) ngramsByDoc _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
...@@ -224,7 +211,7 @@ postAsync' l (WithJsonFile m _) logStatus = do ...@@ -224,7 +211,7 @@ postAsync' l (WithJsonFile m _) logStatus = do
, _scst_remaining = Just 2 , _scst_remaining = Just 2
, _scst_events = Just [] , _scst_events = Just []
} }
printDebug "New list as file" l -- printDebug "New list as file" l
_ <- setList l m _ <- setList l m
-- printDebug "Done" r -- printDebug "Done" r
...@@ -283,15 +270,15 @@ csvPost :: FlowCmdM env err m ...@@ -283,15 +270,15 @@ csvPost :: FlowCmdM env err m
-> Text -> Text
-> m Bool -> m Bool
csvPost l m = do csvPost l m = do
printDebug "[csvPost] l" l -- printDebug "[csvPost] l" l
-- printDebug "[csvPost] m" m -- printDebug "[csvPost] m" m
-- status label forms -- status label forms
let lst = readCsvText m let lst = readCsvText m
let p = parseCsvData lst let p = parseCsvData lst
--printDebug "[csvPost] lst" lst --printDebug "[csvPost] lst" lst
printDebug "[csvPost] p" p -- printDebug "[csvPost] p" p
_ <- setListNgrams l NgramsTerms p _ <- setListNgrams l NgramsTerms p
printDebug "ReIndexing List" l -- printDebug "ReIndexing List" l
corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList) corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node) let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
_ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm]) _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
...@@ -301,10 +288,10 @@ csvPost l m = do ...@@ -301,10 +288,10 @@ csvPost l m = do
------------------------------------------------------------------------ ------------------------------------------------------------------------
csvPostAsync :: ServerT CSVAPI (GargM Env GargError) csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
csvPostAsync lId = csvPostAsync lId =
serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile ft _ n) log' -> do serveJobsAPI UpdateNgramsListJobCSV $ \f@(WithTextFile _ft _ _n) log' -> do
let log'' x = do let log'' x = do
printDebug "[csvPostAsync] filetype" ft -- printDebug "[csvPostAsync] filetype" ft
printDebug "[csvPostAsync] name" n -- printDebug "[csvPostAsync] name" n
liftBase $ log' x liftBase $ log' x
csvPostAsync' lId f log'' csvPostAsync' lId f log''
......
...@@ -192,6 +192,9 @@ dim m = n ...@@ -192,6 +192,9 @@ dim m = n
matSumCol :: (Elt a, P.Num (Exp a)) => Dim -> Acc (Matrix a) -> Acc (Matrix a) matSumCol :: (Elt a, P.Num (Exp a)) => Dim -> Acc (Matrix a) -> Acc (Matrix a)
matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
matSumLin :: (Elt a, P.Num (Exp a)) => Dim -> Acc (Matrix a) -> Acc (Matrix a)
matSumLin r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat
matSumCol' :: (Elt a, P.Num (Exp a)) => Matrix a -> Matrix a matSumCol' :: (Elt a, P.Num (Exp a)) => Matrix a -> Matrix a
matSumCol' m = run $ matSumCol n m' matSumCol' m = run $ matSumCol n m'
where where
...@@ -210,6 +213,8 @@ matSumCol' m = run $ matSumCol n m' ...@@ -210,6 +213,8 @@ matSumCol' m = run $ matSumCol n m'
matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double) matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
matProba d mat = zipWith (/) mat (matSumCol d mat) matProba d mat = zipWith (/) mat (matSumCol d mat)
-- | Diagonal of the matrix -- | Diagonal of the matrix
-- --
-- >>> run $ diag (use $ matrix 3 ([1..] :: [Int])) -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
...@@ -242,11 +247,21 @@ divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) ...@@ -242,11 +247,21 @@ divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All))
matMiniMax :: (Elt a, Ord a, P.Num a) matMiniMax :: (Elt a, Ord a, P.Num a)
=> Acc (Matrix a) => Acc (Matrix a)
-> Acc (Matrix a) -> Acc (Matrix a)
matMiniMax m = trace "matMiniMax" $ filterWith' miniMax' (constant 0) m matMiniMax m = filterWith' (>=) miniMax' (constant 0) m
where
miniMax' = the $ minimum $ maximum m
matMaxMini :: (Elt a, Ord a, P.Num a)
=> Acc (Matrix a)
-> Acc (Matrix a)
matMaxMini m = filterWith' (>) miniMax' (constant 0) m
where where
miniMax' = the $ maximum $ minimum m miniMax' = the $ maximum $ minimum m
-- | Filters the matrix with a constant -- | Filters the matrix with a constant
-- --
-- >>> run $ matFilter 5 $ use $ matrix 3 [1..] -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
...@@ -260,15 +275,12 @@ filter' t m = filterWith t 0 m ...@@ -260,15 +275,12 @@ filter' t m = filterWith t 0 m
filterWith :: Double -> Double -> Acc (Matrix Double) -> Acc (Matrix Double) filterWith :: Double -> Double -> Acc (Matrix Double) -> Acc (Matrix Double)
filterWith t v m = map (\x -> ifThenElse (x > (constant t)) x (constant v)) (transpose m) filterWith t v m = map (\x -> ifThenElse (x > (constant t)) x (constant v)) (transpose m)
filterWith' :: (Elt a, Ord a) => Exp a -> Exp a -> Acc (Matrix a) -> Acc (Matrix a) filterWith' :: (Elt a, Ord a) => (Exp a -> Exp a -> Exp Bool) -> Exp a -> Exp a -> Acc (Matrix a) -> Acc (Matrix a)
filterWith' t v m = map (\x -> ifThenElse (x > t) x v) m filterWith' f t v m = map (\x -> ifThenElse (f x t) x v) m
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO use Lenses -- | TODO use Lenses
data Direction = MatCol (Exp Int) | MatRow (Exp Int) | Diag data Direction = MatCol (Exp Int) | MatRow (Exp Int) | Diag
......
...@@ -17,13 +17,18 @@ module Gargantext.Core.Methods.Similarities ...@@ -17,13 +17,18 @@ module Gargantext.Core.Methods.Similarities
import Data.Aeson import Data.Aeson
import Data.Array.Accelerate (Matrix) import Data.Array.Accelerate (Matrix)
import Data.Swagger import Data.Swagger
import Data.Text (Text)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Gargantext.Core.Methods.Similarities.Accelerate.Conditional (measureConditional) import Gargantext.Core.Methods.Similarities.Accelerate.Conditional (measureConditional)
import Gargantext.Core.Methods.Similarities.Accelerate.Distributional (logDistributional) import Gargantext.Core.Methods.Similarities.Accelerate.Distributional (logDistributional)
import Gargantext.Prelude (Ord, Eq, Int, Double, Show) -- import Gargantext.Core.Text.Metrics.Count (coocOn)
-- import Gargantext.Core.Viz.Graph.Index
import Gargantext.Prelude (Ord, Eq, Int, Double, Show, map)
import Prelude (Enum, Bounded, minBound, maxBound) import Prelude (Enum, Bounded, minBound, maxBound)
import Test.QuickCheck (elements) import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary import Test.QuickCheck.Arbitrary
-- import qualified Data.Map as Map
import qualified Data.Text as Text
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Similarity = Conditional | Distributional data Similarity = Conditional | Distributional
...@@ -31,9 +36,7 @@ data Similarity = Conditional | Distributional ...@@ -31,9 +36,7 @@ data Similarity = Conditional | Distributional
measure :: Similarity -> Matrix Int -> Matrix Double measure :: Similarity -> Matrix Int -> Matrix Double
measure Conditional x = measureConditional x measure Conditional x = measureConditional x
measure Distributional x = y measure Distributional x = logDistributional x
where
y = logDistributional x
------------------------------------------------------------------------ ------------------------------------------------------------------------
withMetric :: GraphMetric -> Similarity withMetric :: GraphMetric -> Similarity
...@@ -52,3 +55,32 @@ instance Arbitrary GraphMetric where ...@@ -52,3 +55,32 @@ instance Arbitrary GraphMetric where
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Let's take 2 different forms to produce another one:
hello_words :: [[Text]]
hello_words = map (Text.splitOn "-") wrds
where
wrds = [ "bio-logie"
, "socio-logie"
-- , "ana-logie"
-- , "micro-scope"
-- , "micro-phone"
-- , "micro-cosme"
-- -- , "micro-biote"
-- , "tele-scope"
-- , "tele-phone"
-- , "tele-surveillance"
-- , "macro-scope"
-- , "macro-cosme"
-- , "macro-biote"
]
{-
hello_matrix ms' = measureConditional
$ map2mat Square 0 (Map.size ti)
$ toIndex ti ms
where
ms = coocOn identity ms'
(ti, it) = createIndices ms
-}
...@@ -45,17 +45,20 @@ import qualified Gargantext.Prelude as P ...@@ -45,17 +45,20 @@ import qualified Gargantext.Prelude as P
-- --
-- Conditional metric is an absolute metric which reflects -- Conditional metric is an absolute metric which reflects
-- interactions of 2 terms in the corpus. -- interactions of 2 terms in the corpus.
-- Filtered with MiniMax.
measureConditional :: Matrix Int -> Matrix Double measureConditional :: Matrix Int -> Matrix Double
measureConditional m = run $ zipWith (/) m' (matSumCol d m') measureConditional m = run $ x $ map fromIntegral $ use m
where where
m' = map fromIntegral (use m) x :: Acc (Matrix Double) -> Acc (Matrix Double)
d = dim m x mat = matMiniMax $ matProba r mat
r :: Dim
r = dim m
-- *** Conditional distance (advanced)
-- | Conditional distance (advanced version) -- | To filter the nodes
--
-- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called -- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called
-- "confidence" , is the maximum probability between @i@ and @j@ to see -- "confidence" , is the maximum probability between @i@ and @j@ to see
-- @i@ in the same context of @j@ knowing @j@. -- @i@ in the same context of @j@ knowing @j@.
...@@ -69,19 +72,25 @@ conditional' m = ( run $ ie $ map fromIntegral $ use m ...@@ -69,19 +72,25 @@ conditional' m = ( run $ ie $ map fromIntegral $ use m
, run $ sg $ map fromIntegral $ use m , run $ sg $ map fromIntegral $ use m
) )
where where
x :: Acc (Matrix Double) -> Acc (Matrix Double)
x mat = (matProba r mat)
xs :: Acc (Matrix Double) -> Acc (Matrix Double)
xs mat = let mat' = x mat in zipWith (-) (matSumLin r mat') mat'
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = let mat' = x mat in zipWith (-) (matSumCol r mat') mat'
ie :: Acc (Matrix Double) -> Acc (Matrix Double) ie :: Acc (Matrix Double) -> Acc (Matrix Double)
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat) ie mat = map (\x' -> x' / (2*(n-1))) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double) sg :: Acc (Matrix Double) -> Acc (Matrix Double)
sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat) sg mat = map (\x' -> x' / (2*(n-1))) $ zipWith (-) (xs mat) (ys mat)
r :: Dim
r = dim m
n :: Exp Double n :: Exp Double
n = P.fromIntegral r n = P.fromIntegral r
r :: Dim
r = dim m
xs :: Acc (Matrix Double) -> Acc (Matrix Double)
xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
ys :: Acc (Matrix Double) -> Acc (Matrix Double)
ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
...@@ -31,8 +31,54 @@ where $n_{ij}$ is the cooccurrence between term $i$ and term $j$ ...@@ -31,8 +31,54 @@ where $n_{ij}$ is the cooccurrence between term $i$ and term $j$
* MI=(M ./ O * D(M)) .* (M / D(M) * O ) * MI=(M ./ O * D(M)) .* (M / D(M) * O )
* distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$ * distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$
[Specifications written by David Chavalarias on Garg v4 shared NodeWrite, team Pyremiel 2020] [Finally, we have used as convention the Distributional metric used in Legacy GarganText](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/50)
mi = defaultdict(lambda : defaultdict(int))
total_cooc = x.sum().sum()
for i in matrix.keys():
si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
for j in matrix[i].keys():
sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
if i!=j :
mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
r = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
sumMin = sum(
[
min(mi[i][k], mi[j][k])
for k in matrix.keys()
if i != j and k != i and k != j and mi[i][k] > 0
]
)
sumMi = sum(
[
mi[i][k]
for k in matrix.keys()
if k != i and k != j and mi[i][k] > 0
]
)
try:
r[i][j] = sumMin / sumMi
except Exception as error:
r[i][j] = 0
# Need to filter the weak links, automatic threshold here
minmax = min([ max([ r[i][j] for i in r.keys()]) for j in r.keys()])
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': r[i][j]})
for i in r.keys() for j in r.keys()
if i != j and r[i][j] > minmax and r[i][j] > r[j][i]
]
)
-} -}
{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeFamilies #-}
...@@ -122,10 +168,10 @@ distributional m' = result ...@@ -122,10 +168,10 @@ distributional m' = result
result = termDivNan z_1 z_2 result = termDivNan z_1 z_2
logDistributional :: Matrix Int -> Matrix Double logDistributional2 :: Matrix Int -> Matrix Double
logDistributional m = trace ("logDistributional, dim=" `mappend` show n) . run logDistributional2 m = trace ("logDistributional, dim=" `mappend` show n) . run
$ diagNull n $ diagNull n
$ matMiniMax $ matMaxMini
$ logDistributional' n m $ logDistributional' n m
where where
n = dim m n = dim m
...@@ -216,8 +262,61 @@ logDistributional' n m' = trace ("logDistributional'") result ...@@ -216,8 +262,61 @@ logDistributional' n m' = trace ("logDistributional'") result
-- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\] -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
-- --
logDistributional :: Matrix Int -> Matrix Double
logDistributional m' = run $ diagNull n result
where
m = map fromIntegral $ use m'
n = dim m'
-- Scalar. Sum of all elements of m.
to = the $ sum (flatten m)
-- Diagonal matrix with the diagonal of m.
d_m = (.*) m (matrixIdentity n)
-- Size n vector. s = [s_i]_i
s = sum ((.-) m d_m)
-- Matrix nxn. Vector s replicated as rows.
s_1 = replicate (constant (Z :. All :. n)) s
-- Matrix nxn. Vector s replicated as columns.
s_2 = replicate (constant (Z :. n :. All)) s
-- Matrix nxn. ss = [s_i * s_j]_{i,j}. Outer product of s with itself.
ss = (.*) s_1 s_2
-- Matrix nxn. mi = [m_{i,j}]_{i,j} where
-- m_{i,j} = 0 if n_{i,j} = 0 or i = j,
-- m_{i,j} = log(to * n_{i,j} / s_{i,j}) otherwise.
mi = (.*) (matrixEye n)
(map (lift1 (\x -> cond (x == 0) 0 (log (x * to)))) ((./) m ss))
-- Tensor nxnxn. Matrix mi replicated along the 2nd axis.
w_1 = replicate (constant (Z :. All :. n :. All)) mi
-- Tensor nxnxn. Matrix mi replicated along the 1st axis.
w_2 = replicate (constant (Z :. n :. All :. All)) mi
-- Tensor nxnxn.
w' = zipWith min w_1 w_2
-- A predicate that is true when the input (i, j, k) satisfy
-- k /= i AND k /= j
k_diff_i_and_j = lift1 (\(Z :. i :. j :. k) -> ((&&) ((/=) k i) ((/=) k j)))
-- Matrix nxn.
sumMin = sum (condOrDefault k_diff_i_and_j 0 w')
-- Matrix nxn. All columns are the same.
sumM = sum (condOrDefault k_diff_i_and_j 0 w_1)
result = termDivNan sumMin sumM
distributional'' :: Matrix Int -> Matrix Double distributional'' :: Matrix Int -> Matrix Double
distributional'' m = -- run {- $ matMiniMax -} distributional'' m = -- run {- $ matMaxMini -}
run $ diagNull n run $ diagNull n
$ rIJ n $ rIJ n
$ filterWith 0 100 $ filterWith 0 100
...@@ -255,7 +354,7 @@ distributional'' m = -- run {- $ matMiniMax -} ...@@ -255,7 +354,7 @@ distributional'' m = -- run {- $ matMiniMax -}
rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a) rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
=> Dim -> Acc (Matrix a) -> Acc (Matrix a) => Dim -> Acc (Matrix a) -> Acc (Matrix a)
rIJ n m = matMiniMax $ divide a b rIJ n m = matMaxMini $ divide a b
where where
a = sumRowMin n m a = sumRowMin n m
b = sumColMin n m b = sumColMin n m
......
...@@ -37,8 +37,8 @@ conditional :: (Ord a, Hashable a, NFData a) ...@@ -37,8 +37,8 @@ conditional :: (Ord a, Hashable a, NFData a)
conditional m' = Map.fromList $ ((catMaybes results') `using` parList rdeepseq) conditional m' = Map.fromList $ ((catMaybes results') `using` parList rdeepseq)
where where
results' = [ let results' = [ let
ij = (/) <$> Map.lookup (i,j) m <*> Map.lookup (i,i) m ij = (/) <$> Map.lookup (i,j) m <*> Map.lookup (j,j) m
ji = (/) <$> Map.lookup (j,i) m <*> Map.lookup (j,j) m ji = (/) <$> Map.lookup (j,i) m <*> Map.lookup (i,i) m
in getMax (i,j) ij ji in getMax (i,j) ij ji
| i <- keys | i <- keys
...@@ -53,6 +53,3 @@ conditional m' = Map.fromList $ ((catMaybes results') `using` parList rdeepseq) ...@@ -53,6 +53,3 @@ conditional m' = Map.fromList $ ((catMaybes results') `using` parList rdeepseq)
(x,y) = unzip $ Map.keys m (x,y) = unzip $ Map.keys m
...@@ -23,19 +23,20 @@ TODO use Map LouvainNodeId (Map LouvainNodeId) ...@@ -23,19 +23,20 @@ TODO use Map LouvainNodeId (Map LouvainNodeId)
module Gargantext.Core.Viz.Graph.Bridgeness -- (bridgeness) module Gargantext.Core.Viz.Graph.Bridgeness -- (bridgeness)
where where
import Gargantext.Core.Methods.Similarities (Similarity(..))
-- import Data.IntMap (IntMap)
import Data.Map.Strict (Map, fromListWith, lookup, toList, mapWithKey, elems) import Data.Map.Strict (Map, fromListWith, lookup, toList, mapWithKey, elems)
import Data.Maybe (catMaybes) import Data.Maybe (catMaybes)
import Data.Ord (Down(..)) import Data.Ord (Down(..))
import Data.Set (Set)
import Data.Tuple.Extra (swap)
import Debug.Trace (trace) import Debug.Trace (trace)
import Gargantext.Core.Methods.Similarities (Similarity(..))
import Gargantext.Prelude import Gargantext.Prelude
import Graph.Types (ClusterNode(..)) import Graph.Types (ClusterNode(..))
-- import qualified Data.IntMap as IntMap import qualified Data.List as List
import qualified Data.List as List import qualified Data.Map.Strict as Map
import qualified Data.Map.Strict as Map import qualified Data.Set as Set
-- import qualified Data.Set as Set import qualified Data.Tuple.Extra as Tuple
import qualified Data.IntMap as Dico
---------------------------------------------------------------------- ----------------------------------------------------------------------
type Partitions = Map (Int, Int) Double -> IO [ClusterNode] type Partitions = Map (Int, Int) Double -> IO [ClusterNode]
...@@ -46,6 +47,43 @@ nodeId2comId (ClusterNode i1 i2) = (i1, i2) ...@@ -46,6 +47,43 @@ nodeId2comId (ClusterNode i1 i2) = (i1, i2)
type NodeId = Int type NodeId = Int
type CommunityId = Int type CommunityId = Int
----------------------------------------------------------------------
----------------------------------------------------------------------
-- recursiveClustering : get get more granularity of a given clustering
-- tested with spinglass clustering only (WIP)
recursiveClustering :: Partitions -> Map (Int, Int) Double -> IO [ClusterNode]
recursiveClustering f mp = do
let
n :: Double
n = fromIntegral $ Set.size
$ Set.unions $ List.concat
$ map (\(k1,k2) -> map Set.singleton [k1, k2])
$ Map.keys mp
t :: Int
t = round $ (n / 2) * (sqrt n) / 100
(toSplit,others) <- List.span (\a -> Set.size a > t) <$> clusterNodes2sets <$> f mp
cls' <- mapM f $ map (\s -> removeNodes s mp) toSplit
pure $ setNodes2clusterNodes $ others <> (List.concat $ map clusterNodes2sets cls')
setNodes2clusterNodes :: [Set NodeId] -> [ClusterNode]
setNodes2clusterNodes ns = List.concat $ map (\(n,ns') -> toCluster n ns') $ zip [1..] ns
where
toCluster :: CommunityId -> Set NodeId -> [ClusterNode]
toCluster cId setNodeId = map (\n -> ClusterNode n cId) (Set.toList setNodeId)
removeNodes :: Set NodeId
-> Map (NodeId, NodeId) Double
-> Map (NodeId, NodeId) Double
removeNodes s = Map.filterWithKey (\(n1,n2) _v -> Set.member n1 s && Set.member n2 s)
clusterNodes2sets :: [ClusterNode] -> [Set NodeId]
clusterNodes2sets = Dico.elems
. Dico.fromListWith (<>)
. (map ((Tuple.second Set.singleton) . swap . nodeId2comId))
---------------------------------------------------------------------- ----------------------------------------------------------------------
---------------------------------------------------------------------- ----------------------------------------------------------------------
data Bridgeness = Bridgeness_Basic { bridgeness_partitions :: [ClusterNode] data Bridgeness = Bridgeness_Basic { bridgeness_partitions :: [ClusterNode]
...@@ -116,7 +154,7 @@ filterComs _b m = Map.filter (\n -> length n > 0) $ mapWithKey filter' m ...@@ -116,7 +154,7 @@ filterComs _b m = Map.filter (\n -> length n > 0) $ mapWithKey filter' m
filter' (c1,c2) a filter' (c1,c2) a
| c1 == c2 = a | c1 == c2 = a
-- TODO use n here -- TODO use n here
| otherwise = take n $ List.sortOn (Down . snd) a | otherwise = take (2*n) $ List.sortOn (Down . snd) a
where where
n :: Int n :: Int
n = round $ 100 * a' / t n = round $ 100 * a' / t
......
...@@ -23,9 +23,9 @@ import GHC.Float (sin, cos) ...@@ -23,9 +23,9 @@ import GHC.Float (sin, cos)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Gargantext.API.Ngrams.Types (NgramsTerm(..)) import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core.Methods.Similarities (Similarity(..), measure) import Gargantext.Core.Methods.Similarities (Similarity(..), measure)
import Gargantext.Core.Methods.Similarities.Conditional (conditional) -- import Gargantext.Core.Methods.Similarities.Conditional (conditional)
import Gargantext.Core.Statistics import Gargantext.Core.Statistics
import Gargantext.Core.Viz.Graph.Bridgeness (bridgeness, Bridgeness(..), Partitions, nodeId2comId) import Gargantext.Core.Viz.Graph.Bridgeness (bridgeness, Bridgeness(..), Partitions, nodeId2comId, recursiveClustering)
import Gargantext.Core.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map, Index, MatrixShape(..)) import Gargantext.Core.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map, Index, MatrixShape(..))
import Gargantext.Core.Viz.Graph.Tools.IGraph (mkGraphUfromEdges, spinglass) import Gargantext.Core.Viz.Graph.Tools.IGraph (mkGraphUfromEdges, spinglass)
import Gargantext.Core.Viz.Graph.Tools.Infomap (infomap) import Gargantext.Core.Viz.Graph.Tools.Infomap (infomap)
...@@ -48,7 +48,6 @@ import qualified Graph.BAC.ProxemyOptim as BAC ...@@ -48,7 +48,6 @@ import qualified Graph.BAC.ProxemyOptim as BAC
import qualified IGraph as Igraph import qualified IGraph as Igraph
import qualified IGraph.Algorithms.Layout as Layout import qualified IGraph.Algorithms.Layout as Layout
data PartitionMethod = Spinglass | Confluence | Infomap data PartitionMethod = Spinglass | Confluence | Infomap
deriving (Generic, Eq, Ord, Enum, Bounded, Show) deriving (Generic, Eq, Ord, Enum, Bounded, Show)
instance FromJSON PartitionMethod instance FromJSON PartitionMethod
...@@ -124,16 +123,27 @@ cooc2graphWith' doPartitions bridgenessMethod multi similarity threshold strengt ...@@ -124,16 +123,27 @@ cooc2graphWith' doPartitions bridgenessMethod multi similarity threshold strengt
distanceMap `seq` diag `seq` ti `seq` return () distanceMap `seq` diag `seq` ti `seq` return ()
--{- -- Debug --{- -- Debug
-- saveAsFileDebug "/tmp/distanceMap" distanceMap -- To Work with Igraph
-- saveAsFileDebug "/tmp/distanceMap.keys" (List.length $ Map.keys distanceMap) saveAsFileDebug "/tmp/distanceMap" ( List.intercalate ";"
$ Set.toList
$ Set.fromList
$ map (\(k1,k2) -> if k1 < k2
then show (k1+1) <> " " <> show (k2+1)
else show (k2+1) <> " " <> show (k1+1)
)
$ Map.keys
$ Map.filter (>0.005) distanceMap
)
saveAsFileDebug "/tmp/distanceMap.data" distanceMap
saveAsFileDebug "/tmp/distanceMap.cooc" myCooc
-- printDebug "similarities" similarities -- printDebug "similarities" similarities
--} --}
partitions <- if (Map.size distanceMap > 0) partitions <- if (Map.size distanceMap > 0)
then doPartitions distanceMap then recursiveClustering doPartitions distanceMap
else panic $ Text.unlines [ "[Gargantext.C.V.Graph.Tools] Similarity Matrix is empty" else panic $ Text.unlines [ "[Gargantext.C.V.Graph.Tools] Similarity Matrix is empty"
, "Maybe you should add more Map Terms in your list" , "Maybe you should add more Map Terms in your list"
, "Tutorial: link todo" , "Tutorial: TODO"
] ]
length partitions `seq` return () length partitions `seq` return ()
...@@ -155,6 +165,33 @@ doSimilarityMap :: Similarity ...@@ -155,6 +165,33 @@ doSimilarityMap :: Similarity
, Map (Index, Index) Int , Map (Index, Index) Int
, Map NgramsTerm Index , Map NgramsTerm Index
) )
doSimilarityMap Conditional threshold strength myCooc = (distanceMap, toIndex ti myCooc', ti)
where
myCooc' = Map.fromList $ HashMap.toList myCooc
(_diag, theMatrix) = Map.partitionWithKey (\(x,y) _ -> x == y)
$ Map.fromList
$ HashMap.toList myCooc
(ti, _it) = createIndices theMatrix
tiSize = Map.size ti
similarities = (\m -> m `seq` m)
$ (\m -> m `seq` measure Conditional m)
$ (\m -> m `seq` map2mat Square 0 tiSize m)
$ theMatrix `seq` toIndex ti theMatrix
links = round (let n :: Double = fromIntegral (Map.size ti) in 10 * n * (log n)^(2::Int))
distanceMap = Map.fromList
$ List.take links
$ (if strength == Weak then List.reverse else identity)
$ List.sortOn snd
$ Map.toList
$ Map.filter (> threshold)
-- $ conditional myCooc
$ similarities `seq` mat2map similarities
doSimilarityMap Distributional threshold strength myCooc = (distanceMap, toIndex ti diag, ti) doSimilarityMap Distributional threshold strength myCooc = (distanceMap, toIndex ti diag, ti)
where where
-- TODO remove below -- TODO remove below
...@@ -181,20 +218,6 @@ doSimilarityMap Distributional threshold strength myCooc = (distanceMap, toIndex ...@@ -181,20 +218,6 @@ doSimilarityMap Distributional threshold strength myCooc = (distanceMap, toIndex
$ (\m -> m `seq` Map.filter (> threshold) m) $ (\m -> m `seq` Map.filter (> threshold) m)
$ similarities `seq` mat2map similarities $ similarities `seq` mat2map similarities
doSimilarityMap Conditional threshold strength myCooc = (distanceMap, toIndex ti myCooc', ti)
where
myCooc' = Map.fromList $ HashMap.toList myCooc
(ti, _it) = createIndices myCooc'
links = round (let n :: Double = fromIntegral (Map.size ti) in n * (log n)^(2::Int))
distanceMap = toIndex ti
$ Map.fromList
$ List.take links
$ (if strength == Weak then List.reverse else identity)
$ List.sortOn snd
$ HashMap.toList
$ HashMap.filter (> threshold)
$ conditional myCooc
---------------------------------------------------------- ----------------------------------------------------------
-- | From data to Graph -- | From data to Graph
type Occurrences = Int type Occurrences = Int
......
...@@ -21,6 +21,7 @@ import Gargantext.Core.Viz.Graph.Index ...@@ -21,6 +21,7 @@ import Gargantext.Core.Viz.Graph.Index
import Graph.Types (ClusterNode(..)) import Graph.Types (ClusterNode(..))
import IGraph hiding (mkGraph, neighbors, edges, nodes, Node, Graph) import IGraph hiding (mkGraph, neighbors, edges, nodes, Node, Graph)
import Protolude import Protolude
import Gargantext.Prelude (saveAsFileDebug)
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Map.Strict as Map import qualified Data.Map.Strict as Map
import qualified IGraph as IG import qualified IGraph as IG
...@@ -79,7 +80,11 @@ partitions_spinglass' :: (Serialize v, Serialize e) ...@@ -79,7 +80,11 @@ partitions_spinglass' :: (Serialize v, Serialize e)
=> Seed -> IG.Graph 'U v e -> IO [[Int]] => Seed -> IG.Graph 'U v e -> IO [[Int]]
partitions_spinglass' s g = do partitions_spinglass' s g = do
gen <- IG.withSeed s pure gen <- IG.withSeed s pure
IG.findCommunity g Nothing Nothing IG.spinglass gen res <- IG.findCommunity g Nothing Nothing IG.spinglass gen
-- res <- IG.findCommunity g Nothing Nothing IG.leiden gen
-- res <- IG.findCommunity g Nothing Nothing IG.infomap gen
saveAsFileDebug "/tmp/res" res
pure res
toClusterNode :: [[Int]] -> [ClusterNode] toClusterNode :: [[Int]] -> [ClusterNode]
......
...@@ -424,14 +424,15 @@ saveDocNgramsWith lId mapNgramsDocs' = do ...@@ -424,14 +424,15 @@ saveDocNgramsWith lId mapNgramsDocs' = do
--printDebug "saveDocNgramsWith" mapCgramsId --printDebug "saveDocNgramsWith" mapCgramsId
-- insertDocNgrams -- insertDocNgrams
_return <- insertContextNodeNgrams2 let ngrams2insert = catMaybes [ ContextNodeNgrams2 <$> Just nId
$ catMaybes [ ContextNodeNgrams2 <$> Just nId
<*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')) <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
<*> Just (fromIntegral w :: Double) <*> Just (fromIntegral w :: Double)
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight
] ]
printDebug "Ngrams2Insert" ngrams2insert
_return <- insertContextNodeNgrams2 ngrams2insert
-- to be removed -- to be removed
_ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
......
...@@ -7,8 +7,9 @@ extra-package-dbs: [] ...@@ -7,8 +7,9 @@ extra-package-dbs: []
skip-ghc-check: true skip-ghc-check: true
packages: packages:
- . - .
#- 'deps/gargantext-graph' # - 'deps/haskell-igraph'
#- 'deps/haskell-opaleye' # - 'deps/crawlers/arxiv-api'
# - 'deps/haskell-opaleye'
docker: docker:
enable: false enable: false
...@@ -78,6 +79,8 @@ extra-deps: ...@@ -78,6 +79,8 @@ extra-deps:
commit: 3db385e767d2100d8abe900833c6e7de3ac55e1b commit: 3db385e767d2100d8abe900833c6e7de3ac55e1b
- git: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git - git: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
commit: 2d7e5753cbbce248b860b571a0e9885415c846f7 commit: 2d7e5753cbbce248b860b571a0e9885415c846f7
#- git: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
#commit: a2d78abeaec9315be765b90d5e51a4a50c48e7b8
#- git: https://gitlab.iscpif.fr/cgenie/arxiv-api.git #- git: https://gitlab.iscpif.fr/cgenie/arxiv-api.git
#- arxiv-0.0.3@sha256:02de1114091d11f1f3ab401d104d125ad4301260806feb7f63b3dcefc7db88cf,1588 #- arxiv-0.0.3@sha256:02de1114091d11f1f3ab401d104d125ad4301260806feb7f63b3dcefc7db88cf,1588
...@@ -95,8 +98,10 @@ extra-deps: ...@@ -95,8 +98,10 @@ extra-deps:
# Graph libs # Graph libs
#- git: https://github.com/kaizhang/haskell-igraph.git #- git: https://github.com/kaizhang/haskell-igraph.git
- git: https://github.com/alpmestan/haskell-igraph.git #- git: https://github.com/alpmestan/haskell-igraph.git
commit: 9f55eb36639c8e0965c8bc539a57738869f33e9a # commit: 9f55eb36639c8e0965c8bc539a57738869f33e9a
- git: https://gitlab.iscpif.fr/gargantext/haskell-igraph.git
commit: 03c8885d0166255ed7bf0b624a07610ea68ec02c
- git: https://gitlab.iscpif.fr/gargantext/haskell-infomap.git - git: https://gitlab.iscpif.fr/gargantext/haskell-infomap.git
commit: 6d1d60b952b9b2b272b58fc5539700fd8890ac88 commit: 6d1d60b952b9b2b272b58fc5539700fd8890ac88
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment