Commit d6253948 authored by Grégoire Locqueville's avatar Grégoire Locqueville

(WIP) Add utility functions for dealing with clustering types

parent d7b909c3
Pipeline #7307 failed with stages
in 20 minutes and 42 seconds
......@@ -20,6 +20,7 @@ module Gargantext.Core.Viz.Graph.Tools
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.HashSet qualified as HashSet
import Data.IntMap qualified as IM
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
......@@ -137,24 +138,6 @@ cooc2graphWith' _doPartitions _bridgenessMethod multi similarity threshold stren
pure $ data2graph multi ti diag bridgeness' confluence' (setNodes2clusterNodes partitions)
-- | A converter from the partition type returned by `spinglass'`
-- to the partition type required by `bridgeness`
partitionsToClusterNodes :: [Set Int] -> [ClusterNode]
partitionsToClusterNodes setlist =
setlist &
-- Convert sets to lists:
fmap toList &
-- Assign an integer index to each cluster:
zip [1 ..] &
-- Attach cluster IDs to individual nodes instead to whole clusters
fmap (\(id, clusterIds) -> zip (repeat id) clusterIds) &
-- Flatten list of clusters of nodes labeled by cluster indices
-- into a list of labeled nodes:
join &
-- Turn pairs into `ClusterNode`s
fmap (\(clusterId, nodeId) -> ClusterNode nodeId clusterId)
type Reverse = Bool
doSimilarityMap :: Similarity
......@@ -384,3 +367,86 @@ generateLegend (Graph { _graph_nodes = nodes }) = List.sortBy (\(LegendField {_l
f acc (Node {node_attributes = Attributes {clust_default = i}}) = case List.find (\(LegendField {_lf_id}) -> _lf_id == i) acc of
Just _ -> acc
Nothing -> acc ++ [LegendField {_lf_id = i, _lf_label = "Cluster" <> show i, _lf_color = "#FFF"}]
--------------------------------------------------------------------------------
-- Utilities for manipulating clusterings
-- | A representation for clusterings that is convenient in some cases, e.g.
-- for determining whether two clusterings are equivalent up to their labels.
-- This is a dictionary whose values are clusters and whose keys are integer
-- labels associated to each subset. The node IDs in two distinct clusters are
-- expected to be distinct (if not, we'll say the clustering is "invalid")
type ClusterMap = IM.IntMap (Set Int)
-- | Convert between clustering types.
clusterNodes2clusterMap :: Set ClusterNode
-- ^ A set of node IDs and their associated cluster ID
-> ClusterMap -- ^ The equivalent cluster map
clusterNodes2clusterMap = foldl' (flip insertClusterNode) IM.empty
where
-- | Given a collection of sets of nodes each labeled with a cluster number,
-- insert a new node in a given cluster
insertClusterNode :: ClusterNode
-- ^ The nodeID to add, and the cluster ID it should be added to
-> ClusterMap
-- ^ The `ClusterMap` to which one should add a new node
-> ClusterMap
insertClusterNode (ClusterNode nodeId clusterId) =
IM.alter (insertMaybe nodeId) clusterId
where insertMaybe :: Int -> Maybe (Set Int) -> Maybe (Set Int)
insertMaybe n Nothing = Just $ Set.singleton n
insertMaybe n (Just s) = Just $ Set.insert n s
-- | Convert between clustering types
partitions2clusterNodes :: [Set Int] -> [ClusterNode]
partitions2clusterNodes clustering =
clustering &
-- Convert sets to lists:
fmap toList &
-- Assign an integer index to each cluster:
zip [1 ..] &
-- Attach cluster IDs to individual nodes instead of to whole clusters
fmap (\(id, clusterIds) -> zip (repeat id) clusterIds) &
-- Flatten list of clusters of nodes labeled by cluster indices
-- into a list of labeled nodes:
join &
-- Turn pairs into `ClusterNode`s
fmap (\(clusterId, nodeId) -> ClusterNode nodeId clusterId)
-- | Determines whether two clusters are equivalent when on ignores the labels.
-- May return `Nothing` when either clustering is invalid, but this is not
-- guaranteed.
equivalentClusterings :: Set ClusterNode
-> Set ClusterNode
-> Maybe Bool
equivalentClusterings c1 c2 = do -- entering the `Maybe` monad
let normalized1 = normalizeClusterMap . clusterNodes2clusterMap $ c1
let normalized2 = normalizeClusterMap . clusterNodes2clusterMap $ c2
return $ normalized1 == normalized2
-- | "Get rid" of the keys of a `ClusterMap`. More specifically, this function
-- replaces the label of each cluster with the smallest node ID contained in said
-- cluster. As a result, if two clusterings `c1` and `c2` correspond to the same
-- partition but possibly differ in the way their clusters are labeled, the
-- following will hold:
-- `normalizeClusterMap c1 == normalizeClusterMap c2`
-- May return `Nothing` if the input is not a true partition, i.e. a node
-- belongs to two clusters at once. (Some invalid partitions still result in
-- a `Just`, so don't use this to test whether the partition is valid!)
normalizeClusterMap :: ClusterMap -- ^ Input clustering
-> Maybe ClusterMap
-- ^ If the input corresponds to a valid partition of a set of
-- integers, `Just` the normalized map.
normalizeClusterMap = foldM addCluster IM.empty
where
-- | Add a cluster to a `ClusterMap` labeled by smallest elements.
addCluster :: ClusterMap -> Set Int -> Maybe ClusterMap
addCluster imap cluster = let clusterMin = minimum cluster in
if IM.member clusterMin imap
-- There's already a cluster whose label/smallest element is the same as
-- the one we want to add!
then Nothing
-- All good, we can add the cluster:
else Just $ IM.insert clusterMin cluster imap
......@@ -15,11 +15,12 @@ module Test.Graph.Clustering where
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.IntMap qualified as IM
import Data.IntMap ((?!))
import Data.List qualified as List
import Data.Set qualified as Set
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core.Methods.Similarities (Similarity(..))
import Gargantext.Core.Viz.Graph.Tools (doSimilarityMap)
import Gargantext.Core.Viz.Graph.Tools (doSimilarityMap, partitionsToClusterNodes)
import Gargantext.Core.Viz.Graph.Tools.IGraph (spinglass)
import Gargantext.Core.Viz.Graph.Types
import Gargantext.Prelude
......@@ -41,45 +42,26 @@ test = do
let
result = List.length partitions > 1
shouldBe True result
describe "Cluster type conversion" $ do
it "Empty partition" $ do
let setList = []
let clusterNodeList = []
(partitionsToClusterNodes setList `equiv` clusterNodeList) `shouldBe` True
-- | Determines whether two dictionaries of sets correspond to the same partition
-- ignoring the dictionary keys. May return `Nothing` if one of the dictionaries
-- does not correspond to an actual partition (i.e. a same integer appears in two
-- distinct sets).
equiv :: [ClusterNode]
-> [ClusterNode]
-> Maybe Bool
equiv partition1 partition2 = let
-- | We first convert the clusterings to dictionaries (`IntMap`), which we can
-- then normalize and compare the normalized version.
normalized1 = normalizePartition . clusterNodesToDict $ partition1
normalized2 = normalizePartition . clusterNodesToDict $ partition2
in (==) <$> normalized1 <*> normalized2
-- | "Get rid" of the keys of a dictionary representing a partition.
-- If `partition1` and `partition2` are the same up to changing the dictionary
-- keys, then `normalizePartition partition1 == normalizePartition partition2`.
-- May return `Nothing` if the input is not a true partition, i.e. a node
-- belongs to two clusters at once. (This is not guaranteed, so don't use this to
-- test whether the partition is a true one; however, the function still correctly
-- normalizes all values that result in `Just`, even those that are not true
-- partitions)
-- This works by relabeling each cluster by its smallest element.
normalizePartition :: IM.IntMap (Set.Set Int) -- ^ A dictionary whose values are the sets of the partition, and whose keys we want to get rid of.
-> Maybe (IM.IntMap (Set.Set Int))
normalizePartition = foldM addCluster IM.empty
where
-- | Add a cluster to a dictionary of clusters labeled by their smallest elements.
addCluster :: IM.IntMap (Set.Set Int)
-> Set.Set Int
-> Maybe (IM.IntMap (Set.Set Int))
addCluster imap cluster = let clusterMin = minimum cluster in
if IM.member clusterMin imap
-- There's already a cluster whose label/smallest element is the same as the one we want to add!
then Nothing
-- All good, we can add the cluster:
else Just $ IM.insert clusterMin cluster imap
-- | Partition a set of nodes, each labeled with a cluster number, into a
-- dictionary whose keys are cluster IDs and whose values are sets of nodes
-- belonging to the corresponding cluster
splitClusterSet :: Set.Set ClusterNode -- ^ A set of node IDs and their associated cluster ID
-> IM.IntMap (Set.Set Int)
splitClusterSet = foldl' (flip insertClusterNode) IM.empty
-- | Given a collection of sets of nodes each labeled with a cluster number,
-- insert a new node in a given cluster
insertClusterNode :: ClusterNode -- ^ The nodeID to add, and the cluster ID it should be added to
-> IM.IntMap (Set.Set Int) -- ^ The collection of clusters to which one should add a new node
-> IM.IntMap (Set.Set Int)
insertClusterNode (ClusterNode nodeId clusterId) =
IM.alter addToCluster clusterId
where addToCluster :: Maybe (Set.Set Int) -> Maybe (Set.Set Int)
addToCluster Nothing = Just $ Set.singleton nodeId
addToCluster (Just s) = Just $ Set.insert nodeId s
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment