(WIP) Add utility functions for dealing with clustering types

d6253948 · Grégoire Locqueville · d7b909c3 · d6253948 · d6253948
Commit d6253948 authored Feb 05, 2025 by Grégoire Locqueville
Hide whitespace changes
Inline Side-by-side

Showing with 106 additions and 58 deletions

Tools.hs src/Gargantext/Core/Viz/Graph/Tools.hs +84 -18

Clustering.hs test/Test/Graph/Clustering.hs +22 -40

No files found.
--- a/src/Gargantext/Core/Viz/Graph/Tools.hs
+++ b/src/Gargantext/Core/Viz/Graph/Tools.hs
@@ -20,6 +20,7 @@ module Gargantext.Core.Viz.Graph.Tools
 import Data.HashMap.Strict (HashMap)
 import Data.HashMap.Strict qualified as HashMap
 import Data.HashSet qualified as HashSet
+import Data.IntMap qualified as IM
 import Data.List qualified as List
 import Data.Map.Strict qualified as Map
 import Data.Set qualified as Set
@@ -137,24 +138,6 @@ cooc2graphWith' _doPartitions _bridgenessMethod multi similarity threshold stren
  pure $ data2graph multi ti diag bridgeness' confluence' (setNodes2clusterNodes partitions)


-- | A converter from the partition type returned by `spinglass'`
-- to the partition type required by `bridgeness`
-partitionsToClusterNodes :: [Set Int] -> [ClusterNode]
-partitionsToClusterNodes setlist =
-  setlist                                                &
-  -- Convert sets to lists:
-  fmap toList                                            &
-  -- Assign an integer index to each cluster:
-  zip [1 ..]                                             &
-  -- Attach cluster IDs to individual nodes instead to whole clusters
-  fmap (\(id, clusterIds) -> zip (repeat id) clusterIds) &
-  -- Flatten list of clusters of nodes labeled by cluster indices
-  -- into a list of labeled nodes:
-  join                                                   &
-  -- Turn pairs into `ClusterNode`s
-  fmap (\(clusterId, nodeId) -> ClusterNode nodeId clusterId)
-
-
 type Reverse = Bool

 doSimilarityMap :: Similarity
@@ -384,3 +367,86 @@ generateLegend (Graph { _graph_nodes = nodes }) = List.sortBy (\(LegendField {_l
    f acc (Node {node_attributes = Attributes {clust_default = i}}) = case List.find (\(LegendField {_lf_id}) -> _lf_id == i) acc of
      Just _  -> acc
      Nothing -> acc ++ [LegendField {_lf_id = i, _lf_label = "Cluster" <> show i, _lf_color = "#FFF"}]
+
+--------------------------------------------------------------------------------
+-- Utilities for manipulating clusterings
+
+-- | A representation for clusterings that is convenient in some cases, e.g.
+-- for determining whether two clusterings are equivalent up to their labels.
+-- This is a dictionary whose values are clusters and whose keys are integer
+-- labels associated to each subset. The node IDs in two distinct clusters are
+-- expected to be distinct (if not, we'll say the clustering is "invalid")
+type ClusterMap = IM.IntMap (Set Int)
+
+
+-- | Convert between clustering types.
+clusterNodes2clusterMap :: Set ClusterNode
+                        -- ^ A set of node IDs and their associated cluster ID
+                        -> ClusterMap -- ^ The equivalent cluster map
+clusterNodes2clusterMap = foldl' (flip insertClusterNode) IM.empty
+  where
+  -- | Given a collection of sets of nodes each labeled with a cluster number,
+  -- insert a new node in a given cluster
+  insertClusterNode :: ClusterNode
+                    -- ^ The nodeID to add, and the cluster ID it should be added to
+                    -> ClusterMap
+                    -- ^ The `ClusterMap` to which one should add a new node
+                    -> ClusterMap
+  insertClusterNode (ClusterNode nodeId clusterId) =
+    IM.alter (insertMaybe nodeId) clusterId
+    where insertMaybe :: Int -> Maybe (Set Int) -> Maybe (Set Int)
+          insertMaybe n Nothing  = Just $ Set.singleton n
+          insertMaybe n (Just s) = Just $ Set.insert n s
+
+-- | Convert between clustering types
+partitions2clusterNodes :: [Set Int] -> [ClusterNode]
+partitions2clusterNodes clustering =
+  clustering                                             &
+  -- Convert sets to lists:
+  fmap toList                                            &
+  -- Assign an integer index to each cluster:
+  zip [1 ..]                                             &
+  -- Attach cluster IDs to individual nodes instead of to whole clusters
+  fmap (\(id, clusterIds) -> zip (repeat id) clusterIds) &
+  -- Flatten list of clusters of nodes labeled by cluster indices
+  -- into a list of labeled nodes:
+  join                                                   &
+  -- Turn pairs into `ClusterNode`s
+  fmap (\(clusterId, nodeId) -> ClusterNode nodeId clusterId)
+
+-- | Determines whether two clusters are equivalent when on ignores the labels.
+-- May return `Nothing` when either clustering is invalid, but this is not
+-- guaranteed.
+equivalentClusterings :: Set ClusterNode
+                      -> Set ClusterNode
+                      -> Maybe Bool
+equivalentClusterings c1 c2 = do -- entering the `Maybe` monad
+  let normalized1 = normalizeClusterMap . clusterNodes2clusterMap $ c1
+  let normalized2 = normalizeClusterMap . clusterNodes2clusterMap $ c2
+  return $ normalized1 == normalized2
+
+-- | "Get rid" of the keys of a `ClusterMap`. More specifically, this function
+-- replaces the label of each cluster with the smallest node ID contained in said
+-- cluster. As a result, if two clusterings `c1` and `c2` correspond to the same
+-- partition but possibly differ in the way their clusters are labeled, the
+-- following will hold:
+-- `normalizeClusterMap c1 == normalizeClusterMap c2`
+-- May return `Nothing` if the input is not a true partition, i.e. a node
+-- belongs to two clusters at once. (Some invalid partitions still result in
+-- a `Just`, so don't use this to test whether the partition is valid!)
+normalizeClusterMap :: ClusterMap -- ^ Input clustering
+                    -> Maybe ClusterMap
+                    -- ^ If the input corresponds to a valid partition of a set of
+                    -- integers, `Just` the normalized map.
+normalizeClusterMap = foldM addCluster IM.empty
+  where
+    -- | Add a cluster to a `ClusterMap` labeled by smallest elements.
+    addCluster :: ClusterMap -> Set Int -> Maybe ClusterMap
+    addCluster imap cluster = let clusterMin = minimum cluster in
+      if IM.member clusterMin imap
+        -- There's already a cluster whose label/smallest element is the same as
+        -- the one we want to add!
+        then Nothing
+        -- All good, we can add the cluster:
+        else Just $ IM.insert clusterMin cluster imap
+
--- a/test/Test/Graph/Clustering.hs
+++ b/test/Test/Graph/Clustering.hs
@@ -15,11 +15,12 @@ module Test.Graph.Clustering where
 import Data.HashMap.Strict (HashMap)
 import Data.HashMap.Strict qualified as HashMap
 import Data.IntMap qualified as IM
+import Data.IntMap ((?!))
 import Data.List qualified as List
 import Data.Set  qualified as Set
 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
 import Gargantext.Core.Methods.Similarities (Similarity(..))
-import Gargantext.Core.Viz.Graph.Tools (doSimilarityMap)
+import Gargantext.Core.Viz.Graph.Tools (doSimilarityMap, partitionsToClusterNodes)
 import Gargantext.Core.Viz.Graph.Tools.IGraph (spinglass)
 import Gargantext.Core.Viz.Graph.Types
 import Gargantext.Prelude
@@ -41,45 +42,26 @@ test = do
      let
        result = List.length partitions > 1
      shouldBe True result
+  describe "Cluster type conversion" $ do
+    it "Empty partition" $ do
+      let setList = []
+      let clusterNodeList = []
+      (partitionsToClusterNodes setList `equiv` clusterNodeList) `shouldBe` True
+      
+
+-- | Determines whether two dictionaries of sets correspond to the same partition
+-- ignoring the dictionary keys. May return `Nothing` if one of the dictionaries
+-- does not correspond to an actual partition (i.e. a same integer appears in two
+-- distinct sets).
+equiv :: [ClusterNode]
+      -> [ClusterNode]
+      -> Maybe Bool
+equiv partition1 partition2 = let
+  -- | We first convert the clusterings to dictionaries (`IntMap`), which we can
+  -- then normalize and compare the normalized version.
+  normalized1 = normalizePartition . clusterNodesToDict $ partition1
+  normalized2 = normalizePartition . clusterNodesToDict $ partition2
+  in (==) <$> normalized1 <*> normalized2

-- | "Get rid" of the keys of a dictionary representing a partition.
-- If `partition1` and `partition2` are the same up to changing the dictionary
-- keys, then `normalizePartition partition1 == normalizePartition partition2`.
-- May return `Nothing` if the input is not a true partition, i.e. a node
-- belongs to two clusters at once. (This is not guaranteed, so don't use this to
-- test whether the partition is a true one; however, the function still correctly
-- normalizes all values that result in `Just`, even those that are not true
-- partitions)
-- This works by relabeling each cluster by its smallest element.
-normalizePartition :: IM.IntMap (Set.Set Int) -- ^ A dictionary whose values are the sets of the partition, and whose keys we want to get rid of.
-                   -> Maybe (IM.IntMap (Set.Set Int))
-normalizePartition = foldM addCluster IM.empty
-  where
-    -- | Add a cluster to a dictionary of clusters labeled by their smallest elements.
-    addCluster :: IM.IntMap (Set.Set Int)
-               -> Set.Set Int
-               -> Maybe (IM.IntMap (Set.Set Int))
-    addCluster imap cluster = let clusterMin = minimum cluster in
-      if IM.member clusterMin imap
-        -- There's already a cluster whose label/smallest element is the same as the one we want to add!
-        then Nothing
-        -- All good, we can add the cluster:
-        else Just $ IM.insert clusterMin cluster imap

-- | Partition a set of nodes, each labeled with a cluster number, into a
-- dictionary whose keys are cluster IDs and whose values are sets of nodes
-- belonging to the corresponding cluster
-splitClusterSet :: Set.Set ClusterNode -- ^ A set of node IDs and their associated cluster ID
-                -> IM.IntMap (Set.Set Int)
-splitClusterSet = foldl' (flip insertClusterNode) IM.empty

-- | Given a collection of sets of nodes each labeled with a cluster number,
-- insert a new node in a given cluster
-insertClusterNode :: ClusterNode -- ^ The nodeID to add, and the cluster ID it should be added to
-                  -> IM.IntMap (Set.Set Int) -- ^ The collection of clusters to which one should add a new node
-                  -> IM.IntMap (Set.Set Int)
-insertClusterNode (ClusterNode nodeId clusterId) =
-  IM.alter addToCluster clusterId
-  where addToCluster :: Maybe (Set.Set Int) -> Maybe (Set.Set Int)
-        addToCluster Nothing  = Just $ Set.singleton nodeId
-        addToCluster (Just s) = Just $ Set.insert nodeId s