Merge branch 'flouvain' of ssh://gitlab.iscpif.fr:20022/gargantext/clustering-louvain into flouvain

f0f00ffd · Alexandre Delanoë · f8d493dc · d05ffebf · f0f00ffd · f0f00ffd
Commit f0f00ffd authored Mar 26, 2020 by Alexandre Delanoë
7 changed files
--- a/clustering-louvain.cabal
+++ b/clustering-louvain.cabal
@@ -4,7 +4,7 @@ cabal-version: 1.12
 --
 -- see: https://github.com/sol/hpack
 --
-- hash: 9d2b00c4d3d099b31d6b9db84cd1172e0464481bc132080e2694e02b5587b29b
+-- hash: 1f5db6daebbb99eb2ce71fb2a61120a7128c45b5152b0e6defa3a811c85aef62

 name:           clustering-louvain
 version:        0.1.0.0
@@ -29,6 +29,7 @@ library
    , fgl
    , foldl
    , hxt
+    , parsec
    , protolude
    , text
    , turtle

--- a/package.yaml
+++ b/package.yaml
@@ -36,6 +36,7 @@ library:
    - base >= 4.7 && < 5
    - fgl
    - hxt
+    - parsec
    - turtle
    - foldl
 #tests:

--- a/src/Data/Graph/Clustering/Example.hs
+++ b/src/Data/Graph/Clustering/Example.hs
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE TupleSections     #-}
+
 module Data.Graph.Clustering.Example where

+import Protolude
+
 import Control.Monad (foldM_)
-import Data.List (sort)
+import Data.List (nub, sort)
 import Data.Graph.Clustering.Louvain.Utils
 import Data.Graph.Inductive
 import Data.Graph.Clustering.FLouvain
+import qualified Data.Text as T
+import qualified Text.ParserCombinators.Parsec as P
+import Text.Parsec.Language (haskellStyle)
+import qualified Text.Parsec.Token as PT

 -- | Utility function to remap Gr () Double into FGraph () ()
 exampleRemap :: Gr () Double -> FGraph () ()
@@ -34,7 +43,7 @@ runIterations n gr = do
  let initCgr = initialCGr fgr

  putStrLn "Initial modularity: "
-  putStrLn $ show $ modularity fgr initCgr fgrWeight
+  putStrLn $ T.unpack $ show $ modularity fgr initCgr fgrWeight

  foldM_ (runIteration fgr fgrWeight) initCgr [0..n]

@@ -44,7 +53,7 @@ runIterations n gr = do
      putStrLn $ "----- ITERATION " <> show i
      putStrLn $ prettify iterNextCgr
      putStrLn $ show i <> " iteration modularity: "
-      putStrLn $ show $ modularity fgr iterNextCgr fgrWeight
+      putStrLn $ T.unpack $ show $ modularity fgr iterNextCgr fgrWeight
      return iterNextCgr

 runLouvainFirstStepIterate :: Int -> Gr () Double -> (Modularity, CGr)
@@ -54,13 +63,48 @@ runLouvainFirstStepIterate n gr = (modularity fgr cgr m, cgr)
    cgr = louvainFirstStepIterate n fgr
    m = graphWeight fgr

+-- | egr <- readPythonGraph "<file-path>"
+-- let gr = head $ Data.Either.rights [egr]
+readPythonGraph :: FilePath -> IO (Either P.ParseError (Gr () Double))
+readPythonGraph src = do
+  contents <- readFile src
+  let eEdges = P.parse edgeParser "(unknown)" $ T.unpack contents
+  case eEdges of
+    Left err -> do
+      return $ Left err
+    Right edges -> do
+      let nodes =  map (, ()) $ nub $ map (\(s, _, _) -> s) edges
+      return $ Right $ mkGraph nodes edges
+  where
+    lexer = PT.makeTokenParser haskellStyle
+    edgeParser :: P.GenParser Char st [(Node, Node, Double)]
+    edgeParser = P.many edgeLine
+    edgeLine :: P.GenParser Char st (Node, Node, Double)
+    edgeLine = do
+      source <- edgeSource
+      target <- edgeTarget
+      weight <- edgeWeight
+      return (source, target, weight)
+    edgeSource :: P.GenParser Char st Node
+    edgeSource = do
+      v <- PT.integer lexer
+      return $ fromIntegral v
+    edgeTarget :: P.GenParser Char st Node
+    edgeTarget = do
+      v <- PT.integer lexer
+      return $ fromIntegral v
+    edgeWeight :: P.GenParser Char st Double
+    edgeWeight = do
+      v <- PT.float lexer
+      return v
+
 karate :: Gr () Double
 -- karate =  mkGraph' <$> importGraphFromGexf "src/Data/karate.gexf"
 karate = mkGraph [(1,()),(2,()),(3,()),(4,()),(5,()),(6,()),(7,()),(8,()),(9,()),(10,()),(11,()),(12,()),(13,()),(14,()),(15,()),(16,()),(17,()),(18,()),(19,()),(20,()),(21,()),(22,()),(23,()),(24,()),(25,()),(26,()),(27,()),(28,()),(29,()),(30,()),(31,()),(32,()),(33,()),(34,())] [(1,2,1.0),(1,3,1.0),(1,4,1.0),(1,5,1.0),(1,6,1.0),(1,7,1.0),(1,8,1.0),(1,9,1.0),(1,11,1.0),(1,12,1.0),(1,13,1.0),(1,14,1.0),(1,18,1.0),(1,20,1.0),(1,22,1.0),(1,32,1.0),(2,3,1.0),(2,4,1.0),(2,8,1.0),(2,14,1.0),(2,18,1.0),(2,20,1.0),(2,22,1.0),(2,31,1.0),(3,4,1.0),(3,8,1.0),(3,9,1.0),(3,10,1.0),(3,14,1.0),(3,28,1.0),(3,29,1.0),(3,33,1.0),(4,8,1.0),(4,13,1.0),(4,14,1.0),(5,7,1.0),(5,11,1.0),(6,7,1.0),(6,11,1.0),(6,17,1.0),(7,17,1.0),(9,31,1.0),(9,33,1.0),(9,34,1.0),(10,34,1.0),(14,34,1.0),(15,33,1.0),(15,34,1.0),(16,33,1.0),(16,34,1.0),(19,33,1.0),(19,34,1.0),(20,34,1.0),(21,33,1.0),(21,34,1.0),(23,33,1.0),(23,34,1.0),(24,26,1.0),(24,28,1.0),(24,30,1.0),(24,33,1.0),(24,34,1.0),(25,26,1.0),(25,28,1.0),(25,32,1.0),(26,32,1.0),(27,30,1.0),(27,34,1.0),(28,34,1.0),(29,32,1.0),(29,34,1.0),(30,33,1.0),(30,34,1.0),(31,33,1.0),(31,34,1.0),(32,33,1.0),(32,34,1.0),(33,34,1.0)]


 karate2com :: [[Node]]
-karate2com = sort $ Prelude.map (sort) [[10, 29, 32, 25, 28, 26, 24, 30, 27, 34, 31, 33, 23, 15, 16, 21, 19], [3, 9, 8, 4, 14, 20, 2, 13, 22, 1, 18, 12, 5, 7, 6, 17]]
+karate2com = sort $ map (sort) [[10, 29, 32, 25, 28, 26, 24, 30, 27, 34, 31, 33, 23, 15, 16, 21, 19], [3, 9, 8, 4, 14, 20, 2, 13, 22, 1, 18, 12, 5, 7, 6, 17]]


 eU :: [LEdge Double]

--- a/src/Data/Graph/Clustering/python/.envrc
+++ b/src/Data/Graph/Clustering/python/.envrc
+use nix
--- a/src/Data/Graph/Clustering/python/clustered.graph
+++ b/src/Data/Graph/Clustering/python/clustered.graph
--- a/src/Data/Graph/Clustering/python/default.nix
+++ b/src/Data/Graph/Clustering/python/default.nix
+with import <nixpkgs> {};
+stdenv.mkDerivation rec {
+  name = "env";
+  env = buildEnv {
+    name = name;
+    paths = buildInputs;
+  };
+  buildInputs = [
+    python3
+    python3Packages.ipython
+    python3Packages.numpy
+    python3Packages.pandas
+    python3Packages.sklearn-deap
+  ];
+  builder = builtins.toFile "builder.sh" ''
+    source $stdenv/setup
+    touch $out
+  '';
+}
+
--- a/src/Data/Graph/Clustering/python/example.py
+++ b/src/Data/Graph/Clustering/python/example.py
+#!/usr/bin/env python3
+#
+# In ipython:
+# %load "src/Data/Graph/Clustering/example.py"
+
+import numpy as np
+import numpy.linalg as npl
+import pandas as pd
+import random
+from sklearn.datasets.samples_generator import make_blobs
+
+SAME_CLUSTER_PROB = 0.8
+DIFFERENT_CLUSTER_PROB = 0.1
+
+X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
+# Norm with be the edge weight
+norms = [npl.norm(xx) for xx in X]
+
+# Our graph will consist of pt (0.0, 0.0) with edges to all nodes in X
+#node0 = (0, np.array([0.0, 0.0]))
+graph = {
+    'nodes': [],
+    'edges': [],
+}
+
+for i, (node, norm, cluster) in enumerate(zip(X, norms, y)):
+    graph['nodes'].append((i, node))
+    for j, (node2, cluster2) in enumerate(zip(X, y)):
+        # If same cluster -- higher probability of edge creation
+        # If different cluster -- low probability of an edge
+        prob = DIFFERENT_CLUSTER_PROB
+        if cluster == cluster2:
+            prob = SAME_CLUSTER_PROB
+        if random.random() < prob:
+            graph['edges'].append((i, j, norm))
+        # If no edge, try backwards one
+        elif random.random() < prob:
+            graph['edges'].append((j, i, norm))
+
+# Custom graph file format:
+# <edge source> <edge target> <edge weight>
+with open('clustered.graph', 'w') as f:
+    for edge in graph['edges']:
+        f.write('{} {} {}\n'.format(*edge))