[SnapCSV] parser

baab1af6 · Alexandre Delanoë · 367e6b2c · baab1af6 · baab1af6 · baab1af6
Commit baab1af6 authored Oct 08, 2021 by Alexandre Delanoë
7 changed files
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -15,8 +15,8 @@ import Data.Text (Text)
 import Data.Vector.Unboxed (Vector)
 import Graph.BAC.Clustering
 import Graph.BAC.ProxemyOptim
-import Graph.BAC.ProxemyOptim
 import Graph.FGL
+import Graph.Tools.CSV
 import Graph.Tools.Import
 import Graph.Types
 import Prelude (String)
@@ -41,7 +41,7 @@ import qualified Data.IntSet as IntSet
 import qualified Data.Text as Text

 setupEnv :: FilePath -> IO (Graph [Text] Double)
-setupEnv fp = getGraph (WithFile fp)
+setupEnv fp = getGraph SnapGraph (WithFile fp)

 main :: IO ()
 main = do
@@ -58,8 +58,9 @@ main = do
        clusts' = Prelude.map (sort . Prelude.map (lkp dico) . IntSet.toList)
                $ sortBy (\a b -> flipOrd $ comparing IntSet.size a b)
                $ IntMap.elems clusts
-    putStrLn $ "#clusters: " ++ show (IntMap.size clusts)
+    putStrLn $ "#clusters: "        ++ show (IntMap.size clusts)
    putStrLn $ "max cluster size: " ++ show (length (clusts' Prelude.!! 0))
+    putStrLn $ "min cluster size: " ++ show (length (Prelude.last clusts'))
    putStrLn $ "Clustering score: " ++ show score
    withFile fpout WriteMode $ \hndl ->
      forM_ clusts' $ \clust ->

--- a/data/snap/get
+++ b/data/snap/get
+#!/bin/bash
+
+cat urls.txt | while read url; do lynx -dump $url | grep -oP "http.*gz" | while read gz; do wget $gz; done; done
+
+
--- a/data/snap/urls.txt
+++ b/data/snap/urls.txt
+https://snap.stanford.edu/data/com-DBLP.html
+https://snap.stanford.edu/data/com-Amazon.html
--- a/run
+++ b/run
 #!/bin/bash

-~/.local/bin/gargantext-graph-exe $1 +RTS -p # -sstderr
+#~/.local/bin/gargantext-graph-exe $1 +RTS -p # -sstderr
 #time ~/.local/bin/gargantext-graph-exe +RTS -p # -sstderr
+
+INPUT=$1
+OUTPUT1=${1%csv}out1
+OUTPUT2=${1%csv}out2
+
+touch $OUTPUT1
+touch $OUTPUT2
+
+#for beta in -0.6 -0.5 -0.3 -0.2 -0.1 0.0 0.3 0.4 0.5 0.6 0.7; do
+for beta in 0.0 ; do
+  for gc in "gc" "nogc"; do
+    echo "$gc,$beta" >> $OUTPUT1 ;
+    echo "$gc,$beta" >> $OUTPUT2 ;
+    time ~/.local/bin/gargantext-graph-exe $1 $OUTPUT2 $gc $beta >> $OUTPUT1
+  done
+done
+
--- a/src/Graph/BAC/ProxemyOptim.hs
+++ b/src/Graph/BAC/ProxemyOptim.hs
@@ -11,7 +11,7 @@ Portability : POSIX
 publication it will be integrated to the backend with usual license --
 see above)

-Article: Confluence for Graph Clustering, B. Gaume and A. Delanoë, Journal:TBA
+Article: Confluence for Graph Clustering, B. Gaume and A. Delanoë, A. Mestanogullari

 Code written in Haskell by A. Delanoë from first Python Specifications by B.
 Gaume.

--- a/src/Graph/Tools/CSV.hs
+++ b/src/Graph/Tools/CSV.hs
@@ -54,11 +54,12 @@ uniq' ns = map (\(n1,n2) -> (n1,n2,1))

 ------------------------------------------------------------------------

-data FileGraph = CillexGraph | TestGraph
+data FileGraph = CillexGraph | TestGraph | SnapGraph

 readFileGraph :: FileGraph -> FilePath -> IO (DGIP.Gr [Text] Double)
 readFileGraph CillexGraph fp = toGraph <$> snd <$> readFileCsvCillex fp
 readFileGraph TestGraph   fp = toGraph <$> snd <$> readFileCsvTest   fp
+readFileGraph SnapGraph   fp = toGraph <$> snd <$> readFileSnapCsv   fp


 toGraph :: ToNode a => Vector a -> (DGIP.Gr [Text] Double)
@@ -90,6 +91,11 @@ instance ToNode TestCsv
        es = List.zip3 (cycle [n]) ns (cycle [1])
        ns = map (read . unpack) $ splitOn "," v

+instance ToNode SnapCsv
+  where
+    toNode :: SnapCsv -> ((Int, [Text]), [(Int,Int,Double)])
+    toNode (SnapCsv f t) = ((f, []), [(f,t,1)])
+
 toNodeUnlabelled :: TestCsv -> ((Int, ()), [(Int,Int,())])
 toNodeUnlabelled (TestCsv n v l) = (ln, es)
  where
@@ -100,22 +106,37 @@ toNodeUnlabelled (TestCsv n v l) = (ln, es)

 ------------------------------------------------------------------------
 readFileCsvCillex :: FilePath -> IO (Header, Vector CillexCsv)
-readFileCsvCillex = fmap readWith . BL.readFile
+readFileCsvCillex = fmap (readWith ',') . BL.readFile

 readFileCsvTest :: FilePath -> IO (Header, Vector TestCsv)
-readFileCsvTest = fmap readWith . BL.readFile
+readFileCsvTest = fmap (readWith ',') . BL.readFile

-csvDecodeOptions :: DecodeOptions
-csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
-  where
-    delimiter :: Word8
-    delimiter = fromIntegral $ ord ','
+readFileSnapCsv :: FilePath -> IO (Header, Vector SnapCsv)
+readFileSnapCsv = fmap (readWith '\t') . BL.readFile
+------------------------------------------------------------------------

-readWith :: FromNamedRecord a => BL.ByteString -> (Header, Vector a)
-readWith bs = case decodeByNameWith csvDecodeOptions bs of
+readWith :: FromNamedRecord a => Char -> BL.ByteString -> (Header, Vector a)
+readWith x bs = case decodeByNameWith (csvDecodeOptions x) bs of
      Left e        -> panic (pack e)
      Right csvDocs -> csvDocs

+csvDecodeOptions :: Char -> DecodeOptions
+csvDecodeOptions x = defaultDecodeOptions {decDelimiter = delimiter}
+  where
+    delimiter :: Word8
+    delimiter = fromIntegral $ ord x
+
+------------------------------------------------------------------------
+data SnapCsv = 
+  SnapCsv { sc_fromNodeId :: Int
+          , sc_toNodeId   :: Int
+	  }
+    deriving (Show)
+
+instance FromNamedRecord SnapCsv where
+  parseNamedRecord r = SnapCsv <$> r .: "FromNodeId"
+                               <*> r .: "ToNodeId"
+
 ------------------------------------------------------------------------
 data TestCsv =
  TestCsv { rc_node    :: Int

--- a/src/Graph/Tools/Import.hs
+++ b/src/Graph/Tools/Import.hs
@@ -51,21 +51,21 @@ data GraphData a b where
    :: { labelledGraph :: Graph [Text] Double } -> GraphData [Text] Double


-getGraph' :: GetGraph a b-> IO (GraphData a b)
-getGraph' (Random n) = reifyNat (fromIntegral n) $ \(pn :: Proxy n) ->
+getGraph' :: FileGraph -> GetGraph a b-> IO (GraphData a b)
+getGraph' _ (Random n) = reifyNat (fromIntegral n) $ \(pn :: Proxy n) ->
  randomAdjacency @n
                >>= \m -> pure $ LightGraph
                               $ mkGraphUfromEdges
                               $ List.map (\(x,y,_) -> (x,y))
                               $ SMatrix.toList m

-getGraph' (WithFile fp) = do
-  g <- readFileGraph CillexGraph fp
+getGraph' fg (WithFile fp) = do
+  g <- readFileGraph fg fp
  pure $ LabelledGraph g


-getGraph :: GetGraph a b -> IO (Graph a b)
-getGraph gg = toGraph' <$> getGraph' gg
+getGraph :: FileGraph -> GetGraph a b -> IO (Graph a b)
+getGraph fg gg = toGraph' <$> getGraph' fg gg

 toGraph' :: GraphData a b -> Graph a b
 toGraph' (LightGraph    g) = g