{-| Module : Graph.Tools Description : Copyright : (c) CNRS, Alexandre Delanoƫ License : AGPL + CECILL v3 Maintainer : alexandre+dev@delanoe.org Stability : experimental Portability : POSIX -} {-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE InstanceSigs #-} {-# LANGUAGE NoImplicitPrelude #-} module Graph.Tools.CSV where import Data.Csv import Data.Text (pack, splitOn, unpack) import Data.Vector hiding (map, uniq) import Prelude (read) import Protolude import Graph.Types import qualified Data.Map as Map import qualified Data.ByteString.Lazy as BL import qualified Data.Graph.Inductive as DGI import qualified Data.Graph.Inductive.PatriciaTree as DGIP import qualified Data.List as List import qualified Data.Set as Set import qualified Data.Vector as Vector import Data.Semigroup import qualified Data.IntMap.Strict as IntMap import Data.Maybe (fromJust) ------------------------------------------------------------------------ -- | Clean tools uniq :: [DGI.Edge] -> [DGI.Edge] uniq ns = Set.toList $ Set.fromList $ map (\(n1,n2) -> if n1 <= n2 then (n1,n2) else (n2,n1) ) ns uniq' :: [(Int,Int,Double)] -> [(Int,Int,Double)] uniq' ns = map (\(n1,n2) -> (n1,n2,1)) $ Map.keys $ Map.fromListWith (+) $ map (\(n1,n2,w) -> if n1 <= n2 then ((n1,n2), w) else ((n2,n1), w) ) ns ------------------------------------------------------------------------ data FileGraph = CillexGraph | TestGraph | SnapGraph readFileGraph :: FileGraph -> FilePath -> IO (DGIP.Gr [Text] Double) readFileGraph CillexGraph fp = toGraph <$> snd <$> readFileCsvCillex fp readFileGraph TestGraph fp = toGraph <$> snd <$> readFileCsvTest fp -- readFileGraph SnapGraph fp = toGraph <$> snd <$> readFileSnapCsv fp readFileGraph SnapGraph fp = toGraph'' <$> snd <$> readFileSnapCsv fp toGraph'' :: Vector SnapCsv -> DGIP.Gr [Text] Double toGraph'' v = DGI.undir (DGI.mkGraph ns es') where es = List.map (\(SnapCsv f t) -> (f, t)) $ Vector.toList v (_, nodeDict) = List.foldl' (\(nextIdx, dict) (from, to) -> let (nextIdx1, dict1) = smartInsert nextIdx from dict in smartInsert nextIdx1 to dict1 ) (0, mempty) es es' = List.map (\(f, t) -> ( fromJust (IntMap.lookup f nodeDict) , fromJust (IntMap.lookup t nodeDict) , 1 ) ) es ns = List.map (\(a, b) -> (b, [Protolude.show a])) (IntMap.toList nodeDict) smartInsert nextI v dict = case IntMap.lookup v dict of Nothing -> (nextI+1, IntMap.insert v nextI dict) Just i -> (nextI, dict) toGraph :: ToNode a => Vector a -> (DGIP.Gr [Text] Double) toGraph vs = g where (ns,es) = List.unzip $ Vector.toList $ Vector.map toNode vs g = DGI.mkGraph ns (uniq' $ List.concat es) ------------------------------------------------------------------------ class ToNode a where toNode :: a -> ((Int, [Text]), [(Int,Int,Double)]) instance ToNode CillexCsv where toNode :: CillexCsv -> ((Int, [Text]), [(Int,Int,Double)]) toNode (CillexCsv n v _ _ l) = (ln, es) where ln = (n, [l]) es = List.zip3 (cycle [n]) ns (cycle [1]) ns = map (read . unpack) $ splitOn "," v instance ToNode TestCsv where toNode :: TestCsv -> ((Int, [Text]), [(Int,Int,Double)]) toNode (TestCsv n v l) = (ln, es) where ln = (n, map show [l]) es = List.zip3 (cycle [n]) ns (cycle [1]) ns = map (read . unpack) $ splitOn "," v instance ToNode SnapCsv where toNode :: SnapCsv -> ((Int, [Text]), [(Int,Int,Double)]) toNode (SnapCsv f t) = ((f, []), [(f,t,1)]) toNodeUnlabelled :: TestCsv -> ((Int, ()), [(Int,Int,())]) toNodeUnlabelled (TestCsv n v l) = (ln, es) where ln = (n, ()) es = List.zip3 (cycle [n]) ns (cycle [()]) ns = map (read . unpack) $ splitOn "," v ------------------------------------------------------------------------ readFileCsvCillex :: FilePath -> IO (Header, Vector CillexCsv) readFileCsvCillex = fmap (readWith ',') . BL.readFile readFileCsvTest :: FilePath -> IO (Header, Vector TestCsv) readFileCsvTest = fmap (readWith ',') . BL.readFile readFileSnapCsv :: FilePath -> IO (Header, Vector SnapCsv) readFileSnapCsv = fmap (readWith '\t') . BL.readFile ------------------------------------------------------------------------ readWith :: FromNamedRecord a => Char -> BL.ByteString -> (Header, Vector a) readWith x bs = case decodeByNameWith (csvDecodeOptions x) bs of Left e -> panic (pack e) Right csvDocs -> csvDocs csvDecodeOptions :: Char -> DecodeOptions csvDecodeOptions x = defaultDecodeOptions {decDelimiter = delimiter} where delimiter :: Word8 delimiter = fromIntegral $ ord x ------------------------------------------------------------------------ data SnapCsv = SnapCsv { sc_fromNodeId :: Int , sc_toNodeId :: Int } deriving (Show) instance FromNamedRecord SnapCsv where parseNamedRecord r = SnapCsv <$> r .: "FromNodeId" <*> r .: "ToNodeId" ------------------------------------------------------------------------ data TestCsv = TestCsv { rc_node :: Int , rc_voisins :: Text , rc_id :: Double } deriving (Show) instance FromNamedRecord TestCsv where parseNamedRecord r = TestCsv <$> r .: "@node: #id_pdg" <*> r .: "%+voisins" <*> r .: "label" ------------------------------------------------------------------------ data CillexCsv = CillexCsv { node :: Int , voisins :: Text , lemme :: Text , id :: Double , label :: Text } deriving (Show) instance FromNamedRecord CillexCsv where parseNamedRecord r = CillexCsv <$> r .: "@node: #id_pdg" <*> r .: "%+voisins" <*> r .: "lemme" <*> r .: "id" <*> r .: "label" {- cillexCsvHeader = header [ "@node: #id_pdg" , "%+voisins" , "id" , "lemme" , "label" ] -}