[MERGE] bashql + pipeline.

de549c2f · Alexandre Delanoë · 5c858e52 · 37c3a450 · 5c858e52 · de549c2f
Commit de549c2f authored Jun 18, 2018 by Alexandre Delanoë
17 changed files
--- a/Data.ByteString/Extended.hs
+++ b/Data.ByteString/Extended.hs
-{-# OPTIONS_GHC -fno-warn-orphans #-}
-
-{-|
-Module      : Data.ByteString.Extended
-Description : Short description
-Copyright   : (c) CNRS, 2017
-License     : AGPL + CECILL v3
-Maintainer  : team@gargantext.org
-Stability   : experimental
-Portability : POSIX
-
-Example showing how to extend existing base libraries.
-}
-
-
-module Data.ByteString.Extended ( module Data.ByteString
-                                , replace
-                                ) where
-import Data.ByteString
-
-replace :: ByteString -> ByteString -> ByteString -> ByteString
-replace = undefined
-
-- instance (Binary k, Binary v) => Binary (HaskMap k v) where
-- ...
--- a/app/CleanCsvCorpus.hs
+++ b/app/CleanCsvCorpus.hs
@@ -42,18 +42,16 @@ main = do
  let q = ["gratuit", "gratuité", "culture", "culturel"]

  (h,csvDocs) <- readCsv rPath
-  
+
  putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
  putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
-  
+
  let docs   = toDocs csvDocs
  let engine = insertDocs docs initialDocSearchEngine
  let docIds = S.query engine (map pack q)
  let docs'  = fromDocs $ filterDocs docIds (V.fromList docs)
-  
+
  putStrLn $ "Number of documents after:" <> show (V.length docs')
  putStrLn $ "Mean size of docs:" <> show (docsSize docs')
-  
-  writeCsv wPath (h, docs')
-

+  writeCsv wPath (h, docs')
--- a/package.yaml
+++ b/package.yaml
@@ -24,6 +24,7 @@ library:
    #  - -Werror
  exposed-modules:
  - Gargantext
+  - Gargantext.TextFlow
  - Gargantext.Prelude
  - Gargantext.Core
  - Gargantext.Core.Types
@@ -68,6 +69,7 @@ library:
  - hlcm
  - ini
  - jose-jwt
+  - kmeans-vector
  - lens
  - logging-effect
  - matrix
@@ -121,25 +123,38 @@ library:
  - zlib
 #  - utc

-executable:
-  main: Main.hs
-  source-dirs: app
-  ghc-options: 
-  - -threaded 
-  - -rtsopts 
-  - -with-rtsopts=-N 
-  - -O2 
-  - -Wmissing-signatures
-  dependencies:
+executables:
+  gargantext:
+    main: Main.hs
+    source-dirs: app
+    ghc-options:
+    - -threaded
+    - -rtsopts
+    - -with-rtsopts=-N
+    - -O2
+    - -Wmissing-signatures
+    dependencies:
+      - base
+      - containers
+      - gargantext
+      - vector
+      - cassava
+      - ini
+      - optparse-generic
+      - unordered-containers
+      - full-text-search
+  gargantext-workflow:
+    main: Main.hs
+    source-dirs: app-workflow
+    ghc-options:
+    - -threaded
+    - -rtsopts
+    - -with-rtsopts=-N
+    - -O2
+    - -Wmissing-signatures
+    dependencies:
    - base
-    - containers
    - gargantext
-    - vector
-    - cassava
-    - ini
-    - optparse-generic
-    - unordered-containers
-    - full-text-search

 tests:
  garg-test:

--- a/src/Gargantext/Core/Types/Node.hs
+++ b/src/Gargantext/Core/Types/Node.hs
@@ -265,6 +265,9 @@ data NodePoly id typename userId parentId name date hyperdata = Node { node_id
                                                                     } deriving (Show, Generic)
 $(deriveJSON (unPrefix "node_") ''NodePoly)

+
+
+
 instance Arbitrary (NodePoly NodeId NodeTypeId (Maybe NodeUserId) NodeParentId NodeName UTCTime Value) where
    arbitrary = elements [Node 1 1 (Just 1) 1 "name" (jour 2018 01 01) (toJSON ("{}"::Text))]


--- a/src/Gargantext/Database.hs
+++ b/src/Gargantext/Database.hs
 {-|
 Module      : Gargantext.Database
-Description : 
+Description : Main commands of BASHQL a Domain Specific Language to deal with Gargantext Database.
 Copyright   : (c) CNRS, 2017-Present
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

-Here is a longer description of this module, containing some
-commentary with @some markup@.
+* BASHQL = functional (Bash * SQL)
+
+* Which language to chose when working with a database ? To make it
+simple, instead of all common Object Relational Mapping (ORM) [1]
+strategy used nowadays inspired more by object logic than functional
+logic, the semantics of BASHQL with focus on the function first.
+
+* BASHQL focus on the function, i.e. use bash language function name,
+and make it with SQL behind the scene. Then BASHQL is inspired more
+by Bash language [2] than SQL and then follows its main commands as
+specification and documentation.
+
+* Main arguments:
+  1. Theoritical: database and FileSystems are each thought as a single
+  category, assumption based on theoretical work on databases by David Spivak [0].
+  2. Practical argument: basic bash commands are a daily practice among
+  developper community.
+
+* How to help ?
+  1. Choose a command you like in Bash
+  2. Implement it in Haskell-SQL according to Gargantext Shema (Tree like
+  filesystem)
+  3. Translate it in BASHQL (follow previous implementations)
+  4. Make a pull request    (enjoy the community)
+
+* Implementation strategy: Functional adapations are made to the
+gargantext languages options and SQL optimization are done continuously
+during the project. For the Haskellish part, you may be inspired by
+Turtle implementation written by Gabriel Gonzales [3] which shows how to
+write Haskell bash translations.
+
+* Semantics
+- FileSystem is now a NodeSystem where each File is a Node in a Directed Graph (DG).
+
+* References
+[0] MIT Press has published "Category theory for the sciences". The book
+can also be purchased on Amazon. Here are reviews by the MAA, by the
+AMS, and by SIAM.
+
+[1] https://en.wikipedia.org/wiki/Object-relational_mapping
+
+[2] https://en.wikipedia.org/wiki/Bash_(Unix_shell)
+
+[3] https://github.com/Gabriel439/Haskell-Turtle-Library
+
 -}

 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Database (
-  module Gargantext.Database.Utils
--  , module Gargantext.Database.Instances
-  , module Gargantext.Database.User
-  , module Gargantext.Database.Node
-  , module Gargantext.Database.NodeNode
--  , module Gargantext.Database.Ngram
-  , module Gargantext.Database.NodeNgram
-  , module Gargantext.Database.NodeNodeNgram
-  , module Gargantext.Database.NodeNgramNgram
-    --                             , module Gargantext.Database.Gargandb
-    --                             , module Gargantext.Database.Simple
-    --                             , module Gargantext.Database.InsertNode
-    --                             , module Gargantext.Database.NodeType
-  ) where
-
-import Gargantext.Database.Utils
--import Gargantext.Database.Gargandb
-import Gargantext.Database.User
+module Gargantext.Database ( module Gargantext.Database.Utils
+                           , get
+                           , ls  , ls'
+                           , home, home'
+                           , post, post', postR'
+                           , del , del'
+                           , tree, tree'
+                           )
+    where
+
+import Gargantext.Core.Types
+import Gargantext.Core.Types.Node
+import Gargantext.Database.Utils (connectGargandb)
 import Gargantext.Database.Node
-import Gargantext.Database.NodeNode
--import Gargantext.Database.Ngram
-import Gargantext.Database.NodeNgram
-import Gargantext.Database.NodeNodeNgram
-import Gargantext.Database.NodeNgramNgram
--import Gargantext.Database.Simple
--import Gargantext.Database.NodeType
--import Gargantext.Database.InsertNode
+import Gargantext.Prelude
+import Database.PostgreSQL.Simple (Connection)
+import Data.Text (Text)
+import Opaleye hiding (FromField)
+import Data.Aeson
+import Data.ByteString (ByteString)
+import Data.List (last, concat)
+type UserId = Int
+--type NodeId = Int
+
+-- List of NodeId
+-- type PWD a = PWD UserId [a]
+type PWD = [NodeId]
+--data PWD' a = a | PWD' [a]
+
+-- | TODO get Children or Node
+get :: Connection -> PWD -> IO [Node Value]
+get _ [] = pure []
+get conn pwd = runQuery conn $ selectNodesWithParentID (last pwd)
+
+-- | Home, need to filter with UserId
+home :: Connection -> IO PWD
+home c = map node_id <$> getNodesWithParentId c 0 Nothing
+
+-- | ls == get Children
+ls :: Connection -> PWD -> IO [Node Value]
+ls = get
+
+tree :: Connection -> PWD -> IO [Node Value]
+tree c p = do
+  ns <- get c p
+  cs <- mapM (\p' -> get c [p']) $ map node_id ns
+  pure $ ns <> (concat cs)
+
+
+-- | TODO
+post :: Connection -> PWD -> [NodeWrite'] -> IO Int64
+post _ [] _   = pure 0
+post _ _ []   = pure 0
+post c pth ns = mkNode c (last pth) ns
+
+postR :: Connection -> PWD -> [NodeWrite'] -> IO [Int]
+postR _ [] _   = pure [0]
+postR _ _ []   = pure [0]
+postR c pth ns = mkNodeR c (last pth) ns
+
+
+rm :: Connection -> PWD -> [NodeId] -> IO Int
+rm = del
+
+del :: Connection -> PWD -> [NodeId] -> IO Int
+del _ [] _ = pure 0
+del _ _ [] = pure 0
+del c pth ns = deleteNodes c ns
+
+put :: Connection -> PWD -> [a] -> IO Int64
+put = undefined
+
+-- | TODO
+-- cd (Home UserId) | (Node NodeId)
+-- cd Path
+-- jump NodeId
+-- touch Dir
+
+--------------------------------------------------------------
+-- Tests
+--------------------------------------------------------------
+
+home' :: IO PWD
+home' = do
+  c <- connectGargandb "gargantext.ini"
+  home c
+
+ls' :: IO [Node Value]
+ls' = do
+  c <- connectGargandb "gargantext.ini"
+  h <- home c
+  ls c h
+
+tree' :: IO [Node Value]
+tree' = do
+  c <- connectGargandb "gargantext.ini"
+  h <- home c
+  tree c h
+
+post' :: IO [Int]
+post'  = do
+  c   <- connectGargandb "gargantext.ini"
+  pid <- last <$> home c
+  let uid = 1
+  postNode c uid pid (Node' Corpus  "Premier corpus" "{}" [ Node' Document "Doc1" "{}" []
+                                                          , Node' Document "Doc2" "{}" []
+                                                          , Node' Document "Doc3" "{}" []
+                                                          ]
+                     )
+
+
+del' :: [NodeId] -> IO Int
+del' ns = do
+  c <- connectGargandb "gargantext.ini"
+  h <- home c
+  del c h ns


--- a/src/Gargantext/Database/Node.hs
+++ b/src/Gargantext/Database/Node.hs
@@ -21,6 +21,10 @@ Portability : POSIX

 module Gargantext.Database.Node where

+
+import GHC.Int (Int64)
+import Data.Maybe
+import Data.Time (UTCTime)
 import Database.PostgreSQL.Simple.FromField ( Conversion
                                            , ResultError(ConversionFailed)
                                            , FromField
@@ -28,6 +32,7 @@ import Database.PostgreSQL.Simple.FromField ( Conversion
                                            , returnError
                                            )
 import Prelude hiding (null, id, map, sum)
+import           Data.Time.Segment (jour, timesAfter, Granularity(D))

 import Gargantext.Core.Types
 import Gargantext.Core.Types.Node (NodeType)
@@ -43,10 +48,15 @@ import Data.Maybe (Maybe, fromMaybe)
 import Data.Text (Text)
 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
 import Data.Typeable (Typeable)
-import qualified Data.ByteString.Internal as DBI
+
+import qualified Data.ByteString      as DB
+import qualified Data.ByteString.Lazy as DBL
+import Data.ByteString (ByteString)
+
 import Database.PostgreSQL.Simple (Connection)
 import Opaleye hiding (FromField)
-
+import Opaleye.Internal.QueryArr (Query(..))
+import qualified Data.Profunctor.Product as PP
 -- | Types for Node Database Management
 data PGTSVector

@@ -78,7 +88,7 @@ instance QueryRunnerColumnDefault PGJsonb HyperdataUser     where



-fromField' :: (Typeable b, FromJSON b) => Field -> Maybe DBI.ByteString -> Conversion b
+fromField' :: (Typeable b, FromJSON b) => Field -> Maybe DB.ByteString -> Conversion b
 fromField' field mb = do
    v <- fromField field mb
    valueToHyperdata v
@@ -89,7 +99,7 @@ fromField' field mb = do


 $(makeAdaptorAndInstance "pNode" ''NodePoly)
-$(makeLensesWith abbreviatedFields   ''NodePoly)
+$(makeLensesWith abbreviatedFields ''NodePoly)


 nodeTable :: Table NodeWrite NodeRead
@@ -105,12 +115,40 @@ nodeTable = Table "nodes" (pNode Node { node_id                = optional "id"
                            )


+nodeTable' :: Table (Maybe (Column PGInt4)
+                    ,       Column PGInt4
+                    ,       Column PGInt4
+                    ,       Column PGInt4
+                    ,       Column PGText
+                    ,Maybe (Column PGTimestamptz)
+                    ,       Column PGJsonb
+                    )
+                    ((Column PGInt4)
+                    ,       Column PGInt4
+                    ,       Column PGInt4
+                    ,       Column PGInt4
+                    ,       Column PGText
+                    ,(Column PGTimestamptz)
+                    ,       Column PGJsonb
+                    )
+
+nodeTable' = Table "nodes" (PP.p7 ( optional "id"
+                               , required "typename"
+                               , required "user_id"
+                               , required "parent_id"
+                               , required "name"
+                               , optional "date"
+                               , required "hyperdata"
+                               )
+                            )
+
+
 queryNodeTable :: Query NodeRead
 queryNodeTable = queryTable nodeTable


-selectNodes :: Column PGInt4 -> Query NodeRead
-selectNodes id = proc () -> do
+selectNode :: Column PGInt4 -> Query NodeRead
+selectNode id = proc () -> do
    row <- queryNodeTable -< ()
    restrict -< node_id row .== id
    returnA -< row
@@ -142,13 +180,11 @@ selectNodesWith' parentId maybeNodeType = proc () -> do


 deleteNode :: Connection -> Int -> IO Int
-deleteNode conn n = fromIntegral 
-                 <$> runDelete conn nodeTable 
+deleteNode conn n = fromIntegral <$> runDelete conn nodeTable
                 (\(Node n_id _ _ _ _ _ _) -> n_id .== pgInt4 n)

 deleteNodes :: Connection -> [Int] -> IO Int
-deleteNodes conn ns = fromIntegral 
-                   <$> runDelete conn nodeTable 
+deleteNodes conn ns = fromIntegral <$> runDelete conn nodeTable
                   (\(Node n_id _ _ _ _ _ _) -> in_ ((map pgInt4 ns)) n_id)


@@ -164,6 +200,11 @@ getNodesWithParentId :: Connection -> Int
                     -> Maybe Text -> IO [Node HyperdataDocument]
 getNodesWithParentId conn n _ = runQuery conn $ selectNodesWithParentID n

+getNodesWithParentId' :: Connection -> Int 
+                     -> Maybe Text -> IO [Node Value]
+getNodesWithParentId' conn n _ = runQuery conn $ selectNodesWithParentID n
+
+
 selectNodesWithParentID :: Int -> Query NodeRead
 selectNodesWithParentID n = proc () -> do
    row@(Node _ _ _ parent_id _ _ _) <- queryNodeTable -< ()
@@ -181,12 +222,103 @@ selectNodesWithType type_id = proc () -> do
    restrict -< tn .== type_id
    returnA -< row

+getNode' :: Connection -> Int -> IO (Node Value)
+getNode' c id = do
+  fromMaybe (error "TODO: 404") . headMay <$> runQuery c (limit 1 $ selectNode (pgInt4 id))
+
+
 getNode :: Connection -> Int -> IO (Node HyperdataDocument)
 getNode conn id = do
-    fromMaybe (error "TODO: 404") . headMay <$> runQuery conn (limit 1 $ selectNodes (pgInt4 id))
+    fromMaybe (error "TODO: 404") . headMay <$> runQuery conn (limit 1 $ selectNode (pgInt4 id))

 getNodesWithType :: Connection -> Column PGInt4 -> IO [Node HyperdataDocument]
 getNodesWithType conn type_id = do
    runQuery conn $ selectNodesWithType type_id

+type UserId = NodeId
+type TypeId = Int
+
+
+------------------------------------------------------------------------
+-- Quick and dirty
+------------------------------------------------------------------------
+type NodeWrite' = NodePoly (Maybe Int) Int Int (ParentId) Text (Maybe UTCTime) ByteString
+
+--node :: UserId -> ParentId -> NodeType -> Text -> Value -> NodeWrite'
+node :: UserId -> ParentId -> NodeType -> Text -> Value -> NodeWrite'
+node userId parentId nodeType name nodeData = Node Nothing typeId userId parentId name Nothing byteData
+  where
+    typeId = nodeTypeId nodeType
+    byteData = DB.pack $ DBL.unpack $ encode nodeData
+
+node2write pid (Node id tn ud _ nm dt hp) = ((pgInt4    <$> id)
+                                         ,(pgInt4        tn)
+                                         ,(pgInt4        ud)
+                                         ,(pgInt4        pid)
+                                         ,(pgStrictText  nm)
+                                         ,(pgUTCTime <$> dt)
+                                         ,(pgStrictJSONB hp)
+                                         )
+
+
+mkNode :: Connection -> ParentId -> [NodeWrite'] -> IO Int64
+mkNode conn pid ns = runInsertMany conn nodeTable' $ map (node2write pid) ns
+
+mkNodeR :: Connection -> ParentId -> [NodeWrite'] -> IO [Int]
+mkNodeR conn pid ns = runInsertManyReturning conn nodeTable' (map (node2write pid) ns) (\(i,_,_,_,_,_,_) -> i)
+
+
+------------------------------------------------------------------------
+-- TODO Hierachy of Nodes
+-- post and get same types Node' and update if changes
+
+{- TODO semantic to achieve
+post c uid pid [ Node' Corpus "name" "{}" []
+               , Node' Folder "name" "{}" [Node' Corpus "test 2" "" [ Node' Document "title" "metaData" []
+                                                                    , Node' Document "title" "jsonData" []
+                                                                    ]
+                                          ]
+               ]
+-}
+------------------------------------------------------------------------
+
+-- TODO
+-- currently this function remove the child relation
+-- needs a Temporary type between Node' and NodeWriteT
+node2table :: UserId -> ParentId -> Node' -> [NodeWriteT]
+node2table uid pid (Node' nt txt v []) = [( Nothing, (pgInt4$ nodeTypeId nt), (pgInt4 uid), (pgInt4 pid)
+                                         , pgStrictText txt, Nothing, pgStrictJSONB $ DB.pack $ DBL.unpack $ encode v)]
+node2table _ _ (Node' _ _ _ _) = panic "node2table: should not happen, Tree insert not implemented yet"
+
+
+data Node' = Node' { _n_type :: NodeType
+                   , _n_name :: Text
+                   , _n_data :: Value
+                   , _n_children :: [Node']
+                   } deriving (Show)
+
+
+type NodeWriteT =  ( Maybe (Column PGInt4)
+                   , Column PGInt4, Column PGInt4
+                   , Column PGInt4, Column PGText
+                   , Maybe (Column PGTimestamptz)
+                   , Column PGJsonb
+                   )
+
+
+mkNode' :: Connection -> [NodeWriteT] -> IO Int64
+mkNode' conn ns = runInsertMany conn nodeTable' ns
+
+mkNodeR' :: Connection -> [NodeWriteT] -> IO [Int]
+mkNodeR' conn ns = runInsertManyReturning conn nodeTable' ns (\(i,_,_,_,_,_,_) -> i)
+
+postNode :: Connection -> UserId -> ParentId -> Node' -> IO [Int]
+postNode c uid pid (Node' nt txt v []) = mkNodeR' c (node2table uid pid (Node' nt txt v []))
+postNode c uid pid (Node' Corpus txt v ns) = do
+  [pid']  <- postNode c uid pid (Node' Corpus txt v [])
+  pids    <- mkNodeR' c $ concat $ (map (\(Node' Document txt v _) -> node2table uid pid' $ Node' Document txt v []) ns)
+  pure (pids)
+postNode c uid pid (Node' _ _ _ _) = panic "postNode for this type not implemented yet"
+
+

--- a/src/Gargantext/Pipeline.hs
+++ b/src/Gargantext/Pipeline.hs
-{-|
-Module      : Gargantext.Pipeline
-Description : Server API
-Copyright   : (c) CNRS, 2017-Present
-License     : AGPL + CECILL v3
-Maintainer  : team@gargantext.org
-Stability   : experimental
-Portability : POSIX
-}
-
-{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
-{-# LANGUAGE NoImplicitPrelude           #-}
-
-module Gargantext.Pipeline
-  where
-
-import Data.Text.IO (readFile)
-
-import Control.Arrow ((***))
-import Data.Map.Strict (Map)
-import qualified Data.Map.Strict as M
-import qualified Data.List       as L
-import Data.Tuple.Extra (both)
----------------------------------------------
-import Gargantext.Core (Lang(FR))
-import Gargantext.Prelude
-
-import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
-import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
-import Gargantext.Viz.Graph.Index (Index)
-import Gargantext.Text.Metrics.Count (cooc, removeApax)
-import Gargantext.Text.Metrics
-import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
-import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
-
-import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
-
-
-{-
-  ____                             _            _
- / ___| __ _ _ __ __ _  __ _ _ __ | |_ _____  _| |_
-| |  _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
-| |_| | (_| | | | (_| | (_| | | | | ||  __/>  <| |_
- \____|\__,_|_|  \__, |\__,_|_| |_|\__\___/_/\_\\__|
-                 |___/
-
-}
-
-pipeline path = do
-  -- Text  <- IO Text <- FilePath
-  text     <- readFile path
-  let contexts = splitBy (Sentences 5) text
-  myterms <- extractTerms Multi FR contexts
-  
-  -- TODO    filter (\t -> not . elem t stopList) myterms
-  -- TODO    groupBy (Stem | GroupList)
-  
-  let myCooc = removeApax $ cooc myterms
-  --let (ti, fi) = createIndices myCooc
-  pure True
-  --pure $ incExcSpeGen myCooc
-  -- Cooc -> Matrix
-  
--  -- filter by spec/gen (dynmaic programming)
--  let theScores = M.filter (>0) $ score conditional myCoocFiltered
----
------  -- Matrix -> Clustering
------  pure $ bestpartition False $ map2graph $ toIndex ti theScores
--  partitions <- cLouvain theScores
--  pure partitions
---- | Building : -> Graph -> JSON
-
--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -19,20 +19,23 @@ commentary with @some markup@.
 module Gargantext.Prelude
  ( module Gargantext.Prelude
  , module Protolude
-  , headMay
+  , headMay, lastMay
  , module Text.Show
  , module Text.Read
  , cs
  , module Data.Maybe
+  , sortWith
  )
  where

+import GHC.Exts (sortWith)
+
 import Data.Maybe (isJust, fromJust, maybe)
-import Protolude ( Bool(True, False), Int, Double, Integer
+import Protolude ( Bool(True, False), Int, Int64, Double, Integer
                 , Fractional, Num, Maybe(Just,Nothing)
                 , Enum, Bounded, Float
                 , Floating, Char, IO
-                 , pure, (<*>), (<$>), panic
+                 , pure, (>>=), (=<<), (<*>), (<$>), panic
                 , putStrLn
                 , head, flip
                 , Ord, Integral, Foldable, RealFrac, Monad, filter
@@ -50,6 +53,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
                 , undefined
                 , IO()
                 , compare
+                 , on
                 )

 -- TODO import functions optimized in Utils.Count
@@ -63,7 +67,7 @@ import qualified Data.Map as M

 import Data.Map.Strict (insertWith)
 import qualified Data.Vector as V
-import Safe (headMay)
+import Safe (headMay, lastMay)
 import Text.Show (Show(), show)
 import Text.Read (Read())
 import Data.String.Conversions (cs)
@@ -109,7 +113,7 @@ ma = movingAverage 3

 -- | splitEvery n == chunkAlong n n
 splitEvery :: Int -> [a] -> [[a]]
-splitEvery _ [] = L.cycle [[]]
+splitEvery _ [] = []
 splitEvery n xs =
  let (h,t) = L.splitAt n xs
  in  h : splitEvery n t
@@ -235,5 +239,5 @@ unMaybe :: [Maybe a] -> [a]
 unMaybe = map fromJust . L.filter isJust

 -- maximumWith
-maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
+maximumWith f = L.maximumBy (compare `on` f)

--- a/src/Gargantext/Text.hs
+++ b/src/Gargantext/Text.hs
@@ -37,9 +37,6 @@ type Context = Text -> [Text]

 data Viz = Graph | Phylo | Chart

-pipeline :: Config -> Text -> Viz
-pipeline = undefined
-

 -----------------------------------------------------------------
 -------------------------------------------------------------------

--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -16,6 +16,7 @@ noApax m = M.filter (>1) m

 -}

+{-# LANGUAGE BangPatterns      #-}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

@@ -23,16 +24,19 @@ module Gargantext.Text.Metrics
  where

 import Data.Text (Text, pack)
+import Data.Ord (comparing, Down(..))
 import Data.Map (Map)
-
 import qualified Data.List as L
 import qualified Data.Map  as M
 import qualified Data.Set  as S
 import qualified Data.Text as T
+import qualified Data.Vector as V
+import qualified Data.Vector.Unboxed as VU
 import Data.Tuple.Extra (both)
 --import GHC.Real (Ratio)
 --import qualified Data.Text.Metrics as DTM
 import Data.Array.Accelerate (toList)
+import Math.KMeans (kmeans, euclidSq, elements)


 import Gargantext.Prelude
@@ -46,26 +50,99 @@ import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
 import Gargantext.Viz.Graph.Distances.Matrice
 import Gargantext.Viz.Graph.Index

+import qualified Data.Array.Accelerate.Interpreter as DAA
+import qualified Data.Array.Accelerate as DAA
+-- import Data.Array.Accelerate ((:.)(..), Z(..))
+
+import GHC.Real (round)
+
+import Debug.Trace
+import Prelude (seq)

-- ord relevance: top n plus inclus
-- échantillonnage de généricity
-- 
--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m = 
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
----(ti, fi)  = createIndices m
-- . fromIndex fi $ filterMat $ cooc2mat ti m
+data MapListSize   = MapListSize   Int
+data InclusionSize = InclusionSize Int
+data SampleBins    = SampleBins    Double
+data Clusters      = Clusters      Int
+data DefaultValue  = DefaultValue  Int

+data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
+                                 , fc_inclusionSize :: InclusionSize
+                                 , fc_sampleBins    :: SampleBins
+                                 , fc_clusters      :: Clusters
+                                 , fc_defaultValue  :: DefaultValue
+                             }
+
+filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
+filterCooc fc cc = (filterCooc' fc) ts cc
+  where
+    ts     = map _scored_terms $ takeSome fc $ coocScored cc

-import Data.Array.Accelerate (Matrix)

-filterMat :: Matrix Int -> [(Index, Index)]
-filterMat m = S.toList $ S.take n $ S.fromList $ (L.take nIe incExc') <> (L.take nSg speGen')
+filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
+filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
+  foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
+    M.empty selection
+  where
+    selection  = [(x,y) | x <- ts
+                        , y <- ts
+                       -- , x >= y
+                        ]
+
+
+-- | Map list creation
+-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
+-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
+-- each parts is then ordered by Inclusion/Exclusion
+-- take n scored terms in each parts where n * SampleBins = MapListSize.
+takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
+takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
+                    $ takeSample n m
+                    $ L.take l' $ sortWith (Down . _scored_incExc) scores
+                    -- $ splitKmeans k scores
  where
-    (incExc', speGen') = both ( map fst . L.sortOn snd . M.toList . mat2map) (conditional' m)
-    n = nIe + nSg
-    nIe = 30
-    nSg = 70
+    -- TODO: benchmark with accelerate-example kmeans version
+    splitKmeans x xs = L.concat $ map elements
+                     $ V.take (k-1)
+                     $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
+                              euclidSq x xs
+    n = round ((fromIntegral l)/s)
+    m = round $ (fromIntegral $ length scores) / (s)
+    takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
+                        L.concat $ map (L.take n)
+                                 $ map (sortWith (Down . _scored_incExc))
+                                 -- TODO use kmeans s instead of splitEvery
+                                 -- in order to split in s heteregenous parts
+                                 -- without homogeneous order hypothesis
+                                 $ splitEvery m
+                                 $ sortWith (Down . _scored_speGen) xs
+
+
+data Scored t = Scored { _scored_terms  :: !t
+                       , _scored_incExc :: !InclusionExclusion
+                       , _scored_speGen :: !SpecificityGenericity
+                     } deriving (Show)
+
+coocScored :: Ord t => Map (t,t) Int -> [Scored t]
+coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
+  where
+    (ti,fi) = createIndices m
+    (is, ss) = incExcSpeGen $ cooc2mat ti m
+    scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+



@@ -73,8 +150,7 @@ incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
  where
    (ti,fi) = createIndices m
-    ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
-
+    ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)



@@ -106,7 +182,7 @@ metrics_sentences_Test = metrics_sentences == metrics_sentences'
 -}

 metrics_terms :: IO [[Terms]]
-metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
+metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text

 -- | Occurrences
 {-

--- a/src/Gargantext/Text/Metrics/Count.hs
+++ b/src/Gargantext/Text/Metrics/Count.hs
@@ -75,14 +75,15 @@ type Grouped = Stems

 type Occs = Int
 type Coocs = Int
-
-removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
-removeApax = DMS.filter (> 1)
+type Threshold = Int
+removeApax :: Threshold -> Map (Label, Label) Int -> Map (Label, Label) Int
+removeApax t = DMS.filter (> t)

 cooc :: [[Terms]] -> Map (Label, Label) Int
-cooc tss = coocOnWithLabel _terms_stem (labelPolicy terms_occs) tss
+cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
  where
    terms_occs = occurrencesOn _terms_stem (List.concat tss)
+    label_policy = mkLabelPolicy terms_occs


 coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
@@ -93,10 +94,21 @@ coocOnWithLabel on policy tss =
    delta f = f *** f


+mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped Label
+mkLabelPolicy = DMS.map f where
+  f = _terms_label . fst . maximumWith snd . DMS.toList
+     -- TODO use the Foldable instance of Map instead of building a list
+
+useLabelPolicy :: Map Grouped Label -> Grouped -> Label
+useLabelPolicy m g = case DMS.lookup g m of
+  Just label -> label
+  Nothing    -> panic $ "Label of Grouped not found: " <> (pack $ show g)
+{-
 labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
 labelPolicy m g =  case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
                     Just label -> label
                     Nothing    -> panic $ "Label of Grouped not found: " <> (pack $ show g)
+-}

 coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
 coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as

--- a/src/Gargantext/Text/Parsers/CSV.hs
+++ b/src/Gargantext/Text/Parsers/CSV.hs
@@ -25,7 +25,7 @@ import Control.Applicative
 import Data.Char (ord)
 import Data.Csv
 import Data.Either (Either(Left, Right))
-import Data.Text (Text, pack, length)
+import Data.Text (Text, pack, length, intercalate)
 import qualified Data.ByteString.Lazy as BL

 import Data.Vector (Vector)
@@ -68,9 +68,8 @@ fromDocs docs = V.map fromDocs' docs
 -- | Split a document in its context
 -- TODO adapt the size of the paragraph according to the corpus average

-
 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
-splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
+splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
                          if docSize > 1000
                            then
                              if (mod (round m) docSize) >= 10
@@ -101,18 +100,18 @@ type Mean = Double
 docsSize :: Vector CsvDoc -> Mean
 docsSize csvDoc = mean ls
  where
-    ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
+    ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc


 ---------------------------------------------------------------
 data CsvDoc = CsvDoc
-    { c_title  :: !Text
-    , c_source :: !Text
-    , c_publication_year  :: !Int
-    , c_publication_month :: !Int
-    , c_publication_day   :: !Int
-    , c_abstract          :: !Text
-    , c_authors           :: !Text
+    { csv_title  :: !Text
+    , csv_source :: !Text
+    , csv_publication_year  :: !Int
+    , csv_publication_month :: !Int
+    , csv_publication_day   :: !Int
+    , csv_abstract          :: !Text
+    , csv_authors           :: !Text
    }
    deriving (Show)

@@ -147,12 +146,19 @@ csvEncodeOptions = ( defaultEncodeOptions
                      {encDelimiter = fromIntegral $ ord '\t'}
                    )

+------------------------------------------------------------------------
+------------------------------------------------------------------------
+readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
+readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
+                      <$> snd
+                      <$> readCsv fp

+------------------------------------------------------------------------
 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 readCsv fp = do
    csvData <- BL.readFile fp
    case decodeByNameWith csvDecodeOptions csvData of
-      Left e    -> panic (pack e)
+      Left e        -> panic (pack e)
      Right csvDocs -> pure csvDocs



--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -42,23 +42,23 @@ import Gargantext.Core.Types
 import Gargantext.Text.Terms.Multi (multiterms)
 import Gargantext.Text.Terms.Mono  (monoterms')

-data TermType = Mono | Multi | MonoMulti
+data TermType lang = Mono lang | Multi lang | MonoMulti lang

 -- remove Stop Words
 -- map (filter (\t -> not . elem t)) $ 
 ------------------------------------------------------------------------
 -- | Sugar to extract terms from text (hiddeng mapM from end user).
-extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
-extractTerms termType lang = mapM (terms termType lang)
+extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
+extractTerms termTypeLang = mapM (terms termTypeLang)
 ------------------------------------------------------------------------
 -- | Terms from Text
 -- Mono : mono terms
 -- Multi : multi terms
 -- MonoMulti : mono and multi
 -- TODO : multi terms should exclude mono (intersection is not empty yet)
-terms :: TermType -> Lang -> Text -> IO [Terms]
-terms Mono  lang txt     = pure $ monoterms' lang txt
-terms Multi lang txt     = multiterms lang txt
-terms MonoMulti lang txt = terms Multi lang txt
+terms :: TermType Lang -> Text -> IO [Terms]
+terms (Mono      lang) txt = pure $ monoterms' lang txt
+terms (Multi     lang) txt = multiterms lang txt
+terms (MonoMulti lang) txt = terms (Multi lang) txt
 ------------------------------------------------------------------------

--- a/src/Gargantext/TextFlow.hs
+++ b/src/Gargantext/TextFlow.hs
+{-|
+Module      : Gargantext.TextFlow
+Description : Server API
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+From text to viz, all the flow of texts in Gargantext.
+
+-}
+
+{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
+{-# LANGUAGE NoImplicitPrelude           #-}
+
+module Gargantext.TextFlow
+  where
+
+import qualified Data.Text as T
+import Data.Text.IO (readFile)
+
+import Control.Arrow ((***))
+import Data.Map.Strict (Map)
+import qualified Data.Array.Accelerate as A
+import qualified Data.Map.Strict as M
+import qualified Data.List       as L
+import Data.Tuple.Extra (both)
+----------------------------------------------
+import Gargantext.Core (Lang(FR))
+import Gargantext.Core.Types (Label)
+import Gargantext.Prelude
+import Prelude (print, seq)
+
+import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
+import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
+import Gargantext.Viz.Graph.Index (Index)
+import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
+import Gargantext.Text.Metrics.Count (cooc)
+import Gargantext.Text.Metrics
+import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
+import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
+
+import Gargantext.Text.Parsers.CSV
+
+import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
+
+
+{-
+  ____                             _            _
+ / ___| __ _ _ __ __ _  __ _ _ __ | |_ _____  _| |_
+| |  _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
+| |_| | (_| | | | (_| | (_| | | | | ||  __/>  <| |_
+ \____|\__,_|_|  \__, |\__,_|_| |_|\__\___/_/\_\\__|
+                 |___/
+
+-}
+
+data TextFlow = CSV | FullText
+
+-- workflow :: Lang (EN|FR) -> FilePath -> Graph
+textflow termsLang workType path = do
+  -- Text  <- IO Text <- FilePath
+  contexts <- case workType of
+                FullText -> splitBy (Sentences 5) <$> readFile path
+                CSV      -> readCsvOn [csv_title, csv_abstract] path
+
+  -- Context :: Text -> [Text]
+  -- Contexts = Paragraphs n | Sentences n | Chars n
+
+  myterms <- extractTerms (Mono FR) contexts
+  -- TermsType = Mono | Multi | MonoMulti
+  -- myterms # filter (\t -> not . elem t stopList)
+  --         # groupBy (Stem|GroupList|Ontology)
+  printDebug "myterms" (sum $ map length myterms)
+
+  -- Bulding the map list
+  -- compute copresences of terms
+  -- Cooc = Map (Term, Term) Int
+  let myCooc1 = cooc myterms
+  printDebug "myCooc1" (M.size myCooc1)
+
+  -- Remove Apax: appears one time only => lighting the matrix
+  let myCooc2 = M.filter (>1) myCooc1
+  printDebug "myCooc2" (M.size myCooc2)
+
+  -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
+  let myCooc3 = filterCooc ( FilterConfig (MapListSize   1000 )
+                                          (InclusionSize 4000 )
+                                          (SampleBins      10 )
+                                          (Clusters         3 )
+                                          (DefaultValue     0 )
+                           ) myCooc2
+  printDebug "myCooc3" $ M.size myCooc3
+
+  -- Cooc -> Matrix
+  let (ti, fi) = createIndices myCooc3
+  printDebug "ti" $ M.size ti
+
+  let myCooc4 = toIndex ti myCooc3
+  printDebug "myCooc4" $ M.size myCooc4
+
+  let matCooc = map2mat (0) (M.size ti) myCooc4
+  --printDebug "matCooc" matCooc
+  -- Matrix -> Clustering
+  let distanceMat = conditional matCooc
+--  let distanceMat = distributional matCooc
+  printDebug "distanceMat" $ A.arrayShape distanceMat
+  --printDebug "distanceMat" distanceMat
+-- 
+  let distanceMap = mat2map distanceMat
+  printDebug "distanceMap" $ M.size distanceMap
+--{-
+--  let distance = fromIndex fi distanceMap
+--  printDebug "distance" $ M.size distance
+---}
+  partitions <- cLouvain distanceMap
+------ | Building : -> Graph -> JSON
+  printDebug "partitions" $ length partitions
+  --printDebug "partitions" partitions
+  pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
+
+
+-----------------------------------------------------------
+-- distance should not be a map since we just "toList" it (same as cLouvain)
+data2graph :: [(Label, Int)] -> Map (Int, Int) Int
+                             -> Map (Int, Int) Double
+                             -> [LouvainNode]
+              -> Graph
+data2graph labels coocs distance partitions = Graph nodes edges
+  where
+    community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
+    nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
+                   , n_type = Terms -- or Unknown
+                   , n_id = cs (show n)
+                   , n_label = T.unwords l
+                   , n_attributes = 
+                     Attributes { clust_default = maybe 0 identity 
+                                (M.lookup n community_id_by_node_id) } }
+            | (l, n) <- labels ]
+    edges = [ Edge { e_source = s
+                   , e_target = t
+                   , e_weight = w
+                   , e_id     = i }
+            | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
+-----------------------------------------------------------
+
+printDebug msg x = putStrLn $ msg <> " " <> show x
+--printDebug _ _ = pure ()
+
+
--- a/src/Gargantext/Viz/Graph.hs
+++ b/src/Gargantext/Viz/Graph.hs
@@ -19,11 +19,14 @@ module Gargantext.Viz.Graph
 import GHC.Generics (Generic)
 import Data.Aeson.TH (deriveJSON)
 import Data.Text (Text)
+import Data.Map (Map)

 import Gargantext.Prelude
 import Gargantext.Core.Utils.Prefix (unPrefix)

-----------------------------------------------------------
+import Data.Graph.Clustering.Louvain.CplusPlus (LouvainNode)
+
+
 data TypeNode = Terms | Unknown
  deriving (Show, Generic)

@@ -55,8 +58,6 @@ data Graph = Graph { g_nodes :: [Node]
                   }
  deriving (Show, Generic)
 $(deriveJSON (unPrefix "g_") ''Graph)
-
 -----------------------------------------------------------


-
--- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs
@@ -109,7 +109,6 @@ conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
  where
-
    ie :: Acc (Matrix Double) -> Acc (Matrix Double)
    ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
    sg :: Acc (Matrix Double) -> Acc (Matrix Double)
@@ -149,8 +148,6 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
    crossProduct m = zipWith (*) (cross m  ) (cross (transpose m))
    cross mat      = zipWith (-) (mkSum n mat) (mat)

-
-
 -----------------------------------------------------------------------
 -----------------------------------------------------------------------


--- a/stack.yaml
+++ b/stack.yaml
@@ -24,6 +24,8 @@ extra-deps:
 - fullstop-0.1.4
 - haskell-src-exts-1.18.2
 - http-types-0.12.1
+- kmeans-vector-0.3.2
+- probable-0.1.3
 - protolude-0.2
 - servant-0.13
 - servant-auth-0.3.0.1