Commit de549c2f authored by Alexandre Delanoë's avatar Alexandre Delanoë

[MERGE] bashql + pipeline.

parents 5c858e52 37c3a450
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-|
Module : Data.ByteString.Extended
Description : Short description
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Example showing how to extend existing base libraries.
-}
module Data.ByteString.Extended ( module Data.ByteString
, replace
) where
import Data.ByteString
replace :: ByteString -> ByteString -> ByteString -> ByteString
replace = undefined
-- instance (Binary k, Binary v) => Binary (HaskMap k v) where
-- ...
......@@ -42,18 +42,16 @@ main = do
let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- readCsv rPath
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
let docs = toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine (map pack q)
let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs')
putStrLn $ "Mean size of docs:" <> show (docsSize docs')
writeCsv wPath (h, docs')
writeCsv wPath (h, docs')
......@@ -24,6 +24,7 @@ library:
# - -Werror
exposed-modules:
- Gargantext
- Gargantext.TextFlow
- Gargantext.Prelude
- Gargantext.Core
- Gargantext.Core.Types
......@@ -68,6 +69,7 @@ library:
- hlcm
- ini
- jose-jwt
- kmeans-vector
- lens
- logging-effect
- matrix
......@@ -121,25 +123,38 @@ library:
- zlib
# - utc
executable:
main: Main.hs
source-dirs: app
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
executables:
gargantext:
main: Main.hs
source-dirs: app
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- base
- containers
- gargantext
- vector
- cassava
- ini
- optparse-generic
- unordered-containers
- full-text-search
gargantext-workflow:
main: Main.hs
source-dirs: app-workflow
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- base
- containers
- gargantext
- vector
- cassava
- ini
- optparse-generic
- unordered-containers
- full-text-search
tests:
garg-test:
......
......@@ -265,6 +265,9 @@ data NodePoly id typename userId parentId name date hyperdata = Node { node_id
} deriving (Show, Generic)
$(deriveJSON (unPrefix "node_") ''NodePoly)
instance Arbitrary (NodePoly NodeId NodeTypeId (Maybe NodeUserId) NodeParentId NodeName UTCTime Value) where
arbitrary = elements [Node 1 1 (Just 1) 1 "name" (jour 2018 01 01) (toJSON ("{}"::Text))]
......
{-|
Module : Gargantext.Database
Description :
Description : Main commands of BASHQL a Domain Specific Language to deal with Gargantext Database.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
* BASHQL = functional (Bash * SQL)
* Which language to chose when working with a database ? To make it
simple, instead of all common Object Relational Mapping (ORM) [1]
strategy used nowadays inspired more by object logic than functional
logic, the semantics of BASHQL with focus on the function first.
* BASHQL focus on the function, i.e. use bash language function name,
and make it with SQL behind the scene. Then BASHQL is inspired more
by Bash language [2] than SQL and then follows its main commands as
specification and documentation.
* Main arguments:
1. Theoritical: database and FileSystems are each thought as a single
category, assumption based on theoretical work on databases by David Spivak [0].
2. Practical argument: basic bash commands are a daily practice among
developper community.
* How to help ?
1. Choose a command you like in Bash
2. Implement it in Haskell-SQL according to Gargantext Shema (Tree like
filesystem)
3. Translate it in BASHQL (follow previous implementations)
4. Make a pull request (enjoy the community)
* Implementation strategy: Functional adapations are made to the
gargantext languages options and SQL optimization are done continuously
during the project. For the Haskellish part, you may be inspired by
Turtle implementation written by Gabriel Gonzales [3] which shows how to
write Haskell bash translations.
* Semantics
- FileSystem is now a NodeSystem where each File is a Node in a Directed Graph (DG).
* References
[0] MIT Press has published "Category theory for the sciences". The book
can also be purchased on Amazon. Here are reviews by the MAA, by the
AMS, and by SIAM.
[1] https://en.wikipedia.org/wiki/Object-relational_mapping
[2] https://en.wikipedia.org/wiki/Bash_(Unix_shell)
[3] https://github.com/Gabriel439/Haskell-Turtle-Library
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Database (
module Gargantext.Database.Utils
-- , module Gargantext.Database.Instances
, module Gargantext.Database.User
, module Gargantext.Database.Node
, module Gargantext.Database.NodeNode
-- , module Gargantext.Database.Ngram
, module Gargantext.Database.NodeNgram
, module Gargantext.Database.NodeNodeNgram
, module Gargantext.Database.NodeNgramNgram
-- , module Gargantext.Database.Gargandb
-- , module Gargantext.Database.Simple
-- , module Gargantext.Database.InsertNode
-- , module Gargantext.Database.NodeType
) where
import Gargantext.Database.Utils
--import Gargantext.Database.Gargandb
import Gargantext.Database.User
module Gargantext.Database ( module Gargantext.Database.Utils
, get
, ls , ls'
, home, home'
, post, post', postR'
, del , del'
, tree, tree'
)
where
import Gargantext.Core.Types
import Gargantext.Core.Types.Node
import Gargantext.Database.Utils (connectGargandb)
import Gargantext.Database.Node
import Gargantext.Database.NodeNode
--import Gargantext.Database.Ngram
import Gargantext.Database.NodeNgram
import Gargantext.Database.NodeNodeNgram
import Gargantext.Database.NodeNgramNgram
--import Gargantext.Database.Simple
--import Gargantext.Database.NodeType
--import Gargantext.Database.InsertNode
import Gargantext.Prelude
import Database.PostgreSQL.Simple (Connection)
import Data.Text (Text)
import Opaleye hiding (FromField)
import Data.Aeson
import Data.ByteString (ByteString)
import Data.List (last, concat)
type UserId = Int
--type NodeId = Int
-- List of NodeId
-- type PWD a = PWD UserId [a]
type PWD = [NodeId]
--data PWD' a = a | PWD' [a]
-- | TODO get Children or Node
get :: Connection -> PWD -> IO [Node Value]
get _ [] = pure []
get conn pwd = runQuery conn $ selectNodesWithParentID (last pwd)
-- | Home, need to filter with UserId
home :: Connection -> IO PWD
home c = map node_id <$> getNodesWithParentId c 0 Nothing
-- | ls == get Children
ls :: Connection -> PWD -> IO [Node Value]
ls = get
tree :: Connection -> PWD -> IO [Node Value]
tree c p = do
ns <- get c p
cs <- mapM (\p' -> get c [p']) $ map node_id ns
pure $ ns <> (concat cs)
-- | TODO
post :: Connection -> PWD -> [NodeWrite'] -> IO Int64
post _ [] _ = pure 0
post _ _ [] = pure 0
post c pth ns = mkNode c (last pth) ns
postR :: Connection -> PWD -> [NodeWrite'] -> IO [Int]
postR _ [] _ = pure [0]
postR _ _ [] = pure [0]
postR c pth ns = mkNodeR c (last pth) ns
rm :: Connection -> PWD -> [NodeId] -> IO Int
rm = del
del :: Connection -> PWD -> [NodeId] -> IO Int
del _ [] _ = pure 0
del _ _ [] = pure 0
del c pth ns = deleteNodes c ns
put :: Connection -> PWD -> [a] -> IO Int64
put = undefined
-- | TODO
-- cd (Home UserId) | (Node NodeId)
-- cd Path
-- jump NodeId
-- touch Dir
--------------------------------------------------------------
-- Tests
--------------------------------------------------------------
home' :: IO PWD
home' = do
c <- connectGargandb "gargantext.ini"
home c
ls' :: IO [Node Value]
ls' = do
c <- connectGargandb "gargantext.ini"
h <- home c
ls c h
tree' :: IO [Node Value]
tree' = do
c <- connectGargandb "gargantext.ini"
h <- home c
tree c h
post' :: IO [Int]
post' = do
c <- connectGargandb "gargantext.ini"
pid <- last <$> home c
let uid = 1
postNode c uid pid (Node' Corpus "Premier corpus" "{}" [ Node' Document "Doc1" "{}" []
, Node' Document "Doc2" "{}" []
, Node' Document "Doc3" "{}" []
]
)
del' :: [NodeId] -> IO Int
del' ns = do
c <- connectGargandb "gargantext.ini"
h <- home c
del c h ns
......@@ -21,6 +21,10 @@ Portability : POSIX
module Gargantext.Database.Node where
import GHC.Int (Int64)
import Data.Maybe
import Data.Time (UTCTime)
import Database.PostgreSQL.Simple.FromField ( Conversion
, ResultError(ConversionFailed)
, FromField
......@@ -28,6 +32,7 @@ import Database.PostgreSQL.Simple.FromField ( Conversion
, returnError
)
import Prelude hiding (null, id, map, sum)
import Data.Time.Segment (jour, timesAfter, Granularity(D))
import Gargantext.Core.Types
import Gargantext.Core.Types.Node (NodeType)
......@@ -43,10 +48,15 @@ import Data.Maybe (Maybe, fromMaybe)
import Data.Text (Text)
import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
import Data.Typeable (Typeable)
import qualified Data.ByteString.Internal as DBI
import qualified Data.ByteString as DB
import qualified Data.ByteString.Lazy as DBL
import Data.ByteString (ByteString)
import Database.PostgreSQL.Simple (Connection)
import Opaleye hiding (FromField)
import Opaleye.Internal.QueryArr (Query(..))
import qualified Data.Profunctor.Product as PP
-- | Types for Node Database Management
data PGTSVector
......@@ -78,7 +88,7 @@ instance QueryRunnerColumnDefault PGJsonb HyperdataUser where
fromField' :: (Typeable b, FromJSON b) => Field -> Maybe DBI.ByteString -> Conversion b
fromField' :: (Typeable b, FromJSON b) => Field -> Maybe DB.ByteString -> Conversion b
fromField' field mb = do
v <- fromField field mb
valueToHyperdata v
......@@ -89,7 +99,7 @@ fromField' field mb = do
$(makeAdaptorAndInstance "pNode" ''NodePoly)
$(makeLensesWith abbreviatedFields ''NodePoly)
$(makeLensesWith abbreviatedFields ''NodePoly)
nodeTable :: Table NodeWrite NodeRead
......@@ -105,12 +115,40 @@ nodeTable = Table "nodes" (pNode Node { node_id = optional "id"
)
nodeTable' :: Table (Maybe (Column PGInt4)
, Column PGInt4
, Column PGInt4
, Column PGInt4
, Column PGText
,Maybe (Column PGTimestamptz)
, Column PGJsonb
)
((Column PGInt4)
, Column PGInt4
, Column PGInt4
, Column PGInt4
, Column PGText
,(Column PGTimestamptz)
, Column PGJsonb
)
nodeTable' = Table "nodes" (PP.p7 ( optional "id"
, required "typename"
, required "user_id"
, required "parent_id"
, required "name"
, optional "date"
, required "hyperdata"
)
)
queryNodeTable :: Query NodeRead
queryNodeTable = queryTable nodeTable
selectNodes :: Column PGInt4 -> Query NodeRead
selectNodes id = proc () -> do
selectNode :: Column PGInt4 -> Query NodeRead
selectNode id = proc () -> do
row <- queryNodeTable -< ()
restrict -< node_id row .== id
returnA -< row
......@@ -142,13 +180,11 @@ selectNodesWith' parentId maybeNodeType = proc () -> do
deleteNode :: Connection -> Int -> IO Int
deleteNode conn n = fromIntegral
<$> runDelete conn nodeTable
deleteNode conn n = fromIntegral <$> runDelete conn nodeTable
(\(Node n_id _ _ _ _ _ _) -> n_id .== pgInt4 n)
deleteNodes :: Connection -> [Int] -> IO Int
deleteNodes conn ns = fromIntegral
<$> runDelete conn nodeTable
deleteNodes conn ns = fromIntegral <$> runDelete conn nodeTable
(\(Node n_id _ _ _ _ _ _) -> in_ ((map pgInt4 ns)) n_id)
......@@ -164,6 +200,11 @@ getNodesWithParentId :: Connection -> Int
-> Maybe Text -> IO [Node HyperdataDocument]
getNodesWithParentId conn n _ = runQuery conn $ selectNodesWithParentID n
getNodesWithParentId' :: Connection -> Int
-> Maybe Text -> IO [Node Value]
getNodesWithParentId' conn n _ = runQuery conn $ selectNodesWithParentID n
selectNodesWithParentID :: Int -> Query NodeRead
selectNodesWithParentID n = proc () -> do
row@(Node _ _ _ parent_id _ _ _) <- queryNodeTable -< ()
......@@ -181,12 +222,103 @@ selectNodesWithType type_id = proc () -> do
restrict -< tn .== type_id
returnA -< row
getNode' :: Connection -> Int -> IO (Node Value)
getNode' c id = do
fromMaybe (error "TODO: 404") . headMay <$> runQuery c (limit 1 $ selectNode (pgInt4 id))
getNode :: Connection -> Int -> IO (Node HyperdataDocument)
getNode conn id = do
fromMaybe (error "TODO: 404") . headMay <$> runQuery conn (limit 1 $ selectNodes (pgInt4 id))
fromMaybe (error "TODO: 404") . headMay <$> runQuery conn (limit 1 $ selectNode (pgInt4 id))
getNodesWithType :: Connection -> Column PGInt4 -> IO [Node HyperdataDocument]
getNodesWithType conn type_id = do
runQuery conn $ selectNodesWithType type_id
type UserId = NodeId
type TypeId = Int
------------------------------------------------------------------------
-- Quick and dirty
------------------------------------------------------------------------
type NodeWrite' = NodePoly (Maybe Int) Int Int (ParentId) Text (Maybe UTCTime) ByteString
--node :: UserId -> ParentId -> NodeType -> Text -> Value -> NodeWrite'
node :: UserId -> ParentId -> NodeType -> Text -> Value -> NodeWrite'
node userId parentId nodeType name nodeData = Node Nothing typeId userId parentId name Nothing byteData
where
typeId = nodeTypeId nodeType
byteData = DB.pack $ DBL.unpack $ encode nodeData
node2write pid (Node id tn ud _ nm dt hp) = ((pgInt4 <$> id)
,(pgInt4 tn)
,(pgInt4 ud)
,(pgInt4 pid)
,(pgStrictText nm)
,(pgUTCTime <$> dt)
,(pgStrictJSONB hp)
)
mkNode :: Connection -> ParentId -> [NodeWrite'] -> IO Int64
mkNode conn pid ns = runInsertMany conn nodeTable' $ map (node2write pid) ns
mkNodeR :: Connection -> ParentId -> [NodeWrite'] -> IO [Int]
mkNodeR conn pid ns = runInsertManyReturning conn nodeTable' (map (node2write pid) ns) (\(i,_,_,_,_,_,_) -> i)
------------------------------------------------------------------------
-- TODO Hierachy of Nodes
-- post and get same types Node' and update if changes
{- TODO semantic to achieve
post c uid pid [ Node' Corpus "name" "{}" []
, Node' Folder "name" "{}" [Node' Corpus "test 2" "" [ Node' Document "title" "metaData" []
, Node' Document "title" "jsonData" []
]
]
]
-}
------------------------------------------------------------------------
-- TODO
-- currently this function remove the child relation
-- needs a Temporary type between Node' and NodeWriteT
node2table :: UserId -> ParentId -> Node' -> [NodeWriteT]
node2table uid pid (Node' nt txt v []) = [( Nothing, (pgInt4$ nodeTypeId nt), (pgInt4 uid), (pgInt4 pid)
, pgStrictText txt, Nothing, pgStrictJSONB $ DB.pack $ DBL.unpack $ encode v)]
node2table _ _ (Node' _ _ _ _) = panic "node2table: should not happen, Tree insert not implemented yet"
data Node' = Node' { _n_type :: NodeType
, _n_name :: Text
, _n_data :: Value
, _n_children :: [Node']
} deriving (Show)
type NodeWriteT = ( Maybe (Column PGInt4)
, Column PGInt4, Column PGInt4
, Column PGInt4, Column PGText
, Maybe (Column PGTimestamptz)
, Column PGJsonb
)
mkNode' :: Connection -> [NodeWriteT] -> IO Int64
mkNode' conn ns = runInsertMany conn nodeTable' ns
mkNodeR' :: Connection -> [NodeWriteT] -> IO [Int]
mkNodeR' conn ns = runInsertManyReturning conn nodeTable' ns (\(i,_,_,_,_,_,_) -> i)
postNode :: Connection -> UserId -> ParentId -> Node' -> IO [Int]
postNode c uid pid (Node' nt txt v []) = mkNodeR' c (node2table uid pid (Node' nt txt v []))
postNode c uid pid (Node' Corpus txt v ns) = do
[pid'] <- postNode c uid pid (Node' Corpus txt v [])
pids <- mkNodeR' c $ concat $ (map (\(Node' Document txt v _) -> node2table uid pid' $ Node' Document txt v []) ns)
pure (pids)
postNode c uid pid (Node' _ _ _ _) = panic "postNode for this type not implemented yet"
{-|
Module : Gargantext.Pipeline
Description : Server API
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Pipeline
where
import Data.Text.IO (readFile)
import Control.Arrow ((***))
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M
import qualified Data.List as L
import Data.Tuple.Extra (both)
----------------------------------------------
import Gargantext.Core (Lang(FR))
import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
import Gargantext.Viz.Graph.Index (Index)
import Gargantext.Text.Metrics.Count (cooc, removeApax)
import Gargantext.Text.Metrics
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
{-
____ _ _
/ ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
| | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
| |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
\____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
|___/
-}
pipeline path = do
-- Text <- IO Text <- FilePath
text <- readFile path
let contexts = splitBy (Sentences 5) text
myterms <- extractTerms Multi FR contexts
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
let myCooc = removeApax $ cooc myterms
--let (ti, fi) = createIndices myCooc
pure True
--pure $ incExcSpeGen myCooc
-- Cooc -> Matrix
-- -- filter by spec/gen (dynmaic programming)
-- let theScores = M.filter (>0) $ score conditional myCoocFiltered
----
------ -- Matrix -> Clustering
------ pure $ bestpartition False $ map2graph $ toIndex ti theScores
-- partitions <- cLouvain theScores
-- pure partitions
---- | Building : -> Graph -> JSON
......@@ -19,20 +19,23 @@ commentary with @some markup@.
module Gargantext.Prelude
( module Gargantext.Prelude
, module Protolude
, headMay
, headMay, lastMay
, module Text.Show
, module Text.Read
, cs
, module Data.Maybe
, sortWith
)
where
import GHC.Exts (sortWith)
import Data.Maybe (isJust, fromJust, maybe)
import Protolude ( Bool(True, False), Int, Double, Integer
import Protolude ( Bool(True, False), Int, Int64, Double, Integer
, Fractional, Num, Maybe(Just,Nothing)
, Enum, Bounded, Float
, Floating, Char, IO
, pure, (<*>), (<$>), panic
, pure, (>>=), (=<<), (<*>), (<$>), panic
, putStrLn
, head, flip
, Ord, Integral, Foldable, RealFrac, Monad, filter
......@@ -50,6 +53,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, undefined
, IO()
, compare
, on
)
-- TODO import functions optimized in Utils.Count
......@@ -63,7 +67,7 @@ import qualified Data.Map as M
import Data.Map.Strict (insertWith)
import qualified Data.Vector as V
import Safe (headMay)
import Safe (headMay, lastMay)
import Text.Show (Show(), show)
import Text.Read (Read())
import Data.String.Conversions (cs)
......@@ -109,7 +113,7 @@ ma = movingAverage 3
-- | splitEvery n == chunkAlong n n
splitEvery :: Int -> [a] -> [[a]]
splitEvery _ [] = L.cycle [[]]
splitEvery _ [] = []
splitEvery n xs =
let (h,t) = L.splitAt n xs
in h : splitEvery n t
......@@ -235,5 +239,5 @@ unMaybe :: [Maybe a] -> [a]
unMaybe = map fromJust . L.filter isJust
-- maximumWith
maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
maximumWith f = L.maximumBy (compare `on` f)
......@@ -37,9 +37,6 @@ type Context = Text -> [Text]
data Viz = Graph | Phylo | Chart
pipeline :: Config -> Text -> Viz
pipeline = undefined
-----------------------------------------------------------------
-------------------------------------------------------------------
......
......@@ -16,6 +16,7 @@ noApax m = M.filter (>1) m
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
......@@ -23,16 +24,19 @@ module Gargantext.Text.Metrics
where
import Data.Text (Text, pack)
import Data.Ord (comparing, Down(..))
import Data.Map (Map)
import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
import Data.Tuple.Extra (both)
--import GHC.Real (Ratio)
--import qualified Data.Text.Metrics as DTM
import Data.Array.Accelerate (toList)
import Math.KMeans (kmeans, euclidSq, elements)
import Gargantext.Prelude
......@@ -46,26 +50,99 @@ import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Viz.Graph.Distances.Matrice
import Gargantext.Viz.Graph.Index
import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.Array.Accelerate as DAA
-- import Data.Array.Accelerate ((:.)(..), Z(..))
import GHC.Real (round)
import Debug.Trace
import Prelude (seq)
-- ord relevance: top n plus inclus
-- échantillonnage de généricity
--
--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m =
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
----(ti, fi) = createIndices m
-- . fromIndex fi $ filterMat $ cooc2mat ti m
data MapListSize = MapListSize Int
data InclusionSize = InclusionSize Int
data SampleBins = SampleBins Double
data Clusters = Clusters Int
data DefaultValue = DefaultValue Int
data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
, fc_inclusionSize :: InclusionSize
, fc_sampleBins :: SampleBins
, fc_clusters :: Clusters
, fc_defaultValue :: DefaultValue
}
filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc
where
ts = map _scored_terms $ takeSome fc $ coocScored cc
import Data.Array.Accelerate (Matrix)
filterMat :: Matrix Int -> [(Index, Index)]
filterMat m = S.toList $ S.take n $ S.fromList $ (L.take nIe incExc') <> (L.take nSg speGen')
filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty selection
where
selection = [(x,y) | x <- ts
, y <- ts
-- , x >= y
]
-- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
$ takeSample n m
$ L.take l' $ sortWith (Down . _scored_incExc) scores
-- $ splitKmeans k scores
where
(incExc', speGen') = both ( map fst . L.sortOn snd . M.toList . mat2map) (conditional' m)
n = nIe + nSg
nIe = 30
nSg = 70
-- TODO: benchmark with accelerate-example kmeans version
splitKmeans x xs = L.concat $ map elements
$ V.take (k-1)
$ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
euclidSq x xs
n = round ((fromIntegral l)/s)
m = round $ (fromIntegral $ length scores) / (s)
takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
L.concat $ map (L.take n)
$ map (sortWith (Down . _scored_incExc))
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
-- without homogeneous order hypothesis
$ splitEvery m
$ sortWith (Down . _scored_speGen) xs
data Scored t = Scored { _scored_terms :: !t
, _scored_incExc :: !InclusionExclusion
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
coocScored :: Ord t => Map (t,t) Int -> [Scored t]
coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
where
(ti,fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m
scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
......@@ -73,8 +150,7 @@ incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
where
(ti,fi) = createIndices m
ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)
......@@ -106,7 +182,7 @@ metrics_sentences_Test = metrics_sentences == metrics_sentences'
-}
metrics_terms :: IO [[Terms]]
metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
-- | Occurrences
{-
......
......@@ -75,14 +75,15 @@ type Grouped = Stems
type Occs = Int
type Coocs = Int
removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
removeApax = DMS.filter (> 1)
type Threshold = Int
removeApax :: Threshold -> Map (Label, Label) Int -> Map (Label, Label) Int
removeApax t = DMS.filter (> t)
cooc :: [[Terms]] -> Map (Label, Label) Int
cooc tss = coocOnWithLabel _terms_stem (labelPolicy terms_occs) tss
cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
where
terms_occs = occurrencesOn _terms_stem (List.concat tss)
label_policy = mkLabelPolicy terms_occs
coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
......@@ -93,10 +94,21 @@ coocOnWithLabel on policy tss =
delta f = f *** f
mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped Label
mkLabelPolicy = DMS.map f where
f = _terms_label . fst . maximumWith snd . DMS.toList
-- TODO use the Foldable instance of Map instead of building a list
useLabelPolicy :: Map Grouped Label -> Grouped -> Label
useLabelPolicy m g = case DMS.lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
{-
labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
-}
coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
......
......@@ -25,7 +25,7 @@ import Control.Applicative
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length)
import Data.Text (Text, pack, length, intercalate)
import qualified Data.ByteString.Lazy as BL
import Data.Vector (Vector)
......@@ -68,9 +68,8 @@ fromDocs docs = V.map fromDocs' docs
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
if docSize > 1000
then
if (mod (round m) docSize) >= 10
......@@ -101,18 +100,18 @@ type Mean = Double
docsSize :: Vector CsvDoc -> Mean
docsSize csvDoc = mean ls
where
ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
---------------------------------------------------------------
data CsvDoc = CsvDoc
{ c_title :: !Text
, c_source :: !Text
, c_publication_year :: !Int
, c_publication_month :: !Int
, c_publication_day :: !Int
, c_abstract :: !Text
, c_authors :: !Text
{ csv_title :: !Text
, csv_source :: !Text
, csv_publication_year :: !Int
, csv_publication_month :: !Int
, csv_publication_day :: !Int
, csv_abstract :: !Text
, csv_authors :: !Text
}
deriving (Show)
......@@ -147,12 +146,19 @@ csvEncodeOptions = ( defaultEncodeOptions
{encDelimiter = fromIntegral $ ord '\t'}
)
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readCsv fp
------------------------------------------------------------------------
readCsv :: FilePath -> IO (Header, Vector CsvDoc)
readCsv fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
Left e -> panic (pack e)
Left e -> panic (pack e)
Right csvDocs -> pure csvDocs
......
......@@ -42,23 +42,23 @@ import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms')
data TermType = Mono | Multi | MonoMulti
data TermType lang = Mono lang | Multi lang | MonoMulti lang
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
-- | Sugar to extract terms from text (hiddeng mapM from end user).
extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
extractTerms termType lang = mapM (terms termType lang)
extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms termTypeLang = mapM (terms termTypeLang)
------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
-- MonoMulti : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType -> Lang -> Text -> IO [Terms]
terms Mono lang txt = pure $ monoterms' lang txt
terms Multi lang txt = multiterms lang txt
terms MonoMulti lang txt = terms Multi lang txt
terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoterms' lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
------------------------------------------------------------------------
{-|
Module : Gargantext.TextFlow
Description : Server API
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
From text to viz, all the flow of texts in Gargantext.
-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.TextFlow
where
import qualified Data.Text as T
import Data.Text.IO (readFile)
import Control.Arrow ((***))
import Data.Map.Strict (Map)
import qualified Data.Array.Accelerate as A
import qualified Data.Map.Strict as M
import qualified Data.List as L
import Data.Tuple.Extra (both)
----------------------------------------------
import Gargantext.Core (Lang(FR))
import Gargantext.Core.Types (Label)
import Gargantext.Prelude
import Prelude (print, seq)
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
import Gargantext.Viz.Graph.Index (Index)
import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
import Gargantext.Text.Metrics.Count (cooc)
import Gargantext.Text.Metrics
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Text.Parsers.CSV
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
{-
____ _ _
/ ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
| | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
| |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
\____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
|___/
-}
data TextFlow = CSV | FullText
-- workflow :: Lang (EN|FR) -> FilePath -> Graph
textflow termsLang workType path = do
-- Text <- IO Text <- FilePath
contexts <- case workType of
FullText -> splitBy (Sentences 5) <$> readFile path
CSV -> readCsvOn [csv_title, csv_abstract] path
-- Context :: Text -> [Text]
-- Contexts = Paragraphs n | Sentences n | Chars n
myterms <- extractTerms (Mono FR) contexts
-- TermsType = Mono | Multi | MonoMulti
-- myterms # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList|Ontology)
printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list
-- compute copresences of terms
-- Cooc = Map (Term, Term) Int
let myCooc1 = cooc myterms
printDebug "myCooc1" (M.size myCooc1)
-- Remove Apax: appears one time only => lighting the matrix
let myCooc2 = M.filter (>1) myCooc1
printDebug "myCooc2" (M.size myCooc2)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
(InclusionSize 4000 )
(SampleBins 10 )
(Clusters 3 )
(DefaultValue 0 )
) myCooc2
printDebug "myCooc3" $ M.size myCooc3
-- Cooc -> Matrix
let (ti, fi) = createIndices myCooc3
printDebug "ti" $ M.size ti
let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat (0) (M.size ti) myCooc4
--printDebug "matCooc" matCooc
-- Matrix -> Clustering
let distanceMat = conditional matCooc
-- let distanceMat = distributional matCooc
printDebug "distanceMat" $ A.arrayShape distanceMat
--printDebug "distanceMat" distanceMat
--
let distanceMap = mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap
--{-
-- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance
---}
partitions <- cLouvain distanceMap
------ | Building : -> Graph -> JSON
printDebug "partitions" $ length partitions
--printDebug "partitions" partitions
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
-----------------------------------------------------------
-- distance should not be a map since we just "toList" it (same as cLouvain)
data2graph :: [(Label, Int)] -> Map (Int, Int) Int
-> Map (Int, Int) Double
-> [LouvainNode]
-> Graph
data2graph labels coocs distance partitions = Graph nodes edges
where
community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
, n_type = Terms -- or Unknown
, n_id = cs (show n)
, n_label = T.unwords l
, n_attributes =
Attributes { clust_default = maybe 0 identity
(M.lookup n community_id_by_node_id) } }
| (l, n) <- labels ]
edges = [ Edge { e_source = s
, e_target = t
, e_weight = w
, e_id = i }
| (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
-----------------------------------------------------------
printDebug msg x = putStrLn $ msg <> " " <> show x
--printDebug _ _ = pure ()
......@@ -19,11 +19,14 @@ module Gargantext.Viz.Graph
import GHC.Generics (Generic)
import Data.Aeson.TH (deriveJSON)
import Data.Text (Text)
import Data.Map (Map)
import Gargantext.Prelude
import Gargantext.Core.Utils.Prefix (unPrefix)
-----------------------------------------------------------
import Data.Graph.Clustering.Louvain.CplusPlus (LouvainNode)
data TypeNode = Terms | Unknown
deriving (Show, Generic)
......@@ -55,8 +58,6 @@ data Graph = Graph { g_nodes :: [Node]
}
deriving (Show, Generic)
$(deriveJSON (unPrefix "g_") ''Graph)
-----------------------------------------------------------
......@@ -109,7 +109,6 @@ conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
where
ie :: Acc (Matrix Double) -> Acc (Matrix Double)
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double)
......@@ -149,8 +148,6 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
crossProduct m = zipWith (*) (cross m ) (cross (transpose m))
cross mat = zipWith (-) (mkSum n mat) (mat)
-----------------------------------------------------------------------
-----------------------------------------------------------------------
......
......@@ -24,6 +24,8 @@ extra-deps:
- fullstop-0.1.4
- haskell-src-exts-1.18.2
- http-types-0.12.1
- kmeans-vector-0.3.2
- probable-0.1.3
- protolude-0.2
- servant-0.13
- servant-auth-0.3.0.1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment