Some progress on workflow, the data2graph fails because of unknown nodes, see the TODO

parent b56988d8
......@@ -24,6 +24,7 @@ library:
# - -Werror
exposed-modules:
- Gargantext
- Gargantext.Pipeline
- Gargantext.Prelude
- Gargantext.Core
- Gargantext.Core.Types
......@@ -122,25 +123,38 @@ library:
- zlib
# - utc
executable:
main: Main.hs
source-dirs: app
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
executables:
gargantext:
main: Main.hs
source-dirs: app
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- base
- containers
- gargantext
- vector
- cassava
- ini
- optparse-generic
- unordered-containers
- full-text-search
gargantext-workflow:
main: Main.hs
source-dirs: app-workflow
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- base
- containers
- gargantext
- vector
- cassava
- ini
- optparse-generic
- unordered-containers
- full-text-search
tests:
garg-test:
......
......@@ -14,27 +14,31 @@ Portability : POSIX
module Gargantext.Pipeline
where
import qualified Data.Text as T
import Data.Text.IO (readFile)
import Control.Arrow ((***))
import Data.Map.Strict (Map)
import qualified Data.Array.Accelerate as A
import qualified Data.Map.Strict as M
import qualified Data.List as L
import Data.Tuple.Extra (both)
----------------------------------------------
import Gargantext.Core (Lang(FR))
import Gargantext.Core.Types (Label)
import Gargantext.Prelude
import Prelude (print, seq)
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
import Gargantext.Viz.Graph.Index (Index)
import Gargantext.Viz.Graph (Graph)
import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
import Gargantext.Text.Metrics.Count (cooc, removeApax)
import Gargantext.Text.Metrics
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode)
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
{-
......@@ -48,30 +52,62 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode)
-}
-----------------------------------------------------------
data2graph :: Map (Int, Int) Int -> Map (Int, Int) Double -> [LouvainNode] -> Graph
data2graph = undefined
-- distance should not be a map since we just "toList" it (same as cLouvain)
data2graph :: [(Label, Int)] -> Map (Int, Int) Int -> Map (Int, Int) Double -> [LouvainNode] -> Graph
data2graph labels coocs distance partitions = Graph nodes edges
where
community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
nodes = [ Node { n_size = coocs M.! (n, n) -- TODO lookup with default ?
, n_type = Terms -- or Unknown
, n_id = cs (show n)
, n_label = T.unwords l
, n_attributes =
-- TODO lookup with default ?
Attributes { clust_default = community_id_by_node_id M.! n } }
| (l, n) <- labels ]
edges = [ Edge { e_source = s
, e_target = t
, e_weight = w
, e_id = i }
| (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
-----------------------------------------------------------
-- printDebug msg x = putStrLn $ msg <> " " <> show x
printDebug _ _ = pure ()
workflow lang path = do
-- Text <- IO Text <- FilePath
text <- readFile path
let contexts = splitBy (Sentences 5) text
myterms <- extractTerms Multi lang contexts
myterms <- extractTerms Mono lang contexts
printDebug "myterms" $ sum $ map length myterms
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
-- @np FIXME optimization issue of filterCooc (too much memory consumed)
let myCooc = filterCooc $ removeApax $ cooc myterms
--pure myCooc
let myCooc1 = cooc myterms
printDebug "myCooc1" $ M.size myCooc1
let myCooc2 = removeApax myCooc1
printDebug "myCooc2" $ M.size myCooc2
let myCooc3 = filterCooc myCooc2
printDebug "myCooc3" $ M.size myCooc3
-- Cooc -> Matrix
let (ti, _) = createIndices myCooc
let (ti, fi) = createIndices myCooc3
printDebug "ti" $ M.size ti
let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat 0 (M.size ti) myCooc4
-- Matrix -> Clustering
let distance = score conditional $ toIndex ti myCooc
partitions <- cLouvain distance
--pure partitions
let distanceMat = conditional matCooc
printDebug "distanceMat" $ A.arrayShape distanceMat
let distanceMap = mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap
{-
let distance = fromIndex fi distanceMap
printDebug "distance" $ M.size distance
-}
partitions <- cLouvain distanceMap
---- | Building : -> Graph -> JSON
pure partitions
--pure $ data2graph myCooc distance partitions
printDebug "partitions" $ length partitions
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
......@@ -50,6 +50,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, undefined
, IO()
, compare
, on
)
-- TODO import functions optimized in Utils.Count
......@@ -235,5 +236,5 @@ unMaybe :: [Maybe a] -> [a]
unMaybe = map fromJust . L.filter isJust
-- maximumWith
maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
maximumWith f = L.maximumBy (compare `on` f)
......@@ -51,16 +51,22 @@ import Gargantext.Viz.Graph.Index
import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.Array.Accelerate as DAA
-- import Data.Array.Accelerate ((:.)(..), Z(..))
import GHC.Real (round)
import Debug.Trace
import Prelude (seq)
filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
filterCooc cc = filterCooc' ts cc
where
ts = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' ts m = foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m') M.empty selection
filterCooc' ts m = -- trace ("coocScored " <> show (length ts)) $
foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m')
M.empty selection
where
errMessage = panic "Filter cooc: no key"
selection = [(x,y) | x <- ts, y <- ts, x > y]
......@@ -87,7 +93,8 @@ takeSome l s k scores = L.take l
euclidSq x xs
n = round ((fromIntegral l)/s)
m = round $ (fromIntegral $ length scores) / (s)
takeSample n m xs = L.concat $ map (L.take n)
takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
L.concat $ map (L.take n)
$ L.reverse $ map (L.sortOn _scored_incExc)
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
......
......@@ -80,9 +80,10 @@ removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
removeApax = DMS.filter (> 1)
cooc :: [[Terms]] -> Map (Label, Label) Int
cooc tss = coocOnWithLabel _terms_stem (labelPolicy terms_occs) tss
cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
where
terms_occs = occurrencesOn _terms_stem (List.concat tss)
label_policy = mkLabelPolicy terms_occs
coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
......@@ -93,10 +94,21 @@ coocOnWithLabel on policy tss =
delta f = f *** f
mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped Label
mkLabelPolicy = DMS.map f where
f = _terms_label . fst . maximumWith snd . DMS.toList
-- TODO use the Foldable instance of Map instead of building a list
useLabelPolicy :: Map Grouped Label -> Grouped -> Label
useLabelPolicy m g = case DMS.lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
{-
labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
-}
coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment