[WORKFLOW] clean, issue in map2mat: diagonal == 0.

15511563 · Alexandre Delanoë · e079af35 · 15511563 · 15511563 · 15511563
Commit 15511563 authored Jun 12, 2018 by Alexandre Delanoë
4 changed files
--- a/src/Gargantext/Pipeline.hs
+++ b/src/Gargantext/Pipeline.hs
@@ -30,10 +30,10 @@ import Gargantext.Prelude
 import Prelude (print, seq)

 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
-import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
+import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
 import Gargantext.Viz.Graph.Index (Index)
 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
-import Gargantext.Text.Metrics.Count (cooc, removeApax)
+import Gargantext.Text.Metrics.Count (cooc)
 import Gargantext.Text.Metrics
 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
@@ -51,19 +51,80 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))

 -}

+workflow lang path = do
+  -- Text  <- IO Text <- FilePath
+  text     <- readFile path
+
+  -- context :: Text -> [Text]
+  let contexts = splitBy (Sentences 5) text
+
+  myterms <- extractTerms Mono lang contexts
+  -- myterms <- extractTerms (Mono lang) contexts # filter (\t -> not . elem t stopList)
+  --                                              # groupBy (Stem|GroupList)
+  printDebug "myterms" (sum $ map length myterms)
+
+  -- Bulding the map list
+  let myCooc1 = cooc myterms
+  printDebug "myCooc1" (M.size myCooc1)
+
+  -- Remove Apax: appears one time only => lighting the matrix
+  let myCooc2 = M.filter (>1) myCooc1
+  printDebug "myCooc2" (M.size myCooc2)
+  
+  -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
+  let myCooc3 = filterCooc ( FilterConfig (MapListSize     20 )
+                                          (InclusionSize 1000 )
+                                          (SampleBins      10 )
+                                          (Clusters         3 )
+                                          (DefaultValue   (-1))
+                           ) myCooc2
+  printDebug "myCooc3" $ M.size myCooc3
+
+  -- Cooc -> Matrix
+  let (ti, fi) = createIndices myCooc3
+  printDebug "ti" $ M.size ti
+
+  let myCooc4 = toIndex ti myCooc3
+  printDebug "myCooc4" $ M.size myCooc4
+
+  let matCooc = map2mat (-2) (M.size ti) myCooc4
+  printDebug "matCooc" matCooc
+  pure matCooc
+  -- Matrix -> Clustering
+  --let distanceMat = conditional matCooc
+--  let distanceMat = distributional matCooc
+--  printDebug "distanceMat" $ A.arrayShape distanceMat
+--  printDebug "distanceMat" distanceMat
+-- 
+--  let distanceMap = mat2map distanceMat
+--  printDebug "distanceMap" $ M.size distanceMap
+--{-
+--  let distance = fromIndex fi distanceMap
+--  printDebug "distance" $ M.size distance
+---}
+--  partitions <- cLouvain distanceMap
+------ | Building : -> Graph -> JSON
+--  printDebug "partitions" $ length partitions
+--  pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
+
+
+
 -----------------------------------------------------------
 -- distance should not be a map since we just "toList" it (same as cLouvain)
-data2graph :: [(Label, Int)] -> Map (Int, Int) Int -> Map (Int, Int) Double -> [LouvainNode] -> Graph
+data2graph :: [(Label, Int)] -> Map (Int, Int) Int
+                             -> Map (Int, Int) Double
+                             -> [LouvainNode]
+              -> Graph
 data2graph labels coocs distance partitions = Graph nodes edges
  where
    community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
-    nodes = [ Node { n_size = coocs M.! (n, n) -- TODO lookup with default ?
+    nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
                   , n_type = Terms -- or Unknown
                   , n_id = cs (show n)
                   , n_label = T.unwords l
-                   , n_attributes =
-                       -- TODO lookup with default ?
-                       Attributes { clust_default = community_id_by_node_id M.! n } }
+                   , n_attributes = 
+                     Attributes { clust_default = maybe 0 identity 
+                                (M.lookup n community_id_by_node_id) } }
            | (l, n) <- labels ]
    edges = [ Edge { e_source = s
                   , e_target = t
@@ -72,42 +133,9 @@ data2graph labels coocs distance partitions = Graph nodes edges
            | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
 -----------------------------------------------------------

-- printDebug msg x = putStrLn $ msg <> " " <> show x
-printDebug _ _ = pure ()
+printDebug msg x = putStrLn $ msg <> " " <> show x
+--printDebug _ _ = pure ()
+
+

-workflow lang path = do
-  -- Text  <- IO Text <- FilePath
-  text     <- readFile path
-  let contexts = splitBy (Sentences 5) text
-  myterms <- extractTerms Mono lang contexts
-  printDebug "myterms" $ sum $ map length myterms
-  
-  -- TODO    filter (\t -> not . elem t stopList) myterms
-  -- TODO    groupBy (Stem | GroupList)
-  
-  let myCooc1 = cooc myterms
-  printDebug "myCooc1" $ M.size myCooc1
-  let myCooc2 = removeApax myCooc1
-  printDebug "myCooc2" $ M.size myCooc2
-  let myCooc3 = filterCooc myCooc2
-  printDebug "myCooc3" $ M.size myCooc3
-  -- Cooc -> Matrix
-  let (ti, fi) = createIndices myCooc3
-  printDebug "ti" $ M.size ti
-  let myCooc4 = toIndex ti myCooc3
-  printDebug "myCooc4" $ M.size myCooc4
-  let matCooc = map2mat 0 (M.size ti) myCooc4
-  -- Matrix -> Clustering
-  let distanceMat = conditional matCooc
-  printDebug "distanceMat" $ A.arrayShape distanceMat
-  let distanceMap = mat2map distanceMat
-  printDebug "distanceMap" $ M.size distanceMap
-{-
-  let distance = fromIndex fi distanceMap
-  printDebug "distance" $ M.size distance
-}
-  partitions <- cLouvain distanceMap
---- | Building : -> Graph -> JSON
-  printDebug "partitions" $ length partitions
-  pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions

--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -58,48 +58,58 @@ import GHC.Real (round)
 import Debug.Trace
 import Prelude (seq)

-filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
-filterCooc cc = filterCooc' ts cc
+data MapListSize   = MapListSize   Int
+data InclusionSize = InclusionSize Int
+data SampleBins    = SampleBins    Double
+data Clusters      = Clusters      Int
+data DefaultValue  = DefaultValue  Int
+
+data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
+                                 , fc_inclusionSize :: InclusionSize
+                                 , fc_sampleBins    :: SampleBins
+                                 , fc_clusters      :: Clusters
+                                 , fc_defaultValue  :: DefaultValue
+                             }
+
+filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
+filterCooc fc cc = (filterCooc' fc) ts cc
  where
-    ts     = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
+    ts     = map _scored_terms $ takeSome fc $ coocScored cc

-filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
-filterCooc' ts m = -- trace ("coocScored " <> show (length ts)) $
-  foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m')
+
+filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
+filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
+  foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
    M.empty selection
  where
-    errMessage = panic "Filter cooc: no key"
    selection  = [(x,y) | x <- ts, y <- ts, x > y]


-type MapListSize = Int
-type SampleBins  = Double
-type Clusters    = Int
-
 -- | Map list creation
 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
 -- each parts is then ordered by Inclusion/Exclusion
 -- take n scored terms in each parts where n * SampleBins = MapListSize.
-takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
-takeSome l s k scores = L.take l
+takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
+takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
                    $ takeSample n m
-                    $ splitKmeans k scores
+                    $ L.take l' $ L.reverse $ L.sortOn _scored_incExc scores
+                    -- $ splitKmeans k scores
  where
    -- TODO: benchmark with accelerate-example kmeans version
-    splitKmeans x xs = elements
-                     $ V.head
+    splitKmeans x xs = L.concat $ map elements
+                     $ V.take (k-1)
                     $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
                              euclidSq x xs
    n = round ((fromIntegral l)/s)
    m = round $ (fromIntegral $ length scores) / (s)
    takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
                        L.concat $ map (L.take n)
-                                 $ L.reverse $ map (L.sortOn _scored_incExc)
+                                 $ map (reverse . (L.sortOn _scored_incExc))
                                 -- TODO use kmeans s instead of splitEvery
                                 -- in order to split in s heteregenous parts
                                 -- without homogeneous order hypothesis
-                                 $ splitEvery m 
+                                 $ splitEvery m
                                 $ L.reverse $ L.sortOn _scored_speGen xs



--- a/src/Gargantext/Text/Metrics/Count.hs
+++ b/src/Gargantext/Text/Metrics/Count.hs
@@ -75,9 +75,9 @@ type Grouped = Stems

 type Occs = Int
 type Coocs = Int
-
-removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
-removeApax = DMS.filter (> 1)
+type Threshold = Int
+removeApax :: Threshold -> Map (Label, Label) Int -> Map (Label, Label) Int
+removeApax t = DMS.filter (> t)

 cooc :: [[Terms]] -> Map (Label, Label) Int
 cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss

--- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs
@@ -148,8 +148,6 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
    crossProduct m = zipWith (*) (cross m  ) (cross (transpose m))
    cross mat      = zipWith (-) (mkSum n mat) (mat)

-
-
 -----------------------------------------------------------------------
 -----------------------------------------------------------------------