Merge branch 'pipeline'

8a838c7f · Alexandre Delanoë · a630946f · 45c3bb43 · a630946f · 8a838c7f
Commit 8a838c7f authored Jun 12, 2018 by Alexandre Delanoë
10 changed files
--- a/Data.ByteString/Extended.hs
+++ b/Data.ByteString/Extended.hs
-{-# OPTIONS_GHC -fno-warn-orphans #-}
-
-{-|
-Module      : Data.ByteString.Extended
-Description : Short description
-Copyright   : (c) CNRS, 2017
-License     : AGPL + CECILL v3
-Maintainer  : team@gargantext.org
-Stability   : experimental
-Portability : POSIX
-
-Example showing how to extend existing base libraries.
-}
-
-
-module Data.ByteString.Extended ( module Data.ByteString
-                                , replace
-                                ) where
-import Data.ByteString
-
-replace :: ByteString -> ByteString -> ByteString -> ByteString
-replace = undefined
-
-- instance (Binary k, Binary v) => Binary (HaskMap k v) where
-- ...
--- a/app/CleanCsvCorpus.hs
+++ b/app/CleanCsvCorpus.hs
@@ -42,18 +42,16 @@ main = do
  let q = ["gratuit", "gratuité", "culture", "culturel"]

  (h,csvDocs) <- readCsv rPath
-  
+
  putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
  putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
-  
+
  let docs   = toDocs csvDocs
  let engine = insertDocs docs initialDocSearchEngine
  let docIds = S.query engine (map pack q)
  let docs'  = fromDocs $ filterDocs docIds (V.fromList docs)
-  
+
  putStrLn $ "Number of documents after:" <> show (V.length docs')
  putStrLn $ "Mean size of docs:" <> show (docsSize docs')
-  
-  writeCsv wPath (h, docs')
-

+  writeCsv wPath (h, docs')
--- a/package.yaml
+++ b/package.yaml
@@ -24,6 +24,7 @@ library:
    #  - -Werror
  exposed-modules:
  - Gargantext
+  - Gargantext.Pipeline
  - Gargantext.Prelude
  - Gargantext.Core
  - Gargantext.Core.Types
@@ -122,25 +123,38 @@ library:
  - zlib
 #  - utc

-executable:
-  main: Main.hs
-  source-dirs: app
-  ghc-options: 
-  - -threaded 
-  - -rtsopts 
-  - -with-rtsopts=-N 
-  - -O2 
-  - -Wmissing-signatures
-  dependencies:
+executables:
+  gargantext:
+    main: Main.hs
+    source-dirs: app
+    ghc-options:
+    - -threaded
+    - -rtsopts
+    - -with-rtsopts=-N
+    - -O2
+    - -Wmissing-signatures
+    dependencies:
+      - base
+      - containers
+      - gargantext
+      - vector
+      - cassava
+      - ini
+      - optparse-generic
+      - unordered-containers
+      - full-text-search
+  gargantext-workflow:
+    main: Main.hs
+    source-dirs: app-workflow
+    ghc-options:
+    - -threaded
+    - -rtsopts
+    - -with-rtsopts=-N
+    - -O2
+    - -Wmissing-signatures
+    dependencies:
    - base
-    - containers
    - gargantext
-    - vector
-    - cassava
-    - ini
-    - optparse-generic
-    - unordered-containers
-    - full-text-search

 tests:
  garg-test:

--- a/src/Gargantext/Pipeline.hs
+++ b/src/Gargantext/Pipeline.hs
@@ -14,26 +14,31 @@ Portability : POSIX
 module Gargantext.Pipeline
  where

+import qualified Data.Text as T
 import Data.Text.IO (readFile)

 import Control.Arrow ((***))
 import Data.Map.Strict (Map)
+import qualified Data.Array.Accelerate as A
 import qualified Data.Map.Strict as M
 import qualified Data.List       as L
 import Data.Tuple.Extra (both)
 ----------------------------------------------
 import Gargantext.Core (Lang(FR))
+import Gargantext.Core.Types (Label)
 import Gargantext.Prelude
+import Prelude (print, seq)

-import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
-import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
+import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
+import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
 import Gargantext.Viz.Graph.Index (Index)
-import Gargantext.Text.Metrics.Count (cooc, removeApax)
+import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
+import Gargantext.Text.Metrics.Count (cooc)
 import Gargantext.Text.Metrics
 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))

-import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
+import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))


 {-
@@ -46,27 +51,94 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)

 -}

-pipeline path = do
+workflow lang path = do
  -- Text  <- IO Text <- FilePath
  text     <- readFile path
+
  let contexts = splitBy (Sentences 5) text
-  myterms <- extractTerms Multi FR contexts
-  
-  -- TODO    filter (\t -> not . elem t stopList) myterms
-  -- TODO    groupBy (Stem | GroupList)
+  -- Context :: Text -> [Text]
+  -- Contexts = Paragraphs n | Sentences n | Chars n
+
+  myterms <- extractTerms (Mono lang) contexts
+  -- myterms # filter (\t -> not . elem t stopList)
+  --         # groupBy (Stem|GroupList)
+  printDebug "myterms" (sum $ map length myterms)
+
+  -- Bulding the map list
+  -- compute copresences of terms
+  -- Cooc = Map (Term, Term) Int
+  let myCooc1 = cooc myterms
+  printDebug "myCooc1" (M.size myCooc1)
+
+  -- Remove Apax: appears one time only => lighting the matrix
+  let myCooc2 = M.filter (>1) myCooc1
+  printDebug "myCooc2" (M.size myCooc2)
  
-  let myCooc = removeApax $ cooc myterms
-  --let (ti, fi) = createIndices myCooc
-  pure True
-  --pure $ incExcSpeGen myCooc
+  -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
+  let myCooc3 = filterCooc ( FilterConfig (MapListSize     20 )
+                                          (InclusionSize 1000 )
+                                          (SampleBins      10 )
+                                          (Clusters         3 )
+                                          (DefaultValue   (-1))
+                           ) myCooc2
+  printDebug "myCooc3" $ M.size myCooc3
+
  -- Cooc -> Matrix
-  
--  -- filter by spec/gen (dynmaic programming)
--  let theScores = M.filter (>0) $ score conditional myCoocFiltered
----
------  -- Matrix -> Clustering
------  pure $ bestpartition False $ map2graph $ toIndex ti theScores
--  partitions <- cLouvain theScores
--  pure partitions
---- | Building : -> Graph -> JSON
+  let (ti, fi) = createIndices myCooc3
+  printDebug "ti" $ M.size ti
+
+  let myCooc4 = toIndex ti myCooc3
+  printDebug "myCooc4" $ M.size myCooc4
+
+  let matCooc = map2mat (-2) (M.size ti) myCooc4
+  printDebug "matCooc" matCooc
+  pure matCooc
+  -- Matrix -> Clustering
+  --let distanceMat = conditional matCooc
+--  let distanceMat = distributional matCooc
+--  printDebug "distanceMat" $ A.arrayShape distanceMat
+--  printDebug "distanceMat" distanceMat
+-- 
+--  let distanceMap = mat2map distanceMat
+--  printDebug "distanceMap" $ M.size distanceMap
+--{-
+--  let distance = fromIndex fi distanceMap
+--  printDebug "distance" $ M.size distance
+---}
+--  partitions <- cLouvain distanceMap
+------ | Building : -> Graph -> JSON
+--  printDebug "partitions" $ length partitions
+--  pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
+
+
+
+-----------------------------------------------------------
+-- distance should not be a map since we just "toList" it (same as cLouvain)
+data2graph :: [(Label, Int)] -> Map (Int, Int) Int
+                             -> Map (Int, Int) Double
+                             -> [LouvainNode]
+              -> Graph
+data2graph labels coocs distance partitions = Graph nodes edges
+  where
+    community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
+    nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
+                   , n_type = Terms -- or Unknown
+                   , n_id = cs (show n)
+                   , n_label = T.unwords l
+                   , n_attributes = 
+                     Attributes { clust_default = maybe 0 identity 
+                                (M.lookup n community_id_by_node_id) } }
+            | (l, n) <- labels ]
+    edges = [ Edge { e_source = s
+                   , e_target = t
+                   , e_weight = w
+                   , e_id     = i }
+            | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
+-----------------------------------------------------------
+
+printDebug msg x = putStrLn $ msg <> " " <> show x
+--printDebug _ _ = pure ()
+
+
+

--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -50,6 +50,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
                 , undefined
                 , IO()
                 , compare
+                 , on
                 )

 -- TODO import functions optimized in Utils.Count
@@ -109,7 +110,7 @@ ma = movingAverage 3

 -- | splitEvery n == chunkAlong n n
 splitEvery :: Int -> [a] -> [[a]]
-splitEvery _ [] = L.cycle [[]]
+splitEvery _ [] = []
 splitEvery n xs =
  let (h,t) = L.splitAt n xs
  in  h : splitEvery n t
@@ -235,5 +236,5 @@ unMaybe :: [Maybe a] -> [a]
 unMaybe = map fromJust . L.filter isJust

 -- maximumWith
-maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
+maximumWith f = L.maximumBy (compare `on` f)

--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -16,6 +16,7 @@ noApax m = M.filter (>1) m

 -}

+{-# LANGUAGE BangPatterns      #-}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

@@ -50,55 +51,99 @@ import Gargantext.Viz.Graph.Index

 import qualified Data.Array.Accelerate.Interpreter as DAA
 import qualified Data.Array.Accelerate as DAA
+-- import Data.Array.Accelerate ((:.)(..), Z(..))

 import GHC.Real (round)

--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m = 
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
----(ti, fi)  = createIndices m
-- . fromIndex fi $ filterMat $ cooc2mat ti m
+import Debug.Trace
+import Prelude (seq)

+data MapListSize   = MapListSize   Int
+data InclusionSize = InclusionSize Int
+data SampleBins    = SampleBins    Double
+data Clusters      = Clusters      Int
+data DefaultValue  = DefaultValue  Int

-type ListSize  = Int
-type BinSize = Double
+data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
+                                 , fc_inclusionSize :: InclusionSize
+                                 , fc_sampleBins    :: SampleBins
+                                 , fc_clusters      :: Clusters
+                                 , fc_defaultValue  :: DefaultValue
+                             }

-- Map list creation
-- Kmean split into 2 main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in s parts
+filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
+filterCooc fc cc = (filterCooc' fc) ts cc
+  where
+    ts     = map _scored_terms $ takeSome fc $ coocScored cc
+
+
+filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
+filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
+  foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
+    M.empty selection
+  where
+    selection  = [(x,y) | x <- ts, y <- ts, x > y]
+
+
+-- | Map list creation
+-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
+-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
 -- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * s = l
-takeSome :: Ord t => ListSize -> BinSize -> [Scored t] -> [Scored t]
-takeSome l s scores = L.take l
+-- take n scored terms in each parts where n * SampleBins = MapListSize.
+takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
+takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
                    $ takeSample n m
-                    $ splitKmeans 2 scores
+                    $ L.take l' $ L.reverse $ L.sortOn _scored_incExc scores
+                    -- $ splitKmeans k scores
  where
-    -- (TODO: benchmark with accelerate-example kmeans version)
-    splitKmeans x xs = elements
-                     $ V.head
+    -- TODO: benchmark with accelerate-example kmeans version
+    splitKmeans x xs = L.concat $ map elements
+                     $ V.take (k-1)
                     $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
                              euclidSq x xs
    n = round ((fromIntegral l)/s)
    m = round $ (fromIntegral $ length scores) / (s)
-    takeSample n m xs = L.concat $ map (L.take n)
-                                 $ L.reverse $ map (L.sortOn _scored_incExc)
+    takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
+                        L.concat $ map (L.take n)
+                                 $ map (reverse . (L.sortOn _scored_incExc))
+                                 -- TODO use kmeans s instead of splitEvery
+                                 -- in order to split in s heteregenous parts
+                                 -- without homogeneous order hypothesis
                                 $ splitEvery m
                                 $ L.reverse $ L.sortOn _scored_speGen xs


-data Scored t = Scored { _scored_terms  :: t
-                       , _scored_incExc :: InclusionExclusion
-                       , _scored_speGen :: SpecificityGenericity
+data Scored t = Scored { _scored_terms  :: !t
+                       , _scored_incExc :: !InclusionExclusion
+                       , _scored_speGen :: !SpecificityGenericity
                     } deriving (Show)

-incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
-incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
+coocScored :: Ord t => Map (t,t) Int -> [Scored t]
+coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
  where
    (ti,fi) = createIndices m
    (is, ss) = incExcSpeGen $ cooc2mat ti m
    scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)


+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
  where
@@ -107,7 +152,6 @@ incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)



-
 metrics_text :: Text
 metrics_text = T.intercalate " " metrics_sentences

@@ -136,7 +180,7 @@ metrics_sentences_Test = metrics_sentences == metrics_sentences'
 -}

 metrics_terms :: IO [[Terms]]
-metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
+metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text

 -- | Occurrences
 {-

--- a/src/Gargantext/Text/Metrics/Count.hs
+++ b/src/Gargantext/Text/Metrics/Count.hs
@@ -75,14 +75,15 @@ type Grouped = Stems

 type Occs = Int
 type Coocs = Int
-
-removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
-removeApax = DMS.filter (> 1)
+type Threshold = Int
+removeApax :: Threshold -> Map (Label, Label) Int -> Map (Label, Label) Int
+removeApax t = DMS.filter (> t)

 cooc :: [[Terms]] -> Map (Label, Label) Int
-cooc tss = coocOnWithLabel _terms_stem (labelPolicy terms_occs) tss
+cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
  where
    terms_occs = occurrencesOn _terms_stem (List.concat tss)
+    label_policy = mkLabelPolicy terms_occs


 coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
@@ -93,10 +94,21 @@ coocOnWithLabel on policy tss =
    delta f = f *** f


+mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped Label
+mkLabelPolicy = DMS.map f where
+  f = _terms_label . fst . maximumWith snd . DMS.toList
+     -- TODO use the Foldable instance of Map instead of building a list
+
+useLabelPolicy :: Map Grouped Label -> Grouped -> Label
+useLabelPolicy m g = case DMS.lookup g m of
+  Just label -> label
+  Nothing    -> panic $ "Label of Grouped not found: " <> (pack $ show g)
+{-
 labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
 labelPolicy m g =  case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
                     Just label -> label
                     Nothing    -> panic $ "Label of Grouped not found: " <> (pack $ show g)
+-}

 coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
 coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as

--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -42,23 +42,23 @@ import Gargantext.Core.Types
 import Gargantext.Text.Terms.Multi (multiterms)
 import Gargantext.Text.Terms.Mono  (monoterms')

-data TermType = Mono | Multi | MonoMulti
+data TermType lang = Mono lang | Multi lang | MonoMulti lang

 -- remove Stop Words
 -- map (filter (\t -> not . elem t)) $ 
 ------------------------------------------------------------------------
 -- | Sugar to extract terms from text (hiddeng mapM from end user).
-extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
-extractTerms termType lang = mapM (terms termType lang)
+extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
+extractTerms termTypeLang = mapM (terms termTypeLang)
 ------------------------------------------------------------------------
 -- | Terms from Text
 -- Mono : mono terms
 -- Multi : multi terms
 -- MonoMulti : mono and multi
 -- TODO : multi terms should exclude mono (intersection is not empty yet)
-terms :: TermType -> Lang -> Text -> IO [Terms]
-terms Mono  lang txt     = pure $ monoterms' lang txt
-terms Multi lang txt     = multiterms lang txt
-terms MonoMulti lang txt = terms Multi lang txt
+terms :: TermType Lang -> Text -> IO [Terms]
+terms (Mono      lang) txt = pure $ monoterms' lang txt
+terms (Multi     lang) txt = multiterms lang txt
+terms (MonoMulti lang) txt = terms (Multi lang) txt
 ------------------------------------------------------------------------

--- a/src/Gargantext/Viz/Graph.hs
+++ b/src/Gargantext/Viz/Graph.hs
@@ -19,11 +19,14 @@ module Gargantext.Viz.Graph
 import GHC.Generics (Generic)
 import Data.Aeson.TH (deriveJSON)
 import Data.Text (Text)
+import Data.Map (Map)

 import Gargantext.Prelude
 import Gargantext.Core.Utils.Prefix (unPrefix)

-----------------------------------------------------------
+import Data.Graph.Clustering.Louvain.CplusPlus (LouvainNode)
+
+
 data TypeNode = Terms | Unknown
  deriving (Show, Generic)

@@ -55,8 +58,6 @@ data Graph = Graph { g_nodes :: [Node]
                   }
  deriving (Show, Generic)
 $(deriveJSON (unPrefix "g_") ''Graph)
-
 -----------------------------------------------------------


-
--- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs
@@ -109,7 +109,6 @@ conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
  where
-
    ie :: Acc (Matrix Double) -> Acc (Matrix Double)
    ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
    sg :: Acc (Matrix Double) -> Acc (Matrix Double)
@@ -149,8 +148,6 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
    crossProduct m = zipWith (*) (cross m  ) (cross (transpose m))
    cross mat      = zipWith (-) (mkSum n mat) (mat)

-
-
 -----------------------------------------------------------------------
 -----------------------------------------------------------------------