[COOC FILTERING] workflow done but optimization issue.

c44c07c7 · Alexandre Delanoë · e5419936 · c44c07c7 · c44c07c7 · c44c07c7
Commit c44c07c7 authored Jun 09, 2018 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 31 deletions

Pipeline.hs src/Gargantext/Pipeline.hs +10 -14

Metrics.hs src/Gargantext/Text/Metrics.hs +41 -15

Graph.hs src/Gargantext/Viz/Graph.hs +0 -2

No files found.
--- a/src/Gargantext/Pipeline.hs
+++ b/src/Gargantext/Pipeline.hs
@@ -46,27 +46,23 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
 -}
-pipeline path = do
+workflow lang path = do
  -- Text  <- IO Text <- FilePath
  text     <- readFile path
  let contexts = splitBy (Sentences 5) text
-  myterms <- extractTerms Multi FR contexts
+  myterms <- extractTerms Multi lang contexts
  -- TODO    filter (\t -> not . elem t stopList) myterms
  -- TODO    groupBy (Stem | GroupList)
-  let myCooc = removeApax $ cooc myterms
+  let myCooc = filterCooc $ removeApax $ cooc myterms
-  --let (ti, fi) = createIndices myCooc
-  pure True
-  --pure $ incExcSpeGen myCooc
  -- Cooc -> Matrix
+  --let (ti, fi) = createIndices myCooc
--  -- filter by spec/gen (dynmaic programming)
+  -- @np FIXME optimization issue of filterCooc (too much memory consumed)
--  let theScores = M.filter (>0) $ score conditional myCoocFiltered
+  pure myCooc
----
+  -- Matrix -> Clustering
------  -- Matrix -> Clustering
+-- pure $ bestpartition False $ map2graph $ toIndex ti myCooc
------  pure $ bestpartition False $ map2graph $ toIndex ti theScores
+  --partitions <- cLouvain $ toIndex ti $ M.map (\v -> (fromIntegral v) :: Double) myCooc
--  partitions <- cLouvain theScores
+  --pure partitions
--  pure partitions
 ---- | Building : -> Graph -> JSON
--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -16,6 +16,7 @@ noApax m = M.filter (>1) m
 -}
+{-# LANGUAGE BangPatterns      #-}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
@@ -53,16 +54,21 @@ import qualified Data.Array.Accelerate as DAA
 import GHC.Real (round)
--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
+filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m = 
+filterCooc cc = filterCooc' ts cc
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
+  where
----(ti, fi)  = createIndices m
+    ts     = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
-- . fromIndex fi $ filterMat $ cooc2mat ti m
+filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
+filterCooc' ts m = foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m') M.empty selection
+  where
+    errMessage = panic "Filter cooc: no key"
+    selection  = [(x,y) | x <- ts, y <- ts, x > y]
-type MapListSize  = Int
+type MapListSize = Int
-type SampleBins = Double
+type SampleBins  = Double
-type Clusters = Int
+type Clusters    = Int
 -- | Map list creation
 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
@@ -83,23 +89,44 @@ takeSome l s k scores = L.take l
    m = round $ (fromIntegral $ length scores) / (s)
    takeSample n m xs = L.concat $ map (L.take n)
                                 $ L.reverse $ map (L.sortOn _scored_incExc)
-                                 $ splitEvery m
+                                 -- TODO use kmeans s instead of splitEvery
+                                 -- in order to split in s heteregenous parts
+                                 -- without homogeneous order hypothesis
+                                 $ splitEvery m 
                                 $ L.reverse $ L.sortOn _scored_speGen xs
-data Scored t = Scored { _scored_terms  :: t
+data Scored t = Scored { _scored_terms  :: !t
-                       , _scored_incExc :: InclusionExclusion
+                       , _scored_incExc :: !InclusionExclusion
-                       , _scored_speGen :: SpecificityGenericity
+                       , _scored_speGen :: !SpecificityGenericity
                     } deriving (Show)
-incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
+coocScored :: Ord t => Map (t,t) Int -> [Scored t]
-incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
+coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
  where
    (ti,fi) = createIndices m
    (is, ss) = incExcSpeGen $ cooc2mat ti m
    scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
  where
@@ -108,7 +135,6 @@ incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
 metrics_text :: Text
 metrics_text = T.intercalate " " metrics_sentences

--- a/src/Gargantext/Viz/Graph.hs
+++ b/src/Gargantext/Viz/Graph.hs
@@ -55,8 +55,6 @@ data Graph = Graph { g_nodes :: [Node]
                   }
  deriving (Show, Generic)
 $(deriveJSON (unPrefix "g_") ''Graph)
 -----------------------------------------------------------