From 9643c219f2fb65e2600407c996ed136856d62aee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexandre=20Delano=C3=AB?= <devel+git@delanoe.org>
Date: Fri, 24 May 2019 16:06:54 +0200
Subject: [PATCH] [ELEVE] Ngrams, still NaN.

---
 package.yaml                 |   1 +
 src/Gargantext/Prelude.hs    |  14 +++--
 src/Gargantext/Text/Eleve.hs | 115 +++++++++++++++++++++++++----------
 3 files changed, 91 insertions(+), 39 deletions(-)

diff --git a/package.yaml b/package.yaml
index 373c42f2..7463a120 100644
--- a/package.yaml
+++ b/package.yaml
@@ -151,6 +151,7 @@ library:
   - protolude
   - pureMD5
   - SHA
+  - simple-reflect
   - random
   - rake
   - regex-compat
diff --git a/src/Gargantext/Prelude.hs b/src/Gargantext/Prelude.hs
index 34dd4902..0f5655a4 100644
--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -98,18 +98,20 @@ eavg [] = 0
 
 -- Simple Average
 mean :: Fractional a => [a] -> a
-mean xs = if L.null xs then 0.0
-                       else sum xs / fromIntegral (length xs)
+mean xs = sum xs / fromIntegral (length xs)
 
 
 sumMaybe :: Num a => [Maybe a] -> Maybe a
 sumMaybe = fmap sum . M.sequence
 
 variance :: Floating a => [a] -> a
-variance xs = mean $ map (\x -> (x - m) ** 2) xs where
+variance xs = sum ys  / (fromIntegral (length xs) - 1)
+  where
     m = mean xs
+    ys = map (\x -> (x - m) ** 2) xs
+
 
-deviation :: [Double] -> Double
+deviation :: Floating a => [a] -> a
 deviation = sqrt . variance
 
 movingAverage :: (Eq b, Fractional b) => Int -> [b] -> [b]
@@ -242,8 +244,8 @@ scaleMinMax xs = map (\x -> (x - mi / (ma - mi + 1) )) xs'
 scaleNormalize :: [Double] -> [Double]
 scaleNormalize xs = map (\x -> (x - v / (m + 1))) xs'
     where
-        v = variance xs'
-        m = mean     xs'
+        v   = variance  xs'
+        m   = mean      xs'
         xs' = map abs xs
 
 normalize :: [Double] -> [Double]
diff --git a/src/Gargantext/Text/Eleve.hs b/src/Gargantext/Text/Eleve.hs
index bd30c118..64a7268a 100644
--- a/src/Gargantext/Text/Eleve.hs
+++ b/src/Gargantext/Text/Eleve.hs
@@ -8,7 +8,10 @@ Implementation of EleVe Python version of papers:
 -}
 module Gargantext.Text.Eleve where
 
+import Debug.Trace (trace)
+import Debug.SimpleReflect
 
+import Control.Monad (foldM)
 import Data.Ord (Ord)
 import qualified Data.List as L
 import Data.Monoid
@@ -18,34 +21,60 @@ import Data.Map (Map)
 import Data.Maybe (fromMaybe)
 import qualified Data.Map as Map
 import Gargantext.Prelude
+import qualified Data.Tree as Tree
+import Data.Tree (Tree)
+import qualified Prelude as P (putStrLn, logBase, String)
 
 -- prop (Node c _e f) = c == Map.size f
--- TODO remove Leaf
+-- TODO maybe add Leaf
 --   NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
 
-example :: [[Token]]
-example = map token
-        $ chunkAlong 3 1
-        $ T.words "New York and New York is a big apple"
+--test = split t ts
+test n example = do
+  let
+    ex = toToken n example
+    t  = buildTrie $ chunkAlong n 1 ex
 
-data Token = NonTerminal Text | Terminal
+  P.putStrLn $ Tree.drawTree
+               $ fmap show
+               $ toTree (NonTerminal "") t
+
+  pure $ map unToken $ split t t [] ex
+
+
+example'  =  T.words "New York and New York"
+example'' =  map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
+
+
+data Token = NonTerminal Text
+           | Terminal
   deriving (Ord, Eq, Show)
 
-token :: [Text] -> [Token]
-token xs = (NonTerminal <$> xs) <> [Terminal]
+toToken :: Int -> [Text] -> [Token]
+toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
+
+unToken :: [Token] -> [Text]
+unToken = map f
+  where
+    f (NonTerminal x) = x
+    f Terminal = ""
+
 
 data Trie k e
   = Node { _node_count    :: Int
          , _node_entropy  :: e
          , _node_children :: Map k (Trie k e)
          }
--- | Leaf { _node_count    :: Int }
+ | Leaf { _node_count    :: Int }
   deriving (Show)
 
+toTree :: k -> Trie k e -> Tree (k,Int,e)
+toTree k (Node c e cs) = Tree.Node (k, c, e) (map (uncurry toTree) $ Map.toList cs)
+
 -- emptyTrie :: Trie k e
--- emptyTrie = Leaf 0
-emptyTrie :: (Ord k, Monoid e) => Trie k e
-emptyTrie = Node 0 mempty mempty
+emptyTrie :: (Ord k, Monoid e) => Int -> Trie k e
+--emptyTrie n = Node n mempty mempty
+emptyTrie   = Leaf
 
 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
 mkTrie c children
@@ -53,39 +82,40 @@ mkTrie c children
   | otherwise -}        = Node c mempty children
 
 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
-insertTrie []     n                    = n
+insertTrie []     n                    = n { _node_count = _node_count n +1}
 -- insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie)
 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
   where
-    f = Just . insertTrie xs . fromMaybe emptyTrie
+    f = Just . insertTrie xs . fromMaybe (emptyTrie 0)
 
 insertTries :: Ord k => [[k]] -> Trie k ()
-insertTries = L.foldr insertTrie emptyTrie
+insertTries = L.foldr insertTrie (emptyTrie 1)
 
-entropyTrie :: (k -> Bool) -> Trie k () -> Trie k Double
+entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
 -- entropyTrie _    (Leaf c)             = Leaf c
-entropyTrie pred (Node c _e children) = Node c e (entropyTrie pred <$> children)
+entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
   where
-    e = sum $ f <$> Map.toList children
-    f (k, child) = if pred k then cfc * log (fromIntegral c) else - cfc * log cfc
+    e = sum $ map f $ Map.toList children
+    f (k, child) = if pred k then   cfc * P.logBase 2 (fromIntegral c)
+                             else - cfc * P.logBase 2 cfc
       where
         cfc = fromIntegral (_node_count child) / fromIntegral c
 
-normalizeEntropy :: Trie k Double -> Trie k Double
+normalizeEntropy :: (Fractional e, Floating e, Show e) => Trie k e -> Trie k e
 -- normalizeEntropy (Leaf c)            = Leaf c
 normalizeEntropy (Node c e children) =
-    Node c e $ normalizeLevel m v . normalizeEntropy <$> children
+    trace (show $ L.length es) $ Node c e $ map (normalizeLevel m v . normalizeEntropy) children
   where
-    es = _node_entropy <$> Map.elems children
-    m  = mean es
-    v  = variance es
+    es = map _node_entropy $ Map.elems children
+    m  = mean      es
+    v  = deviation es
 
-normalizeLevel :: Double -> Double -> Trie k Double -> Trie k Double
+normalizeLevel :: (Fractional e, Floating e, Show e) => e -> e -> Trie k e -> Trie k e
 -- normalizeLevel _ _ (Leaf c)            = Leaf c
--- normalizeLevel m v (Node c e children) = Node c ((e - m) / v) children
-normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) / v }
+--normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) }
+normalizeLevel m v n = trace (show (_node_entropy n,m,v)) $ n { _node_entropy = (_node_entropy n - m) / v}
 
-buildTrie :: [[Token]] -> Trie Token Double
+buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
 buildTrie = normalizeEntropy . entropyTrie (== Terminal) . insertTries
 
 subForest :: Trie k e -> [Trie k e]
@@ -98,17 +128,36 @@ levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
 entropyLevels :: Trie k e -> [[e]]
 entropyLevels = fmap (fmap _node_entropy) . levels
 
-normalizeEntropy' :: Trie k Double -> Trie k Double
+normalizeEntropy' :: (Floating e, Show e) => Trie k e -> Trie k e
 normalizeEntropy' t = go (entropyLevels t) t
   where
-    go :: [[Double]] -> Trie k Double -> Trie k Double
+    go :: (Floating e, Show e) => [[e]] -> Trie k e -> Trie k e
     go [] _ = panic "normalizeEntropy' empty levels"
     -- go _          (Leaf c)            = Leaf c
     go (es : ess) (Node c e children) =
         Node c e (normalizeLevel m v . go ess <$> children)
       where
-        m  = mean es
-        v  = variance es
+        m  = mean      es
+        v  = deviation es
 
-buildTrie' :: [[Token]] -> Trie Token Double
+buildTrie' :: (Floating e, Show e) => [[Token]] -> Trie Token e
 buildTrie' = normalizeEntropy' . entropyTrie (== Terminal) . insertTries
+
+------------------------------------------------------------------------
+
+autonomie :: Trie Token e -> Token -> e
+autonomie trie t = case (Map.lookup t (_node_children trie)) of
+  Nothing -> panic $ "Gargantext.Text.Ngrams: autonomie" <> (cs $ show t)
+  Just  a -> _node_entropy a
+
+------------------------------------------------------------------------
+
+split :: (Num e, Ord e) => Trie Token e -> Trie Token e -> [Token] -> [Token] -> [[Token]]
+split _ _ pref [] = [reverse pref]
+split t0 t pref (x:xs) = case Map.lookup x $ _node_children t of
+  Nothing -> reverse pref : split t0 t0 [x] xs
+  Just a  -> case Map.lookup x $ _node_children t0 of
+    Nothing  -> panic "TODO" -- reverse pref : split t0 t0 [] xs
+    Just xt0 -> case _node_entropy t + _node_entropy xt0 > _node_entropy a of
+      True  -> split t0 a (x:pref) xs
+      False -> reverse pref : split t0 xt0 [x] xs
-- 
2.21.0