From 9643c219f2fb65e2600407c996ed136856d62aee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Delano=C3=AB?= <devel+git@delanoe.org> Date: Fri, 24 May 2019 16:06:54 +0200 Subject: [PATCH] [ELEVE] Ngrams, still NaN. --- package.yaml | 1 + src/Gargantext/Prelude.hs | 14 +++-- src/Gargantext/Text/Eleve.hs | 115 +++++++++++++++++++++++++---------- 3 files changed, 91 insertions(+), 39 deletions(-) diff --git a/package.yaml b/package.yaml index 373c42f2..7463a120 100644 --- a/package.yaml +++ b/package.yaml @@ -151,6 +151,7 @@ library: - protolude - pureMD5 - SHA + - simple-reflect - random - rake - regex-compat diff --git a/src/Gargantext/Prelude.hs b/src/Gargantext/Prelude.hs index 34dd4902..0f5655a4 100644 --- a/src/Gargantext/Prelude.hs +++ b/src/Gargantext/Prelude.hs @@ -98,18 +98,20 @@ eavg [] = 0 -- Simple Average mean :: Fractional a => [a] -> a -mean xs = if L.null xs then 0.0 - else sum xs / fromIntegral (length xs) +mean xs = sum xs / fromIntegral (length xs) sumMaybe :: Num a => [Maybe a] -> Maybe a sumMaybe = fmap sum . M.sequence variance :: Floating a => [a] -> a -variance xs = mean $ map (\x -> (x - m) ** 2) xs where +variance xs = sum ys / (fromIntegral (length xs) - 1) + where m = mean xs + ys = map (\x -> (x - m) ** 2) xs + -deviation :: [Double] -> Double +deviation :: Floating a => [a] -> a deviation = sqrt . variance movingAverage :: (Eq b, Fractional b) => Int -> [b] -> [b] @@ -242,8 +244,8 @@ scaleMinMax xs = map (\x -> (x - mi / (ma - mi + 1) )) xs' scaleNormalize :: [Double] -> [Double] scaleNormalize xs = map (\x -> (x - v / (m + 1))) xs' where - v = variance xs' - m = mean xs' + v = variance xs' + m = mean xs' xs' = map abs xs normalize :: [Double] -> [Double] diff --git a/src/Gargantext/Text/Eleve.hs b/src/Gargantext/Text/Eleve.hs index bd30c118..64a7268a 100644 --- a/src/Gargantext/Text/Eleve.hs +++ b/src/Gargantext/Text/Eleve.hs @@ -8,7 +8,10 @@ Implementation of EleVe Python version of papers: -} module Gargantext.Text.Eleve where +import Debug.Trace (trace) +import Debug.SimpleReflect +import Control.Monad (foldM) import Data.Ord (Ord) import qualified Data.List as L import Data.Monoid @@ -18,34 +21,60 @@ import Data.Map (Map) import Data.Maybe (fromMaybe) import qualified Data.Map as Map import Gargantext.Prelude +import qualified Data.Tree as Tree +import Data.Tree (Tree) +import qualified Prelude as P (putStrLn, logBase, String) -- prop (Node c _e f) = c == Map.size f --- TODO remove Leaf +-- TODO maybe add Leaf -- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time) -example :: [[Token]] -example = map token - $ chunkAlong 3 1 - $ T.words "New York and New York is a big apple" +--test = split t ts +test n example = do + let + ex = toToken n example + t = buildTrie $ chunkAlong n 1 ex -data Token = NonTerminal Text | Terminal + P.putStrLn $ Tree.drawTree + $ fmap show + $ toTree (NonTerminal "") t + + pure $ map unToken $ split t t [] ex + + +example' = T.words "New York and New York" +example'' = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String) + + +data Token = NonTerminal Text + | Terminal deriving (Ord, Eq, Show) -token :: [Text] -> [Token] -token xs = (NonTerminal <$> xs) <> [Terminal] +toToken :: Int -> [Text] -> [Token] +toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal) + +unToken :: [Token] -> [Text] +unToken = map f + where + f (NonTerminal x) = x + f Terminal = "" + data Trie k e = Node { _node_count :: Int , _node_entropy :: e , _node_children :: Map k (Trie k e) } --- | Leaf { _node_count :: Int } + | Leaf { _node_count :: Int } deriving (Show) +toTree :: k -> Trie k e -> Tree (k,Int,e) +toTree k (Node c e cs) = Tree.Node (k, c, e) (map (uncurry toTree) $ Map.toList cs) + -- emptyTrie :: Trie k e --- emptyTrie = Leaf 0 -emptyTrie :: (Ord k, Monoid e) => Trie k e -emptyTrie = Node 0 mempty mempty +emptyTrie :: (Ord k, Monoid e) => Int -> Trie k e +--emptyTrie n = Node n mempty mempty +emptyTrie = Leaf mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e mkTrie c children @@ -53,39 +82,40 @@ mkTrie c children | otherwise -} = Node c mempty children insertTrie :: Ord k => [k] -> Trie k () -> Trie k () -insertTrie [] n = n +insertTrie [] n = n { _node_count = _node_count n +1} -- insertTrie (x:xs) (Leaf c) = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie) insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children where - f = Just . insertTrie xs . fromMaybe emptyTrie + f = Just . insertTrie xs . fromMaybe (emptyTrie 0) insertTries :: Ord k => [[k]] -> Trie k () -insertTries = L.foldr insertTrie emptyTrie +insertTries = L.foldr insertTrie (emptyTrie 1) -entropyTrie :: (k -> Bool) -> Trie k () -> Trie k Double +entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e -- entropyTrie _ (Leaf c) = Leaf c -entropyTrie pred (Node c _e children) = Node c e (entropyTrie pred <$> children) +entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children) where - e = sum $ f <$> Map.toList children - f (k, child) = if pred k then cfc * log (fromIntegral c) else - cfc * log cfc + e = sum $ map f $ Map.toList children + f (k, child) = if pred k then cfc * P.logBase 2 (fromIntegral c) + else - cfc * P.logBase 2 cfc where cfc = fromIntegral (_node_count child) / fromIntegral c -normalizeEntropy :: Trie k Double -> Trie k Double +normalizeEntropy :: (Fractional e, Floating e, Show e) => Trie k e -> Trie k e -- normalizeEntropy (Leaf c) = Leaf c normalizeEntropy (Node c e children) = - Node c e $ normalizeLevel m v . normalizeEntropy <$> children + trace (show $ L.length es) $ Node c e $ map (normalizeLevel m v . normalizeEntropy) children where - es = _node_entropy <$> Map.elems children - m = mean es - v = variance es + es = map _node_entropy $ Map.elems children + m = mean es + v = deviation es -normalizeLevel :: Double -> Double -> Trie k Double -> Trie k Double +normalizeLevel :: (Fractional e, Floating e, Show e) => e -> e -> Trie k e -> Trie k e -- normalizeLevel _ _ (Leaf c) = Leaf c --- normalizeLevel m v (Node c e children) = Node c ((e - m) / v) children -normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) / v } +--normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) } +normalizeLevel m v n = trace (show (_node_entropy n,m,v)) $ n { _node_entropy = (_node_entropy n - m) / v} -buildTrie :: [[Token]] -> Trie Token Double +buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e buildTrie = normalizeEntropy . entropyTrie (== Terminal) . insertTries subForest :: Trie k e -> [Trie k e] @@ -98,17 +128,36 @@ levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure entropyLevels :: Trie k e -> [[e]] entropyLevels = fmap (fmap _node_entropy) . levels -normalizeEntropy' :: Trie k Double -> Trie k Double +normalizeEntropy' :: (Floating e, Show e) => Trie k e -> Trie k e normalizeEntropy' t = go (entropyLevels t) t where - go :: [[Double]] -> Trie k Double -> Trie k Double + go :: (Floating e, Show e) => [[e]] -> Trie k e -> Trie k e go [] _ = panic "normalizeEntropy' empty levels" -- go _ (Leaf c) = Leaf c go (es : ess) (Node c e children) = Node c e (normalizeLevel m v . go ess <$> children) where - m = mean es - v = variance es + m = mean es + v = deviation es -buildTrie' :: [[Token]] -> Trie Token Double +buildTrie' :: (Floating e, Show e) => [[Token]] -> Trie Token e buildTrie' = normalizeEntropy' . entropyTrie (== Terminal) . insertTries + +------------------------------------------------------------------------ + +autonomie :: Trie Token e -> Token -> e +autonomie trie t = case (Map.lookup t (_node_children trie)) of + Nothing -> panic $ "Gargantext.Text.Ngrams: autonomie" <> (cs $ show t) + Just a -> _node_entropy a + +------------------------------------------------------------------------ + +split :: (Num e, Ord e) => Trie Token e -> Trie Token e -> [Token] -> [Token] -> [[Token]] +split _ _ pref [] = [reverse pref] +split t0 t pref (x:xs) = case Map.lookup x $ _node_children t of + Nothing -> reverse pref : split t0 t0 [x] xs + Just a -> case Map.lookup x $ _node_children t0 of + Nothing -> panic "TODO" -- reverse pref : split t0 t0 [] xs + Just xt0 -> case _node_entropy t + _node_entropy xt0 > _node_entropy a of + True -> split t0 a (x:pref) xs + False -> reverse pref : split t0 xt0 [x] xs -- 2.21.0