Commit 9643c219 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[ELEVE] Ngrams, still NaN.

parent 7b3d0ab6
......@@ -151,6 +151,7 @@ library:
- protolude
- pureMD5
- SHA
- simple-reflect
- random
- rake
- regex-compat
......
......@@ -98,18 +98,20 @@ eavg [] = 0
-- Simple Average
mean :: Fractional a => [a] -> a
mean xs = if L.null xs then 0.0
else sum xs / fromIntegral (length xs)
mean xs = sum xs / fromIntegral (length xs)
sumMaybe :: Num a => [Maybe a] -> Maybe a
sumMaybe = fmap sum . M.sequence
variance :: Floating a => [a] -> a
variance xs = mean $ map (\x -> (x - m) ** 2) xs where
variance xs = sum ys / (fromIntegral (length xs) - 1)
where
m = mean xs
ys = map (\x -> (x - m) ** 2) xs
deviation :: [Double] -> Double
deviation :: Floating a => [a] -> a
deviation = sqrt . variance
movingAverage :: (Eq b, Fractional b) => Int -> [b] -> [b]
......@@ -242,8 +244,8 @@ scaleMinMax xs = map (\x -> (x - mi / (ma - mi + 1) )) xs'
scaleNormalize :: [Double] -> [Double]
scaleNormalize xs = map (\x -> (x - v / (m + 1))) xs'
where
v = variance xs'
m = mean xs'
v = variance xs'
m = mean xs'
xs' = map abs xs
normalize :: [Double] -> [Double]
......
......@@ -8,7 +8,10 @@ Implementation of EleVe Python version of papers:
-}
module Gargantext.Text.Eleve where
import Debug.Trace (trace)
import Debug.SimpleReflect
import Control.Monad (foldM)
import Data.Ord (Ord)
import qualified Data.List as L
import Data.Monoid
......@@ -18,34 +21,60 @@ import Data.Map (Map)
import Data.Maybe (fromMaybe)
import qualified Data.Map as Map
import Gargantext.Prelude
import qualified Data.Tree as Tree
import Data.Tree (Tree)
import qualified Prelude as P (putStrLn, logBase, String)
-- prop (Node c _e f) = c == Map.size f
-- TODO remove Leaf
-- TODO maybe add Leaf
-- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
example :: [[Token]]
example = map token
$ chunkAlong 3 1
$ T.words "New York and New York is a big apple"
--test = split t ts
test n example = do
let
ex = toToken n example
t = buildTrie $ chunkAlong n 1 ex
data Token = NonTerminal Text | Terminal
P.putStrLn $ Tree.drawTree
$ fmap show
$ toTree (NonTerminal "") t
pure $ map unToken $ split t t [] ex
example' = T.words "New York and New York"
example'' = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
data Token = NonTerminal Text
| Terminal
deriving (Ord, Eq, Show)
token :: [Text] -> [Token]
token xs = (NonTerminal <$> xs) <> [Terminal]
toToken :: Int -> [Text] -> [Token]
toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
unToken :: [Token] -> [Text]
unToken = map f
where
f (NonTerminal x) = x
f Terminal = ""
data Trie k e
= Node { _node_count :: Int
, _node_entropy :: e
, _node_children :: Map k (Trie k e)
}
-- | Leaf { _node_count :: Int }
| Leaf { _node_count :: Int }
deriving (Show)
toTree :: k -> Trie k e -> Tree (k,Int,e)
toTree k (Node c e cs) = Tree.Node (k, c, e) (map (uncurry toTree) $ Map.toList cs)
-- emptyTrie :: Trie k e
-- emptyTrie = Leaf 0
emptyTrie :: (Ord k, Monoid e) => Trie k e
emptyTrie = Node 0 mempty mempty
emptyTrie :: (Ord k, Monoid e) => Int -> Trie k e
--emptyTrie n = Node n mempty mempty
emptyTrie = Leaf
mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
mkTrie c children
......@@ -53,39 +82,40 @@ mkTrie c children
| otherwise -} = Node c mempty children
insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
insertTrie [] n = n
insertTrie [] n = n { _node_count = _node_count n +1}
-- insertTrie (x:xs) (Leaf c) = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie)
insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
where
f = Just . insertTrie xs . fromMaybe emptyTrie
f = Just . insertTrie xs . fromMaybe (emptyTrie 0)
insertTries :: Ord k => [[k]] -> Trie k ()
insertTries = L.foldr insertTrie emptyTrie
insertTries = L.foldr insertTrie (emptyTrie 1)
entropyTrie :: (k -> Bool) -> Trie k () -> Trie k Double
entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
-- entropyTrie _ (Leaf c) = Leaf c
entropyTrie pred (Node c _e children) = Node c e (entropyTrie pred <$> children)
entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
where
e = sum $ f <$> Map.toList children
f (k, child) = if pred k then cfc * log (fromIntegral c) else - cfc * log cfc
e = sum $ map f $ Map.toList children
f (k, child) = if pred k then cfc * P.logBase 2 (fromIntegral c)
else - cfc * P.logBase 2 cfc
where
cfc = fromIntegral (_node_count child) / fromIntegral c
normalizeEntropy :: Trie k Double -> Trie k Double
normalizeEntropy :: (Fractional e, Floating e, Show e) => Trie k e -> Trie k e
-- normalizeEntropy (Leaf c) = Leaf c
normalizeEntropy (Node c e children) =
Node c e $ normalizeLevel m v . normalizeEntropy <$> children
trace (show $ L.length es) $ Node c e $ map (normalizeLevel m v . normalizeEntropy) children
where
es = _node_entropy <$> Map.elems children
m = mean es
v = variance es
es = map _node_entropy $ Map.elems children
m = mean es
v = deviation es
normalizeLevel :: Double -> Double -> Trie k Double -> Trie k Double
normalizeLevel :: (Fractional e, Floating e, Show e) => e -> e -> Trie k e -> Trie k e
-- normalizeLevel _ _ (Leaf c) = Leaf c
-- normalizeLevel m v (Node c e children) = Node c ((e - m) / v) children
normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) / v }
--normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) }
normalizeLevel m v n = trace (show (_node_entropy n,m,v)) $ n { _node_entropy = (_node_entropy n - m) / v}
buildTrie :: [[Token]] -> Trie Token Double
buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
buildTrie = normalizeEntropy . entropyTrie (== Terminal) . insertTries
subForest :: Trie k e -> [Trie k e]
......@@ -98,17 +128,36 @@ levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
entropyLevels :: Trie k e -> [[e]]
entropyLevels = fmap (fmap _node_entropy) . levels
normalizeEntropy' :: Trie k Double -> Trie k Double
normalizeEntropy' :: (Floating e, Show e) => Trie k e -> Trie k e
normalizeEntropy' t = go (entropyLevels t) t
where
go :: [[Double]] -> Trie k Double -> Trie k Double
go :: (Floating e, Show e) => [[e]] -> Trie k e -> Trie k e
go [] _ = panic "normalizeEntropy' empty levels"
-- go _ (Leaf c) = Leaf c
go (es : ess) (Node c e children) =
Node c e (normalizeLevel m v . go ess <$> children)
where
m = mean es
v = variance es
m = mean es
v = deviation es
buildTrie' :: [[Token]] -> Trie Token Double
buildTrie' :: (Floating e, Show e) => [[Token]] -> Trie Token e
buildTrie' = normalizeEntropy' . entropyTrie (== Terminal) . insertTries
------------------------------------------------------------------------
autonomie :: Trie Token e -> Token -> e
autonomie trie t = case (Map.lookup t (_node_children trie)) of
Nothing -> panic $ "Gargantext.Text.Ngrams: autonomie" <> (cs $ show t)
Just a -> _node_entropy a
------------------------------------------------------------------------
split :: (Num e, Ord e) => Trie Token e -> Trie Token e -> [Token] -> [Token] -> [[Token]]
split _ _ pref [] = [reverse pref]
split t0 t pref (x:xs) = case Map.lookup x $ _node_children t of
Nothing -> reverse pref : split t0 t0 [x] xs
Just a -> case Map.lookup x $ _node_children t0 of
Nothing -> panic "TODO" -- reverse pref : split t0 t0 [] xs
Just xt0 -> case _node_entropy t + _node_entropy xt0 > _node_entropy a of
True -> split t0 a (x:pref) xs
False -> reverse pref : split t0 xt0 [x] xs
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment