Commit 00787609 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[Ngrams] Entropy Variation OK in tests.

parent 86d4a8dd
......@@ -45,7 +45,7 @@ import Debug.Trace (trace)
-- import Debug.SimpleReflect
import Data.Functor.Reverse
import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~))
import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
import Control.Monad (forM_)
import Data.Ord (Ord)
import qualified Data.List as L
......@@ -56,6 +56,7 @@ import Data.Map (Map)
import Data.Maybe (fromMaybe)
import qualified Data.Map as Map
import Gargantext.Prelude hiding (cs)
import qualified Gargantext.Prelude as GP
import qualified Data.Tree as Tree
import Data.Tree (Tree)
import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
......@@ -71,18 +72,23 @@ type Entropy e =
-- | Example and tests for development
data I e = I
{ _info_entropy :: e
, _info_entropy_var :: e
, _info_autonomy :: e
}
instance Show e => Show (I e) where
show (I e n) = show (e, n)
show (I e v n) = show (e, v, n)
makeLenses ''I
type ModEntropy i o e = (e -> e) -> i -> o
set_autonomy :: ModEntropy e (I e) e
set_autonomy f e = I e (f e)
set_autonomy f e = I e e (f e)
set_entropy_var :: ModEntropy e (I e) e
set_entropy_var f e = I e (f e) e
data StartStop = Start | Stop
deriving (Ord, Eq, Show)
......@@ -91,6 +97,9 @@ data Token = NonTerminal Text
| Terminal StartStop
deriving (Ord, Eq, Show)
rootTrie :: Token
rootTrie = NonTerminal ""
isTerminal :: Token -> Bool
isTerminal (Terminal _) = True
isTerminal (NonTerminal _) = False
......@@ -100,6 +109,23 @@ parseToken "<start>" = Terminal Start
parseToken "<stop>" = Terminal Stop
parseToken t = NonTerminal t
-- |
-- >>> reverseTokens [Terminal Start, NonTerminal "new", NonTerminal "york", Terminal Stop]
-- [Terminal Start,NonTerminal "york",NonTerminal "new",Terminal Stop]
reverseTokens :: [Token] -> [Token]
reverseTokens xs = case lastMay xs of
Nothing -> []
Just (Terminal Stop) -> reverseTokens' xs <> [Terminal Stop]
_ -> reverseTokens' xs
reverseTokens' :: [Token] -> [Token]
reverseTokens' [] = []
reverseTokens' [Terminal Stop] = []
reverseTokens' [x] = [x]
reverseTokens' (x:xs) = case x of
Terminal Start -> [Terminal Start] <> reverseTokens' xs
_ -> reverseTokens' xs <> [x]
toToken :: [Text] -> [Token]
toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
......@@ -110,7 +136,6 @@ printToken = f
f (NonTerminal x) = x
f (Terminal Start) = "<start>"
f (Terminal Stop) = "<stop>"
------------------------------------------------------------------------
data Trie k e
......@@ -144,7 +169,6 @@ mkTrie c children
| otherwise = Node c mempty children
-----------------------------
-- | Trie to Tree since Tree as nice print function
toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
......@@ -152,7 +176,6 @@ toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.t
------------------------------------------------------------------------
------------------------------------------------------------------------
nan :: Floating e => e
nan = 0 / 0
......@@ -180,7 +203,6 @@ entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) childre
where
chc = fromIntegral (_node_count child) / fromIntegral c
------------------------------------------------------------------------
normalizeLevel :: Entropy e => [e] -> e -> e
normalizeLevel = checkDiff (go . noNaNs)
......@@ -234,8 +256,9 @@ instance IsTrie Trie where
findTrie ks t = L.foldl (flip nodeChild) t ks
normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
normalizeEntropy inE modE t = trace (show level) $ go (modE identity) level t
where
level = (entropyLevels inE t)
go _ [] _ = panic "normalizeEntropy' empty levels"
go _ _ (Leaf c) = Leaf c
-- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
......@@ -282,6 +305,8 @@ data Tries k e = Tries
makeLenses ''Tries
instance IsTrie Tries where
buildTrie to n tts = Tries { _fwd = buildTrie to n tts
, _bwd = buildTrie to n (map reverse $ tts)
......@@ -362,6 +387,43 @@ chunkAlongEleve n xs = L.take n <$> L.tails xs
toToken' :: Int -> [[Text]] -> [[Token]]
toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
---------------------------------------------
set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
set_entropy_vars inE modE tries@(Tries fwd _bwd) =
mapTree (\k -> modE $ entropy_var'' inE tries k) [rootTrie] fwd
mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
mapTree f k t = go f k t
where
go _ _ (Leaf c) = Leaf c
go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (filter (/= rootTrie) $ k <> [k'])) children)
entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
where
fwd = (nodeEntropy inE (_fwd $ findTrie ng tries))
bwd = (nodeEntropy inE (_bwd $ findTrie (reverseTokens ng) tries))
---------------------------------------------
-- | TODO remove function below after following bug fixed
-- | TODO entropy_var' /= entropy_var on "<start> token.."
entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
, (nodeEntropy info_entropy (_bwd $ findTrie (reverseTokens ng) tries))
]
)
entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
, (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
]
)
where
ntf = parseToken <$> ng
ntb = parseToken <$> reverse ng
---------------------------------------------
testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
testEleve debug n output checks = do
let
......@@ -401,12 +463,25 @@ testEleve debug n output checks = do
expected = fmap (T.splitOn "-") <$> out
input = (T.splitOn "-" =<<) <$> out
inp = toToken <$> input
t :: Tries Token Double
t = buildTrie toToken' n input
& bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
-- NP: this is a hack to set the bwd entropy of Start at NaN.
t'' :: Trie Token Double
t'' = set_entropy_vars identity (\e _i -> e) t
-- keeping nt for fwd and bwd checks
-- it has no sense to calculate entropy_var on fwd and bwd each
nt :: Tries Token (I Double)
nt = normalizeEntropy identity set_autonomy t
nt' :: Trie Token (I Double)
nt' = normalizeEntropy identity set_autonomy t''
-- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
-- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
nt = normalizeEntropy identity set_autonomy t
check f msg ref my =
if f ref my
......@@ -416,23 +491,28 @@ testEleve debug n output checks = do
checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
let ns = parseToken <$> T.words ngram
nsb = parseToken <$> (reverse $ T.words ngram)
t' = findTrie ns nt
tb' = findTrie nsb nt
-- TODO put this Variation Entropy at VETODO mark above maybe in nodeEntropy ?
ev = (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
tvar = findTrie ns t''
P.putStrLn $ " " <> T.unpack ngram <> ":"
check (==) "count" count (_node_count (_fwd t'))
check sim "entropy" entropy ev
check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
check (==) "count" count (_node_count tvar)
check sim "entropy_var" entropy (nodeEntropy identity tvar)
--check sim ("entropy_varOK") entropy (entropy_var (T.words ngram) nt)
--check sim "entropy" entropy (entropy_var' nt (parseToken <$> T.words ngram))
{- ^ FIXME 2 fun above should have same results (error in reverseToken):
<start> New York:
PASS count 1
FAIL entropy ref=NaN my=0.0
-}
check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
printTrie =
P.putStrLn . Tree.drawTree
. fmap show
. toTree (NonTerminal "")
. toTree rootTrie
-- | TODO real data is a list of tokenized sentences
example0, example1, example2, example3, example4, example5, example6 :: [Text]
......@@ -460,7 +540,7 @@ checks0 =
--,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
-- need to count it.
--{-
{-
,("<start> New", 1, nan, nan, nan, nan, 0.0)
,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
,("York is", 1, 0.0, nan, nan, nan, 0.0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment