ElEve many changes

parent f65383e8
......@@ -44,8 +44,7 @@ module Gargantext.Text.Eleve where
-- import Debug.Trace (trace)
-- import Debug.SimpleReflect
import Data.Functor.Reverse
import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
import Control.Lens hiding (levels, children)
import Control.Monad (forM_)
import Data.Ord (Ord)
import qualified Data.List as L
......@@ -56,11 +55,28 @@ import Data.Map (Map)
import Data.Maybe (fromMaybe)
import qualified Data.Map as Map
import Gargantext.Prelude hiding (cs)
import qualified Gargantext.Prelude as GP
import qualified Data.Tree as Tree
import Data.Tree (Tree)
import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
nan :: Floating e => e
nan = 0 / 0
noNaNs :: P.RealFloat e => [e] -> [e]
noNaNs = filter (not . P.isNaN)
updateIfDefined :: P.RealFloat e => e -> e -> e
updateIfDefined e0 e | P.isNaN e = e0
| otherwise = e
sim :: Entropy e => e -> e -> Bool
sim x y = x == y || (P.isNaN x && P.isNaN y)
subst :: Entropy e => (e, e) -> e -> e
subst (src, dst) x | sim src x = dst
| otherwise = x
------------------------------------------------------------------------
type Entropy e =
( Fractional e
, Floating e
......@@ -77,18 +93,17 @@ data I e = I
}
instance Show e => Show (I e) where
show (I e v n) = show (e, v, n)
show (I e ev a) = show (e, ev, a)
makeLenses ''I
type ModEntropy i o e = (e -> e) -> i -> o
set_autonomy :: ModEntropy e (I e) e
set_autonomy f e = I e e (f e)
set_entropy_var :: ModEntropy e (I e) e
set_entropy_var f e = I e (f e) e
set_autonomy :: Entropy e => ModEntropy e (I e) e
set_autonomy f e = I e nan (f e)
set_entropy_var :: Entropy e => Setter e (I e) e e
set_entropy_var f e = (\ev -> I e ev nan) <$> f e
data StartStop = Start | Stop
deriving (Ord, Eq, Show)
......@@ -155,23 +170,6 @@ toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.t
------------------------------------------------------------------------
------------------------------------------------------------------------
nan :: Floating e => e
nan = 0 / 0
noNaNs :: P.RealFloat e => [e] -> [e]
noNaNs = filter (not . P.isNaN)
updateIfDefined :: P.RealFloat e => e -> e -> e
updateIfDefined e0 e | P.isNaN e = e0
| otherwise = e
sim :: Entropy e => e -> e -> Bool
sim x y = x == y || (P.isNaN x && P.isNaN y)
subst :: Entropy e => (e, e) -> e -> e
subst (src, dst) x | sim src x = dst
| otherwise = x
entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
entropyTrie _ (Leaf c) = Leaf c
entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
......@@ -182,8 +180,8 @@ entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) childre
where
chc = fromIntegral (_node_count child) / fromIntegral c
------------------------------------------------------------------------
normalizeLevel :: Entropy e => e -> e -> e -> e -> e
normalizeLevel prev m v e = ((e - prev) - m) / v
normalizeLevel :: Entropy e => e -> e -> e -> e
normalizeLevel m v e = (e - m) / v
{- Unused
......@@ -199,7 +197,9 @@ class IsTrie trie where
nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
nodeChild :: Ord k => k -> trie k e -> trie k e
findTrie :: Ord k => [k] -> trie k e -> trie k e
findTrieR :: Ord k => [k] -> trie k e -> trie k e
printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
normalizeEntropy :: Entropy e
=> Getting e i e -> ModEntropy i o e
-> trie k i -> trie k o
......@@ -218,6 +218,7 @@ instance IsTrie Trie where
nodeChild _ (Leaf _) = emptyTrie
findTrie ks t = L.foldl (flip nodeChild) t ks
findTrieR = findTrie
printTrie inE t = do
P.putStrLn . Tree.drawTree
......@@ -227,29 +228,21 @@ instance IsTrie Trie where
forM_ (normalizationLevels inE t) $ \level ->
P.putStrLn $ " " <> show level
evTrie inE setEV = go nan
where
go _ (Leaf c) = Leaf c
go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
where e1 = i ^. inE
ev 0 0 = nan
ev i0 i1 = i1 - i0
normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
where
go _ _ (Leaf c) = Leaf c
go _ [] _ = panic "normalizeEntropy' empty levels"
go f ((m, v, _) : ess) (Node c i children)
= Node c (f i) $ go (modE $ normalizeLevel (i ^. inE) m v) ess <$> children
{-
This is only normalizing a node with respect to its brothers (unlike all the
nodes of the same level).
normalizeEntropy inE modE = go $ modE identity
where
go _ (Leaf c) = Leaf c
go f (Node c i children)
| Map.null children =
panic "normalizeEntropy: impossible"
| otherwise =
Node c (f i) $ go (modE $ normalizeLevel es) <$> children
where
es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
-}
= Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
------------------------------------------------------------------------
levels :: Trie k e -> [[Trie k e]]
......@@ -276,38 +269,39 @@ data Tries k e = Tries
makeLenses ''Tries
nodeEntropySafe :: Entropy e => Getting e i e -> Tries k i -> e
nodeEntropySafe inE (Tries f b) =
mean $ noNaNs [nodeEntropy inE f, nodeEntropy inE b]
nodeEntropyBwdOpt :: Entropy e => Getting e i e -> Tries k i -> e
nodeEntropyBwdOpt inE (Tries f b) =
mean $ nodeEntropy inE f : noNaNs [nodeEntropy inE b]
instance IsTrie Tries where
buildTrie tts = Tries { _fwd = buildTrie tts
, _bwd = buildTrie (reverse <$> tts)
}
nodeEntropy inE (Tries fwd bwd) =
-- VETODO reverse the query for bwd here
-- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
findTrie ks = onTries (findTrie ks)
findTrieR ks (Tries f b) = Tries (findTrieR ks f) (findTrieR (reverse ks) b)
findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
-- ^^
-- TODO: here this is tempting to reverse but this is not always what we
-- want. See also nodeAutonomy.
-- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
-- since recursivity of the function makes the reverse multiple times (I guess)
nodeChild = onTries . nodeChild
nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
evTrie inE setEV = onTries $ evTrie inE setEV
normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
normalizeEntropy inE = onTries . normalizeEntropy inE
printTrie inE (Tries fwd bwd) = do
printTrie inE (Tries f b) = do
P.putStrLn "Forward:"
printTrie inE fwd
printTrie inE f
P.putStrLn ""
P.putStrLn "Backward:"
printTrie inE bwd
printTrie inE b
onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
onTries h (Tries f b) = Tries (h f) (h b)
------------------------------------------------------------------------
split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
......@@ -366,43 +360,23 @@ toToken' :: Int -> [[Text]] -> [[Token]]
toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
---------------------------------------------
{-
set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
set_entropy_vars inE modE tries@(Tries fwd _bwd) =
mapTree (\k -> modE $ entropy_var'' inE tries k) [] fwd
mapTree (\k -> modE $ nodeEntropy inE (findTrieR k tries)) [] fwd
mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
mapTree f k t = go f k t
where
go _ _ (Leaf c) = Leaf c
go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
where
fwd = (nodeEntropy inE (_fwd $ findTrie ng tries))
bwd = (nodeEntropy inE (_bwd $ findTrie (reverse ng) tries))
-}
---------------------------------------------
-- | TODO remove function below after following bug fixed
-- | TODO entropy_var' /= entropy_var on "<start> token.."
entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
, (nodeEntropy info_entropy (_bwd $ findTrie (reverse ng) tries))
]
)
entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
, (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
]
)
where
ntf = parseToken <$> ng
ntb = parseToken <$> reverse ng
---------------------------------------------
type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
testEleve debug n output checks = do
let
{-
......@@ -422,9 +396,9 @@ testEleve debug n output checks = do
-- forM_ pss (P.putStrLn . show)
P.putStrLn ""
printTrie info_entropy nt
P.putStrLn ""
P.putStrLn "Entropy Var:"
printTrie identity t''
-- P.putStrLn ""
-- P.putStrLn "Entropy Var:"
-- printTrie identity t''
P.putStrLn ""
P.putStrLn "Splitting:"
P.putStrLn $ show res
......@@ -438,45 +412,43 @@ testEleve debug n output checks = do
inp = toToken <$> input
t :: Tries Token Double
t = buildTrie (toToken' n input)
& bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
-- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
t = buildTrie (toToken' n input)
evt :: Tries Token (I Double)
evt = evTrie identity set_entropy_var t
nt :: Tries Token (I Double)
nt = normalizeEntropy identity set_autonomy t
nt = normalizeEntropy info_entropy_var (\fe i -> i & info_autonomy .~ fe (i ^. info_entropy_var)) evt
t'' :: Trie Token Double
t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
-- t'' :: Trie Token Double
-- t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
-- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
-- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
check f msg ref my =
if f ref my
then P.putStrLn $ " PASS " <> msg <> " " <> show ref
else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
let ns = parseToken <$> T.words ngram
nsb = parseToken <$> (reverse $ T.words ngram)
t' = findTrie ns t
tvar = findTrie ns t''
-- tvar = findTrie ns t''
-- my_entropy_var = nodeEntropy identity tvar
nt' = findTrie ns nt
P.putStrLn $ " " <> T.unpack ngram <> ":"
check (==) "count" count (_node_count tvar)
check sim "entropy_var" entropy (nodeEntropy identity tvar)
--check sim ("entropy_varOK") entropy (entropy_var (T.words ngram) nt)
--check sim "entropy" entropy (entropy_var' nt (parseToken <$> T.words ngram))
{- ^ FIXME 2 fun above should have same results (error in reverseToken):
<start> New York:
PASS count 1
FAIL entropy ref=NaN my=0.0
-}
check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
check sim "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
check (==) "count" count (_node_count (_fwd t'))
check sim "entropy" entropy (nodeEntropyBwdOpt info_entropy nt')
check sim "ev" ev (nodeEntropy info_entropy_var nt')
check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
-- | TODO real data is a list of tokenized sentences
example0, example1, example2, example3, example4, example5, example6 :: [Text]
......@@ -493,40 +465,37 @@ example6 = ["le-petit chat"
,"le gros rat"
]
checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
checks0, checks2 :: Checks Double
checks0 =
[("<start>", 1, nan, nan, nan, nan, 0.0)
,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
--,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
-- need to count it.
--{-
,("<start> New", 1, nan, nan, nan, nan, 0.0)
,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
,("York is", 1, 0.0, nan, nan, nan, 0.0)
,("is New", 1, 0.0, nan, nan, nan, 0.0)
,("York and", 1, 0.0, nan, nan, nan, 0.0)
,("and New", 1, 0.0, nan, nan, nan, 0.0)
,("York <stop>", 1, nan, nan, nan, nan, nan)
,("<start> New York", 1, nan, nan, nan, nan, 0.0)
,("New York is", 1, 0.0, nan, nan, nan, 0.0)
,("York is New", 1, 0.0, nan, nan, nan, 0.0)
,("is New York", 1, 0.0, nan, nan, nan, 0.0)
,("New York and", 1, 0.0, nan, nan, nan, 0.0)
,("York and New", 1, 0.0, nan, nan, nan, 0.0)
,("and New York", 1, 0.0, nan, nan, nan, 0.0)
,("New York <stop>", 1, nan, nan, nan, nan, nan)
--}
[ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
, ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
, ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
, ("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
, ("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
--, ("<stop>", 0.0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
-- Since it is not in the trie it no, need to count it.
, ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
, ("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, nan, nan)
, ("York is", 1, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474, nan, nan, nan)
, ("is New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
, ("York and", 1, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474, nan, nan, nan)
, ("and New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
, ("York <stop>", 1, nan, nan, nan, nan, nan, nan, nan, nan, nan)
, ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
, ("New York is", 1, 0.0, nan, nan, 0.0, -1.584962500721156, nan, nan, nan, nan)
, ("York is New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
, ("is New York", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
, ("New York and", 1, 0.0, nan, nan, 0.0, -1.584962500721156, nan, nan, nan, nan)
, ("York and New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
, ("and New York", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
, ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, nan, nan, nan)
]
checks2 =
checks2 = []
{-
[("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
,("be or", 2, 0.5, nan, nan, nan, 1.0)
,("or not", 1, 0.0, nan, nan, nan, 0.0)
......@@ -535,7 +504,7 @@ checks2 =
,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
,("be and", 1, 0.0, nan, nan, nan, 0.0)
]
-}
runTests :: IO ()
runTests =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment