WIP ElEve

parent 82e82799
Pipeline #434 canceled with stage
...@@ -32,6 +32,7 @@ Notes for current implementation: ...@@ -32,6 +32,7 @@ Notes for current implementation:
$ Gargantext.map _hyperdataDocument_abstract docs $ Gargantext.map _hyperdataDocument_abstract docs
-} -}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-} {-# LANGUAGE RankNTypes #-}
...@@ -39,10 +40,10 @@ Notes for current implementation: ...@@ -39,10 +40,10 @@ Notes for current implementation:
module Gargantext.Text.Eleve where module Gargantext.Text.Eleve where
-- import Debug.Trace (trace) import Debug.Trace (trace)
-- import Debug.SimpleReflect -- import Debug.SimpleReflect
import Control.Lens (Lens', Getting, (^.), (^?), (%~), view, makeLenses, _Just) import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
import Control.Monad (foldM, mapM_, forM_) import Control.Monad (foldM, mapM_, forM_)
import Data.Ord (Ord) import Data.Ord (Ord)
import qualified Data.List as L import qualified Data.List as L
...@@ -55,38 +56,51 @@ import qualified Data.Map as Map ...@@ -55,38 +56,51 @@ import qualified Data.Map as Map
import Gargantext.Prelude hiding (cs) import Gargantext.Prelude hiding (cs)
import qualified Data.Tree as Tree import qualified Data.Tree as Tree
import Data.Tree (Tree) import Data.Tree (Tree)
import qualified Prelude as P (putStrLn, logBase) import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
type Entropy e =
( Fractional e
, Floating e
, P.RealFloat e
, Show e
-- ^ TODO: only used for debugging
)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Example and tests for development -- | Example and tests for development
data I e = I data I e = I
{ _info_entropy :: e { _info_entropy :: e
, _info_norm_entropy :: e , _info_norm_entropy :: e
, _info_norm_entropy' :: e
} }
instance Show e => Show (I e) where instance Show e => Show (I e) where
show (I e n n') = show (e, n, n') show (I e n) = show (e, n)
makeLenses ''I makeLenses ''I
type ModEntropy i o e = (e -> e) -> i -> o type ModEntropy i o e = (e -> e) -> i -> o
setNormEntropy :: ModEntropy e (I e) e setNormEntropy :: ModEntropy e (I e) e
setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy") setNormEntropy f e = I e (f e)
data StartStop = Start | Stop
deriving (Ord, Eq, Show)
data Token = NonTerminal Text data Token = NonTerminal Text
| Terminal | Terminal StartStop
deriving (Ord, Eq, Show) deriving (Ord, Eq, Show)
isTerminal :: Token -> Bool
isTerminal (Terminal _) = True
isTerminal (NonTerminal _) = False
toToken :: Int -> [Text] -> [Token] toToken :: Int -> [Text] -> [Token]
toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal) toToken n xs = Terminal Start : (NonTerminal <$> xs) <> L.take n (repeat $ Terminal Stop)
unToken :: [Token] -> [Text] unToken :: [Token] -> [Text]
unToken = map f unToken = map f
where where
f (NonTerminal x) = x f (NonTerminal x) = x
f Terminal = "" f (Terminal _) = ""
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -130,114 +144,179 @@ toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.t ...@@ -130,114 +144,179 @@ toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.t
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e nan :: Floating e => e
nan = 0 / 0
updateIfDefined :: P.RealFloat e => e -> e -> e
updateIfDefined e0 e | P.isNaN e = e0
| otherwise = e
entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
entropyTrie _ (Leaf c) = Leaf c entropyTrie _ (Leaf c) = Leaf c
entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children) entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
where where
e = sum $ map f $ Map.toList children e = sum $ map f $ Map.toList children
f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c) f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
else - chc * P.logBase 2 chc else - chc * P.logBase 2 chc
where where
chc = fromIntegral (_node_count child) / fromIntegral c chc = fromIntegral (_node_count child) / fromIntegral c
------------------------------------------------------------------------
normalizeLevel :: Entropy e => [e] -> e -> e
normalizeLevel = checkDiff (go . filter (not . P.isNaN))
normalizeEntropy :: (Fractional e, Floating e, Show e)
=> Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
normalizeEntropy inE modE = go $ modE identity
where where
go _ (Leaf c) = Leaf c -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
go f (Node c i children) checkDiff = identity
| Map.null children = go [] = panic "normalizeLevel: impossible"
panic "normalizeEntropy: impossible" -- trace "normalizeLevel"
| otherwise = -- go [_] = identity
-- trace (show $ L.length es) $ go es = \e -> (e - m) / v
Node c (f i) $ go (modE $ normalizeLevel m v) <$> children {-
in if P.isNaN e'
then trace ("normalizeLevel " <> show (e,m,v,es))
e
else e'
-}
where where
es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
m = mean es m = mean es
v = deviation es v = deviation es
------------------------------------------------------------------------
normalizeLevel :: (Fractional e, Floating e, Show e)
=> e -> e -> e -> e
normalizeLevel m v e = (e - m) / v
buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e {- Unused
buildTrie = entropyTrie (== Terminal) . insertTries
nodeEntropy :: Trie k e -> Maybe e
nodeEntropy (Node _ e _) = Just e
nodeEntropy (Leaf _) = Nothing
nodeChildren :: Trie k e -> Map k (Trie k e) nodeChildren :: Trie k e -> Map k (Trie k e)
nodeChildren (Node _ _ cs) = cs nodeChildren (Node _ _ cs) = cs
nodeChildren (Leaf _) = Map.empty nodeChildren (Leaf _) = Map.empty
nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e) -}
nodeChild k (Node _ _ cs) = Map.lookup k cs
nodeChild _ (Leaf _) = Nothing
findTrie :: Ord k => [k] -> Trie k e -> Maybe (Trie k e) class IsTrie trie where
findTrie ks t = foldM (flip nodeChild) t ks buildTrie :: Floating e => [[Token]] -> trie Token e
nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
nodeChild :: Ord k => k -> trie k e -> trie k e
findTrie :: Ord k => [k] -> trie k e -> trie k e
normalizeEntropy :: Entropy e
=> Getting e i e -> ModEntropy i o e
-> trie k i -> trie k o
levels :: Trie k e -> [[Trie k e]] nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
where
subForest :: Trie k e -> [Trie k e] instance IsTrie Trie where
subForest (Leaf _) = [] buildTrie = entropyTrie isTerminal . insertTries
subForest (Node _ _ children) = Map.elems children
entropyLevels :: Getting e i e -> Trie k i -> [[e]] nodeEntropy inE (Node _ e _) = e ^. inE
entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
nan
--fwd :: Getting a s a -> ASetter s t u3 a -> s -> t nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
--fwd inE outE s = s & outE .~ (s ^. inE) nodeChild _ (Leaf _) = emptyTrie
normalizeEntropy' :: (Fractional e, Floating e, Show e) findTrie ks t = L.foldl (flip nodeChild) t ks
=> Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
where where
go _ [] _ = panic "normalizeEntropy' empty levels" go _ [] _ = panic "normalizeEntropy' empty levels"
go _ _ (Leaf c) = Leaf c go _ _ (Leaf c) = Leaf c
go _ ([] : _) _ = panic "normalizeEntropy': empty level" go _ ([] : _) _ = panic "normalizeEntropy': empty level"
go f (es : ess) (Node c i children) = go f (es : ess) (Node c i children) =
Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
{-
This is only normalizing a node with respect to its brothers (unlike all the
nodes of the same level).
normalizeEntropy inE modE = go $ modE identity
where where
m = mean es go _ (Leaf c) = Leaf c
v = deviation es go f (Node c i children)
| Map.null children =
panic "normalizeEntropy: impossible"
| otherwise =
Node c (f i) $ go (modE $ normalizeLevel es) <$> children
where
es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
-}
------------------------------------------------------------------------
levels :: Trie k e -> [[Trie k e]]
levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
where
subForest :: Trie k e -> [Trie k e]
subForest (Leaf _) = []
subForest (Node _ _ children) = Map.elems children
entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Tries k e = Tries
{ _fwd :: Trie k e
, _bwd :: Trie k e
}
instance IsTrie Tries where
buildTrie tts = Tries { _fwd = buildTrie tts
, _bwd = buildTrie (reverse <$> tts)
}
nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
------------------------------------------------------------------------ ------------------------------------------------------------------------
split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]] split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
split inE t0 = go t0 [] split _ _ [] = []
split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
where where
consRev [] xss = xss consRev [] xss = xss
consRev xs xss = reverse xs : xss consRev xs xss = reverse xs : xss
go _ pref [] = [reverse pref] go _ pref [] = [reverse pref]
go _ pref (Terminal:_) = [reverse pref] go _ pref (Terminal Stop:_) = [reverse pref]
go t pref (x:xs) = case nodeChild x t of go t pref (Terminal Start:xs) = go t pref xs
Nothing -> consRev pref $ go t0 [x] xs go t pref (x:xs) =
Just xt -> case nodeChild x t0 of -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
Nothing -> panic $ "TODO" if acc
Just xt0 -> then go xt (x:pref) xs
let et = ne (panic "t") t else consRev pref $ go xt0 [x] xs
where
xt = nodeChild x t
xt0 = nodeChild x t0
et = ne 0 t
-- ^ entropy of the current prefix -- ^ entropy of the current prefix
ext0 = ne (panic "xt0") xt0 ext0 = ne 0 xt0
-- ^ entropy of [x] -- ^ entropy of [x]
ext = ne 0 xt ext = ne 0 xt
-- ^ entropy of the current prefix plus x -- ^ entropy of the current prefix plus x
in acc = ext > et + ext0
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $ -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
if ext + ext0 > et
then go xt (x:pref) xs
else consRev pref $ go xt0 [x] xs
ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE) ne d t = if P.isNaN e then d else e
where e = nodeEntropy inE t
{-
split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
split inE t0 ts =
maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
-}
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
mainEleve :: Int -> [[Text]] -> [[[Text]]] mainEleve :: Int -> [[Text]] -> [[[Text]]]
mainEleve _ _ = []
{-
mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
where where
inp = toToken (n - 1) <$> input inp = toToken (n - 1) <$> input
...@@ -246,6 +325,7 @@ mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> in ...@@ -246,6 +325,7 @@ mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> in
-- instead we should use either: -- instead we should use either:
-- info_norm_entropy or info_norm_entropy' -- info_norm_entropy or info_norm_entropy'
-- However they should first be fixed. -- However they should first be fixed.
-}
testEleve :: Bool -> Int -> [Text] -> IO Bool testEleve :: Bool -> Int -> [Text] -> IO Bool
testEleve debug n output = do testEleve debug n output = do
...@@ -255,27 +335,42 @@ testEleve debug n output = do ...@@ -255,27 +335,42 @@ testEleve debug n output = do
input = (T.splitOn "-" =<<) <$> out input = (T.splitOn "-" =<<) <$> out
inp = toToken (n - 1) <$> input inp = toToken (n - 1) <$> input
t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double) -- nt = normalizeEntropy identity setNormEntropy (fwd :: Trie Token Double)
nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
pss = [ (ps, findTrie ps t ^? _Just . node_entropy) -- . info_entropy) nt = normalizeEntropy identity setNormEntropy
(t :: Trie Token Double)
{-
pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
| ps <- L.nub $ [ c | ps <- L.nub $ [ c
| m <- [1..n] | m <- [1..n]
, cs <- chunkAlong m 1 <$> inp , cs <- chunkAlong m 1 <$> inp
, c <- cs , c <- cs
] ]
] ]
res = map unToken . split identity t <$> inp -}
--res = map unToken . split identity fwd <$> inp
--res = map unToken . split info_norm_entropy' nt' <$> inp
res = map unToken . split info_norm_entropy nt <$> inp
when debug $ do when debug $ do
P.putStrLn (show input) P.putStrLn (show input)
mapM_ (P.putStrLn . show) pss -- mapM_ (P.putStrLn . show) pss
P.putStrLn $ Tree.drawTree P.putStrLn ""
$ fmap show printTrie nt
$ toTree (NonTerminal "") nt' {-
printTrie (_fwd nt)
printTrie (_bwd nt)
-}
P.putStrLn $ show res P.putStrLn $ show res
pure $ expected == res pure $ expected == res
where
printTrie =
P.putStrLn . Tree.drawTree
. fmap show
. toTree (NonTerminal "")
-- | TODO real data is a list of tokenized sentences -- | TODO real data is a list of tokenized sentences
example0, example1, example2, example3, example4, example5 :: [Text] example0, example1, example2, example3, example4, example5, example6 :: [Text]
example0 = ["New-York is New-York and New-York"] example0 = ["New-York is New-York and New-York"]
example1 = ["to-be or not to-be"] example1 = ["to-be or not to-be"]
example2 = ["to-be-or not to-be-or NOT to-be and"] example2 = ["to-be-or not to-be-or NOT to-be and"]
...@@ -283,6 +378,11 @@ example3 = example0 <> example0 ...@@ -283,6 +378,11 @@ example3 = example0 <> example0
-- > TEST: Should not have York New in the trie -- > TEST: Should not have York New in the trie
example4 = ["a-b-c-d e a-b-c-d f"] example4 = ["a-b-c-d e a-b-c-d f"]
example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"] example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
example6 = ["le-petit chat"
,"le-petit chien"
,"le-petit rat"
,"le gros rat"
]
runTests :: IO () runTests :: IO ()
runTests = runTests =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment