Commit a311ae32 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] bwd_entropy : bwd tokens.

parent 8c5e48a2
...@@ -20,9 +20,7 @@ References: ...@@ -20,9 +20,7 @@ References:
, pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075) , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
Notes for current implementation: Notes for current implementation:
- TODO fix normalization
- TODO extract longer ngrams (see paper above, viterbi algo can be used) - TODO extract longer ngrams (see paper above, viterbi algo can be used)
- TODO AD TEST: prop (Node c _e f) = c == Map.size f
- AD: Real ngrams extraction test - AD: Real ngrams extraction test
from Gargantext.Text.Terms import extractTermsUnsupervised from Gargantext.Text.Terms import extractTermsUnsupervised
...@@ -41,7 +39,7 @@ Notes for current implementation: ...@@ -41,7 +39,7 @@ Notes for current implementation:
module Gargantext.Text.Eleve where module Gargantext.Text.Eleve where
-- import Debug.Trace (trace) import Debug.Trace (trace)
-- import Debug.SimpleReflect -- import Debug.SimpleReflect
import Control.Lens hiding (levels, children) import Control.Lens hiding (levels, children)
...@@ -65,9 +63,9 @@ nan = 0 / 0 ...@@ -65,9 +63,9 @@ nan = 0 / 0
noNaNs :: P.RealFloat e => [e] -> [e] noNaNs :: P.RealFloat e => [e] -> [e]
noNaNs = filter (not . P.isNaN) noNaNs = filter (not . P.isNaN)
updateIfDefined :: P.RealFloat e => e -> e -> e updateIfNaN :: P.RealFloat e => e -> e -> e
updateIfDefined e0 e | P.isNaN e = e0 updateIfNaN e0 e | P.isNaN e = e0
| otherwise = e | otherwise = e
sim :: Entropy e => e -> e -> Bool sim :: Entropy e => e -> e -> Bool
sim x y = x == y || (P.isNaN x && P.isNaN y) sim x y = x == y || (P.isNaN x && P.isNaN y)
...@@ -75,6 +73,9 @@ sim x y = x == y || (P.isNaN x && P.isNaN y) ...@@ -75,6 +73,9 @@ sim x y = x == y || (P.isNaN x && P.isNaN y)
subst :: Entropy e => (e, e) -> e -> e subst :: Entropy e => (e, e) -> e -> e
subst (src, dst) x | sim src x = dst subst (src, dst) x | sim src x = dst
| otherwise = x | otherwise = x
------------------------------------------------------------------------ ------------------------------------------------------------------------
type Entropy e = type Entropy e =
...@@ -87,9 +88,9 @@ type Entropy e = ...@@ -87,9 +88,9 @@ type Entropy e =
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Example and tests for development -- | Example and tests for development
data I e = I data I e = I
{ _info_entropy :: e { _info_entropy :: e
, _info_entropy_var :: e , _info_entropy_var :: e
, _info_autonomy :: e , _info_autonomy :: e
} }
instance Show e => Show (I e) where instance Show e => Show (I e) where
...@@ -116,14 +117,22 @@ isTerminal :: Token -> Bool ...@@ -116,14 +117,22 @@ isTerminal :: Token -> Bool
isTerminal (Terminal _) = True isTerminal (Terminal _) = True
isTerminal (NonTerminal _) = False isTerminal (NonTerminal _) = False
chunkAlongEleve :: Int -> [a] -> [[a]]
chunkAlongEleve n xs = L.take n <$> L.tails xs
toToken' :: Int -> [Text] -> [[Token]]
toToken' n input = (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2))
$ toToken input
toToken :: [Text] -> [Token]
toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
parseToken :: Text -> Token parseToken :: Text -> Token
parseToken "<start>" = Terminal Start parseToken "<start>" = Terminal Start
parseToken "<stop>" = Terminal Stop parseToken "<stop>" = Terminal Stop
parseToken t = NonTerminal t parseToken t = NonTerminal t
toToken :: [Text] -> [Token]
toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
printToken :: Token -> Text printToken :: Token -> Text
printToken = f printToken = f
where where
...@@ -183,20 +192,20 @@ entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) childre ...@@ -183,20 +192,20 @@ entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) childre
where where
chc = fromIntegral (_node_count child) / fromIntegral c chc = fromIntegral (_node_count child) / fromIntegral c
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO if stdv == 0 then not defined (NaN)
normalizeLevel :: Entropy e => e -> e -> e -> e normalizeLevel :: Entropy e => e -> e -> e -> e
normalizeLevel m v e = (e - m) / v normalizeLevel m v e = (e - m) / v
--normalizeLevel m v e = if v == 0 then nan else (e - m) / v
{- Unused {- Unused
nodeChildren :: Trie k e -> Map k (Trie k e) nodeChildren :: Trie k e -> Map k (Trie k e)
nodeChildren (Node _ _ cs) = cs nodeChildren (Node _ _ cs) = cs
nodeChildren (Leaf _) = Map.empty nodeChildren (Leaf _) = Map.empty
-} -}
class IsTrie trie where class IsTrie trie where
buildTrie :: Entropy e => [[Token]] -> trie Token e buildTrie :: Entropy e => Int -> [Text] -> trie Token e
nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
nodeChild :: Ord k => k -> trie k e -> trie k e nodeChild :: Ord k => k -> trie k e -> trie k e
findTrie :: Ord k => [k] -> trie k e -> trie k e findTrie :: Ord k => [k] -> trie k e -> trie k e
...@@ -212,7 +221,7 @@ class IsTrie trie where ...@@ -212,7 +221,7 @@ class IsTrie trie where
--nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
instance IsTrie Trie where instance IsTrie Trie where
buildTrie ts = entropyTrie isTerminal $ insertTries ts buildTrie n ts = trace (show ts) $ entropyTrie isTerminal $ insertTries $ toToken' n ts
nodeEntropy inE (Node _ e _) = e ^. inE nodeEntropy inE (Node _ e _) = e ^. inE
nodeEntropy _ (Leaf _) = nan nodeEntropy _ (Leaf _) = nan
...@@ -281,9 +290,9 @@ nodeEntropyBwdOpt inE (Tries f b) = ...@@ -281,9 +290,9 @@ nodeEntropyBwdOpt inE (Tries f b) =
mean $ nodeEntropy inE f : noNaNs [nodeEntropy inE b] mean $ nodeEntropy inE f : noNaNs [nodeEntropy inE b]
instance IsTrie Tries where instance IsTrie Tries where
buildTrie tts = Tries { _fwd = buildTrie tts buildTrie n tts = Tries { _fwd = buildTrie n tts
, _bwd = buildTrie (reverse <$> tts) , _bwd = buildTrie n (reverse tts)
} }
nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b] nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
...@@ -356,25 +365,6 @@ mainEleve n input = map (map printToken) . split identity (t :: Trie Token Doubl ...@@ -356,25 +365,6 @@ mainEleve n input = map (map printToken) . split identity (t :: Trie Token Doubl
t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
-} -}
chunkAlongEleve :: Int -> [a] -> [[a]]
chunkAlongEleve n xs = L.take n <$> L.tails xs
toToken' :: Int -> [[Text]] -> [[Token]]
toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
---------------------------------------------
{-
set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
set_entropy_vars inE modE tries@(Tries fwd _bwd) =
mapTree (\k -> modE $ nodeEntropy inE (findTrieR k tries)) [] fwd
mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
mapTree f k t = go f k t
where
go _ _ (Leaf c) = Leaf c
go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
-}
--------------------------------------------- ---------------------------------------------
type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)] type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
...@@ -415,7 +405,7 @@ testEleve debug n output checks = do ...@@ -415,7 +405,7 @@ testEleve debug n output checks = do
inp = toToken <$> input inp = toToken <$> input
t :: Tries Token Double t :: Tries Token Double
t = buildTrie (toToken' n input) t = buildTrie n (L.concat input)
evt :: Tries Token (I Double) evt :: Tries Token (I Double)
evt = evTrie identity set_entropy_var t evt = evTrie identity set_entropy_var t
...@@ -442,16 +432,19 @@ testEleve debug n output checks = do ...@@ -442,16 +432,19 @@ testEleve debug n output checks = do
nt' = findTrie ns nt nt' = findTrie ns nt
P.putStrLn $ " " <> T.unpack ngram <> ":" P.putStrLn $ " " <> T.unpack ngram <> ":"
check (==) "count" count (_node_count (_fwd t')) check (==) "count" count (_node_count (_fwd t'))
check sim "entropy" entropy (nodeEntropyBwdOpt info_entropy nt')
check sim "ev" ev (nodeEntropy info_entropy_var nt') check sim "entropy" entropy (nodeEntropyBwdOpt info_entropy nt')
check sim "autonomy" autonomy (nodeEntropy info_autonomy nt') check sim "ev" ev (nodeEntropy info_entropy_var nt')
check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt')) check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt')) check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt')) check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt')) check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt')) check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
-- | TODO real data is a list of tokenized sentences -- | TODO real data is a list of tokenized sentences
example0, example1, example2, example3, example4, example5, example6 :: [Text] example0, example1, example2, example3, example4, example5, example6 :: [Text]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment