Commit fed5f0b8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX][Ngrams] order of tokens.

parent 71dce4c8
...@@ -170,7 +170,7 @@ entropyTrie _ (Leaf c) = Leaf c ...@@ -170,7 +170,7 @@ entropyTrie _ (Leaf c) = Leaf c
entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children) entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
where where
e = sum $ map f $ Map.toList children e = sum $ map f $ Map.toList children
f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c) f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
else - chc * P.logBase 2 chc else - chc * P.logBase 2 chc
where where
chc = fromIntegral (_node_count child) / fromIntegral c chc = fromIntegral (_node_count child) / fromIntegral c
...@@ -204,11 +204,13 @@ nodeChildren (Leaf _) = Map.empty ...@@ -204,11 +204,13 @@ nodeChildren (Leaf _) = Map.empty
-} -}
data Ward = ForWard | BackWard
class IsTrie trie where class IsTrie trie where
buildTrie :: Entropy e => [[Token]] -> trie Token e buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
nodeChild :: Ord k => k -> trie k e -> trie k e nodeChild :: Ord k => k -> trie k e -> trie k e
findTrie :: Ord k => [k] -> trie k e -> trie k e findTrie :: Ord k => [k] -> trie k e -> trie k e
normalizeEntropy :: Entropy e normalizeEntropy :: Entropy e
=> Getting e i e -> ModEntropy i o e => Getting e i e -> ModEntropy i o e
-> trie k i -> trie k o -> trie k i -> trie k o
...@@ -218,7 +220,7 @@ class IsTrie trie where ...@@ -218,7 +220,7 @@ class IsTrie trie where
--nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
instance IsTrie Trie where instance IsTrie Trie where
buildTrie = entropyTrie isTerminal . insertTries buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
nodeEntropy inE (Node _ e _) = e ^. inE nodeEntropy inE (Node _ e _) = e ^. inE
nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $ nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
...@@ -275,10 +277,15 @@ data Tries k e = Tries ...@@ -275,10 +277,15 @@ data Tries k e = Tries
, _bwd :: Trie k e , _bwd :: Trie k e
} }
toToken' :: Int -> [[Text]] -> [[Token]]
toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
instance IsTrie Tries where instance IsTrie Tries where
buildTrie tts = Tries { _fwd = buildTrie tts buildTrie to n tts = Tries { _fwd = buildTrie to n tts
, _bwd = buildTrie (reverse <$> tts) , _bwd = buildTrie to n (map reverse $ tts)
} }
nodeEntropy inE (Tries fwd bwd) = nodeEntropy inE (Tries fwd bwd) =
mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd] mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
...@@ -390,7 +397,7 @@ testEleve debug n output checks = do ...@@ -390,7 +397,7 @@ testEleve debug n output checks = do
expected = fmap (T.splitOn "-") <$> out expected = fmap (T.splitOn "-") <$> out
input = (T.splitOn "-" =<<) <$> out input = (T.splitOn "-" =<<) <$> out
inp = toToken <$> input inp = toToken <$> input
t = buildTrie $ L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> inp t = buildTrie toToken' n input
-- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double) -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
-- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
nt = normalizeEntropy identity set_autonomy t nt = normalizeEntropy identity set_autonomy t
...@@ -440,6 +447,7 @@ checks0 = ...@@ -440,6 +447,7 @@ checks0 =
,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0) ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
,("<stop>", 0, nan, nan, nan, 0.0, nan) ,("<stop>", 0, nan, nan, nan, 0.0, nan)
{-
,("<start> New", 1, nan, nan, nan, nan, 0.0) ,("<start> New", 1, nan, nan, nan, nan, 0.0)
,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156) ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
,("York is", 1, 0.0, nan, nan, nan, 0.0) ,("York is", 1, 0.0, nan, nan, nan, 0.0)
...@@ -456,6 +464,7 @@ checks0 = ...@@ -456,6 +464,7 @@ checks0 =
,("York and New", 1, 0.0, nan, nan, nan, 0.0) ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
,("and New York", 1, 0.0, nan, nan, nan, 0.0) ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
,("New York <stop>", 1, nan, nan, nan, nan, nan) ,("New York <stop>", 1, nan, nan, nan, nan, nan)
-}
] ]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment