Commit 9643c219 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[ELEVE] Ngrams, still NaN.

parent 7b3d0ab6
...@@ -151,6 +151,7 @@ library: ...@@ -151,6 +151,7 @@ library:
- protolude - protolude
- pureMD5 - pureMD5
- SHA - SHA
- simple-reflect
- random - random
- rake - rake
- regex-compat - regex-compat
......
...@@ -98,18 +98,20 @@ eavg [] = 0 ...@@ -98,18 +98,20 @@ eavg [] = 0
-- Simple Average -- Simple Average
mean :: Fractional a => [a] -> a mean :: Fractional a => [a] -> a
mean xs = if L.null xs then 0.0 mean xs = sum xs / fromIntegral (length xs)
else sum xs / fromIntegral (length xs)
sumMaybe :: Num a => [Maybe a] -> Maybe a sumMaybe :: Num a => [Maybe a] -> Maybe a
sumMaybe = fmap sum . M.sequence sumMaybe = fmap sum . M.sequence
variance :: Floating a => [a] -> a variance :: Floating a => [a] -> a
variance xs = mean $ map (\x -> (x - m) ** 2) xs where variance xs = sum ys / (fromIntegral (length xs) - 1)
where
m = mean xs m = mean xs
ys = map (\x -> (x - m) ** 2) xs
deviation :: [Double] -> Double deviation :: Floating a => [a] -> a
deviation = sqrt . variance deviation = sqrt . variance
movingAverage :: (Eq b, Fractional b) => Int -> [b] -> [b] movingAverage :: (Eq b, Fractional b) => Int -> [b] -> [b]
...@@ -242,8 +244,8 @@ scaleMinMax xs = map (\x -> (x - mi / (ma - mi + 1) )) xs' ...@@ -242,8 +244,8 @@ scaleMinMax xs = map (\x -> (x - mi / (ma - mi + 1) )) xs'
scaleNormalize :: [Double] -> [Double] scaleNormalize :: [Double] -> [Double]
scaleNormalize xs = map (\x -> (x - v / (m + 1))) xs' scaleNormalize xs = map (\x -> (x - v / (m + 1))) xs'
where where
v = variance xs' v = variance xs'
m = mean xs' m = mean xs'
xs' = map abs xs xs' = map abs xs
normalize :: [Double] -> [Double] normalize :: [Double] -> [Double]
......
...@@ -8,7 +8,10 @@ Implementation of EleVe Python version of papers: ...@@ -8,7 +8,10 @@ Implementation of EleVe Python version of papers:
-} -}
module Gargantext.Text.Eleve where module Gargantext.Text.Eleve where
import Debug.Trace (trace)
import Debug.SimpleReflect
import Control.Monad (foldM)
import Data.Ord (Ord) import Data.Ord (Ord)
import qualified Data.List as L import qualified Data.List as L
import Data.Monoid import Data.Monoid
...@@ -18,34 +21,60 @@ import Data.Map (Map) ...@@ -18,34 +21,60 @@ import Data.Map (Map)
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import qualified Data.Map as Map import qualified Data.Map as Map
import Gargantext.Prelude import Gargantext.Prelude
import qualified Data.Tree as Tree
import Data.Tree (Tree)
import qualified Prelude as P (putStrLn, logBase, String)
-- prop (Node c _e f) = c == Map.size f -- prop (Node c _e f) = c == Map.size f
-- TODO remove Leaf -- TODO maybe add Leaf
-- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time) -- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
example :: [[Token]] --test = split t ts
example = map token test n example = do
$ chunkAlong 3 1 let
$ T.words "New York and New York is a big apple" ex = toToken n example
t = buildTrie $ chunkAlong n 1 ex
data Token = NonTerminal Text | Terminal P.putStrLn $ Tree.drawTree
$ fmap show
$ toTree (NonTerminal "") t
pure $ map unToken $ split t t [] ex
example' = T.words "New York and New York"
example'' = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
data Token = NonTerminal Text
| Terminal
deriving (Ord, Eq, Show) deriving (Ord, Eq, Show)
token :: [Text] -> [Token] toToken :: Int -> [Text] -> [Token]
token xs = (NonTerminal <$> xs) <> [Terminal] toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
unToken :: [Token] -> [Text]
unToken = map f
where
f (NonTerminal x) = x
f Terminal = ""
data Trie k e data Trie k e
= Node { _node_count :: Int = Node { _node_count :: Int
, _node_entropy :: e , _node_entropy :: e
, _node_children :: Map k (Trie k e) , _node_children :: Map k (Trie k e)
} }
-- | Leaf { _node_count :: Int } | Leaf { _node_count :: Int }
deriving (Show) deriving (Show)
toTree :: k -> Trie k e -> Tree (k,Int,e)
toTree k (Node c e cs) = Tree.Node (k, c, e) (map (uncurry toTree) $ Map.toList cs)
-- emptyTrie :: Trie k e -- emptyTrie :: Trie k e
-- emptyTrie = Leaf 0 emptyTrie :: (Ord k, Monoid e) => Int -> Trie k e
emptyTrie :: (Ord k, Monoid e) => Trie k e --emptyTrie n = Node n mempty mempty
emptyTrie = Node 0 mempty mempty emptyTrie = Leaf
mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
mkTrie c children mkTrie c children
...@@ -53,39 +82,40 @@ mkTrie c children ...@@ -53,39 +82,40 @@ mkTrie c children
| otherwise -} = Node c mempty children | otherwise -} = Node c mempty children
insertTrie :: Ord k => [k] -> Trie k () -> Trie k () insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
insertTrie [] n = n insertTrie [] n = n { _node_count = _node_count n +1}
-- insertTrie (x:xs) (Leaf c) = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie) -- insertTrie (x:xs) (Leaf c) = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie)
insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
where where
f = Just . insertTrie xs . fromMaybe emptyTrie f = Just . insertTrie xs . fromMaybe (emptyTrie 0)
insertTries :: Ord k => [[k]] -> Trie k () insertTries :: Ord k => [[k]] -> Trie k ()
insertTries = L.foldr insertTrie emptyTrie insertTries = L.foldr insertTrie (emptyTrie 1)
entropyTrie :: (k -> Bool) -> Trie k () -> Trie k Double entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
-- entropyTrie _ (Leaf c) = Leaf c -- entropyTrie _ (Leaf c) = Leaf c
entropyTrie pred (Node c _e children) = Node c e (entropyTrie pred <$> children) entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
where where
e = sum $ f <$> Map.toList children e = sum $ map f $ Map.toList children
f (k, child) = if pred k then cfc * log (fromIntegral c) else - cfc * log cfc f (k, child) = if pred k then cfc * P.logBase 2 (fromIntegral c)
else - cfc * P.logBase 2 cfc
where where
cfc = fromIntegral (_node_count child) / fromIntegral c cfc = fromIntegral (_node_count child) / fromIntegral c
normalizeEntropy :: Trie k Double -> Trie k Double normalizeEntropy :: (Fractional e, Floating e, Show e) => Trie k e -> Trie k e
-- normalizeEntropy (Leaf c) = Leaf c -- normalizeEntropy (Leaf c) = Leaf c
normalizeEntropy (Node c e children) = normalizeEntropy (Node c e children) =
Node c e $ normalizeLevel m v . normalizeEntropy <$> children trace (show $ L.length es) $ Node c e $ map (normalizeLevel m v . normalizeEntropy) children
where where
es = _node_entropy <$> Map.elems children es = map _node_entropy $ Map.elems children
m = mean es m = mean es
v = variance es v = deviation es
normalizeLevel :: Double -> Double -> Trie k Double -> Trie k Double normalizeLevel :: (Fractional e, Floating e, Show e) => e -> e -> Trie k e -> Trie k e
-- normalizeLevel _ _ (Leaf c) = Leaf c -- normalizeLevel _ _ (Leaf c) = Leaf c
-- normalizeLevel m v (Node c e children) = Node c ((e - m) / v) children --normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) }
normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) / v } normalizeLevel m v n = trace (show (_node_entropy n,m,v)) $ n { _node_entropy = (_node_entropy n - m) / v}
buildTrie :: [[Token]] -> Trie Token Double buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
buildTrie = normalizeEntropy . entropyTrie (== Terminal) . insertTries buildTrie = normalizeEntropy . entropyTrie (== Terminal) . insertTries
subForest :: Trie k e -> [Trie k e] subForest :: Trie k e -> [Trie k e]
...@@ -98,17 +128,36 @@ levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure ...@@ -98,17 +128,36 @@ levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
entropyLevels :: Trie k e -> [[e]] entropyLevels :: Trie k e -> [[e]]
entropyLevels = fmap (fmap _node_entropy) . levels entropyLevels = fmap (fmap _node_entropy) . levels
normalizeEntropy' :: Trie k Double -> Trie k Double normalizeEntropy' :: (Floating e, Show e) => Trie k e -> Trie k e
normalizeEntropy' t = go (entropyLevels t) t normalizeEntropy' t = go (entropyLevels t) t
where where
go :: [[Double]] -> Trie k Double -> Trie k Double go :: (Floating e, Show e) => [[e]] -> Trie k e -> Trie k e
go [] _ = panic "normalizeEntropy' empty levels" go [] _ = panic "normalizeEntropy' empty levels"
-- go _ (Leaf c) = Leaf c -- go _ (Leaf c) = Leaf c
go (es : ess) (Node c e children) = go (es : ess) (Node c e children) =
Node c e (normalizeLevel m v . go ess <$> children) Node c e (normalizeLevel m v . go ess <$> children)
where where
m = mean es m = mean es
v = variance es v = deviation es
buildTrie' :: [[Token]] -> Trie Token Double buildTrie' :: (Floating e, Show e) => [[Token]] -> Trie Token e
buildTrie' = normalizeEntropy' . entropyTrie (== Terminal) . insertTries buildTrie' = normalizeEntropy' . entropyTrie (== Terminal) . insertTries
------------------------------------------------------------------------
autonomie :: Trie Token e -> Token -> e
autonomie trie t = case (Map.lookup t (_node_children trie)) of
Nothing -> panic $ "Gargantext.Text.Ngrams: autonomie" <> (cs $ show t)
Just a -> _node_entropy a
------------------------------------------------------------------------
split :: (Num e, Ord e) => Trie Token e -> Trie Token e -> [Token] -> [Token] -> [[Token]]
split _ _ pref [] = [reverse pref]
split t0 t pref (x:xs) = case Map.lookup x $ _node_children t of
Nothing -> reverse pref : split t0 t0 [x] xs
Just a -> case Map.lookup x $ _node_children t0 of
Nothing -> panic "TODO" -- reverse pref : split t0 t0 [] xs
Just xt0 -> case _node_entropy t + _node_entropy xt0 > _node_entropy a of
True -> split t0 a (x:pref) xs
False -> reverse pref : split t0 xt0 [x] xs
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment