Eleve refactor

parent a6cc86a9
...@@ -10,89 +10,103 @@ module Gargantext.Text.Eleve where ...@@ -10,89 +10,103 @@ module Gargantext.Text.Eleve where
import Data.Ord (Ord) import Data.Ord (Ord)
import qualified Data.List as List import qualified Data.List as L
import Data.Monoid import Data.Monoid
import Data.Text hiding (map) import Data.Text (Text)
import qualified Data.Text as T
import Data.Map (Map) import Data.Map (Map)
import Data.Maybe (fromMaybe)
import qualified Data.Map as Map import qualified Data.Map as Map
import Gargantext.Prelude import Gargantext.Prelude
-- prop (Noeud c _e f) = c == Map.size f -- prop (Node c _e f) = c == Map.size f
-- TODO remove Feuille -- TODO remove Leaf
-- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
example :: [[Terminal]] example :: [[Token]]
example = map terminal example = map token
$ chunkAlong 3 1 $ chunkAlong 3 1
$ words "New York and New York is a big apple" $ T.words "New York and New York is a big apple"
data Terminal = Terminal Text | Fin data Token = NonTerminal Text | Fin
deriving (Ord, Eq, Show) deriving (Ord, Eq, Show)
isFin :: Terminal -> Bool isFin :: Token -> Bool
isFin x = case x of isFin x = case x of
Fin -> True Fin -> True
_ -> False _ -> False
terminal :: [Text] -> [Terminal] token :: [Text] -> [Token]
terminal xs = (map Terminal xs) <> [Fin] token xs = (NonTerminal <$> xs) <> [Fin]
data Trie k e
= Node { _node_count :: Int
data Arbre k e = Noeud { _noeud_count :: Double , _node_entropy :: e
, _noeud_entropy :: e , _node_children :: Map k (Trie k e)
, _noeud_fils :: Map k (Arbre k e) }
} -- | Leaf { _node_count :: Int }
| Feuille { _noeud_count :: Double } deriving (Show)
deriving (Show)
-- emptyTrie :: Trie k e
arbreVide :: Arbre k e -- emptyTrie = Leaf 0
arbreVide = Feuille 0 emptyTrie :: (Ord k, Monoid e) => Trie k e
emptyTrie = Node 0 mempty mempty
mkArbre :: Monoid e => Double -> Map Terminal (Arbre Terminal e) -> Arbre Terminal e
mkArbre c fils mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
| Map.null fils = Feuille c mkTrie c children
| otherwise = Noeud c mempty fils {-| Map.null children = Leaf c
| otherwise -} = Node c mempty children
insertArbre :: [Terminal] -> Arbre Terminal () -> Arbre Terminal () insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
insertArbre [] n = n insertTrie [] n = n
insertArbre (x:xs) (Feuille c) = mkArbre (c+1) (Map.singleton x $ insertArbre xs arbreVide) -- insertTrie (x:xs) (Leaf c) = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie)
insertArbre (x:xs) (Noeud c _e f) = mkArbre (c+1) (case Map.lookup x f of insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
Nothing -> Map.insert x (insertArbre xs arbreVide) f where
Just arbre -> Map.insert x (insertArbre xs arbre ) f f = Just . insertTrie xs . fromMaybe emptyTrie
)
insertArbres :: [[Terminal]] -> Arbre Terminal () insertTries :: Ord k => [[k]] -> Trie k ()
insertArbres = List.foldr insertArbre arbreVide insertTries = L.foldr insertTrie emptyTrie
entropyArbre :: Arbre Terminal () -> Arbre Terminal Double entropyTrie :: (k -> Bool) -> Trie k () -> Trie k Double
entropyArbre (Feuille c) = Feuille c -- entropyTrie _ (Leaf c) = Leaf c
entropyArbre (Noeud c _e fils) = (Noeud c e (map entropyArbre fils)) entropyTrie pred (Node c _e children) = Node c e (entropyTrie pred <$> children)
where where
e = sum $ map (\(k, f) -> case isFin k of e = sum $ f <$> Map.toList children
True -> (_noeud_count f) / c * log c f (k, child) = if pred k then cfc * log (fromIntegral c) else - cfc * log cfc
False -> - c' * log c' where
where cfc = fromIntegral (_node_count child) / fromIntegral c
c' = (_noeud_count f) / c
) normalizeEntropy :: Trie k Double -> Trie k Double
$ Map.toList fils -- normalizeEntropy (Leaf c) = Leaf c
normalizeEntropy (Node c e children) =
normalizeArbre :: Arbre Terminal Double -> Arbre Terminal Double Node c e $ normalizeLevel m v . normalizeEntropy <$> children
normalizeArbre (Feuille c) = Feuille c
normalizeArbre (Noeud c e f) = Noeud c e (Map.map (\a -> normalizeLevel a $ Map.elems f) f)
normalizeLevel :: Arbre Terminal Double -> [Arbre Terminal Double] -> Arbre Terminal Double
normalizeLevel (Feuille c) _ = Feuille c
normalizeLevel (Noeud c e f) ns = Noeud c ( (e-m) / v) f
where where
es = map _noeud_entropy ns es = _node_entropy <$> Map.elems children
m = mean es m = mean es
v = variance es v = variance es
buildArbre :: [[Terminal]] -> Arbre Terminal Double normalizeLevel :: Double -> Double -> Trie k Double -> Trie k Double
buildArbre = normalizeArbre . entropyArbre . insertArbres -- normalizeLevel _ _ (Leaf c) = Leaf c
normalizeLevel m v (Node c e children) = Node c ((e - m) / v) children
buildTrie :: [[Token]] -> Trie Token Double
buildTrie = normalizeEntropy . entropyTrie isFin . insertTries
subForest :: Trie k e -> [Trie k e]
-- subForest (Leaf _) = []
subForest (Node _ _ children) = Map.elems children
levels :: Trie k e -> [[Trie k e]]
levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
entropyLevels :: Trie k e -> [[e]]
entropyLevels = fmap (fmap _node_entropy) . levels
normalizeEntropy' :: [[Double]] -> Trie k Double -> Trie k Double
normalizeEntropy' [] _ = panic "normalizeEntropy' empty levels"
-- normalizeEntropy' _ (Leaf c) = Leaf c
normalizeEntropy' (es : ess) (Node c e children) =
Node c e (normalizeLevel m v . normalizeEntropy' ess <$> children)
where
m = mean es
v = variance es
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment