Commit ec5bb213 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Ngrams extraction unsupervized (doc, type, function).

parent d1b3f47c
...@@ -578,8 +578,6 @@ instance MkCorpus HyperdataAnnuaire ...@@ -578,8 +578,6 @@ instance MkCorpus HyperdataAnnuaire
mk n h p u = insertNodesR [nodeAnnuaireW n h p u] mk n h p u = insertNodesR [nodeAnnuaireW n h p u]
getOrMkList :: HasNodeError err => ParentId -> UserId -> Cmd err ListId getOrMkList :: HasNodeError err => ParentId -> UserId -> Cmd err ListId
getOrMkList pId uId = getOrMkList pId uId =
maybe (mkList' pId uId) (pure . view node_id) . headMay =<< getListsWithParentId pId maybe (mkList' pId uId) (pure . view node_id) . headMay =<< getListsWithParentId pId
......
...@@ -3,12 +3,31 @@ ...@@ -3,12 +3,31 @@
{-# LANGUAGE RankNTypes #-} {-# LANGUAGE RankNTypes #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{- {-
# Implementation of Unsupervized Word Segmentation
Implementation of EleVe Python version of papers: References:
- EleVe Python implementation and discussions with Korantin August and Bruno Gaume
[git repo](https://github.com/kodexlab/eleve.git)
- Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
the 50th Annual Meeting of the Association for Computational Linguistics
, pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
Notes for current implementation:
- The node count is correct; TODO add tests to keep track of it
- NP fix normalization
- NP extract longer ngrams (see paper above, viterbi algo can be used)
- TODO TEST: prop (Node c _e f) = c == Map.size f
- AD: Real ngrams extraction test
from Gargantext.Text.Terms import extractTermsUnsupervised
docs <- runCmdRepl $ selectDocs 1004
extractTermsUnsupervised 3 $ DT.intercalate " "
$ catMaybes
$ Gargantext.map _hyperdataDocument_abstract docs
NP:
* The node count is correct and we should not regress on this front.
-} -}
module Gargantext.Text.Eleve where module Gargantext.Text.Eleve where
...@@ -30,10 +49,7 @@ import qualified Data.Tree as Tree ...@@ -30,10 +49,7 @@ import qualified Data.Tree as Tree
import Data.Tree (Tree) import Data.Tree (Tree)
import qualified Prelude as P (putStrLn, logBase, String) import qualified Prelude as P (putStrLn, logBase, String)
-- prop (Node c _e f) = c == Map.size f ------------------------------------------------------------------------
-- TODO maybe add Leaf
-- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
data I e = I data I e = I
{ _info_entropy :: e { _info_entropy :: e
, _info_norm_entropy :: e , _info_norm_entropy :: e
...@@ -50,30 +66,32 @@ type ModEntropy i o e = (e -> e) -> i -> o ...@@ -50,30 +66,32 @@ type ModEntropy i o e = (e -> e) -> i -> o
setNormEntropy :: ModEntropy e (I e) e setNormEntropy :: ModEntropy e (I e) e
setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy") setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
test n example = do testEleve n example = do
let let
ex = toToken n example ex = toToken n <$> example
t = buildTrie $ chunkAlong n 1 ex t = buildTrie $ L.concat $ chunkAlong n 1 <$> ex
nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double) nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
{-
P.putStrLn $ Tree.drawTree P.putStrLn $ Tree.drawTree
$ fmap show $ fmap show
$ toTree (NonTerminal "") nt' $ toTree (NonTerminal "") nt'
--}
pure $ map unToken $ split info_entropy nt' ex pure $ map unToken $ split info_entropy nt' $ L.concat ex
-- NP: here we use the entropy to split -- NP: here we use the entropy to split
-- instead we should use either: -- instead we should use either:
-- info_norm_entropy or info_norm_entropy' -- info_norm_entropy or info_norm_entropy'
-- However they should first be fixed. -- However they should first be fixed.
-- | TODO real data is a list of tokenized sentences
example0 = [T.words "New York is New York and New York"]
example1 = [T.words "to be or not to be"]
example2 = [T.words "to be or not to be or"]
example3 = example0 <> example0 -- > TEST: Should not have York New in the trie
example4 = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
example0 = T.words "New York is New York and New York" ------------------------------------------------------------------------
example1 = T.words "to be or not to be" ------------------------------------------------------------------------
example2 = T.words "to be or not to be or"
example3 = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
data Token = NonTerminal Text data Token = NonTerminal Text
| Terminal | Terminal
deriving (Ord, Eq, Show) deriving (Ord, Eq, Show)
...@@ -85,8 +103,9 @@ unToken :: [Token] -> [Text] ...@@ -85,8 +103,9 @@ unToken :: [Token] -> [Text]
unToken = map f unToken = map f
where where
f (NonTerminal x) = x f (NonTerminal x) = x
f Terminal = "" f Terminal = ""
------------------------------------------------------------------------
data Trie k e data Trie k e
= Node { _node_count :: Int = Node { _node_count :: Int
...@@ -96,39 +115,46 @@ data Trie k e ...@@ -96,39 +115,46 @@ data Trie k e
| Leaf { _node_count :: Int } | Leaf { _node_count :: Int }
deriving (Show) deriving (Show)
toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
toTree k (Leaf c) = Tree.Node (k, c, Nothing) [] insertTries :: Ord k => [[k]] -> Trie k ()
toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs) insertTries = L.foldr insertTrie emptyTrie
insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
insertTrie [] n = n { _node_count = _node_count n +1}
insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
where
f = Just . insertTrie xs . fromMaybe emptyTrie
-- emptyTrie :: (Ord k, Monoid e) => Trie k e -- emptyTrie :: (Ord k, Monoid e) => Trie k e
-- emptyTrie = Node 0 mempty mempty -- emptyTrie = Node 0 mempty mempty
emptyTrie :: Trie k e emptyTrie :: Trie k e
emptyTrie = Leaf 0 emptyTrie = Leaf 0
mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
mkTrie c children mkTrie c children
| Map.null children = Leaf c | Map.null children = Leaf c
| otherwise = Node c mempty children | otherwise = Node c mempty children
insertTrie :: Ord k => [k] -> Trie k () -> Trie k () -----------------------------
insertTrie [] n = n { _node_count = _node_count n +1}
insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
where
f = Just . insertTrie xs . fromMaybe emptyTrie
insertTries :: Ord k => [[k]] -> Trie k () -- | Trie to Tree since Tree as nice print function
insertTries = L.foldr insertTrie emptyTrie toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
------------------------------------------------------------------------
------------------------------------------------------------------------
entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
entropyTrie _ (Leaf c) = Leaf c entropyTrie _ (Leaf c) = Leaf c
entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children) entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
where where
e = sum $ map f $ Map.toList children e = sum $ map f $ Map.toList children
f (k, child) = if pred k then cfc * P.logBase 2 (fromIntegral c) f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
else - cfc * P.logBase 2 cfc else - chc * P.logBase 2 chc
where where
cfc = fromIntegral (_node_count child) / fromIntegral c chc = fromIntegral (_node_count child) / fromIntegral c
normalizeEntropy :: (Fractional e, Floating e, Show e) normalizeEntropy :: (Fractional e, Floating e, Show e)
=> Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
...@@ -142,6 +168,7 @@ normalizeEntropy inE modE = go $ modE identity ...@@ -142,6 +168,7 @@ normalizeEntropy inE modE = go $ modE identity
es = [ i' ^. inE | Node _ i' _ <- Map.elems children ] es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
m = mean es m = mean es
v = deviation es v = deviation es
------------------------------------------------------------------------
normalizeLevel :: (Fractional e, Floating e, Show e) normalizeLevel :: (Fractional e, Floating e, Show e)
=> e -> e -> e -> e => e -> e -> e -> e
...@@ -150,10 +177,6 @@ normalizeLevel m v e = (e - m) / v ...@@ -150,10 +177,6 @@ normalizeLevel m v e = (e - m) / v
buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
buildTrie = entropyTrie (== Terminal) . insertTries buildTrie = entropyTrie (== Terminal) . insertTries
subForest :: Trie k e -> [Trie k e]
subForest (Leaf _) = []
subForest (Node _ _ children) = Map.elems children
nodeEntropy :: Trie k e -> Maybe e nodeEntropy :: Trie k e -> Maybe e
nodeEntropy (Node _ e _) = Just e nodeEntropy (Node _ e _) = Just e
nodeEntropy (Leaf _) = Nothing nodeEntropy (Leaf _) = Nothing
...@@ -168,6 +191,10 @@ nodeChild _ (Leaf _) = Nothing ...@@ -168,6 +191,10 @@ nodeChild _ (Leaf _) = Nothing
levels :: Trie k e -> [[Trie k e]] levels :: Trie k e -> [[Trie k e]]
levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
where
subForest :: Trie k e -> [Trie k e]
subForest (Leaf _) = []
subForest (Node _ _ children) = Map.elems children
entropyLevels :: Getting e i e -> Trie k i -> [[e]] entropyLevels :: Getting e i e -> Trie k i -> [[e]]
entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
...@@ -189,11 +216,10 @@ normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t ...@@ -189,11 +216,10 @@ normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
v = deviation es v = deviation es
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------
split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]] split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
split inE t0 = go t0 [] split inE t0 = go t0 []
where where
ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
consRev [] xss = xss consRev [] xss = xss
consRev xs xss = reverse xs : xss consRev xs xss = reverse xs : xss
...@@ -203,16 +229,18 @@ split inE t0 = go t0 [] ...@@ -203,16 +229,18 @@ split inE t0 = go t0 []
Just xt -> case nodeChild x t0 of Just xt -> case nodeChild x t0 of
Nothing -> panic "TODO" Nothing -> panic "TODO"
Just xt0 -> Just xt0 ->
let et = ne (panic "t") t let et = ne (panic "t") t
ext0 = ne (panic "xt0") xt0 ext0 = ne (panic "xt0") xt0
ext = ne 0 xt ext = ne 0 xt
in in
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $ -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
case et {-+ ext0-} < ext of case et {-+ ext0-} < ext of
-- NP: here we must take ext0 in account howover currently it -- NP: here we must take ext0 in account however currently it
-- makes it worse. -- makes it worse.
-- For instance it currently works well to 2-grams but not more. -- For instance it currently works well to 2-grams but not more.
-- PASS: test 4 example1 -- PASS: test 4 example1
-- FAIL: test 4 example2 -- FAIL: test 4 example2
True -> go xt (x:pref) xs True -> go xt (x:pref) xs
False -> consRev pref $ go xt0 [x] xs False -> consRev pref $ go xt0 [x] xs
ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
...@@ -37,6 +37,7 @@ module Gargantext.Text.Terms ...@@ -37,6 +37,7 @@ module Gargantext.Text.Terms
import Control.Lens import Control.Lens
import Data.Text (Text) import Data.Text (Text)
import Data.Traversable import Data.Traversable
import GHC.Base (String)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core import Gargantext.Core
...@@ -44,6 +45,11 @@ import Gargantext.Core.Types ...@@ -44,6 +45,11 @@ import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms) import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoTerms) import Gargantext.Text.Terms.Mono (monoTerms)
import qualified Data.List as List
import qualified Data.Text as Text
import Gargantext.Text (sentences)
import Gargantext.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Text.Eleve (testEleve)
data TermType lang data TermType lang
= Mono { _tt_lang :: lang } = Mono { _tt_lang :: lang }
...@@ -75,3 +81,23 @@ terms (MonoMulti lang) txt = terms (Multi lang) txt ...@@ -75,3 +81,23 @@ terms (MonoMulti lang) txt = terms (Multi lang) txt
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------ ------------------------------------------------------------------------
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;." :: String)
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: BlockText
extractTermsUnsupervised :: Int -> Text -> IO [[Text]]
extractTermsUnsupervised n =
fmap List.nub
. fmap (List.filter (\l -> List.length l > 1))
. testEleve n
. map (map Text.toLower)
. map (List.filter (not . isPunctuation))
. map tokenize
. sentences
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment