Commit ec5bb213 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Ngrams extraction unsupervized (doc, type, function).

parent d1b3f47c
......@@ -578,8 +578,6 @@ instance MkCorpus HyperdataAnnuaire
mk n h p u = insertNodesR [nodeAnnuaireW n h p u]
getOrMkList :: HasNodeError err => ParentId -> UserId -> Cmd err ListId
getOrMkList pId uId =
maybe (mkList' pId uId) (pure . view node_id) . headMay =<< getListsWithParentId pId
......
......@@ -3,12 +3,31 @@
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE TemplateHaskell #-}
{-
# Implementation of Unsupervized Word Segmentation
Implementation of EleVe Python version of papers:
References:
- EleVe Python implementation and discussions with Korantin August and Bruno Gaume
[git repo](https://github.com/kodexlab/eleve.git)
- Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
the 50th Annual Meeting of the Association for Computational Linguistics
, pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
Notes for current implementation:
- The node count is correct; TODO add tests to keep track of it
- NP fix normalization
- NP extract longer ngrams (see paper above, viterbi algo can be used)
- TODO TEST: prop (Node c _e f) = c == Map.size f
- AD: Real ngrams extraction test
from Gargantext.Text.Terms import extractTermsUnsupervised
docs <- runCmdRepl $ selectDocs 1004
extractTermsUnsupervised 3 $ DT.intercalate " "
$ catMaybes
$ Gargantext.map _hyperdataDocument_abstract docs
NP:
* The node count is correct and we should not regress on this front.
-}
module Gargantext.Text.Eleve where
......@@ -30,10 +49,7 @@ import qualified Data.Tree as Tree
import Data.Tree (Tree)
import qualified Prelude as P (putStrLn, logBase, String)
-- prop (Node c _e f) = c == Map.size f
-- TODO maybe add Leaf
-- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
------------------------------------------------------------------------
data I e = I
{ _info_entropy :: e
, _info_norm_entropy :: e
......@@ -50,30 +66,32 @@ type ModEntropy i o e = (e -> e) -> i -> o
setNormEntropy :: ModEntropy e (I e) e
setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
test n example = do
testEleve n example = do
let
ex = toToken n example
t = buildTrie $ chunkAlong n 1 ex
ex = toToken n <$> example
t = buildTrie $ L.concat $ chunkAlong n 1 <$> ex
nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
{-
P.putStrLn $ Tree.drawTree
$ fmap show
$ toTree (NonTerminal "") nt'
pure $ map unToken $ split info_entropy nt' ex
--}
pure $ map unToken $ split info_entropy nt' $ L.concat ex
-- NP: here we use the entropy to split
-- instead we should use either:
-- info_norm_entropy or info_norm_entropy'
-- However they should first be fixed.
-- | TODO real data is a list of tokenized sentences
example0 = [T.words "New York is New York and New York"]
example1 = [T.words "to be or not to be"]
example2 = [T.words "to be or not to be or"]
example3 = example0 <> example0 -- > TEST: Should not have York New in the trie
example4 = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
example0 = T.words "New York is New York and New York"
example1 = T.words "to be or not to be"
example2 = T.words "to be or not to be or"
example3 = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
------------------------------------------------------------------------
------------------------------------------------------------------------
data Token = NonTerminal Text
| Terminal
deriving (Ord, Eq, Show)
......@@ -85,8 +103,9 @@ unToken :: [Token] -> [Text]
unToken = map f
where
f (NonTerminal x) = x
f Terminal = ""
f Terminal = ""
------------------------------------------------------------------------
data Trie k e
= Node { _node_count :: Int
......@@ -96,39 +115,46 @@ data Trie k e
| Leaf { _node_count :: Int }
deriving (Show)
toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
insertTries :: Ord k => [[k]] -> Trie k ()
insertTries = L.foldr insertTrie emptyTrie
insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
insertTrie [] n = n { _node_count = _node_count n +1}
insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
where
f = Just . insertTrie xs . fromMaybe emptyTrie
-- emptyTrie :: (Ord k, Monoid e) => Trie k e
-- emptyTrie = Node 0 mempty mempty
emptyTrie :: Trie k e
emptyTrie = Leaf 0
emptyTrie = Leaf 0
mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
mkTrie c children
| Map.null children = Leaf c
| otherwise = Node c mempty children
insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
insertTrie [] n = n { _node_count = _node_count n +1}
insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
where
f = Just . insertTrie xs . fromMaybe emptyTrie
-----------------------------
insertTries :: Ord k => [[k]] -> Trie k ()
insertTries = L.foldr insertTrie emptyTrie
-- | Trie to Tree since Tree as nice print function
toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
------------------------------------------------------------------------
------------------------------------------------------------------------
entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
entropyTrie _ (Leaf c) = Leaf c
entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
where
e = sum $ map f $ Map.toList children
f (k, child) = if pred k then cfc * P.logBase 2 (fromIntegral c)
else - cfc * P.logBase 2 cfc
f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
else - chc * P.logBase 2 chc
where
cfc = fromIntegral (_node_count child) / fromIntegral c
chc = fromIntegral (_node_count child) / fromIntegral c
normalizeEntropy :: (Fractional e, Floating e, Show e)
=> Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
......@@ -142,6 +168,7 @@ normalizeEntropy inE modE = go $ modE identity
es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
m = mean es
v = deviation es
------------------------------------------------------------------------
normalizeLevel :: (Fractional e, Floating e, Show e)
=> e -> e -> e -> e
......@@ -150,10 +177,6 @@ normalizeLevel m v e = (e - m) / v
buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
buildTrie = entropyTrie (== Terminal) . insertTries
subForest :: Trie k e -> [Trie k e]
subForest (Leaf _) = []
subForest (Node _ _ children) = Map.elems children
nodeEntropy :: Trie k e -> Maybe e
nodeEntropy (Node _ e _) = Just e
nodeEntropy (Leaf _) = Nothing
......@@ -168,6 +191,10 @@ nodeChild _ (Leaf _) = Nothing
levels :: Trie k e -> [[Trie k e]]
levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
where
subForest :: Trie k e -> [Trie k e]
subForest (Leaf _) = []
subForest (Node _ _ children) = Map.elems children
entropyLevels :: Getting e i e -> Trie k i -> [[e]]
entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
......@@ -189,11 +216,10 @@ normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
v = deviation es
------------------------------------------------------------------------
------------------------------------------------------------------------
split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
split inE t0 = go t0 []
where
ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
consRev [] xss = xss
consRev xs xss = reverse xs : xss
......@@ -203,16 +229,18 @@ split inE t0 = go t0 []
Just xt -> case nodeChild x t0 of
Nothing -> panic "TODO"
Just xt0 ->
let et = ne (panic "t") t
let et = ne (panic "t") t
ext0 = ne (panic "xt0") xt0
ext = ne 0 xt
ext = ne 0 xt
in
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
case et {-+ ext0-} < ext of
-- NP: here we must take ext0 in account howover currently it
-- NP: here we must take ext0 in account however currently it
-- makes it worse.
-- For instance it currently works well to 2-grams but not more.
-- PASS: test 4 example1
-- FAIL: test 4 example2
True -> go xt (x:pref) xs
False -> consRev pref $ go xt0 [x] xs
ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
......@@ -37,6 +37,7 @@ module Gargantext.Text.Terms
import Control.Lens
import Data.Text (Text)
import Data.Traversable
import GHC.Base (String)
import Gargantext.Prelude
import Gargantext.Core
......@@ -44,6 +45,11 @@ import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoTerms)
import qualified Data.List as List
import qualified Data.Text as Text
import Gargantext.Text (sentences)
import Gargantext.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Text.Eleve (testEleve)
data TermType lang
= Mono { _tt_lang :: lang }
......@@ -75,3 +81,23 @@ terms (MonoMulti lang) txt = terms (Multi lang) txt
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;." :: String)
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: BlockText
extractTermsUnsupervised :: Int -> Text -> IO [[Text]]
extractTermsUnsupervised n =
fmap List.nub
. fmap (List.filter (\l -> List.length l > 1))
. testEleve n
. map (map Text.toLower)
. map (List.filter (not . isPunctuation))
. map tokenize
. sentences
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment