[FEAT] Ngrams extraction unsupervized (doc, type, function).

ec5bb213 · Alexandre Delanoë · d1b3f47c · ec5bb213 · ec5bb213 · ec5bb213
Commit ec5bb213 authored May 26, 2019 by Alexandre Delanoë
Show whitespace changes
Inline Side-by-side

Showing with 98 additions and 46 deletions

Node.hs src/Gargantext/Database/Schema/Node.hs +0 -2

Eleve.hs src/Gargantext/Text/Eleve.hs +72 -44

Terms.hs src/Gargantext/Text/Terms.hs +26 -0

No files found.
--- a/src/Gargantext/Database/Schema/Node.hs
+++ b/src/Gargantext/Database/Schema/Node.hs
@@ -578,8 +578,6 @@ instance MkCorpus HyperdataAnnuaire
    mk n h p u = insertNodesR [nodeAnnuaireW n h p u]
 getOrMkList :: HasNodeError err => ParentId -> UserId -> Cmd err ListId
 getOrMkList pId uId =
  maybe (mkList' pId uId) (pure . view node_id) . headMay =<< getListsWithParentId pId

--- a/src/Gargantext/Text/Eleve.hs
+++ b/src/Gargantext/Text/Eleve.hs
@@ -3,12 +3,31 @@
 {-# LANGUAGE RankNTypes        #-}
 {-# LANGUAGE TemplateHaskell   #-}
 {-
+# Implementation of Unsupervized Word Segmentation
-Implementation of EleVe Python version of papers:
+References:
+- EleVe Python implementation and discussions with Korantin August and Bruno Gaume
+  [git repo](https://github.com/kodexlab/eleve.git)
+- Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
+  Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
+  the 50th Annual Meeting of the Association for Computational Linguistics
+  , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
+Notes for current implementation:
+- The node count is correct; TODO add tests to keep track of it
+- NP fix normalization
+- NP extract longer ngrams (see paper above, viterbi algo can be used)
+- TODO TEST: prop (Node c _e f) = c == Map.size f
+- AD: Real ngrams extraction test
+  from Gargantext.Text.Terms import extractTermsUnsupervised
+  docs <- runCmdRepl $ selectDocs 1004
+  extractTermsUnsupervised 3 $ DT.intercalate " "
+                        $ catMaybes
+                        $ Gargantext.map _hyperdataDocument_abstract docs
-NP:
-  * The node count is correct and we should not regress on this front.
 -}
 module Gargantext.Text.Eleve where
@@ -30,10 +49,7 @@ import qualified Data.Tree as Tree
 import Data.Tree (Tree)
 import qualified Prelude as P (putStrLn, logBase, String)
-- prop (Node c _e f) = c == Map.size f
+------------------------------------------------------------------------
-- TODO maybe add Leaf
--   NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
 data I e = I
  { _info_entropy       :: e
  , _info_norm_entropy  :: e
@@ -50,30 +66,32 @@ type ModEntropy i o e = (e -> e) -> i -> o
 setNormEntropy :: ModEntropy e (I e) e
 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
-test n example = do
+testEleve n example = do
  let
-    ex  = toToken n example
+    ex  = toToken n <$> example
-    t   = buildTrie $ chunkAlong n 1 ex
+    t   = buildTrie $ L.concat $ chunkAlong n 1 <$> ex
    nt  = normalizeEntropy  identity setNormEntropy (t :: Trie Token Double)
    nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
+  {-
  P.putStrLn $ Tree.drawTree
               $ fmap show
               $ toTree (NonTerminal "") nt'
+  --}
-  pure $ map unToken $ split info_entropy nt' ex
+  pure $ map unToken $ split info_entropy nt' $ L.concat ex
                          -- NP: here we use the entropy to split
                          -- instead we should use either:
                          --   info_norm_entropy or info_norm_entropy'
                          -- However they should first be fixed.
+-- | TODO real data is a list of tokenized sentences
+example0 =  [T.words "New York is New York and New York"]
+example1 =  [T.words "to be or not to be"]
+example2 =  [T.words "to be or not to be or"]
+example3 =  example0 <> example0 -- > TEST: Should not have York New in the trie
+example4 =  map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
-example0 =  T.words "New York is New York and New York"
+------------------------------------------------------------------------
-example1 =  T.words "to be or not to be"
+------------------------------------------------------------------------
-example2 =  T.words "to be or not to be or"
-example3 =  map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
 data Token = NonTerminal Text
           | Terminal
  deriving (Ord, Eq, Show)
@@ -87,6 +105,7 @@ unToken = map f
    f (NonTerminal x) = x
    f Terminal        = ""
+------------------------------------------------------------------------
 data Trie k e
  = Node { _node_count    :: Int
@@ -96,9 +115,16 @@ data Trie k e
 | Leaf { _node_count    :: Int }
  deriving (Show)
-toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
-toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
+insertTries :: Ord k => [[k]] -> Trie k ()
-toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
+insertTries = L.foldr insertTrie emptyTrie
+insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
+insertTrie []     n                    = n { _node_count = _node_count n +1}
+insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
+insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
+  where
+    f = Just . insertTrie xs . fromMaybe emptyTrie
 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
 -- emptyTrie = Node 0 mempty mempty
@@ -110,25 +136,25 @@ mkTrie c children
  | Map.null children = Leaf c
  | otherwise         = Node c mempty children
-insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
+                        -----------------------------
-insertTrie []     n                    = n { _node_count = _node_count n +1}
-insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
-insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
-  where
-    f = Just . insertTrie xs . fromMaybe emptyTrie
-insertTries :: Ord k => [[k]] -> Trie k ()
+-- | Trie to Tree since Tree as nice print function
-insertTries = L.foldr insertTrie emptyTrie
+toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
+toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
+toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.toList cs)
+------------------------------------------------------------------------
+------------------------------------------------------------------------
 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
 entropyTrie _    (Leaf c)             = Leaf c
 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
  where
    e = sum $ map f $ Map.toList children
-    f (k, child) = if pred k then   cfc * P.logBase 2 (fromIntegral c)
+    f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
-                             else - cfc * P.logBase 2 cfc
+                             else - chc * P.logBase 2 chc
      where
-        cfc = fromIntegral (_node_count child) / fromIntegral c
+        chc = fromIntegral (_node_count child) / fromIntegral c
 normalizeEntropy :: (Fractional e, Floating e, Show e)
                 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
@@ -142,6 +168,7 @@ normalizeEntropy inE modE = go $ modE identity
        es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
        m  = mean      es
        v  = deviation es
+------------------------------------------------------------------------
 normalizeLevel :: (Fractional e, Floating e, Show e)
               => e -> e -> e -> e
@@ -150,10 +177,6 @@ normalizeLevel m v e = (e - m) / v
 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
 buildTrie = entropyTrie (== Terminal) . insertTries
-subForest :: Trie k e -> [Trie k e]
-subForest (Leaf _)            = []
-subForest (Node _ _ children) = Map.elems children
 nodeEntropy :: Trie k e -> Maybe e
 nodeEntropy (Node _ e _) = Just e
 nodeEntropy (Leaf _)     = Nothing
@@ -168,6 +191,10 @@ nodeChild _ (Leaf _)      = Nothing
 levels :: Trie k e -> [[Trie k e]]
 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
+  where
+    subForest :: Trie k e -> [Trie k e]
+    subForest (Leaf _)            = []
+    subForest (Node _ _ children) = Map.elems children
 entropyLevels :: Getting e i e -> Trie k i -> [[e]]
 entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
@@ -189,11 +216,10 @@ normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
        v  = deviation es
 ------------------------------------------------------------------------
+------------------------------------------------------------------------
 split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
 split inE t0 = go t0 []
  where
-    ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
    consRev [] xss = xss
    consRev xs xss = reverse xs : xss
@@ -209,10 +235,12 @@ split inE t0 = go t0 []
          in
          -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
          case et {-+ ext0-} < ext of
-            -- NP: here we must take ext0 in account howover currently it
+            -- NP: here we must take ext0 in account however currently it
            -- makes it worse.
            -- For instance it currently works well to 2-grams but not more.
            -- PASS: test 4 example1
            -- FAIL: test 4 example2
            True  -> go xt (x:pref) xs
            False -> consRev pref $ go xt0 [x] xs
+    ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -37,6 +37,7 @@ module Gargantext.Text.Terms
 import Control.Lens
 import Data.Text (Text)
 import Data.Traversable
+import GHC.Base (String)
 import Gargantext.Prelude
 import Gargantext.Core
@@ -44,6 +45,11 @@ import Gargantext.Core.Types
 import Gargantext.Text.Terms.Multi (multiterms)
 import Gargantext.Text.Terms.Mono  (monoTerms)
+import qualified Data.List as List
+import qualified Data.Text as Text
+import Gargantext.Text (sentences)
+import Gargantext.Text.Terms.Mono.Token.En (tokenize)
+import Gargantext.Text.Eleve (testEleve)
 data TermType lang
  = Mono      { _tt_lang :: lang }
@@ -75,3 +81,23 @@ terms (MonoMulti lang) txt = terms (Multi lang) txt
 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
 ------------------------------------------------------------------------
+isPunctuation :: Text -> Bool
+isPunctuation x = List.elem x $  (Text.pack . pure)
+                             <$> ("!?(),;." :: String)
+-- | Unsupervised ngrams extraction
+-- language agnostic extraction
+-- TODO: remove IO
+-- TODO: BlockText 
+extractTermsUnsupervised :: Int -> Text -> IO [[Text]]
+extractTermsUnsupervised n = 
+               fmap List.nub
+             . fmap (List.filter (\l -> List.length l > 1))
+             . testEleve n
+             . map (map Text.toLower)
+             . map (List.filter (not . isPunctuation))
+             . map tokenize
+             . sentences