Merge branch 'dev' into features/acp-init-graph

8e40d912 · Alexandre Delanoë · 5863fee7 · bd4e8f10 · 8e40d912 · 8e40d912
Commit 8e40d912 authored May 29, 2019 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 106 additions and 62 deletions

Eleve.hs src/Gargantext/Text/Eleve.hs +100 -57

Terms.hs src/Gargantext/Text/Terms.hs +6 -5

No files found.
--- a/src/Gargantext/Text/Eleve.hs
+++ b/src/Gargantext/Text/Eleve.hs
@@ -10,8 +10,9 @@ Portability : POSIX
 # Implementation of Unsupervized Word Segmentation
 References:
- EleVe Python implementation and discussions with Korantin August and Bruno Gaume
-  [git repo](https://github.com/kodexlab/eleve.git)
+- Python implementation (Korantin August, Emmanuel Navarro):
+  [EleVe](https://github.com/kodexlab/eleve.git)
 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
  Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
@@ -19,9 +20,8 @@ References:
  , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
 Notes for current implementation:
- The node count is correct; TODO AD add tests to keep track of it
+- TODO fix normalization
- NP fix normalization
+- TODO extract longer ngrams (see paper above, viterbi algo can be used)
- NP extract longer ngrams (see paper above, viterbi algo can be used)
 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
 - AD: Real ngrams extraction test
@@ -31,7 +31,6 @@ Notes for current implementation:
                        $ catMaybes
                        $ Gargantext.map _hyperdataDocument_abstract docs
 -}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
@@ -40,11 +39,11 @@ Notes for current implementation:
 module Gargantext.Text.Eleve where
-import Debug.Trace (trace)
+-- import Debug.Trace (trace)
 -- import Debug.SimpleReflect
-import Control.Lens (Lens, Lens', ASetter, Getting, (^.), (^?), (&), (.~), (%~), view, makeLenses, _Just)
+import Control.Lens (Lens', Getting, (^.), (^?), (%~), view, makeLenses, _Just)
-import Control.Monad (foldM)
+import Control.Monad (foldM, mapM_, forM_)
 import Data.Ord (Ord)
 import qualified Data.List as L
 import Data.Monoid
@@ -56,7 +55,7 @@ import qualified Data.Map as Map
 import Gargantext.Prelude hiding (cs)
 import qualified Data.Tree as Tree
 import Data.Tree (Tree)
-import qualified Prelude as P (putStrLn, logBase, String)
+import qualified Prelude as P (putStrLn, logBase)
 ------------------------------------------------------------------------
 -- | Example and tests for development
@@ -76,32 +75,6 @@ type ModEntropy i o e = (e -> e) -> i -> o
 setNormEntropy :: ModEntropy e (I e) e
 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
-testEleve n example = do
-  let
-    ex  = toToken n <$> example
-    t   = buildTrie $ L.concat $ chunkAlong n 1 <$> ex
-    nt  = normalizeEntropy  identity setNormEntropy (t :: Trie Token Double)
-    nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
-  {-
-  P.putStrLn $ Tree.drawTree
-               $ fmap show
-               $ toTree (NonTerminal "") nt'
-  --}
-  pure $ map unToken $ split info_entropy nt' $ L.concat ex
-                          -- NP: here we use the entropy to split
-                          -- instead we should use either:
-                          --   info_norm_entropy or info_norm_entropy'
-                          -- However they should first be fixed.
-- | TODO real data is a list of tokenized sentences
-example0 =  [T.words "New York is New York and New York"]
-example1 =  [T.words "to be or not to be"]
-example2 =  [T.words "to be or not to be or"]
-example3 =  example0 <> example0 -- > TEST: Should not have York New in the trie
-example4 =  map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
------------------------------------------------------------------------
------------------------------------------------------------------------
 data Token = NonTerminal Text
           | Terminal
  deriving (Ord, Eq, Show)
@@ -125,6 +98,7 @@ data Trie k e
 | Leaf { _node_count    :: Int }
  deriving (Show)
+makeLenses ''Trie
 insertTries :: Ord k => [[k]] -> Trie k ()
 insertTries = L.foldr insertTrie emptyTrie
@@ -171,13 +145,16 @@ normalizeEntropy :: (Fractional e, Floating e, Show e)
 normalizeEntropy inE modE = go $ modE identity
  where
    go _ (Leaf c) = Leaf c
-    go f (Node c i children) | not (Map.null children) =
+    go f (Node c i children)
-        -- trace (show $ L.length es) $
+      | Map.null children =
-        Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
+          panic "normalizeEntropy: impossible"
-      where
+      | otherwise         =
-        es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
+          -- trace (show $ L.length es) $
-        m  = mean      es
+          Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
-        v  = deviation es
+        where
+          es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
+          m  = mean      es
+          v  = deviation es
 ------------------------------------------------------------------------
 normalizeLevel :: (Fractional e, Floating e, Show e)
@@ -195,6 +172,13 @@ nodeChildren :: Trie k e -> Map k (Trie k e)
 nodeChildren (Node _ _ cs) = cs
 nodeChildren (Leaf _)      = Map.empty
+nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
+nodeChild k (Node _ _ cs) = Map.lookup k cs
+nodeChild _ (Leaf _)      = Nothing
+findTrie :: Ord k => [k] -> Trie k e -> Maybe (Trie k e)
+findTrie ks t = foldM (flip nodeChild) t ks
 levels :: Trie k e -> [[Trie k e]]
 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
  where
@@ -229,29 +213,88 @@ split inE t0 = go t0 []
    consRev [] xss = xss
    consRev xs xss = reverse xs : xss
-    go _ pref [] = [reverse pref]
+    go _ pref []           = [reverse pref]
+    go _ pref (Terminal:_) = [reverse pref]
    go t pref (x:xs) = case nodeChild x t of
      Nothing -> consRev pref $ go t0 [x] xs
      Just xt -> case nodeChild x t0 of
-        Nothing  -> panic "TODO"
+        Nothing  -> panic $ "TODO"
        Just xt0 ->
          let et   = ne (panic "t")   t
+          --  ^ entropy of the current prefix
              ext0 = ne (panic "xt0") xt0
+          --  ^ entropy of [x]
              ext  = ne 0 xt
+          --  ^ entropy of the current prefix plus x
          in
          -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
-          case et {-+ ext0-} < ext of
+          if ext + ext0 > et
-            -- NP: here we must take ext0 in account however currently it
+            then go xt (x:pref) xs
-            -- makes it worse.
+            else consRev pref $ go xt0 [x] xs
-            -- For instance it currently works well to 2-grams but not more.
-            -- PASS: test 4 example1
-            -- FAIL: test 4 example2
-            True  -> go xt (x:pref) xs
-            False -> consRev pref $ go xt0 [x] xs
-    nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
+    ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
-    nodeChild k (Node _ _ cs) = Map.lookup k cs
-    nodeChild _ (Leaf _)      = Nothing
+------------------------------------------------------------------------
+------------------------------------------------------------------------
-    ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
+mainEleve :: Int -> [[Text]] -> [[[Text]]]
+mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
+  where
+    inp = toToken (n - 1) <$> input
+    t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
+                          -- NP: here we use the entropy to split
+                          -- instead we should use either:
+                          --   info_norm_entropy or info_norm_entropy'
+                          -- However they should first be fixed.
+testEleve :: Bool -> Int -> [Text] -> IO Bool
+testEleve debug n output = do
+  let
+    out = T.words <$> output
+    expected = fmap (T.splitOn "-") <$> out
+    input = (T.splitOn "-" =<<) <$> out
+    inp = toToken (n - 1) <$> input
+    t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
+    nt  = normalizeEntropy  identity setNormEntropy (t :: Trie Token Double)
+    nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
+    pss = [ (ps, findTrie ps t ^? _Just . node_entropy) -- . info_entropy)
+          | ps <- L.nub $ [ c
+                          | m <- [1..n]
+                          , cs <- chunkAlong m 1 <$> inp
+                          , c <- cs
+                          ]
+          ]
+    res = map unToken . split identity t <$> inp
+  when debug $ do
+    P.putStrLn (show input)
+    mapM_ (P.putStrLn . show) pss
+    P.putStrLn $ Tree.drawTree
+              $ fmap show
+              $ toTree (NonTerminal "") nt'
+    P.putStrLn $ show res
+  pure $ expected == res
+-- | TODO real data is a list of tokenized sentences
+example0, example1, example2, example3, example4, example5 :: [Text]
+example0 =  ["New-York is New-York and New-York"]
+example1 =  ["to-be or not to-be"]
+example2 =  ["to-be-or not to-be-or NOT to-be and"]
+example3 =  example0 <> example0
+       -- > TEST: Should not have York New in the trie
+example4 =  ["a-b-c-d e a-b-c-d f"]
+example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
+runTests :: IO ()
+runTests =
+  forM_
+    [("example0", 2, example0)
+    ,("example1", 2, example1)
+    ,("example2", 3, example2)
+    ,("example3", 2, example3)
+    ,("example4", 4, example4)
+    ,("example5", 5, example5)
+    ]
+    (\(name, n, ex) -> do
+      b <- testEleve False n ex
+      P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"
+    )
--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -49,7 +49,7 @@ import qualified Data.List as List
 import qualified Data.Text as Text
 import Gargantext.Text (sentences)
 import Gargantext.Text.Terms.Mono.Token.En (tokenize)
-import Gargantext.Text.Eleve (testEleve)
+import Gargantext.Text.Eleve (mainEleve)
 data TermType lang
  = Mono      { _tt_lang :: lang }
@@ -89,11 +89,12 @@ isPunctuation x = List.elem x $  (Text.pack . pure)
 -- language agnostic extraction
 -- TODO: remove IO
 -- TODO: BlockText 
-extractTermsUnsupervised :: Int -> Text -> IO [[Text]]
+extractTermsUnsupervised :: Int -> Text -> [[Text]]
 extractTermsUnsupervised n = 
-               fmap List.nub
+               List.nub
-             . fmap (List.filter (\l -> List.length l > 1))
+             . (List.filter (\l -> List.length l > 1))
-             . testEleve n
+             . List.concat
+             . mainEleve n
             . map (map Text.toLower)
             . map (List.filter (not . isPunctuation))
             . map tokenize