[NGRAMS] Unsupervised extraction OK.

a5bd188c · Alexandre Delanoë · 3706928e · a5bd188c · a5bd188c · a5bd188c
Commit a5bd188c authored Jun 21, 2019 by Alexandre Delanoë
Showing with 48 additions and 26 deletions

Flow.hs src/Gargantext/Database/Flow.hs +0 -2

Terms.hs src/Gargantext/Text/Terms.hs +35 -12

Eleve.hs src/Gargantext/Text/Terms/Eleve.hs +12 -9

Mono.hs src/Gargantext/Text/Terms/Mono.hs +1 -3

No files found.
--- a/src/Gargantext/Database/Flow.hs
+++ b/src/Gargantext/Database/Flow.hs
@@ -302,7 +302,6 @@ instance ExtractNgramsT HyperdataContact
 instance ExtractNgramsT HyperdataDocument
  where
    extractNgramsT = extractNgramsT'
@@ -362,7 +361,6 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams
      pure $ DocumentIdWithNgrams d e
 -- FLOW LIST
 -- | TODO check optimization
 mapNodeIdNgrams :: [DocumentIdWithNgrams a]

--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -44,18 +44,23 @@ import Gargantext.Core
 import Gargantext.Core.Types
 import Gargantext.Text.Terms.Multi (multiterms)
 import Gargantext.Text.Terms.Mono  (monoTerms)
+import Gargantext.Text.Terms.Mono.Stem (stem)
+import qualified Data.Set  as Set
 import qualified Data.List as List
 import qualified Data.Text as Text
 import Gargantext.Text (sentences)
 import Gargantext.Text.Terms.Mono.Token.En (tokenize)
-import Gargantext.Text.Terms.Eleve (mainEleve)
+import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
 data TermType lang
  = Mono      { _tt_lang :: lang }
  | Multi     { _tt_lang :: lang }
  | MonoMulti { _tt_lang :: lang }
+  | Unsupervised { _tt_lang  :: lang
+                 , _tt_size  :: Int
+                 , _tt_model :: Maybe (Tries Token ())
+  }
 makeLenses ''TermType
 --group :: [Text] -> [Text]
@@ -67,7 +72,10 @@ makeLenses ''TermType
 -- | Sugar to extract terms from text (hiddeng mapM from end user).
 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
-extractTerms termTypeLang = mapM (terms termTypeLang)
+extractTerms (Unsupervised l n m) xs = mapM (terms (Unsupervised l n m')) xs
+  where
+    m' = maybe (Just $ newTries n (Text.intercalate " " xs)) Just m
+extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
 ------------------------------------------------------------------------
 -- | Terms from Text
 -- Mono : mono terms
@@ -78,9 +86,16 @@ terms :: TermType Lang -> Text -> IO [Terms]
 terms (Mono      lang) txt = pure $ monoTerms lang txt
 terms (Multi     lang) txt = multiterms lang txt
 terms (MonoMulti lang) txt = terms (Multi lang) txt
+terms (Unsupervised lang n m) txt = termsUnsupervised m' n lang txt
+  where
+    m' = maybe (newTries n txt) identity m
 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
 ------------------------------------------------------------------------
+text2term :: Lang -> [Text] -> Terms
+text2term _ [] = Terms [] Set.empty
+text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
 isPunctuation :: Text -> Bool
 isPunctuation x = List.elem x $  (Text.pack . pure)
                             <$> ("!?(),;." :: String)
@@ -89,14 +104,22 @@ isPunctuation x = List.elem x $  (Text.pack . pure)
 -- language agnostic extraction
 -- TODO: remove IO
 -- TODO: newtype BlockText
-extractTermsUnsupervised :: Int -> Text -> [[Text]]
+termsUnsupervised :: Tries Token () -> Int -> Lang -> Text -> IO [Terms]
-extractTermsUnsupervised n = 
+termsUnsupervised m n l = 
-               List.nub
+               pure
-             . (List.filter (\l -> List.length l > 1))
+             . map (text2term l)
+             . List.nub
+             . (List.filter (\l' -> List.length l' > 1))
             . List.concat
-             . mainEleve n
+             . mainEleveWith m n
-             . map (map Text.toLower)
+             . uniText
-             . map (List.filter (not . isPunctuation))
-             . map tokenize
+newTries :: Int -> Text -> Tries Token ()
-             . sentences
+newTries n t = buildTries n (fmap toToken $ uniText t)
+uniText :: Text -> [[Text]]
+uniText = map (List.filter (not . isPunctuation))
+        . map tokenize
+        . sentences   -- | TODO get sentences according to lang
--- a/src/Gargantext/Text/Terms/Eleve.hs
+++ b/src/Gargantext/Text/Terms/Eleve.hs
@@ -367,16 +367,19 @@ split inE t0 ts =
 ------------------------------------------------------------------------
 mainEleve :: Int -> [[Text]] -> [[[Text]]]
-mainEleve n input = split n info_autonomy (t :: Tries Token (I Double)) <$> inp
+mainEleve n i = mainEleveWith m n i
  where
-    inp = toToken <$> input
+    m = buildTries n (fmap toToken i)
-    t   = normalizeEntropy info_entropy_var set_autonomy
-        . evTrie identity set_entropy_var
-        . entropyTrie isTerminal
-        $ buildTries n inp
+mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
+mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
+  where
+    t :: Tries Token (I Double)
+    t = normalizeEntropy info_entropy_var set_autonomy
+      $ evTrie identity set_entropy_var
+      $ entropyTrie isTerminal m
---------------------------------------------
+------------------------------------------------------------------------
 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
@@ -484,8 +487,8 @@ checks2 = []
  ]
 -}
-runTests :: IO ()
+runTestsEleve :: IO ()
-runTests =
+runTestsEleve =
  forM_
    [("example0", 3, example0, checks0)
    ,("example0", 2, example0, [])

--- a/src/Gargantext/Text/Terms/Mono.hs
+++ b/src/Gargantext/Text/Terms/Mono.hs
@@ -47,6 +47,7 @@ monoTerms l txt = map (monoText2term l) $ monoTexts txt
 monoTexts :: Text -> [Text]
 monoTexts = L.concat . monoTextsBySentence
+-- | TODO use text2term only
 monoText2term :: Lang -> Text -> Terms
 monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
@@ -54,6 +55,3 @@ monoTextsBySentence :: Text -> [[Text]]
 monoTextsBySentence = map T.words
                    . T.split isSep
                    . T.toLower