Commit a5bd188c authored by Alexandre Delanoë's avatar Alexandre Delanoë

[NGRAMS] Unsupervised extraction OK.

parent 3706928e
...@@ -302,7 +302,6 @@ instance ExtractNgramsT HyperdataContact ...@@ -302,7 +302,6 @@ instance ExtractNgramsT HyperdataContact
instance ExtractNgramsT HyperdataDocument instance ExtractNgramsT HyperdataDocument
where where
extractNgramsT = extractNgramsT' extractNgramsT = extractNgramsT'
...@@ -362,7 +361,6 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams ...@@ -362,7 +361,6 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams
pure $ DocumentIdWithNgrams d e pure $ DocumentIdWithNgrams d e
-- FLOW LIST -- FLOW LIST
-- | TODO check optimization -- | TODO check optimization
mapNodeIdNgrams :: [DocumentIdWithNgrams a] mapNodeIdNgrams :: [DocumentIdWithNgrams a]
......
...@@ -44,18 +44,23 @@ import Gargantext.Core ...@@ -44,18 +44,23 @@ import Gargantext.Core
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms) import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoTerms) import Gargantext.Text.Terms.Mono (monoTerms)
import Gargantext.Text.Terms.Mono.Stem (stem)
import qualified Data.Set as Set
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Text as Text import qualified Data.Text as Text
import Gargantext.Text (sentences) import Gargantext.Text (sentences)
import Gargantext.Text.Terms.Mono.Token.En (tokenize) import Gargantext.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Text.Terms.Eleve (mainEleve) import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
data TermType lang data TermType lang
= Mono { _tt_lang :: lang } = Mono { _tt_lang :: lang }
| Multi { _tt_lang :: lang } | Multi { _tt_lang :: lang }
| MonoMulti { _tt_lang :: lang } | MonoMulti { _tt_lang :: lang }
| Unsupervised { _tt_lang :: lang
, _tt_size :: Int
, _tt_model :: Maybe (Tries Token ())
}
makeLenses ''TermType makeLenses ''TermType
--group :: [Text] -> [Text] --group :: [Text] -> [Text]
...@@ -67,7 +72,10 @@ makeLenses ''TermType ...@@ -67,7 +72,10 @@ makeLenses ''TermType
-- | Sugar to extract terms from text (hiddeng mapM from end user). -- | Sugar to extract terms from text (hiddeng mapM from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms]) --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]] extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms termTypeLang = mapM (terms termTypeLang) extractTerms (Unsupervised l n m) xs = mapM (terms (Unsupervised l n m')) xs
where
m' = maybe (Just $ newTries n (Text.intercalate " " xs)) Just m
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Terms from Text -- | Terms from Text
-- Mono : mono terms -- Mono : mono terms
...@@ -78,9 +86,16 @@ terms :: TermType Lang -> Text -> IO [Terms] ...@@ -78,9 +86,16 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (Unsupervised lang n m) txt = termsUnsupervised m' n lang txt
where
m' = maybe (newTries n txt) identity m
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------ ------------------------------------------------------------------------
text2term :: Lang -> [Text] -> Terms
text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation :: Text -> Bool isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure) isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;." :: String) <$> ("!?(),;." :: String)
...@@ -89,14 +104,22 @@ isPunctuation x = List.elem x $ (Text.pack . pure) ...@@ -89,14 +104,22 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
-- language agnostic extraction -- language agnostic extraction
-- TODO: remove IO -- TODO: remove IO
-- TODO: newtype BlockText -- TODO: newtype BlockText
extractTermsUnsupervised :: Int -> Text -> [[Text]] termsUnsupervised :: Tries Token () -> Int -> Lang -> Text -> IO [Terms]
extractTermsUnsupervised n = termsUnsupervised m n l =
List.nub pure
. (List.filter (\l -> List.length l > 1)) . map (text2term l)
. List.nub
. (List.filter (\l' -> List.length l' > 1))
. List.concat . List.concat
. mainEleve n . mainEleveWith m n
. map (map Text.toLower) . uniText
. map (List.filter (not . isPunctuation))
newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t)
uniText :: Text -> [[Text]]
uniText = map (List.filter (not . isPunctuation))
. map tokenize . map tokenize
. sentences . sentences -- | TODO get sentences according to lang
...@@ -367,16 +367,19 @@ split inE t0 ts = ...@@ -367,16 +367,19 @@ split inE t0 ts =
------------------------------------------------------------------------ ------------------------------------------------------------------------
mainEleve :: Int -> [[Text]] -> [[[Text]]] mainEleve :: Int -> [[Text]] -> [[[Text]]]
mainEleve n input = split n info_autonomy (t :: Tries Token (I Double)) <$> inp mainEleve n i = mainEleveWith m n i
where where
inp = toToken <$> input m = buildTries n (fmap toToken i)
t = normalizeEntropy info_entropy_var set_autonomy
. evTrie identity set_entropy_var
. entropyTrie isTerminal
$ buildTries n inp
mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
where
t :: Tries Token (I Double)
t = normalizeEntropy info_entropy_var set_autonomy
$ evTrie identity set_entropy_var
$ entropyTrie isTerminal m
--------------------------------------------- ------------------------------------------------------------------------
type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)] type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
...@@ -484,8 +487,8 @@ checks2 = [] ...@@ -484,8 +487,8 @@ checks2 = []
] ]
-} -}
runTests :: IO () runTestsEleve :: IO ()
runTests = runTestsEleve =
forM_ forM_
[("example0", 3, example0, checks0) [("example0", 3, example0, checks0)
,("example0", 2, example0, []) ,("example0", 2, example0, [])
......
...@@ -47,6 +47,7 @@ monoTerms l txt = map (monoText2term l) $ monoTexts txt ...@@ -47,6 +47,7 @@ monoTerms l txt = map (monoText2term l) $ monoTexts txt
monoTexts :: Text -> [Text] monoTexts :: Text -> [Text]
monoTexts = L.concat . monoTextsBySentence monoTexts = L.concat . monoTextsBySentence
-- | TODO use text2term only
monoText2term :: Lang -> Text -> Terms monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt) monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
...@@ -54,6 +55,3 @@ monoTextsBySentence :: Text -> [[Text]] ...@@ -54,6 +55,3 @@ monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map T.words monoTextsBySentence = map T.words
. T.split isSep . T.split isSep
. T.toLower . T.toLower
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment