Commit a5bd188c authored by Alexandre Delanoë's avatar Alexandre Delanoë

[NGRAMS] Unsupervised extraction OK.

parent 3706928e
......@@ -302,7 +302,6 @@ instance ExtractNgramsT HyperdataContact
instance ExtractNgramsT HyperdataDocument
where
extractNgramsT = extractNgramsT'
......@@ -362,7 +361,6 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams
pure $ DocumentIdWithNgrams d e
-- FLOW LIST
-- | TODO check optimization
mapNodeIdNgrams :: [DocumentIdWithNgrams a]
......
......@@ -44,18 +44,23 @@ import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoTerms)
import Gargantext.Text.Terms.Mono.Stem (stem)
import qualified Data.Set as Set
import qualified Data.List as List
import qualified Data.Text as Text
import Gargantext.Text (sentences)
import Gargantext.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Text.Terms.Eleve (mainEleve)
import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
data TermType lang
= Mono { _tt_lang :: lang }
| Multi { _tt_lang :: lang }
| MonoMulti { _tt_lang :: lang }
| Unsupervised { _tt_lang :: lang
, _tt_size :: Int
, _tt_model :: Maybe (Tries Token ())
}
makeLenses ''TermType
--group :: [Text] -> [Text]
......@@ -67,7 +72,10 @@ makeLenses ''TermType
-- | Sugar to extract terms from text (hiddeng mapM from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms termTypeLang = mapM (terms termTypeLang)
extractTerms (Unsupervised l n m) xs = mapM (terms (Unsupervised l n m')) xs
where
m' = maybe (Just $ newTries n (Text.intercalate " " xs)) Just m
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
......@@ -78,9 +86,16 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (Unsupervised lang n m) txt = termsUnsupervised m' n lang txt
where
m' = maybe (newTries n txt) identity m
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
text2term :: Lang -> [Text] -> Terms
text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;." :: String)
......@@ -89,14 +104,22 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
-- language agnostic extraction
-- TODO: remove IO
-- TODO: newtype BlockText
extractTermsUnsupervised :: Int -> Text -> [[Text]]
extractTermsUnsupervised n =
List.nub
. (List.filter (\l -> List.length l > 1))
termsUnsupervised :: Tries Token () -> Int -> Lang -> Text -> IO [Terms]
termsUnsupervised m n l =
pure
. map (text2term l)
. List.nub
. (List.filter (\l' -> List.length l' > 1))
. List.concat
. mainEleve n
. map (map Text.toLower)
. map (List.filter (not . isPunctuation))
. mainEleveWith m n
. uniText
newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t)
uniText :: Text -> [[Text]]
uniText = map (List.filter (not . isPunctuation))
. map tokenize
. sentences
. sentences -- | TODO get sentences according to lang
......@@ -367,16 +367,19 @@ split inE t0 ts =
------------------------------------------------------------------------
mainEleve :: Int -> [[Text]] -> [[[Text]]]
mainEleve n input = split n info_autonomy (t :: Tries Token (I Double)) <$> inp
mainEleve n i = mainEleveWith m n i
where
inp = toToken <$> input
t = normalizeEntropy info_entropy_var set_autonomy
. evTrie identity set_entropy_var
. entropyTrie isTerminal
$ buildTries n inp
m = buildTries n (fmap toToken i)
mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
where
t :: Tries Token (I Double)
t = normalizeEntropy info_entropy_var set_autonomy
$ evTrie identity set_entropy_var
$ entropyTrie isTerminal m
---------------------------------------------
------------------------------------------------------------------------
type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
......@@ -484,8 +487,8 @@ checks2 = []
]
-}
runTests :: IO ()
runTests =
runTestsEleve :: IO ()
runTestsEleve =
forM_
[("example0", 3, example0, checks0)
,("example0", 2, example0, [])
......
......@@ -47,6 +47,7 @@ monoTerms l txt = map (monoText2term l) $ monoTexts txt
monoTexts :: Text -> [Text]
monoTexts = L.concat . monoTextsBySentence
-- | TODO use text2term only
monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
......@@ -54,6 +55,3 @@ monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map T.words
. T.split isSep
. T.toLower
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment