[ngrams] brutal removal of all TermType constructors except for Multi

parent 01848abe
......@@ -47,13 +47,11 @@ import GHC.Base (String)
import Gargantext.Core ( Lang, NLPServerConfig, PosTagAlgo )
import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms)
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono (monoTerms)
import Gargantext.Core.Text.Terms.Eleve (Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types ( TermsCount, POS, Terms(Terms), TermsWithCount )
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgramsPostag, np_form, np_lem)
......@@ -61,14 +59,7 @@ import Gargantext.Database.Schema.Ngrams (text2ngrams, NgramsId)
import Gargantext.Prelude
data TermType lang
= Mono { _tt_lang :: !lang }
| Multi { _tt_lang :: !lang }
| MonoMulti { _tt_lang :: !lang }
| Unsupervised { _tt_lang :: !lang
, _tt_windowSize :: !Int
, _tt_ngramsSize :: !Int
, _tt_model :: !(Maybe (Tries Token ()))
}
= Multi { _tt_lang :: !lang }
deriving (Generic)
deriving instance (Show lang) => Show (TermType lang)
......@@ -82,31 +73,9 @@ makeLenses ''TermType
-- | Sugar to extract terms from text (hidding 'mapM' from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: NLPServerConfig -> TermType Lang -> [Text] -> IO [[TermsWithCount]]
extractTerms ncs (Unsupervised {..}) xs = mapM (terms ncs (Unsupervised { _tt_model = Just m', .. })) xs
where
m' = case _tt_model of
Just m''-> m''
Nothing -> newTries _tt_windowSize (Text.intercalate " " xs)
extractTerms ncs termTypeLang xs = mapM (terms ncs termTypeLang) xs
------------------------------------------------------------------------
withLang :: (Foldable t, Functor t, HasText h)
=> TermType Lang
-> t h
-> TermType Lang
withLang (Unsupervised {..}) ns = Unsupervised { _tt_model = m', .. }
where
m' = case _tt_model of
Nothing -> -- trace ("buildTries here" :: String)
Just $ buildTries _tt_ngramsSize
$ fmap toToken
$ uniText
$ Text.intercalate " . "
$ concatMap hasText ns
just_m -> just_m
withLang l _ = l
------------------------------------------------------------------------
data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
| EnrichedNgrams { unEnrichedNgrams :: NgramsPostag }
......@@ -172,12 +141,7 @@ isSimpleNgrams _ = False
-- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: NLPServerConfig -> TermType Lang -> Text -> IO [TermsWithCount]
terms _ (Mono lang) txt = pure $ monoTerms lang txt
terms ncs (Multi lang) txt = multiterms ncs lang txt
terms ncs (MonoMulti lang) txt = terms ncs (Multi lang) txt
terms _ (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
where
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
......@@ -185,21 +149,6 @@ terms _ (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _
type WindowSize = Int
type MinNgramSize = Int
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: newtype BlockText
termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panicTrace "[termsUnsupervised] no model"
termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
map (first (text2term _tt_lang))
. groupWithCounts
-- . List.nub
. List.filter (\l' -> List.length l' >= _tt_windowSize)
. List.concat
. mainEleveWith _tt_model _tt_ngramsSize
. uniText
termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (toToken <$> uniText t)
......
......@@ -421,7 +421,7 @@ insertMasterDocs ncs c lang hs = do
mapNgramsDocs' :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
<- mapNodeIdNgrams
<$> documentIdWithNgrams
(extractNgramsT ncs $ withLang lang documentsWithId)
(extractNgramsT ncs lang)
(map (B.first contextId2NodeId) documentsWithId)
lId <- getOrMkList masterCorpusId masterUserId
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment