{-| Module : Gargantext.Core.Text.Ngrams Description : Ngrams definition and tools Copyright : (c) CNRS, 2017 - present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX An @n-gram@ is a contiguous sequence of n items from a given sample of text. In Gargantext application the items are words, n is a non negative integer. Using Latin numerical prefixes, an n-gram of size 1 is referred to as a "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size 3 is a "trigram". English cardinal numbers are sometimes used, e.g., "four-gram", "five-gram", and so on. Source: https://en.wikipedia.org/wiki/Ngrams TODO group Ngrams -> Tree compute occ by node of Tree group occs according groups compute cooccurrences compute graph -} {-# OPTIONS_GHC -fno-warn-deprecations #-} {-# LANGUAGE ConstrainedClassMethods #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TemplateHaskell #-} module Gargantext.Core.Text.Terms where import Control.Lens ( view, over ) import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict qualified as HashMap import Data.List qualified as List import Data.Set qualified as Set import Data.Text qualified as Text import GHC.Base (String) import Gargantext.Core ( Lang, NLPServerConfig, PosTagAlgo ) import Gargantext.Core.Text (sentences, HasText(..)) import Gargantext.Core.Text.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms) import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken) import Gargantext.Core.Text.Terms.Mono (monoTerms) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..)) import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize) import Gargantext.Core.Text.Terms.Multi (multiterms) import Gargantext.Core.Types ( TermsCount, POS, Terms(..), TermsWithCount ) import Gargantext.Core.Utils (groupWithCounts) import Gargantext.Database.Prelude (DBCmd) import Gargantext.Database.Query.Table.Ngrams (insertNgrams) import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgramsPostag, np_form, np_lem) import Gargantext.Database.Schema.Ngrams (text2ngrams, NgramsId) import Gargantext.Prelude data TermType lang = Mono { _tt_lang :: !lang } | Multi { _tt_lang :: !lang } | MonoMulti { _tt_lang :: !lang } | Unsupervised { _tt_lang :: !lang , _tt_windowSize :: !Int , _tt_ngramsSize :: !Int , _tt_model :: !(Maybe (Tries Token ())) } deriving (Generic) deriving instance (Show lang) => Show (TermType lang) makeLenses ''TermType --group :: [Text] -> [Text] --group = undefined -- remove Stop Words -- map (filter (\t -> not . elem t)) $ ------------------------------------------------------------------------ -- | Sugar to extract terms from text (hidding 'mapM' from end user). --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms]) extractTerms :: NLPServerConfig -> TermType Lang -> [Text] -> IO [[TermsWithCount]] extractTerms ncs (Unsupervised {..}) xs = mapM (terms ncs (Unsupervised { _tt_model = Just m', .. })) xs where m' = case _tt_model of Just m''-> m'' Nothing -> newTries _tt_windowSize (Text.unwords xs) extractTerms ncs termTypeLang xs = mapM (terms ncs termTypeLang) xs ------------------------------------------------------------------------ withLang :: (Foldable t, Functor t, HasText h) => TermType Lang -> t h -> TermType Lang withLang (Unsupervised {..}) ns = Unsupervised { _tt_model = m', .. } where m' = case _tt_model of Nothing -> -- trace ("buildTries here" :: String) Just $ buildTries _tt_ngramsSize $ fmap toToken $ uniText $ Text.intercalate " . " $ concatMap hasText ns just_m -> just_m withLang l _ = l ------------------------------------------------------------------------ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams } | EnrichedNgrams { unEnrichedNgrams :: NgramsPostag } deriving (Eq, Ord, Generic, Show) instance Hashable ExtractedNgrams -- | A typeclass that represents extracting ngrams from an entity. class ExtractNgramsT h where extractNgramsT :: HasText h => NLPServerConfig -> TermType Lang -> h -> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) ------------------------------------------------------------------------ enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag enrichedTerms l pa po (Terms { .. }) = NgramsPostag { _np_lang = l , _np_algo = pa , _np_postag = po , _np_form = form , _np_lem = lem } where form = text2ngrams $ Text.unwords _terms_label lem = text2ngrams $ Text.unwords $ Set.toList _terms_stem ------------------------------------------------------------------------ cleanNgrams :: Int -> Ngrams -> Ngrams cleanNgrams s ng | Text.length (ng ^. ngramsTerms) < s = ng | otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms)) cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams cleanExtractedNgrams s (SimpleNgrams ng) = SimpleNgrams $ cleanNgrams s ng cleanExtractedNgrams s (EnrichedNgrams ng) = EnrichedNgrams $ over np_form (cleanNgrams s) $ over np_lem (cleanNgrams s) ng extracted2ngrams :: ExtractedNgrams -> Ngrams extracted2ngrams (SimpleNgrams ng) = ng extracted2ngrams (EnrichedNgrams ng) = view np_form ng --------------------------- insertExtractedNgrams :: [ ExtractedNgrams ] -> DBCmd err (HashMap Text NgramsId) insertExtractedNgrams ngs = do let (s, e) = List.partition isSimpleNgrams ngs m1 <- insertNgrams (map unSimpleNgrams s) --printDebug "others" m1 m2 <- insertNgramsPostag (map unEnrichedNgrams e) --printDebug "terms" m2 pure $ HashMap.union m1 m2 isSimpleNgrams :: ExtractedNgrams -> Bool isSimpleNgrams (SimpleNgrams _) = True isSimpleNgrams _ = False ------------------------------------------------------------------------ -- | Terms from 'Text' -- 'Mono' : mono terms -- 'Multi' : multi terms -- 'MonoMulti' : mono and multi -- TODO : multi terms should exclude mono (intersection is not empty yet) terms :: NLPServerConfig -> TermType Lang -> Text -> IO [TermsWithCount] terms _ (Mono lang) txt = pure $ monoTerms lang txt terms ncs (Multi lang) txt = multiterms ncs lang txt terms ncs (MonoMulti lang) txt = terms ncs (Multi lang) txt terms _ (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt where m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt ------------------------------------------------------------------------ type WindowSize = Int type MinNgramSize = Int -- | Unsupervised ngrams extraction -- language agnostic extraction -- TODO: newtype BlockText termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount] termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panicTrace "[termsUnsupervised] no model" termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) = map (first (text2term _tt_lang)) . groupWithCounts -- . List.nub . List.filter (\l' -> List.length l' >= _tt_windowSize) . List.concat . mainEleveWith _tt_model _tt_ngramsSize . uniText termsUnsupervised _ = undefined newTries :: Int -> Text -> Tries Token () newTries n t = buildTries n (toToken <$> uniText t) -- | TODO removing long terms > 24 uniText :: Text -> [[Text]] uniText = map (List.filter (not . isPunctuation) . tokenize) . sentences -- TODO get sentences according to lang . Text.toLower text2term :: Lang -> [Text] -> Terms text2term _ [] = Terms [] Set.empty text2term lang txt = Terms txt (Set.fromList $ map (stem lang PorterAlgorithm) txt) isPunctuation :: Text -> Bool isPunctuation x = List.elem x $ Text.pack . pure <$> ("!?(),;.:" :: String)