{-| Module : Gargantext.Database.Flow Description : Database Flow Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX -- TODO-ACCESS: -- check userId CanFillUserCorpus userCorpusId -- check masterUserId CanFillMasterCorpus masterCorpusId -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType -- TODO-EVENTS: InsertedNodes -} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE ConstrainedClassMethods #-} {-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE DerivingStrategies #-} {-# LANGUAGE InstanceSigs #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeOperators #-} module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ( DataText(..) , getDataText , getDataText_Debug , flowDataText , flow , flowCorpusFile , flowCorpus , flowCorpusUser , flowAnnuaire , buildSocialList , addDocumentsToHyperCorpus , reIndexWith , ngramsByDoc , getOrMkRoot , getOrMkRootWithCorpus , TermType(..) , DataOrigin(..) , allDataOrigins , do_api ) where import Conduit import Control.Lens ( to, view ) import Control.Exception.Safe (catch, MonadCatch) import Data.Conduit qualified as C import Data.Conduit.Internal (zipSources) import Data.Conduit.List qualified as CL import Data.Conduit.List qualified as CList import Data.HashMap.Strict qualified as HashMap import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.Set qualified as Set import Data.Text qualified as T import Gargantext.API.Ngrams.Tools (getTermsWith) import Gargantext.API.Ngrams.Types (NgramsTerm) import Gargantext.Core (Lang(..), withDefaultLanguage, NLPServerConfig) import Gargantext.Core.Config (GargConfig(..), hasConfig) import Gargantext.Core.Config.Types (APIsConfig(..)) import Gargantext.Core.Ext.IMTUser (readFile_Annuaire) import Gargantext.Core.NLP (HasNLPServer, nlpServerGet) import Gargantext.Core.NodeStory.Types (HasNodeStory, NodeStoryEnv, HasNodeStoryEnv (..)) import Gargantext.Core.Notifications.CentralExchange.Types (HasCentralExchangeNotification(ce_notify), CEMessage(..)) import Gargantext.Core.Text (HasText) import Gargantext.Core.Text.Corpus.API qualified as API import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType) import Gargantext.Core.Text.List (buildNgramsLists) import Gargantext.Core.Text.List.Group.WithStem (GroupParams(..)) import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(_ngramsTerms)) import Gargantext.Core.Text.Terms import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..)) import Gargantext.Core.Types (HasValidationError, TermsCount, TermsWeight(..)) import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Main ( ListType(MapTerm) ) import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances import Gargantext.Database.Action.Flow.List ( flowList_DbRepo, toNodeNgramsW' ) import Gargantext.Database.Action.Flow.Types ( do_api, DataOrigin(..), DataText(..), FlowCorpus, DocumentIdWithNgrams (..) ) import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, insertDocs, mkNodeIdNgramsMap, ngramsByDoc, documentIdWithNgrams) import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore) import Gargantext.Database.Action.Search (searchDocInDatabase) import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact ) import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataAnnuaire, HyperdataCorpus(_hc_lang) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( ToHyperdataDocument(toHyperdataDocument), HyperdataDocument ) import Gargantext.Database.Admin.Types.Node hiding (ERROR, DEBUG) -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId) import Gargantext.Database.Prelude import Gargantext.Database.Query.Table.ContextNodeNgrams2 ( ContextNodeNgrams2Poly(..), insertContextNodeNgrams2 ) import Gargantext.Database.Query.Table.Node ( MkCorpus, insertDefaultNodeIfNotExists, getOrMkList, getNodeWith ) import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add) import Gargantext.Database.Query.Table.Node.Document.Insert ( ToNode(toNode), UniqParameters (..), newUniqIdHash ) -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..)) import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..)) import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId) import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId) import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser) import Gargantext.Database.Schema.Ngrams ( indexNgrams, NgramsId ) import Gargantext.Database.Schema.Node import Gargantext.Database.Types import Gargantext.Prelude hiding (catch, onException, to) import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG, ERROR), MonadLogger ) import Gargantext.Utils.Jobs.Monad ( JobHandle, MonadJobStatus(..) ) ------------------------------------------------------------------------ -- Imports for upgrade function import Gargantext.Database.Query.Tree.Error ( HasTreeError ) import Gargantext.Core.Text.Terms.Multi (MultitermsExtractionException) ------------------------------------------------------------------------ allDataOrigins :: [DataOrigin] allDataOrigins = map InternalOrigin API.externalAPIs <> map ExternalOrigin API.externalAPIs --------------- -- Show instance is not possible because of IO printDataText :: DataText -> IO () printDataText (DataOld xs) = putText $ show xs printDataText (DataNew (maybeInt, conduitData)) = do res <- C.runConduit (conduitData .| CL.consume) putText $ show (maybeInt, res) -- TODO use the split parameter in config file getDataText :: (HasNodeError err) => DataOrigin -> TermType Lang -> API.RawQuery -> Maybe API.Limit -> DBCmdWithEnv env err (Either API.GetCorpusError DataText) getDataText (ExternalOrigin api) la q li = do cfg <- view hasConfig eRes <- liftBase $ API.get api (_tt_lang la) q (_ac_epo_api_url $ _gc_apis cfg) li pure $ DataNew <$> eRes getDataText (InternalOrigin _) la q _li = do cfg <- view hasConfig runDBTx $ do (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster (Nothing :: Maybe HyperdataCorpus) ids <- map fst <$> searchDocInDatabase cId (stem (_tt_lang la) GargPorterAlgorithm $ API.getRawQuery q) pure $ Right $ DataOld ids getDataText_Debug :: (HasNodeError err) => DataOrigin -> TermType Lang -> API.RawQuery -> Maybe API.Limit -> DBCmdWithEnv env err () getDataText_Debug a l q li = do result <- getDataText a l q li case result of Left err -> liftBase $ putText $ show err Right res -> liftBase $ printDataText res ------------------------------------------------------------------------------- flowDataText :: forall env err m. ( IsDBCmd env err m , HasNodeStory env err m , MonadLogger m , HasNLPServer env , HasTreeError err , HasValidationError err , MonadJobStatus m , MonadCatch m , HasCentralExchangeNotification env ) => User -> DataText -> TermType Lang -> CorpusId -> Maybe FlowSocialListWith -> JobHandle m -> m CorpusId flowDataText u (DataOld ids) tt cid mfslw _ = do $(logLocM) DEBUG $ T.pack $ "Found " <> show (length ids) <> " old node IDs" cfg <- view hasConfig (_userId, userCorpusId, listId, msgs) <- runDBTx $ do (a, user_corpus_id, c, d) <- createNodes cfg (MkCorpusUserNormalCorpusIds u [cid]) corpusType _ <- Doc.add user_corpus_id (map nodeId2ContextId ids) pure (a, user_corpus_id, c, d) forM_ msgs ce_notify flowCorpusUser (_tt_lang tt) u userCorpusId listId corpusType mfslw where corpusType = Nothing :: Maybe HyperdataCorpus flowDataText u (DataNew (mLen, txtC)) tt cid mfslw jobHandle = do $(logLocM) DEBUG $ T.pack $ "Found " <> show mLen <> " new documents to process" for_ (mLen <&> fromInteger) (`addMoreSteps` jobHandle) flowCorpus (MkCorpusUserNormalCorpusIds u [cid]) tt mfslw (fromMaybe 0 mLen, transPipe liftBase txtC) jobHandle ------------------------------------------------------------------------ -- TODO use proxy flowAnnuaire :: ( IsDBCmd env err m , HasNodeStory env err m , MonadLogger m , HasNLPServer env , HasTreeError err , HasValidationError err , MonadJobStatus m , MonadCatch m , HasCentralExchangeNotification env ) => MkCorpusUser -> TermType Lang -> FilePath -> JobHandle m -> m AnnuaireId flowAnnuaire mkCorpusUser l filePath jobHandle = do -- TODO Conduit for file docs <- liftBase (readFile_Annuaire filePath :: IO [HyperdataContact]) flow (Nothing :: Maybe HyperdataAnnuaire) mkCorpusUser l Nothing (fromIntegral $ length docs, yieldMany docs) jobHandle ------------------------------------------------------------------------ flowCorpusFile :: ( IsDBCmd env err m , HasNodeStory env err m , MonadLogger m , HasNLPServer env , HasTreeError err , HasValidationError err , MonadJobStatus m , MonadCatch m , HasCentralExchangeNotification env ) => MkCorpusUser -> TermType Lang -> FileType -> FileFormat -> FilePath -> Maybe FlowSocialListWith -> JobHandle m -> m CorpusId flowCorpusFile mkCorpusUser la ft ff fp mfslw jobHandle = do eParsed <- liftBase $ parseFile ft ff fp case eParsed of Right parsed -> do flowCorpus mkCorpusUser la mfslw (fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) jobHandle --let docs = splitEvery 500 $ take l parsed --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus Left e -> panicTrace $ "Error: " <> e ------------------------------------------------------------------------ -- | TODO improve the needed type to create/update a corpus -- (For now, Either is enough) flowCorpus :: ( IsDBCmd env err m , HasNodeStory env err m , MonadLogger m , HasNLPServer env , HasTreeError err , HasValidationError err , FlowCorpus a , ExtractNgrams m a , MonadJobStatus m , MonadCatch m , HasCentralExchangeNotification env ) => MkCorpusUser -> TermType Lang -> Maybe FlowSocialListWith -> (Integer, ConduitT () a m ()) -> JobHandle m -> m CorpusId flowCorpus = flow (Nothing :: Maybe HyperdataCorpus) flow :: forall env err m a c. ( IsDBCmd env err m , HasNodeStory env err m , MonadLogger m , HasNLPServer env , HasTreeError err , HasValidationError err , FlowCorpus a , ExtractNgrams m a , MkCorpus c , MonadJobStatus m , HasCentralExchangeNotification env , MonadCatch m ) => Maybe c -> MkCorpusUser -> TermType Lang -> Maybe FlowSocialListWith -> (Integer, ConduitT () a m ()) -> JobHandle m -> m CorpusId flow c mkCorpusUser la mfslw (count, docsC) jobHandle = do cfg <- view hasConfig (_userId, userCorpusId, listId, msgs) <- runDBTx $ createNodes cfg mkCorpusUser c forM_ msgs ce_notify -- TODO if public insertMasterDocs else insertUserDocs runConduit $ zipSources (yieldMany ([1..] :: [Int])) docsC .| CList.chunksOf 5 .| mapM_C (addDocumentsWithProgress userCorpusId) .| sinkNull let u = userFromMkCorpusUser mkCorpusUser $(logLocM) DEBUG "Calling flowCorpusUser" flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw where addDocumentsWithProgress :: CorpusId -> [(Int, a)] -> m () addDocumentsWithProgress userCorpusId docsChunk = do $(logLocM) DEBUG $ T.pack $ "calling insertDoc, ([idx], mLength) = " <> show (fst <$> docsChunk, count) docs <- addDocumentsToHyperCorpus c la userCorpusId (map snd docsChunk) markProgress (length docs) jobHandle -- | Given a list of corpus documents and a 'NodeId' identifying the 'CorpusId', adds -- the given documents to the corpus. Returns the Ids of the inserted documents. addDocumentsToHyperCorpus :: ( IsDBCmd env err m , HasNodeError err , HasNLPServer env , FlowCorpus document , ExtractNgrams m document , MkCorpus corpus , MonadLogger m , MonadCatch m ) => Maybe corpus -> TermType Lang -> CorpusId -> [document] -> m [DocId] addDocumentsToHyperCorpus mb_hyper la corpusId docs = do cfg <- view hasConfig nlp <- view (nlpServerGet $ _tt_lang la) -- First extract all the ngrams for the input documents via the nlp server, -- collect errors (if any) and pass to 'insertMasterDocs' only the documents -- for which the ngrams extraction succeeded. At the moment errors are just -- logged, but in the future they could be returned upstream so that we can -- display a final result of how many were skipped, how many succeded etc. uncommittedNgrams <- extractNgramsFromDocuments nlp la docs ids <- runDBTx $ insertMasterDocs cfg uncommittedNgrams mb_hyper docs runDBTx $ do void $ Doc.add corpusId (map nodeId2ContextId ids) pure ids ------------------------------------------------------------------------ createNodes :: ( HasNodeError err , MkCorpus c ) => GargConfig -> MkCorpusUser -> Maybe c -> DBUpdate err (UserId, CorpusId, ListId, [CEMessage]) createNodes cfg mkCorpusUser ctype = do -- User Flow (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus cfg mkCorpusUser ctype -- NodeTexts is first _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId -- printDebug "NodeTexts: " tId -- NodeList is second listId <- getOrMkList userCorpusId userId -- User Graph Flow _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId -- _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId let msgs = [UpdateTreeFirstLevel listId, UpdateTreeFirstLevel userCorpusId] pure (userId, userCorpusId, listId, msgs) flowCorpusUser :: ( HasNodeError err , HasValidationError err , HasNLPServer env , HasTreeError err , HasNodeStory env err m , MkCorpus c ) => Lang -> User -> CorpusId -> ListId -> Maybe c -> Maybe FlowSocialListWith -> m CorpusId flowCorpusUser l user userCorpusId listId ctype mfslw = do env <- view hasNodeStory buildSocialList l user userCorpusId listId ctype mfslw runDBTx $ do -- _ <- insertOccsUpdates userCorpusId mastListId --_ <- mkPhylo userCorpusId userId -- Annuaire Flow -- _ <- mkAnnuaire rootUserId userId _ <- reIndexWith env userCorpusId listId NgramsTerms (Set.singleton MapTerm) _ <- updateContextScore env userCorpusId listId _ <- updateNgramsOccurrences env userCorpusId listId pure userCorpusId -- | This function is responsible for contructing terms. buildSocialList :: ( HasNodeError err , HasValidationError err , HasTreeError err , HasNodeStory env err m , MkCorpus c , HasNLPServer env ) => Lang -> User -> CorpusId -> ListId -> Maybe c -> Maybe FlowSocialListWith -> m () buildSocialList l user userCorpusId listId ctype = \case Just (NoList _) -> pure () mfslw -> do cfg <- view hasConfig nlpServer <- view (nlpServerGet l) (masterUserId, masterCorpusId, ngs) <- runDBTx $ do -- User List Flow (master_user_id, _masterRootId, master_corpus_id) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster ctype let gp = GroupWithPosTag l nlpServer HashMap.empty (master_user_id, master_corpus_id,) <$> buildNgramsLists user userCorpusId master_corpus_id mfslw gp -- printDebug "flowCorpusUser:ngs" ngs _userListId <- flowList_DbRepo listId ngs _mastListId <- runDBTx $ getOrMkList masterCorpusId masterUserId pure () ------------------------------------------------------------------------------- -- -- Splitting Ngrams extraction from document creation -- ------------------------------------------------------------------------------- -- -- There is a bit of tension between extracting the Ngrams and creating the documents: -- We need to produce a map between a given 'NodeId' and the ngrams associated with it, where -- the latter are extract via the NLP server. However, each ngrams has to be matched to the -- NodeId associated with the new resource being created as part of 'insertMasterDocs'. This -- creates a bit of a chicken-and-egg problem in trying to make 'insertMasterDocs' a 'DBUpdate' -- function: we need a 'NodeId' to exist by the time we call 'extractNgrams' but the latter can't -- be execute in a pure fashion without a 'NodeId'. -- -- To fix this, we need a data structure which would index the ngrams by some other notion of -- index, and later have a transformation function which would re-index these ngrams to the actual -- 'NodeId' created during the DB Transaction. -- | Ngrams that have been fully \"committed\", i.e. associated to the respective document -- where the latter has been persisted (i.e. committed) on secondary storage. type CommittedNgrams = HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount))) newtype DocumentHashId = DocumentHashId { _DocumentHashId :: T.Text } deriving stock (Show, Eq) deriving newtype Ord -- | Ngrams that have been extracted from the input 'doc' but not fully associated with -- a persisted entity on the database. newtype UncommittedNgrams doc = UncommittedNgrams { _UncommittedNgrams :: Map.Map DocumentHashId (DocumentIdWithNgrams DocumentHashId doc ExtractedNgrams) } deriving stock Show deriving newtype (Semigroup, Monoid) data InsertDocError = NgramsNotFound !(Maybe DocumentHashId) !DocId deriving Show extractNgramsFromDocument :: ( UniqParameters doc , HasText doc , ExtractNgrams m doc , IsDBCmd err env m , MonadLogger m , MonadCatch m ) => NLPServerConfig -> TermType Lang -> doc -> m (UncommittedNgrams doc) extractNgramsFromDocument nlpServer lang doc = -- In case of an exception from the NLP server, treat this as having no ngrams, -- but still index it in the final map, so that later reconciliation still works. -- Pratically speaking it means this won't have any ngrams associated, but the document -- will still be added to the corpus and we can try to regen the ngrams at a later stage. UncommittedNgrams . Map.singleton docId <$> (documentIdWithNgrams (extractNgrams nlpServer $ withLang lang [doc]) (Indexed docId doc) `catch` \(e :: MultitermsExtractionException) -> do $(logLocM) ERROR $ T.pack $ "Document with hash " <> show docId <> " failed ngrams extraction due to an exception: " <> displayException e pure $ DocumentIdWithNgrams (Indexed docId doc) mempty ) where docId = DocumentHashId $ newUniqIdHash doc commitNgramsForDocument :: UniqParameters doc => UncommittedNgrams doc -> Indexed ContextId (Node doc) -> Either InsertDocError CommittedNgrams commitNgramsForDocument (UncommittedNgrams ng) (Indexed oldIx node) = do docId <- mb_docId case Map.lookup docId ng of Nothing -> Left $ NgramsNotFound (Just docId) (_node_id node) Just ngs -> Right $ mkNodeIdNgramsMap [reIndex ngs] where mb_docId = case DocumentHashId <$> _node_hash_id node of Nothing -> Left $ NgramsNotFound Nothing (_node_id node) Just dId -> Right dId reIndex :: DocumentIdWithNgrams DocumentHashId doc ExtractedNgrams -> DocumentIdWithNgrams NodeId doc ExtractedNgrams reIndex did = let (Indexed _ a) = documentWithId did in did { documentWithId = Indexed (contextId2NodeId oldIx) a } extractNgramsFromDocuments :: forall doc env err m. ( HasText doc , UniqParameters doc , ExtractNgrams m doc , IsDBCmd env err m , MonadLogger m , MonadCatch m ) => NLPServerConfig -> TermType Lang -> [doc] -> m (UncommittedNgrams doc) extractNgramsFromDocuments nlpServer lang docs = foldlM go mempty docs where go :: UncommittedNgrams doc -> doc -> m (UncommittedNgrams doc) go !acc inputDoc = do ngrams <- extractNgramsFromDocument nlpServer lang inputDoc pure $ acc <> ngrams commitNgramsForDocuments :: UniqParameters doc => UncommittedNgrams doc -> [Indexed ContextId (Node doc)] -> ([InsertDocError], CommittedNgrams) commitNgramsForDocuments ng nodes = let (errs, successes) = partitionEithers $ map (commitNgramsForDocument ng) nodes in (errs, mconcat successes) insertMasterDocs :: ( HasNodeError err , UniqParameters doc , FlowCorpus doc , MkCorpus c ) => GargConfig -> UncommittedNgrams doc -- ^ The ngrams extracted for /all/ the documents -- and indexed by the hash of the given document. -- We can use this map to associate the document -- with the node being created. -> Maybe c -> [doc] -> DBUpdate err [DocId] insertMasterDocs cfg uncommittedNgrams c hs = do (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs ) _ <- Doc.add masterCorpusId ids' -- TODO -- create a corpus with database name (CSV or PubMed) -- add documents to the corpus (create node_node link) -- this will enable global database monitoring let (_failedExtraction, ngramsDocsMap) = commitNgramsForDocuments uncommittedNgrams documentsWithId lId <- getOrMkList masterCorpusId masterUserId _ <- saveDocNgramsWith lId ngramsDocsMap pure $ map contextId2NodeId ids' saveDocNgramsWith :: ListId -> HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount))) -> DBUpdate err () saveDocNgramsWith lId mapNgramsDocs' = do (terms2id :: HashMap.HashMap Text NgramsId) <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs' let mapNgramsDocs :: HashMap.HashMap Ngrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount))) mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs' -- new mapCgramsId <- listInsertDb lId toNodeNgramsW' $ map (bimap _ngramsTerms Map.keys) $ HashMap.toList mapNgramsDocs let ngrams2insert = catMaybes [ ContextNodeNgrams2 (nodeId2ContextId nId) <$> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'') <*> Just (fromIntegral $ unTermsWeight w :: Double) | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes , (nId, (w, _cnt)) <- Map.toList mapNodeIdWeight ] _return <- insertContextNodeNgrams2 ngrams2insert -- to be removed _ <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs pure () ------------------------------------------------------------------------ -- | Re-index documents of a corpus with ngrams in the list reIndexWith :: HasNodeError err => NodeStoryEnv err -> CorpusId -> ListId -> NgramsType -> Set ListType -> DBUpdate err () reIndexWith env cId lId nt lts = do -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts) corpus_node <- getNodeWith cId (Proxy @HyperdataCorpus) -- NOTE: This assumes a single language for the whole corpus let corpusLang = withDefaultLanguage $ view (node_hyperdata . to _hc_lang) corpus_node -- Getting [NgramsTerm] (ts :: [NgramsTerm]) <- List.concat <$> map (\(k, vs) -> k:vs) -- this is concatenating parent with their children, 1st level only <$> HashMap.toList <$> getTermsWith env identity [lId] nt lts -- Get all documents of the corpus (docs :: [ContextOnlyId HyperdataDocument]) <- selectDocNodesOnlyId cId -- Saving the indexation in database mapM_ (saveDocNgramsWith lId . ngramsByDoc corpusLang nt ts) docs