[flow] refactoring of docs flow so that no ngrams extraction is done for existing docs

parent e7ce7bcb
Pipeline #7944 passed with stages
in 63 minutes and 5 seconds
......@@ -112,7 +112,7 @@ import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser)
import Gargantext.Database.Schema.Ngrams ( indexNgrams, NgramsId )
import Gargantext.Database.Schema.Node ( NodePoly(_node_id, _node_hash_id), node_hyperdata )
import Gargantext.Database.Types ( Indexed(Indexed) )
import Gargantext.Database.Types ( Indexed(Indexed), unIndex )
import Gargantext.Prelude hiding (catch, onException, to)
import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG, ERROR), MonadLogger )
import Gargantext.Utils.Jobs.Monad ( JobHandle, MonadJobStatus(..), markFailureNoErr )
......@@ -320,7 +320,7 @@ flow c mkCorpusUser la mfslw (count, docsC) jobHandle = do
]
let u = userFromMkCorpusUser mkCorpusUser
$(logLocM) DEBUG "Calling flowCorpusUser"
flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw
......@@ -328,8 +328,8 @@ flow c mkCorpusUser la mfslw (count, docsC) jobHandle = do
addDocumentsWithProgress :: CorpusId -> [(Int, a)] -> m ()
addDocumentsWithProgress userCorpusId docsChunk = do
$(logLocM) DEBUG $ T.pack $ "calling insertDoc, ([idx], mLength) = " <> show (fst <$> docsChunk, count)
docs <- addDocumentsToHyperCorpus c la userCorpusId (map snd docsChunk)
markProgress (length docs) jobHandle
_docs <- addDocumentsToHyperCorpus c la userCorpusId (map snd docsChunk)
markProgress (length docsChunk) jobHandle
-- | Given a list of corpus documents and a 'NodeId' identifying the 'CorpusId', adds
......@@ -356,9 +356,27 @@ addDocumentsToHyperCorpus mb_hyper la corpusId docs = do
-- for which the ngrams extraction succeeded. At the moment errors are just
-- logged, but in the future they could be returned upstream so that we can
-- display a final result of how many were skipped, how many succeded etc.
uncommittedNgrams <- extractNgramsFromDocuments nlp la docs
(masterUserId, masterCorpusId, ids', documentsWithId) <- runDBTx $ do
(masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster mb_hyper
(ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) docs )
_ <- Doc.add masterCorpusId ids'
pure (masterUserId, masterCorpusId, ids', documentsWithId)
-- uncommittedNgrams <- extractNgramsFromDocuments nlp la docs
-- Since 'documentsWithId' is of type '[Indexed ContextId (Node
-- document)]', we have to 'unIndex' and get the hyperdata back
-- again, for newly added docs
uncommittedNgrams <- extractNgramsFromDocuments nlp la (view (unIndex . node_hyperdata) <$> documentsWithId)
let (_failedExtraction, ngramsDocsMap) = commitNgramsForDocuments uncommittedNgrams documentsWithId
runDBTx $ do
ids <- insertMasterDocs cfg uncommittedNgrams mb_hyper docs
lId <- getOrMkList masterCorpusId masterUserId
_ <- saveDocNgramsWith lId ngramsDocsMap
let ids = map contextId2NodeId ids'
-- ids <- insertMasterDocs cfg uncommittedNgrams mb_hyper docs
void $ Doc.add corpusId (map nodeId2ContextId ids)
pure ids
......@@ -571,36 +589,6 @@ commitNgramsForDocuments ng nodes =
let (errs, successes) = partitionEithers $ map (commitNgramsForDocument ng) nodes
in (errs, mconcat successes)
insertMasterDocs :: ( HasNodeError err
, UniqParameters doc
, FlowCorpus doc
, MkCorpus c
)
=> GargConfig
-> UncommittedNgrams doc
-- ^ The ngrams extracted for /all/ the documents
-- and indexed by the hash of the given document.
-- We can use this map to associate the document
-- with the node being created.
-> Maybe c
-> [doc]
-> DBUpdate err [DocId]
insertMasterDocs cfg uncommittedNgrams c hs = do
(masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c
(ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
_ <- Doc.add masterCorpusId ids'
-- TODO
-- create a corpus with database name (CSV or PubMed)
-- add documents to the corpus (create node_node link)
-- this will enable global database monitoring
let (_failedExtraction, ngramsDocsMap) = commitNgramsForDocuments uncommittedNgrams documentsWithId
lId <- getOrMkList masterCorpusId masterUserId
_ <- saveDocNgramsWith lId ngramsDocsMap
pure $ map contextId2NodeId ids'
saveDocNgramsWith :: ListId
-> HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment