[flow] refactoring of docs flow so that no ngrams extraction is done for existing docs

5c9ecde5 · Przemyslaw Kaminski · e7ce7bcb · 5c9ecde5
Verified Commit 5c9ecde5 authored Sep 30, 2025 by Przemyslaw Kaminski
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 36 deletions

Flow.hs src/Gargantext/Database/Action/Flow.hs +24 -36

No files found.
--- a/src/Gargantext/Database/Action/Flow.hs
+++ b/src/Gargantext/Database/Action/Flow.hs
@@ -112,7 +112,7 @@ import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
 import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser)
 import Gargantext.Database.Schema.Ngrams ( indexNgrams, NgramsId )
 import Gargantext.Database.Schema.Node ( NodePoly(_node_id, _node_hash_id), node_hyperdata )
-import Gargantext.Database.Types ( Indexed(Indexed) )
+import Gargantext.Database.Types ( Indexed(Indexed), unIndex )
 import Gargantext.Prelude hiding (catch, onException, to)
 import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG, ERROR), MonadLogger )
 import Gargantext.Utils.Jobs.Monad ( JobHandle, MonadJobStatus(..), markFailureNoErr )
@@ -320,7 +320,7 @@ flow c mkCorpusUser la mfslw (count, docsC) jobHandle = do
      ]

  let u = userFromMkCorpusUser mkCorpusUser
-    
+
  $(logLocM) DEBUG "Calling flowCorpusUser"
  flowCorpusUser (la ^. tt_lang) u userCorpusId listId c mfslw

@@ -328,8 +328,8 @@ flow c mkCorpusUser la mfslw (count, docsC) jobHandle = do
    addDocumentsWithProgress :: CorpusId -> [(Int, a)] -> m ()
    addDocumentsWithProgress userCorpusId docsChunk = do
      $(logLocM) DEBUG $ T.pack $ "calling insertDoc, ([idx], mLength) = " <> show (fst <$> docsChunk, count)
-      docs <- addDocumentsToHyperCorpus c la userCorpusId (map snd docsChunk)
-      markProgress (length docs) jobHandle
+      _docs <- addDocumentsToHyperCorpus c la userCorpusId (map snd docsChunk)
+      markProgress (length docsChunk) jobHandle


 -- | Given a list of corpus documents and a 'NodeId' identifying the 'CorpusId', adds
@@ -356,9 +356,27 @@ addDocumentsToHyperCorpus mb_hyper la corpusId docs = do
  -- for which the ngrams extraction succeeded. At the moment errors are just
  -- logged, but in the future they could be returned upstream so that we can
  -- display a final result of how many were skipped, how many succeded etc.
-  uncommittedNgrams <- extractNgramsFromDocuments nlp la docs
+
+  (masterUserId, masterCorpusId, ids', documentsWithId) <- runDBTx $ do
+    (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster mb_hyper
+    (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) docs )
+    _ <- Doc.add masterCorpusId ids'
+    pure (masterUserId, masterCorpusId, ids', documentsWithId)
+
+  -- uncommittedNgrams <- extractNgramsFromDocuments nlp la docs
+
+  -- Since 'documentsWithId' is of type '[Indexed ContextId (Node
+  -- document)]', we have to 'unIndex' and get the hyperdata back
+  -- again, for newly added docs
+  uncommittedNgrams <- extractNgramsFromDocuments nlp la (view (unIndex . node_hyperdata) <$> documentsWithId)
+  let (_failedExtraction, ngramsDocsMap) = commitNgramsForDocuments uncommittedNgrams documentsWithId
+
  runDBTx $ do
-    ids <- insertMasterDocs cfg uncommittedNgrams mb_hyper docs
+    lId <- getOrMkList masterCorpusId masterUserId
+    _ <- saveDocNgramsWith lId ngramsDocsMap
+    let ids = map contextId2NodeId ids'
+
+    -- ids <- insertMasterDocs cfg uncommittedNgrams mb_hyper docs
    void $ Doc.add corpusId (map nodeId2ContextId ids)
    pure ids

@@ -571,36 +589,6 @@ commitNgramsForDocuments ng nodes =
  let (errs, successes) = partitionEithers $ map (commitNgramsForDocument ng) nodes
  in (errs, mconcat successes)

-insertMasterDocs :: ( HasNodeError err
-                    , UniqParameters doc
-                    , FlowCorpus doc
-                    , MkCorpus c
-                    )
-                 => GargConfig
-                 -> UncommittedNgrams doc
-                 -- ^ The ngrams extracted for /all/ the documents
-                 -- and indexed by the hash of the given document.
-                 -- We can use this map to associate the document
-                 -- with the node being created.
-                 -> Maybe c
-                 -> [doc]
-                 -> DBUpdate err [DocId]
-insertMasterDocs cfg uncommittedNgrams c hs  = do
-  (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c
-  (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
-  _ <- Doc.add masterCorpusId ids'
-
-  -- TODO
-  -- create a corpus with database name (CSV or PubMed)
-  -- add documents to the corpus (create node_node link)
-  -- this will enable global database monitoring
-
-  let (_failedExtraction, ngramsDocsMap) = commitNgramsForDocuments uncommittedNgrams documentsWithId
-
-  lId <- getOrMkList masterCorpusId masterUserId
-  _ <- saveDocNgramsWith lId ngramsDocsMap
-  pure $ map contextId2NodeId ids'
-

 saveDocNgramsWith :: ListId
                  -> HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))