insertMasterDoc return richer type, no ngrams do not abort insertion

After dwelling a bit on the solution space, I concluded that no ngrams shouldn't result into the document being skipped. The rationale would be that if, for example, the NLP server is temporarily unavailable, we should still insert the doc and regenerate the ngrams at a later stage.

insertMasterDoc return richer type, no ngrams do not abort insertion
After dwelling a bit on the solution space, I concluded that no ngrams shouldn't result into the document being skipped. The rationale would be that if, for example, the NLP server is temporarily unavailable, we should still insert the doc and regenerate the ngrams at a later stage.
9006d1a9 · Alfredo Di Napoli · ec5cdd78 · 9006d1a9 · 9006d1a9 · 9006d1a9
Commit 9006d1a9 authored Jun 16, 2025 by Alfredo Di Napoli
5 changed files
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -745,6 +745,7 @@ common commonTestDependencies
    , cryptohash
    , directory ^>= 1.3.7.1
    , epo-api-client
+    , exceptions
    , fast-logger ^>= 3.2.2
    , filepath ^>= 1.4.2.2
    , fmt

--- a/src/Gargantext/Database/Action/Flow.hs
+++ b/src/Gargantext/Database/Action/Flow.hs
@@ -40,6 +40,13 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
  , flowAnnuaire
  , buildSocialList
  , addDocumentsToHyperCorpus
+  , extractNgramsFromDocument
+  , insertMasterDoc
+  , insertMasterDocs
+  , noUncommittedNgrams
+  , InsertDocResult(..)
+  , InsertDocWarning(..)
+  , DocumentHashId(..)
  , reIndexWith
  , ngramsByDoc
@@ -354,21 +361,10 @@ addDocumentsToHyperCorpus jobHandle mb_hyper la corpusId docs = do
  -- First extract all the ngrams for the input documents via the nlp server,
  -- log errors (if any) and pass the final result to 'insertMasterDocs'.
  uncommittedNgrams <- extractNgramsFromDocuments nlp la docs
-  (failures, ids) <- runDBTx $ do
+  ids <- insertMasterDocs jobHandle cfg uncommittedNgrams mb_hyper docs
-    (f,i) <- insertMasterDocs cfg uncommittedNgrams mb_hyper docs
+  runDBTx $ do
-    void $ Doc.add corpusId (map nodeId2ContextId i)
+    void $ Doc.add corpusId (map nodeId2ContextId ids)
-    pure (f,i)
+    pure ids
-  forM_ failures $ \failure -> do
-    case failure of
-      NgramsNotFound _mb_hashId docId -> do
-        let msg = UnsafeMkHumanFriendlyErrorText $ "Couldn't find the associated ngrams for input document " <> show docId <> ", therefore the added document won't have any ngrams."
-        emitWarning msg jobHandle
-      DocumentInsertionError err -> do
-        -- FIXME(adn) we should give some kind of identifier to help the user
-        $(logLocM) ERROR $ "Error in document insertion: " <> T.pack (displayException err)
-        let msg = UnsafeMkHumanFriendlyErrorText $ "Skipping document insertion due to an iternal database error"
-        emitWarning msg jobHandle
-  pure ids
 ------------------------------------------------------------------------
 createNodes :: ( HasNodeError err
@@ -495,10 +491,14 @@ newtype UncommittedNgrams doc = UncommittedNgrams
  deriving stock Show
  deriving newtype (Semigroup, Monoid)
-data InsertDocError
+data InsertDocResult
+  = -- | If available, some warnings
+    InsertDocSucceded !DocId !(Maybe InsertDocWarning)
+    deriving (Show, Eq)
+data InsertDocWarning
  = NgramsNotFound !(Maybe DocumentHashId) !DocId
-  | DocumentInsertionError !SomeException
+  deriving (Show, Eq)
-  deriving Show
 extractNgramsFromDocument :: ( UniqParameters doc
                             , HasText doc
@@ -528,22 +528,25 @@ extractNgramsFromDocument nlpServer lang doc =
 commitNgramsForDocument :: UniqParameters doc
                        => UncommittedNgrams doc
                        -> Indexed ContextId (Node doc)
-                        -> Either InsertDocError CommittedNgrams
+                        -> Either InsertDocWarning CommittedNgrams
-commitNgramsForDocument (UncommittedNgrams ng) (Indexed oldIx node) = do
+commitNgramsForDocument (UncommittedNgrams ng) (Indexed cIx node) = do
  docId <- mb_docId
  case Map.lookup docId ng of
-    Nothing  -> Left  $ NgramsNotFound (Just docId) (_node_id node)
+    Nothing  -> Left  $ NgramsNotFound (Just docId) (contextId2NodeId cIx)
    Just ngs -> Right $ mkNodeIdNgramsMap [reIndex ngs]
  where
    mb_docId = case DocumentHashId <$> _node_hash_id node of
-      Nothing  -> Left $ NgramsNotFound Nothing (_node_id node)
+      Nothing  -> Left $ NgramsNotFound Nothing (contextId2NodeId cIx)
      Just dId -> Right dId
    reIndex :: DocumentIdWithNgrams DocumentHashId doc ExtractedNgrams
            -> DocumentIdWithNgrams NodeId doc ExtractedNgrams
    reIndex did =
      let (Indexed _ a) = documentWithId did
-      in did { documentWithId = Indexed (contextId2NodeId oldIx) a }
+      in did { documentWithId = Indexed (contextId2NodeId cIx) a }
+noUncommittedNgrams :: UncommittedNgrams docs
+noUncommittedNgrams = UncommittedNgrams mempty
 extractNgramsFromDocuments :: forall doc env err m.
                           ( HasText doc
@@ -570,9 +573,10 @@ insertMasterDocs :: forall env err doc c m. ( HasNodeError err
                    , FlowCorpus doc
                    , MkCorpus   c
                    , IsDBCmd env err m
-                    , MonadCatch m
+                    , MonadJobStatus m
                    )
-                 => GargConfig
+                 => JobHandle m
+                 -> GargConfig
                 -> UncommittedNgrams doc
                 -- ^ The ngrams extracted for /all/ the documents
                 -- and indexed by the hash of the given document.
@@ -580,20 +584,22 @@ insertMasterDocs :: forall env err doc c m. ( HasNodeError err
                 -- with the node being created.
                 -> Maybe c
                 -> [doc]
-                 -> m ([InsertDocError], [DocId])
+                 -> m [DocId]
-insertMasterDocs cfg uncommittedNgrams c docs =
+insertMasterDocs jobHandle cfg uncommittedNgrams c docs =
-  bimap reverse reverse <$> foldlM go (mempty, mempty) docs
+  reverse <$> foldlM go mempty docs
  where
-    go :: ([InsertDocError], [DocId]) -> doc -> m ([InsertDocError], [DocId])
+    go :: [DocId] -> doc -> m [DocId]
-    go (!errs, !documents) doc = do
+    go !documents doc = do
-      res <- try $ runDBTx (insertMasterDoc cfg uncommittedNgrams c doc)
+      res <- runDBTx (insertMasterDoc cfg uncommittedNgrams c doc)
      case res of
-        Left err
+        InsertDocSucceded d Nothing ->
-          -> pure (DocumentInsertionError err : errs, documents)
+          pure (d : documents)
-        Right (Left err)
+        InsertDocSucceded d (Just warning) -> do
-          -> pure (err : errs, documents)
+          case warning of
-        Right (Right d)
+             NgramsNotFound _mb_hashId docId -> do
-          -> pure (errs, d : documents)
+               let msg = UnsafeMkHumanFriendlyErrorText $ "Couldn't find the associated ngrams for input document " <> show docId <> ", therefore the added document won't have any ngrams."
+               emitWarning msg jobHandle
+          pure (d : documents)
 insertMasterDoc :: ( HasNodeError err
                    , UniqParameters doc
@@ -608,7 +614,7 @@ insertMasterDoc :: ( HasNodeError err
                 -- with the node being created.
                 -> Maybe c
                 -> doc
-                 -> DBUpdate err (Either InsertDocError DocId)
+                 -> DBUpdate err InsertDocResult
 insertMasterDoc cfg uncommittedNgrams c h = do
  (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c
  documentWithId <- insertDoc masterUserId masterCorpusId (toNode masterUserId Nothing h)
@@ -620,11 +626,11 @@ insertMasterDoc cfg uncommittedNgrams c h = do
  -- this will enable global database monitoring
  case commitNgramsForDocument uncommittedNgrams documentWithId of
-    Left failed -> pure $ Left failed
+    Left failed -> pure $ InsertDocSucceded (contextId2NodeId $ _index documentWithId) (Just failed)
    Right ngramsDocsMap -> do
     lId <- getOrMkList masterCorpusId masterUserId
     _ <- saveDocNgramsWith lId ngramsDocsMap
-     pure $ Right (contextId2NodeId $ _index documentWithId)
+     pure $ InsertDocSucceded (contextId2NodeId $ _index documentWithId) Nothing
 saveDocNgramsWith :: ListId

--- a/test/Test/Database/Operations.hs
+++ b/test/Test/Database/Operations.hs
@@ -66,6 +66,9 @@ tests = parallel $ around withTestDB $ beforeWith (\ctx -> setupEnvironment ctx
        it "Can add language to Corpus" corpusAddLanguage
        describe "With test documents" $ beforeWith addCorpusDocuments $ do
          it "Can add documents to a Corpus" corpusAddDocuments
+          it "Correctly return an error if uncommitted ngrams map is empty" corpusAddDocNgramsEmpty
+          it "Correctly return the same Id if a document is inserted twice" corpusAddDocAlreadyExisting
+          it "Can survive failures in a batch" corpusAddDocBatchSurvival
          describe "Corpus search" $ do
            it "Can stem query terms" stemmingTest
            it "Can return the number of docs in a corpus" corpusReturnCount
@@ -81,7 +84,7 @@ tests = parallel $ around withTestDB $ beforeWith (\ctx -> setupEnvironment ctx
        it "Publishes the root and its first level children"  testPublishRecursiveFirstLevel
        it "Publishes the root and its recursive children"  testPublishRecursiveNLevel
        it "Publishes in a lenient way but it's still considered read-only"  testPublishLenientWorks
 nodeStoryTests :: Spec
 nodeStoryTests = sequential $
  -- run 'withTestDB' before _every_ test item

--- a/test/Test/Database/Operations/DocumentSearch.hs
+++ b/test/Test/Database/Operations/DocumentSearch.hs
@@ -9,14 +9,19 @@ Portability : POSIX
 -}
 {-# LANGUAGE QuasiQuotes       #-}
+{-# LANGUAGE ScopedTypeVariables #-}
 module Test.Database.Operations.DocumentSearch where
+import Control.Lens
 import Control.Monad.Reader
 import Data.Aeson.QQ.Simple
 import Data.Aeson.Types
+import Data.Maybe (fromJust)
 import Data.Text qualified as T
 import Gargantext.Core
+import Gargantext.Core.Config
+import Gargantext.Core.NLP (HasNLPServer(..))
 import Gargantext.Core.Text.Corpus.Query qualified as API
 import Gargantext.Core.Text.Terms.Mono.Stem
 import Gargantext.Core.Types
@@ -30,15 +35,15 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document
 import Gargantext.Database.Prelude
 import Gargantext.Database.Query.Facet
 import Gargantext.Database.Query.Table.Node
-import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
+import Gargantext.Database.Query.Table.Node.Error
-import Gargantext.Database.Query.Table.Node.Error (nodeErrorWith)
-import Gargantext.Database.Query.Tree.Root
 import Gargantext.Database.Query.Table.NodeContext (selectCountDocs)
+import Gargantext.Database.Query.Tree.Root
 import Gargantext.Database.Schema.Node (NodePoly(..))
+import Network.URI (parseURI)
 import Prelude
 import Test.Database.Types
-import Test.Hspec.Expectations
 import Test.HUnit hiding (Node)
+import Test.Hspec.Expectations
 exampleDocument_01 :: HyperdataDocument
@@ -114,6 +119,31 @@ exampleDocument_04 = either error id $ parseEither parseJSON $ [aesonQQ|
 }
 |]
+exampleDocument_05 :: HyperdataDocument
+exampleDocument_05 = either error id $ parseEither parseJSON $ [aesonQQ|
+{
+    "bdd": "Arxiv"
+  , "doi": ""
+  , "url": "http://arxiv.org/pdf/1407.5670v1"
+  , "title": "GQL for Dummies"
+  , "source": ""
+  , "authors": "Ennio Annio"
+  , "abstract": "  This article provides an introduction to GraphQL" , "institutes": ""
+  , "language_iso2": "EN"
+  , "publication_date": "2014-07-21T21:20:31Z"
+  , "publication_year": 2014
+}
+|]
+exampleDocument_06 :: HyperdataDocument
+exampleDocument_06 = exampleDocument_05 { _hd_title = Just "GQL Part 1" }
+exampleDocument_07 :: HyperdataDocument
+exampleDocument_07 = exampleDocument_05 { _hd_title = Just "GQL Part 2" }
+exampleDocument_08 :: HyperdataDocument
+exampleDocument_08 = exampleDocument_05 { _hd_title = Just "GQL Part 3" }
 getCorporaWithParentIdOrFail :: HasNodeError err => NodeId -> DBQuery err x (Node HyperdataCorpus)
 getCorporaWithParentIdOrFail parentId = do
  xs <- getCorporaWithParentId parentId
@@ -131,13 +161,74 @@ addCorpusDocuments env = runTestMonad env $ do
  let lang = EN
  let docs = [exampleDocument_01, exampleDocument_02, exampleDocument_03, exampleDocument_04]
-  _ <- addDocumentsToHyperCorpus (test_job_handle env)
+  ids <- addDocumentsToHyperCorpus (test_job_handle env)
                                 (Just $ _node_hyperdata $ corpus)
                                 (Multi lang)
                                 corpusId
                                 docs
+  liftIO $ ids `shouldBe` [ UnsafeMkNodeId 1, UnsafeMkNodeId 2, UnsafeMkNodeId 3, UnsafeMkNodeId 4 ]
  pure env
+corpusAddDocNgramsEmpty :: TestEnv -> Assertion
+corpusAddDocNgramsEmpty env = runTestMonad env $ do
+  cfg <- view hasConfig
+  parentId <- runDBQuery $ getRootId (UserName userMaster)
+  [corpus] <- runDBQuery $ getCorporaWithParentId parentId
+  let doc = exampleDocument_05
+  res <- runDBTx $ insertMasterDoc cfg noUncommittedNgrams (Just $ _node_hyperdata corpus) doc
+  let wrn = NgramsNotFound (Just (DocumentHashId "\\x0b41caa65782b9570530910d1942a21bb1859386ee4c976e7b4cd0624ee966f3")) (UnsafeMkNodeId 5)
+  liftIO $ res `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) (Just wrn)
+corpusAddDocAlreadyExisting :: TestEnv -> Assertion
+corpusAddDocAlreadyExisting env = runTestMonad env $ do
+  cfg <- view hasConfig
+  parentId <- runDBQuery $ getRootId (UserName userMaster)
+  [corpus] <- runDBQuery $ getCorporaWithParentId parentId
+  let doc = exampleDocument_05
+  let la  = Mono EN
+  -- Let's use a bogus NLP server port
+  let nlp = NLPServerConfig CoreNLP (fromJust $ parseURI "http://localhost:9999")
+  uncommittedNgrams <- extractNgramsFromDocument nlp la doc
+  oldCount <- docsInCorpusCount
+  res <- runDBTx $ insertMasterDoc cfg uncommittedNgrams (Just $ _node_hyperdata corpus) doc
+  interimCount <- docsInCorpusCount
+  liftIO $ do
+    interimCount `shouldBe` oldCount + 1
+    res `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) Nothing
+  -- The second time the node gets cached and returned.
+  res1 <- runDBTx $ insertMasterDoc cfg uncommittedNgrams (Just $ _node_hyperdata corpus) doc
+  newCount <- docsInCorpusCount
+  liftIO $ do
+    newCount `shouldBe` interimCount
+    res1 `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) Nothing
+-- | We test that if we add 3 documents, out of which the middle one doesn't have any keyed ngrams
+-- the other two are still persisted.
+corpusAddDocBatchSurvival :: TestEnv -> Assertion
+corpusAddDocBatchSurvival env = runTestMonad env $ do
+  cfg <- view hasConfig
+  parentId <- runDBQuery $ getRootId (UserName userMaster)
+  [corpus] <- runDBQuery $ getCorporaWithParentId parentId
+  let la  = Mono EN
+  nlp <- view (nlpServerGet (_tt_lang la))
+  oldCount <- docsInCorpusCount
+  let docs = [exampleDocument_06, exampleDocument_07, exampleDocument_08 ]
+  uNgrams1 <- extractNgramsFromDocument nlp la exampleDocument_06
+  uNgrams2 <- extractNgramsFromDocument nlp la exampleDocument_08
+  savedDocs <- insertMasterDocs (test_job_handle env) cfg (uNgrams1 <> uNgrams2) (Just $ _node_hyperdata corpus) docs
+  c' <- docsInCorpusCount
+  liftIO $ do
+    -- failures  `shouldBe` [DocumentInsertionError "InternalNodeError Cannot make node due to: Couldn't find the associated ngrams for document with hash \\x666b8e7bfd7c0af37d630e1097791f7ba438a669ecb6d1cb38014edd0b7a2977, therefore the added document won't have any ngrams."]
+    c' `shouldBe` oldCount + 3
+    savedDocs `shouldBe` [ UnsafeMkNodeId 5, UnsafeMkNodeId 6, UnsafeMkNodeId 7 ]
 corpusAddDocuments :: TestEnv -> Assertion
 corpusAddDocuments env = runTestMonad env $ do
@@ -247,11 +338,13 @@ corpusSearchDB01 env = do
    liftIO $ do
      length results `shouldBe` 0 -- doesn't exist, we just check that proper to_tsquery is called
+docsInCorpusCount :: TestMonad Int
+docsInCorpusCount = runDBQuery $ do
+  parentId <- getRootId (UserName userMaster)
+  corpus   <- getCorporaWithParentIdOrFail parentId
+  selectCountDocs (_node_id corpus)
 corpusReturnCount :: TestEnv -> Assertion
 corpusReturnCount env = do
-  runTestMonad env $ do
+  count <- runTestMonad env $ docsInCorpusCount
-    count <- runDBQuery $ do
+  liftIO $ count @?= 4
-      parentId <- getRootId (UserName userMaster)
-      corpus   <- getCorporaWithParentIdOrFail parentId
-      selectCountDocs (_node_id corpus)
-    liftIO $ count @?= 4
--- a/test/Test/Database/Types.hs
+++ b/test/Test/Database/Types.hs
@@ -94,7 +94,7 @@ runTestMonadM :: Show err => env -> TestMonadM env err a -> IO a
 runTestMonadM env m = do
  res <- flip runReaderT env . runExceptT . _TestMonad $ m
  case res of
-    Left err -> throwIO $ userError (show err)
+    Left err -> throwIO $ userError ("runTestMonadM: " <> show err)
    Right x  -> pure x
 runTestMonad :: TestEnv -> TestMonadM TestEnv BackendInternalError a -> IO a