[worker] fix an unfortunate coincidence of various async issues

This described in this comment: #477 (comment 14458) I repaste, for history: - job timeout was 30 seconds only and this is a big zip file, so the job timed out in worker - however, this was recently added https://gitlab.iscpif.fr/gargantext/haskell-gargantext/blame/dev/src/Gargantext/Database/Action/Flow.hs#L490 and the timeout wasn't caught and the worker continued happily - the job finished normally (most probably) - the job was restarted, because default strategy for timeouts is to restart a job - for sending files, we use postgres large objects because it keeps our JSONs small - when the job finishes, it clears definitely the large object so that we don't leave large, unused blob data - however, that job was restarted and there was no more a large object to work on - you got some sql error, but that wasn't the root cause Solution is: - don't catch any exception, but be careful and handle `Timeout` or `KillWorkerSafely` - increase job timeout for file upload - change timeout strategy for file upload to `TSDelete`, i.e. don't retry that job anymore

[worker] fix an unfortunate coincidence of various async issues
This described in this comment: #477 (comment 14458) I repaste, for history: - job timeout was 30 seconds only and this is a big zip file, so the job timed out in worker - however, this was recently added https://gitlab.iscpif.fr/gargantext/haskell-gargantext/blame/dev/src/Gargantext/Database/Action/Flow.hs#L490 and the timeout wasn't caught and the worker continued happily - the job finished normally (most probably) - the job was restarted, because default strategy for timeouts is to restart a job - for sending files, we use postgres large objects because it keeps our JSONs small - when the job finishes, it clears definitely the large object so that we don't leave large, unused blob data - however, that job was restarted and there was no more a large object to work on - you got some sql error, but that wasn't the root cause Solution is: - don't catch any exception, but be careful and handle `Timeout` or `KillWorkerSafely` - increase job timeout for file upload - change timeout strategy for file upload to `TSDelete`, i.e. don't retry that job anymore
406cd89e · Przemyslaw Kaminski · 7c42b188 · 406cd89e · 406cd89e · 406cd89e
Verified Commit 406cd89e authored Jun 13, 2025 by Przemyslaw Kaminski
Showing with 32 additions and 16 deletions

cabal.project cabal.project +1 -1

gargantext.cabal gargantext.cabal +1 -0

Jobs.hs src/Gargantext/Core/Worker/Jobs.hs +10 -3

Flow.hs src/Gargantext/Database/Action/Flow.hs +20 -12

No files found.
--- a/cabal.project
+++ b/cabal.project
@@ -156,7 +156,7 @@ source-repository-package
 source-repository-package
    type: git
    location: https://gitlab.iscpif.fr/gargantext/haskell-bee
-    tag: 4a9c709613554eed0189b486de2126c18797088c
+    tag: eb559a29212ae5bb27dc138f1060494b785e1afb
    subdir: haskell-bee/
            haskell-bee-pgmq/
            haskell-bee-tests/

--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -647,6 +647,7 @@ library
    , transformers-base ^>= 0.4.6
    , tree-diff
    , tuple ^>= 0.3.0.2
+    , unbounded-delays >= 0.1.1 && < 0.2
    , unicode-collation >= 0.1.3.5
    , unordered-containers ^>= 0.2.16.0
    -- needed for Worker / System.Posix.Signals

--- a/src/Gargantext/Core/Worker/Jobs.hs
+++ b/src/Gargantext/Core/Worker/Jobs.hs
@@ -15,6 +15,7 @@ module Gargantext.Core.Worker.Jobs where


 import Async.Worker qualified as W
+import Async.Worker.Types qualified as WT
 import Control.Lens (view)
 import Gargantext.Core.Config (gc_database_config, gc_worker, HasConfig(..), GargConfig, gc_logging)
 import Gargantext.Core.Config.Worker (WorkerSettings(..), WorkerDefinition(..))
@@ -51,13 +52,19 @@ sendJobWithCfg gcConfig job = do

 -- | We want to fine-tune job metadata parameters, for each job type
 updateJobData :: Job -> SendJob -> SendJob
-updateJobData (AddCorpusTempFileAsync {}) sj = sj { W.timeout = 3000 }
+updateJobData (AddCorpusTempFileAsync {}) sj = sj { W.timeout = 3000
+                                                  , W.toStrat = WT.TSDelete
+                                                  , W.resendOnKill = False }
 updateJobData (AddCorpusWithQuery {}) sj = sj { W.timeout = 3000 }
 updateJobData (AddToAnnuaireWithForm {}) sj = sj { W.timeout = 3000 }
-updateJobData (AddWithFile {}) sj = sj { W.timeout = 3000 }
+updateJobData (AddWithFile {}) sj = sj { W.timeout = 3000
+                                       , W.toStrat = WT.TSDelete
+                                       , W.resendOnKill = False }
 updateJobData (DocumentsFromWriteNodes {}) sj = sj { W.timeout = 3000 }
 updateJobData (FrameCalcUpload {}) sj = sj { W.timeout = 3000 }
-updateJobData (JSONPost {}) sj = sj { W.timeout = 3000 }
+updateJobData (JSONPost {}) sj = sj { W.timeout = 3000
+                                    , W.toStrat = WT.TSDelete
+                                    , W.resendOnKill = False }
 updateJobData (NgramsPostCharts {}) sj = sj { W.timeout = 3000 }
 updateJobData (RecomputeGraph {}) sj = sj { W.timeout = 3000 }
 updateJobData (UpdateNode {}) sj = sj { W.timeout = 3000 }

--- a/src/Gargantext/Database/Action/Flow.hs
+++ b/src/Gargantext/Database/Action/Flow.hs
@@ -54,9 +54,11 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
  )
    where

+import Async.Worker qualified as W
 import Conduit
+import Control.Concurrent.Timeout qualified as Timeout
+import Control.Exception.Safe qualified as CES
 import Control.Lens ( to, view )
-import Control.Monad.Catch
 import Data.Conduit qualified as C
 import Data.Conduit.Internal (zipSources)
 import Data.Conduit.List qualified as CL
@@ -172,7 +174,7 @@ flowDataText :: forall env err m.
                , HasTreeError err
                , HasValidationError err
                , MonadJobStatus m
-                , MonadCatch m
+                , CES.MonadCatch m
                , HasCentralExchangeNotification env
                )
                => User
@@ -207,7 +209,7 @@ flowAnnuaire :: ( IsDBCmd env err m
                , HasTreeError err
                , HasValidationError err
                , MonadJobStatus m
-                , MonadCatch m
+                , CES.MonadCatch m
                , HasCentralExchangeNotification env )
             => MkCorpusUser
             -> TermType Lang
@@ -227,7 +229,7 @@ flowCorpusFile :: ( IsDBCmd env err m
                  , HasTreeError err
                  , HasValidationError err
                  , MonadJobStatus m
-                  , MonadCatch m
+                  , CES.MonadCatch m
                  , HasCentralExchangeNotification env )
           => MkCorpusUser
           -> TermType Lang
@@ -257,7 +259,7 @@ flowCorpus :: ( IsDBCmd env err m
              , HasValidationError err
              , FlowCorpus a
              , MonadJobStatus m
-              , MonadCatch m
+              , CES.MonadCatch m
              , HasCentralExchangeNotification env, Show a )
           => MkCorpusUser
           -> TermType Lang
@@ -279,7 +281,8 @@ flow :: forall env err m a c.
        , MkCorpus c
        , MonadJobStatus m
        , HasCentralExchangeNotification env
-        , MonadCatch m, Show a
+        , CES.MonadCatch m
+        , Show a
        )
        => Maybe c
        -> MkCorpusUser
@@ -319,7 +322,8 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m
                             , FlowCorpus document
                             , MkCorpus corpus
                             , MonadLogger m
-                             , MonadCatch m, Show document
+                             , CES.MonadCatch m
+                             , Show document
                             )
                             => Maybe corpus
                             -> TermType Lang
@@ -474,7 +478,7 @@ extractNgramsFromDocument :: ( UniqParameters doc
                             , ExtractNgrams doc
                             , IsDBCmd err env m
                             , MonadLogger m
-                             , MonadCatch m
+                             , CES.MonadCatch m
                             )
                          => NLPServerConfig
                          -> TermType Lang
@@ -487,9 +491,13 @@ extractNgramsFromDocument nlpServer lang doc =
  -- will still be added to the corpus and we can try to regen the ngrams at a later stage.
  UncommittedNgrams . Map.singleton docId <$>
    (documentIdWithNgrams (extractNgrams nlpServer $ withLang lang [doc]) (Indexed docId doc)
-     `catch` \(e :: SomeException) -> do
-       $(logLocM) ERROR $ T.pack $ "Document with hash " <> show docId <> " failed ngrams extraction due to an exception: " <> displayException e
-       pure $ DocumentIdWithNgrams (Indexed docId doc) mempty
+     `CES.catches`
+     [ CES.Handler $ \(e :: Timeout.Timeout) -> CES.throw e
+     , CES.Handler $ \(e :: W.KillWorkerSafely) -> CES.throw e
+     , CES.Handler $ \(e :: CES.SomeException) -> do
+         $(logLocM) ERROR $ T.pack $ "Document with hash " <> show docId <> " failed ngrams extraction due to an exception: " <> displayException e
+         pure $ DocumentIdWithNgrams (Indexed docId doc) mempty
+     ]
    )
  where
    docId = DocumentHashId $ newUniqIdHash doc
@@ -520,7 +528,7 @@ extractNgramsFromDocuments :: forall doc env err m.
                           , ExtractNgrams doc
                           , IsDBCmd env err m
                           , MonadLogger m
-                           , MonadCatch m
+                           , CES.MonadCatch m
                           )
                           => NLPServerConfig
                           -> TermType Lang