diff --git a/gargantext.ini_toModify b/gargantext.ini_toModify index d372000f675917bb590e48efecf3f9d8633d2989..c286309ebdb5aee603621c84c42d872c0bb087ae 100644 --- a/gargantext.ini_toModify +++ b/gargantext.ini_toModify @@ -33,6 +33,7 @@ FRAME_VISIO_URL = URL_TO_CHANGE FRAME_SEARX_URL = URL_TO_CHANGE FRAME_ISTEX_URL = URL_TO_CHANGE +MAX_DOCS_PARSERS = 1000000 MAX_DOCS_SCRAPERS = 10000 [server] diff --git a/src/Gargantext/API/Job.hs b/src/Gargantext/API/Job.hs index 1dceb151590fd5ee8c0a3bbfec88e88c9a017678..d0da5e7f8c90df764043ee613805e4b161add2dd 100644 --- a/src/Gargantext/API/Job.hs +++ b/src/Gargantext/API/Job.hs @@ -3,6 +3,7 @@ module Gargantext.API.Job where import Control.Lens (over, _Just) import Data.IORef import Data.Maybe +import qualified Data.Text as T import Gargantext.Prelude @@ -16,6 +17,14 @@ jobLogInit rem = , _scst_failed = Just 0 , _scst_events = Just [] } +addEvent :: T.Text -> T.Text -> JobLog -> JobLog +addEvent level message (JobLog { _scst_events = mEvts, .. }) = JobLog { _scst_events = Just (evts <> [ newEvt ]), .. } + where + evts = fromMaybe [] mEvts + newEvt = ScraperEvent { _scev_message = Just message + , _scev_level = Just level + , _scev_date = Nothing } + jobLogSuccess :: JobLog -> JobLog jobLogSuccess jl = over (scst_succeeded . _Just) (+ 1) $ over (scst_remaining . _Just) (\x -> x - 1) jl @@ -38,6 +47,9 @@ jobLogFailTotal (JobLog { _scst_succeeded = mSucc Nothing -> (Nothing, mFail) Just rem -> (Just 0, (+ rem) <$> mFail) +jobLogFailTotalWithMessage :: T.Text -> JobLog -> JobLog +jobLogFailTotalWithMessage message jl = addEvent "ERROR" message $ jobLogFailTotal jl + jobLogEvt :: JobLog -> ScraperEvent -> JobLog jobLogEvt jl evt = over (scst_events . _Just) (\evts -> (evt:evts)) jl diff --git a/src/Gargantext/API/Node/Corpus/New.hs b/src/Gargantext/API/Node/Corpus/New.hs index 4ecc9d5222329f8f6774c83b9da0bb66c898aa4e..278042d4e8d9540b0e5e0fd34308683a34c62b49 100644 --- a/src/Gargantext/API/Node/Corpus/New.hs +++ b/src/Gargantext/API/Node/Corpus/New.hs @@ -39,7 +39,7 @@ import Gargantext.Prelude import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events) import Gargantext.API.Admin.Types (HasSettings) -import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal) +import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage) import Gargantext.API.Node.Corpus.New.File import Gargantext.API.Node.Corpus.Searx import Gargantext.API.Node.Corpus.Types @@ -57,11 +57,12 @@ import Gargantext.Database.Action.Node (mkNodeWithParent) import Gargantext.Database.Action.User (getUserId) import Gargantext.Database.Admin.Types.Hyperdata import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId) +import Gargantext.Database.Prelude (hasConfig) import Gargantext.Database.Query.Table.Node (getNodeWith) import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata) import Gargantext.Database.Schema.Node (node_hyperdata) import qualified Gargantext.Database.GargDB as GargDB - +import Gargantext.Prelude.Config (gc_max_docs_parsers) ------------------------------------------------------------------------ {- data Query = Query { query_query :: Text @@ -240,7 +241,7 @@ type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint" :> "async" :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog -addToCorpusWithForm :: FlowCmdM env err m +addToCorpusWithForm :: (FlowCmdM env err m) => User -> CorpusId -> NewWithForm @@ -270,7 +271,21 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do Right docs' -> do -- TODO Add progress (jobStatus) update for docs - this is a -- long action - let docs = splitEvery 500 $ take 1000000 docs' + limit' <- view $ hasConfig . gc_max_docs_parsers + let limit = fromIntegral limit' + if length docs' > limit then do + printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs') + let panicMsg' = [ "[addToCorpusWithForm] number of docs (" + , show $ length docs' + , ") exceeds the MAX_DOCS_PARSERS limit (" + , show limit + , ")" ] + let panicMsg = T.concat $ T.pack <$> panicMsg' + logStatus $ jobLogFailTotalWithMessage panicMsg jobLog + panic panicMsg + else + pure () + let docs = splitEvery 500 $ take limit docs' printDebug "Parsing corpus finished : " cid logStatus jobLog2 diff --git a/src/Gargantext/API/Node/FrameCalcUpload.hs b/src/Gargantext/API/Node/FrameCalcUpload.hs index f54101a2077e611e30966195392bc65e74a9eeda..7238a195b32d967dbc223b692c4225c862cf5744 100644 --- a/src/Gargantext/API/Node/FrameCalcUpload.hs +++ b/src/Gargantext/API/Node/FrameCalcUpload.hs @@ -27,6 +27,7 @@ import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Database.Action.Flow.Types import Gargantext.Database.Admin.Types.Hyperdata.Frame import Gargantext.Database.Admin.Types.Node +import Gargantext.Database.Prelude (HasConfig) import Gargantext.Database.Query.Table.Node (getClosestParentIdByType, getNodeWith) import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Prelude @@ -53,7 +54,7 @@ frameCalcUploadAPI uId nId = ) -frameCalcUploadAsync :: FlowCmdM env err m +frameCalcUploadAsync :: (HasConfig env, FlowCmdM env err m) => UserId -> NodeId -> FrameCalcUpload diff --git a/stack.yaml b/stack.yaml index b17c56f7748f9c70c7b2d21a58ba1d246db7a66c..db2fa9498ff040dd6a0a096951a8f53b1b8390a6 100644 --- a/stack.yaml +++ b/stack.yaml @@ -27,8 +27,9 @@ allow-newer: true # "$everything": -haddock extra-deps: -- git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git - commit: 3e32ec3aca71eb326805355d3a99b9288dc342ee +- #git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git + git: https://gitlab.iscpif.fr/cgenie/haskell-gargantext-prelude.git + commit: 35b09629a658fc16cc9ff63e7591e58511cd98a7 # Data Mining Libs - git: https://github.com/delanoe/data-time-segment.git commit: 10a416b9f6c443866b36479c3441ebb3bcdeb7ef