Commit e1dd0752 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[upload zip] add MAX_DOCS_PARSERS limit to ZIP/CSV upload

parent 5e8bc9f2
Pipeline #1967 failed with stage
in 9 minutes and 57 seconds
......@@ -33,6 +33,7 @@ FRAME_VISIO_URL = URL_TO_CHANGE
FRAME_SEARX_URL = URL_TO_CHANGE
FRAME_ISTEX_URL = URL_TO_CHANGE
MAX_DOCS_PARSERS = 1000000
MAX_DOCS_SCRAPERS = 10000
[server]
......
......@@ -3,6 +3,7 @@ module Gargantext.API.Job where
import Control.Lens (over, _Just)
import Data.IORef
import Data.Maybe
import qualified Data.Text as T
import Gargantext.Prelude
......@@ -16,6 +17,14 @@ jobLogInit rem =
, _scst_failed = Just 0
, _scst_events = Just [] }
addEvent :: T.Text -> T.Text -> JobLog -> JobLog
addEvent level message (JobLog { _scst_events = mEvts, .. }) = JobLog { _scst_events = Just (evts <> [ newEvt ]), .. }
where
evts = fromMaybe [] mEvts
newEvt = ScraperEvent { _scev_message = Just message
, _scev_level = Just level
, _scev_date = Nothing }
jobLogSuccess :: JobLog -> JobLog
jobLogSuccess jl = over (scst_succeeded . _Just) (+ 1) $
over (scst_remaining . _Just) (\x -> x - 1) jl
......@@ -38,6 +47,9 @@ jobLogFailTotal (JobLog { _scst_succeeded = mSucc
Nothing -> (Nothing, mFail)
Just rem -> (Just 0, (+ rem) <$> mFail)
jobLogFailTotalWithMessage :: T.Text -> JobLog -> JobLog
jobLogFailTotalWithMessage message jl = addEvent "ERROR" message $ jobLogFailTotal jl
jobLogEvt :: JobLog -> ScraperEvent -> JobLog
jobLogEvt jl evt = over (scst_events . _Just) (\evts -> (evt:evts)) jl
......
......@@ -39,7 +39,7 @@ import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
import Gargantext.API.Node.Corpus.New.File
import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types
......@@ -57,11 +57,12 @@ import Gargantext.Database.Action.Node (mkNodeWithParent)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (getNodeWith)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Gargantext.Database.GargDB as GargDB
import Gargantext.Prelude.Config (gc_max_docs_parsers)
------------------------------------------------------------------------
{-
data Query = Query { query_query :: Text
......@@ -240,7 +241,7 @@ type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
:> "async"
:> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
addToCorpusWithForm :: FlowCmdM env err m
addToCorpusWithForm :: (FlowCmdM env err m)
=> User
-> CorpusId
-> NewWithForm
......@@ -270,7 +271,21 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
Right docs' -> do
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
let docs = splitEvery 500 $ take 1000000 docs'
limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit'
if length docs' > limit then do
printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
, show $ length docs'
, ") exceeds the MAX_DOCS_PARSERS limit ("
, show limit
, ")" ]
let panicMsg = T.concat $ T.pack <$> panicMsg'
logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
panic panicMsg
else
pure ()
let docs = splitEvery 500 $ take limit docs'
printDebug "Parsing corpus finished : " cid
logStatus jobLog2
......
......@@ -27,6 +27,7 @@ import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Flow.Types
import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (HasConfig)
import Gargantext.Database.Query.Table.Node (getClosestParentIdByType, getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude
......@@ -53,7 +54,7 @@ frameCalcUploadAPI uId nId =
)
frameCalcUploadAsync :: FlowCmdM env err m
frameCalcUploadAsync :: (HasConfig env, FlowCmdM env err m)
=> UserId
-> NodeId
-> FrameCalcUpload
......
......@@ -27,8 +27,9 @@ allow-newer: true
# "$everything": -haddock
extra-deps:
- git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git
commit: 3e32ec3aca71eb326805355d3a99b9288dc342ee
- #git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git
git: https://gitlab.iscpif.fr/cgenie/haskell-gargantext-prelude.git
commit: 35b09629a658fc16cc9ff63e7591e58511cd98a7
# Data Mining Libs
- git: https://github.com/delanoe/data-time-segment.git
commit: 10a416b9f6c443866b36479c3441ebb3bcdeb7ef
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment