Commit 91228141 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch '81-dev-zip-upload' of...

Merge branch '81-dev-zip-upload' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantext into dev-merge
parents 2a45a454 e1dd0752
use_nix
...@@ -11,6 +11,7 @@ profiling ...@@ -11,6 +11,7 @@ profiling
# Stack # Stack
.stack-work .stack-work
dist-newstyle
# Emacs # Emacs
TAGS TAGS
...@@ -33,5 +34,7 @@ _darcs ...@@ -33,5 +34,7 @@ _darcs
# Runtime # Runtime
# Repo # Repo
repos
repo.json* repo.json*
tmp*repo*json tmp*repo*json
data
...@@ -33,6 +33,7 @@ FRAME_VISIO_URL = URL_TO_CHANGE ...@@ -33,6 +33,7 @@ FRAME_VISIO_URL = URL_TO_CHANGE
FRAME_SEARX_URL = URL_TO_CHANGE FRAME_SEARX_URL = URL_TO_CHANGE
FRAME_ISTEX_URL = URL_TO_CHANGE FRAME_ISTEX_URL = URL_TO_CHANGE
MAX_DOCS_PARSERS = 1000000
MAX_DOCS_SCRAPERS = 10000 MAX_DOCS_SCRAPERS = 10000
[server] [server]
......
...@@ -3,6 +3,7 @@ module Gargantext.API.Job where ...@@ -3,6 +3,7 @@ module Gargantext.API.Job where
import Control.Lens (over, _Just) import Control.Lens (over, _Just)
import Data.IORef import Data.IORef
import Data.Maybe import Data.Maybe
import qualified Data.Text as T
import Gargantext.Prelude import Gargantext.Prelude
...@@ -16,6 +17,14 @@ jobLogInit rem = ...@@ -16,6 +17,14 @@ jobLogInit rem =
, _scst_failed = Just 0 , _scst_failed = Just 0
, _scst_events = Just [] } , _scst_events = Just [] }
addEvent :: T.Text -> T.Text -> JobLog -> JobLog
addEvent level message (JobLog { _scst_events = mEvts, .. }) = JobLog { _scst_events = Just (evts <> [ newEvt ]), .. }
where
evts = fromMaybe [] mEvts
newEvt = ScraperEvent { _scev_message = Just message
, _scev_level = Just level
, _scev_date = Nothing }
jobLogSuccess :: JobLog -> JobLog jobLogSuccess :: JobLog -> JobLog
jobLogSuccess jl = over (scst_succeeded . _Just) (+ 1) $ jobLogSuccess jl = over (scst_succeeded . _Just) (+ 1) $
over (scst_remaining . _Just) (\x -> x - 1) jl over (scst_remaining . _Just) (\x -> x - 1) jl
...@@ -38,6 +47,9 @@ jobLogFailTotal (JobLog { _scst_succeeded = mSucc ...@@ -38,6 +47,9 @@ jobLogFailTotal (JobLog { _scst_succeeded = mSucc
Nothing -> (Nothing, mFail) Nothing -> (Nothing, mFail)
Just rem -> (Just 0, (+ rem) <$> mFail) Just rem -> (Just 0, (+ rem) <$> mFail)
jobLogFailTotalWithMessage :: T.Text -> JobLog -> JobLog
jobLogFailTotalWithMessage message jl = addEvent "ERROR" message $ jobLogFailTotal jl
jobLogEvt :: JobLog -> ScraperEvent -> JobLog jobLogEvt :: JobLog -> ScraperEvent -> JobLog
jobLogEvt jl evt = over (scst_events . _Just) (\evts -> (evt:evts)) jl jobLogEvt jl evt = over (scst_events . _Just) (\evts -> (evt:evts)) jl
......
...@@ -21,25 +21,25 @@ module Gargantext.API.Node.Corpus.New ...@@ -21,25 +21,25 @@ module Gargantext.API.Node.Corpus.New
import Control.Lens hiding (elements, Empty) import Control.Lens hiding (elements, Empty)
import Data.Aeson import Data.Aeson
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import qualified Data.ByteString.Base64 as BSB64
import Data.Either import Data.Either
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.Swagger import Data.Swagger
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import GHC.Generics (Generic) import GHC.Generics (Generic)
import qualified Prelude as Prelude
import Protolude (readFile)
import Servant import Servant
import Servant.Job.Utils (jsonOptions) import Servant.Job.Utils (jsonOptions)
-- import Servant.Multipart -- import Servant.Multipart
import qualified Data.Text.Encoding as TE
-- import Test.QuickCheck (elements) -- import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary import Test.QuickCheck.Arbitrary
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal) import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
import Gargantext.API.Node.Corpus.New.File import Gargantext.API.Node.Corpus.New.File
import Gargantext.API.Node.Corpus.Searx import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types import Gargantext.API.Node.Corpus.Types
...@@ -57,11 +57,12 @@ import Gargantext.Database.Action.Node (mkNodeWithParent) ...@@ -57,11 +57,12 @@ import Gargantext.Database.Action.Node (mkNodeWithParent)
import Gargantext.Database.Action.User (getUserId) import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId) import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (getNodeWith) import Gargantext.Database.Query.Table.Node (getNodeWith)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata) import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Gargantext.Database.GargDB as GargDB import qualified Gargantext.Database.GargDB as GargDB
import Gargantext.Prelude.Config (gc_max_docs_parsers)
------------------------------------------------------------------------ ------------------------------------------------------------------------
{- {-
data Query = Query { query_query :: Text data Query = Query { query_query :: Text
...@@ -240,7 +241,7 @@ type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint" ...@@ -240,7 +241,7 @@ type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
:> "async" :> "async"
:> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
addToCorpusWithForm :: FlowCmdM env err m addToCorpusWithForm :: (FlowCmdM env err m)
=> User => User
-> CorpusId -> CorpusId
-> NewWithForm -> NewWithForm
...@@ -260,10 +261,31 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -260,10 +261,31 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
ZIP -> Parser.parseFormat Parser.ZIP ZIP -> Parser.parseFormat Parser.ZIP
-- TODO granularity of the logStatus -- TODO granularity of the logStatus
eDocs <- liftBase $ parse $ cs d let data' = case ft of
ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
Right decoded -> decoded
_ -> cs d
eDocs <- liftBase $ parse data'
case eDocs of case eDocs of
Right docs' -> do Right docs' -> do
let docs = splitEvery 500 $ take 1000000 docs' -- TODO Add progress (jobStatus) update for docs - this is a
-- long action
limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit'
if length docs' > limit then do
printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
, show $ length docs'
, ") exceeds the MAX_DOCS_PARSERS limit ("
, show limit
, ")" ]
let panicMsg = T.concat $ T.pack <$> panicMsg'
logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
panic panicMsg
else
pure ()
let docs = splitEvery 500 $ take limit docs'
printDebug "Parsing corpus finished : " cid printDebug "Parsing corpus finished : " cid
logStatus jobLog2 logStatus jobLog2
...@@ -283,20 +305,19 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -283,20 +305,19 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
logStatus jobLog3 logStatus jobLog3
pure $ jobLog3 pure $ jobLog3
Left e -> do Left e -> do
printDebug "Error" e printDebug "[addToCorpusWithForm] parse error" e
let evt = ScraperEvent { _scev_message = Just $ T.pack e
, _scev_level = Just "ERROR"
, _scev_date = Nothing }
logStatus jobLogE logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
pure jobLogE pure jobLogE
where where
jobLog2 = jobLogSuccess jobLog jobLog2 = jobLogSuccess jobLog
jobLog3 = jobLogSuccess jobLog2 jobLog3 = jobLogSuccess jobLog2
jobLogE = jobLogFailTotal jobLog jobLogE = jobLogFailTotal jobLog
parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
parseCsvGargV3Path fp = do
contents <- readFile fp
Parser.parseFormat Parser.CsvGargV3 $ cs contents
{- {-
addToCorpusWithFile :: FlowCmdM env err m addToCorpusWithFile :: FlowCmdM env err m
=> CorpusId => CorpusId
......
...@@ -27,6 +27,7 @@ import Gargantext.Core.Types.Individu (User(..)) ...@@ -27,6 +27,7 @@ import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Flow.Types import Gargantext.Database.Action.Flow.Types
import Gargantext.Database.Admin.Types.Hyperdata.Frame import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (HasConfig)
import Gargantext.Database.Query.Table.Node (getClosestParentIdByType, getNodeWith) import Gargantext.Database.Query.Table.Node (getClosestParentIdByType, getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude import Gargantext.Prelude
...@@ -53,7 +54,7 @@ frameCalcUploadAPI uId nId = ...@@ -53,7 +54,7 @@ frameCalcUploadAPI uId nId =
) )
frameCalcUploadAsync :: FlowCmdM env err m frameCalcUploadAsync :: (HasConfig env, FlowCmdM env err m)
=> UserId => UserId
-> NodeId -> NodeId
-> FrameCalcUpload -> FrameCalcUpload
......
...@@ -24,7 +24,7 @@ import Gargantext.API.Node.Corpus.New.File (FileType) ...@@ -24,7 +24,7 @@ import Gargantext.API.Node.Corpus.New.File (FileType)
------------------------------------------------------- -------------------------------------------------------
data NewWithForm = NewWithForm data NewWithForm = NewWithForm
{ _wf_filetype :: !FileType { _wf_filetype :: !FileType
, _wf_data :: !Text , _wf_data :: !Text -- NOTE for binary files, this represents base-64 data
, _wf_lang :: !(Maybe Lang) , _wf_lang :: !(Maybe Lang)
, _wf_name :: !Text , _wf_name :: !Text
} deriving (Eq, Show, Generic) } deriving (Eq, Show, Generic)
......
...@@ -25,7 +25,8 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl ...@@ -25,7 +25,8 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries) import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently) import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad (join) import Control.Monad (join, sequence)
import Control.Monad.IO.Class (liftIO)
import Data.Attoparsec.ByteString (parseOnly, Parser) import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..)) import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers) import Data.Either.Extra (partitionEithers)
...@@ -43,6 +44,7 @@ import qualified Data.ByteString.Lazy as DBL ...@@ -43,6 +44,7 @@ import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.Text as DT import qualified Data.Text as DT
import qualified Prelude as Prelude import qualified Prelude as Prelude
import System.IO.Temp (emptySystemTempFile)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
...@@ -95,9 +97,14 @@ parseFormat WOS bs = do ...@@ -95,9 +97,14 @@ parseFormat WOS bs = do
$ partitionEithers $ partitionEithers
$ [runParser' WOS bs] $ [runParser' WOS bs]
pure $ Right docs pure $ Right docs
parseFormat ZIP _bs = do parseFormat ZIP bs = do
printDebug "[parseFormat]" ZIP path <- emptySystemTempFile "parsed.zip"
pure $ Left "Not implemented for ZIP" DB.writeFile path bs
withArchive path $ do
files <- DM.keys <$> getEntries
filesContents <- mapM getEntry files
ddocs <- liftIO $ mapM (parseFormat CsvGargV3) filesContents
pure $ concat <$> sequence ddocs
parseFormat _ _ = undefined parseFormat _ _ = undefined
-- | Parse file into documents -- | Parse file into documents
......
...@@ -10,7 +10,8 @@ packages: ...@@ -10,7 +10,8 @@ packages:
#- 'deps/accelerate-utility' #- 'deps/accelerate-utility'
docker: docker:
enable: false #enable: false
#enable: true
repo: 'cgenie/stack-build:lts-18.12-garg' repo: 'cgenie/stack-build:lts-18.12-garg'
run-args: run-args:
- '--publish=8008:8008' - '--publish=8008:8008'
...@@ -26,8 +27,9 @@ allow-newer: true ...@@ -26,8 +27,9 @@ allow-newer: true
# "$everything": -haddock # "$everything": -haddock
extra-deps: extra-deps:
- git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git - #git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git
commit: 3e32ec3aca71eb326805355d3a99b9288dc342ee git: https://gitlab.iscpif.fr/cgenie/haskell-gargantext-prelude.git
commit: 35b09629a658fc16cc9ff63e7591e58511cd98a7
# Data Mining Libs # Data Mining Libs
- git: https://github.com/delanoe/data-time-segment.git - git: https://github.com/delanoe/data-time-segment.git
commit: 10a416b9f6c443866b36479c3441ebb3bcdeb7ef commit: 10a416b9f6c443866b36479c3441ebb3bcdeb7ef
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment