Commit f18e2dbc authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Implement a proper incremental parser for TSV documents

This commit introduces/improves the `parseTvsWithDiagnostics`
function to parse the input TSV incrementally, collecting errors
as we go, and eventually reporting them upstream via the newly
added `emitTsvParseWarning` function.
parent b0d6365f
...@@ -57,6 +57,7 @@ data-files: ...@@ -57,6 +57,7 @@ data-files:
test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir.tsv test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir.tsv
test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir12.csv test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir12.csv
test-data/issue-380/corpus.tsv test-data/issue-380/corpus.tsv
test-data/issue-380/malformed_row.tsv
.clippy.dhall .clippy.dhall
-- common options -- common options
...@@ -227,6 +228,7 @@ library ...@@ -227,6 +228,7 @@ library
Gargantext.Core.Text.Corpus.API.OpenAlex Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.API.Pubmed Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Parsers Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.Types
Gargantext.Core.Text.Corpus.Parsers.Date Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.TSV Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.Query
......
...@@ -187,6 +187,8 @@ instance MonadJobStatus (GargM DevEnv err) where ...@@ -187,6 +187,8 @@ instance MonadJobStatus (GargM DevEnv err) where
markFailed _ _ = pure () markFailed _ _ = pure ()
emitWarning _ _ = pure ()
addMoreSteps _ _ = pure () addMoreSteps _ _ = pure ()
instance HasConfig DevEnv where instance HasConfig DevEnv where
......
...@@ -20,6 +20,7 @@ module Gargantext.API.Job ( ...@@ -20,6 +20,7 @@ module Gargantext.API.Job (
, jobLogFailTotalWithMessage , jobLogFailTotalWithMessage
, RemainingSteps(..) , RemainingSteps(..)
, addErrorEvent , addErrorEvent
, addWarningEvent
) where ) where
import Control.Lens (over, _Just) import Control.Lens (over, _Just)
...@@ -49,6 +50,9 @@ addEvent level message (JobLog { _scst_events = mEvts, .. }) = JobLog { _scst_ev ...@@ -49,6 +50,9 @@ addEvent level message (JobLog { _scst_events = mEvts, .. }) = JobLog { _scst_ev
addErrorEvent :: ToHumanFriendlyError e => e -> JobLog -> JobLog addErrorEvent :: ToHumanFriendlyError e => e -> JobLog -> JobLog
addErrorEvent message = addEvent "ERROR" (mkHumanFriendly message) addErrorEvent message = addEvent "ERROR" (mkHumanFriendly message)
addWarningEvent :: ToHumanFriendlyError e => e -> JobLog -> JobLog
addWarningEvent message = addEvent "WARNING" (mkHumanFriendly message)
jobLogProgress :: Int -> JobLog -> JobLog jobLogProgress :: Int -> JobLog -> JobLog
jobLogProgress n jl = over (scst_succeeded . _Just) (+ n) $ jobLogProgress n jl = over (scst_succeeded . _Just) (+ n) $
over (scst_remaining . _Just) (\x -> max 0 (x - n)) jl over (scst_remaining . _Just) (\x -> max 0 (x - n)) jl
......
...@@ -15,15 +15,17 @@ New corpus means either: ...@@ -15,15 +15,17 @@ New corpus means either:
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE LambdaCase #-}
module Gargantext.API.Node.Corpus.New module Gargantext.API.Node.Corpus.New
where where
import Conduit ((.|), yieldMany, mapMC, mapC, transPipe) import Conduit ((.|), yieldMany, mapMC, transPipe)
import Control.Exception.Safe (MonadMask) import Control.Exception.Safe (MonadMask)
import Control.Lens ( view, non ) import Control.Lens ( view, non )
import Data.Conduit.Internal (zipSources) import Data.Conduit.Internal (zipSources)
import Data.Conduit.List (mapMaybeM)
import Data.Swagger ( ToSchema(..) ) import Data.Swagger ( ToSchema(..) )
import Data.Text qualified as T import Data.Text qualified as T
import Database.PostgreSQL.Simple.LargeObjects qualified as PSQL import Database.PostgreSQL.Simple.LargeObjects qualified as PSQL
...@@ -34,14 +36,15 @@ import Gargantext.API.Node.Corpus.Searx ( triggerSearxSearch ) ...@@ -34,14 +36,15 @@ import Gargantext.API.Node.Corpus.Searx ( triggerSearxSearch )
import Gargantext.API.Node.Corpus.Types ( Datafield(Web), datafield2origin ) import Gargantext.API.Node.Corpus.Types ( Datafield(Web), datafield2origin )
import Gargantext.API.Node.Corpus.Update (addLanguageToCorpus) import Gargantext.API.Node.Corpus.Update (addLanguageToCorpus)
import Gargantext.API.Node.Types import Gargantext.API.Node.Types
import Gargantext.Core (withDefaultLanguage, defaultLanguage)
import Gargantext.Core.Config (gc_jobs, hasConfig) import Gargantext.Core.Config (gc_jobs, hasConfig)
import Gargantext.Core.Config.Types (jc_max_docs_parsers) import Gargantext.Core.Config.Types (jc_max_docs_parsers)
import Gargantext.Core.NodeStory (HasNodeStoryImmediateSaver, HasNodeArchiveStoryImmediateSaver, currentVersion, NgramsStatePatch', HasNodeStoryEnv) import Gargantext.Core.NodeStory (HasNodeStoryImmediateSaver, HasNodeArchiveStoryImmediateSaver, currentVersion, NgramsStatePatch', HasNodeStoryEnv)
import Gargantext.Core.Text.Corpus.Parsers qualified as Parser (FileType(..), parseFormatC, _ParseFormatError) import Gargantext.Core.Text.Corpus.Parsers qualified as Parser (FileType(..), parseFormatC, _ParseFormatError)
import Gargantext.Core.Text.Corpus.Parsers.Types
import Gargantext.Core.Text.Corpus.Query qualified as API import Gargantext.Core.Text.Corpus.Query qualified as API
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix) import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Core (withDefaultLanguage, defaultLanguage)
import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-}) import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Action.Mail (sendMail) import Gargantext.Database.Action.Mail (sendMail)
...@@ -52,13 +55,14 @@ import Gargantext.Database.Admin.Types.Hyperdata.File ( HyperdataFile(..) ) ...@@ -52,13 +55,14 @@ import Gargantext.Database.Admin.Types.Hyperdata.File ( HyperdataFile(..) )
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), ParentId) import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), ParentId)
import Gargantext.Database.GargDB qualified as GargDB import Gargantext.Database.GargDB qualified as GargDB
import Gargantext.Database.Prelude (readLargeObject, IsDBCmd) import Gargantext.Database.Prelude (readLargeObject, IsDBCmd)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node (getNodeWith, getOrMkList) import Gargantext.Database.Query.Table.Node (getNodeWith, getOrMkList)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata) import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(MkCorpusUserNormalCorpusIds)) import Gargantext.Database.Query.Tree.Root (MkCorpusUser(MkCorpusUserNormalCorpusIds))
import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.System.Logging ( logLocM, LogLevel(..) ) import Gargantext.System.Logging ( logLocM, LogLevel(..) )
import Gargantext.Utils.Jobs.Error (HumanFriendlyErrorText(..))
import Gargantext.Utils.Jobs.Monad (JobHandle, MonadJobStatus(..)) import Gargantext.Utils.Jobs.Monad (JobHandle, MonadJobStatus(..))
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -226,9 +230,7 @@ addToCorpusWithTempFile :: ( MonadMask m ...@@ -226,9 +230,7 @@ addToCorpusWithTempFile :: ( MonadMask m
-> JobHandle m -> JobHandle m
-> m () -> m ()
addToCorpusWithTempFile user cid nwtf jobHandle = do addToCorpusWithTempFile user cid nwtf jobHandle = do
-- printDebug "[addToCorpusWithForm] Parsing corpus: " cid $(logLocM) DEBUG $ "Adding documents to corpus: " <> show cid
-- printDebug "[addToCorpusWithForm] fileType" ft
-- printDebug "[addToCorpusWithForm] fileFormat" ff
let l = nwtf ^. wtf_lang . non defaultLanguage let l = nwtf ^. wtf_lang . non defaultLanguage
addLanguageToCorpus cid l addLanguageToCorpus cid l
...@@ -256,7 +258,7 @@ addToCorpusWithTempFile user cid nwtf jobHandle = do ...@@ -256,7 +258,7 @@ addToCorpusWithTempFile user cid nwtf jobHandle = do
-- TODO Add progress (jobStatus) update for docs - this is a -- TODO Add progress (jobStatus) update for docs - this is a
-- long action -- long action
let docsC' = zipSources (yieldMany [1..]) docsC let docsC' = zipSources (yieldMany [1..]) (transPipe liftBase docsC)
.| mapMC (\(idx, doc) -> .| mapMC (\(idx, doc) ->
if idx > limit then do if idx > limit then do
--printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit) --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
...@@ -269,7 +271,10 @@ addToCorpusWithTempFile user cid nwtf jobHandle = do ...@@ -269,7 +271,10 @@ addToCorpusWithTempFile user cid nwtf jobHandle = do
panicTrace panicMsg panicTrace panicMsg
else else
pure doc) pure doc)
.| mapC toHyperdataDocument .| mapMaybeM (\case
ParseRecordSucceeded r -> pure $ Just $ toHyperdataDocument r
ParseTsvRecordFailed r -> emitTsvParseWarning jobHandle r >> pure Nothing
)
--printDebug "Parsing corpus finished : " cid --printDebug "Parsing corpus finished : " cid
--logStatus jobLog2 --logStatus jobLog2
...@@ -282,7 +287,7 @@ addToCorpusWithTempFile user cid nwtf jobHandle = do ...@@ -282,7 +287,7 @@ addToCorpusWithTempFile user cid nwtf jobHandle = do
(Multi l) (Multi l)
(Just (nwtf ^. wtf_selection)) (Just (nwtf ^. wtf_selection))
--(Just $ fromIntegral $ length docs, docsC') --(Just $ fromIntegral $ length docs, docsC')
(count, transPipe liftBase docsC') -- TODO fix number of docs (count, docsC') -- TODO fix number of docs
--(map (map toHyperdataDocument) docs) --(map (map toHyperdataDocument) docs)
jobHandle jobHandle
...@@ -298,6 +303,11 @@ addToCorpusWithTempFile user cid nwtf jobHandle = do ...@@ -298,6 +303,11 @@ addToCorpusWithTempFile user cid nwtf jobHandle = do
$(logLocM) ERROR $ "[addToCorpusWithTempFile] parse error: " <> Parser._ParseFormatError parseErr $(logLocM) ERROR $ "[addToCorpusWithTempFile] parse error: " <> Parser._ParseFormatError parseErr
markFailed (Just parseErr) jobHandle markFailed (Just parseErr) jobHandle
emitTsvParseWarning :: MonadJobStatus m => JobHandle m -> AtRow (Text, ByteString) -> m ()
emitTsvParseWarning jh (AtRow row_num (t, _r)) =
let msg = UnsafeMkHumanFriendlyErrorText ("Parsing of record at row " <> T.pack (show row_num) <> " failed due to: " <> t)
in emitWarning msg jh
{- {-
addToCorpusWithFile :: FlowCmdM env err m addToCorpusWithFile :: FlowCmdM env err m
=> CorpusId => CorpusId
......
...@@ -13,6 +13,9 @@ Portability : POSIX ...@@ -13,6 +13,9 @@ Portability : POSIX
module Gargantext.API.Worker where module Gargantext.API.Worker where
import Data.Aeson qualified as JSON
import Data.ByteString.Lazy qualified as BL
import Data.Text.Encoding qualified as TE
import Gargantext.API.Prelude (IsGargServer) import Gargantext.API.Prelude (IsGargServer)
import Gargantext.Core.Worker.Jobs (sendJob) import Gargantext.Core.Worker.Jobs (sendJob)
import Gargantext.Core.Worker.Jobs.Types (Job(..), getWorkerMNodeId) import Gargantext.Core.Worker.Jobs.Types (Job(..), getWorkerMNodeId)
...@@ -41,7 +44,7 @@ serveWorkerAPIM mkJob = WorkerAPI { workerAPIPost } ...@@ -41,7 +44,7 @@ serveWorkerAPIM mkJob = WorkerAPI { workerAPIPost }
where where
workerAPIPost i = do workerAPIPost i = do
job <- mkJob i job <- mkJob i
logM DEBUG $ "[serveWorkerAPI] sending job " <> show job logM DEBUG $ "[serveWorkerAPI] sending job " <> TE.decodeUtf8 (BL.toStrict $ JSON.encode job)
mId <- sendJob job mId <- sendJob job
pure $ JobInfo { _ji_message_id = mId pure $ JobInfo { _ji_message_id = mId
, _ji_mNode_id = getWorkerMNodeId job } , _ji_mNode_id = getWorkerMNodeId job }
......
...@@ -62,6 +62,7 @@ import Gargantext.Utils.Jobs.Error ...@@ -62,6 +62,7 @@ import Gargantext.Utils.Jobs.Error
import Gargantext.Utils.Zip qualified as UZip import Gargantext.Utils.Zip qualified as UZip
import Protolude ( show ) import Protolude ( show )
import System.FilePath (takeExtension) import System.FilePath (takeExtension)
import Gargantext.Core.Text.Corpus.Parsers.Types
------------------------------------------------------------------------ ------------------------------------------------------------------------
type ParseError = Text type ParseError = Text
...@@ -101,14 +102,14 @@ parseFormatC :: forall m. MonadBaseControl IO m ...@@ -101,14 +102,14 @@ parseFormatC :: forall m. MonadBaseControl IO m
=> FileType => FileType
-> FileFormat -> FileFormat
-> DB.ByteString -> DB.ByteString
-> m (Either ParseFormatError (Integer, ConduitT () HyperdataDocument IO ())) -> m (Either ParseFormatError (Integer, ConduitT () (ParseCorpusResult HyperdataDocument) IO ()))
parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0 parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
where where
do_parse :: MonadBaseControl IO m do_parse :: MonadBaseControl IO m
=> FileType => FileType
-> FileFormat -> FileFormat
-> DB.ByteString -> DB.ByteString
-> m (Either DT.Text (Integer, ConduitT () HyperdataDocument IO ())) -> m (Either DT.Text (Integer, ConduitT () (ParseCorpusResult HyperdataDocument) IO ()))
do_parse TsvGargV3 Plain bs = do do_parse TsvGargV3 Plain bs = do
let eParsedC = parseTsvC $ DBL.fromStrict bs let eParsedC = parseTsvC $ DBL.fromStrict bs
pure (second (transPipe (pure . runIdentity)) <$> eParsedC) pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
...@@ -117,7 +118,7 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0 ...@@ -117,7 +118,7 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
pure (second (transPipe (pure . runIdentity)) <$> eParsedC) pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
do_parse Istex Plain bs = do do_parse Istex Plain bs = do
ep <- liftBase $ parseIstex EN $ DBL.fromStrict bs ep <- liftBase $ parseIstex EN $ DBL.fromStrict bs
pure $ (\p -> (1, yieldMany [p])) <$> ep pure $ (\p -> (1, yieldMany [ParseRecordSucceeded p])) <$> ep
do_parse RisPresse Plain bs = do do_parse RisPresse Plain bs = do
--docs <- enrichWith RisPresse --docs <- enrichWith RisPresse
let eDocs = runParser' RisPresse bs let eDocs = runParser' RisPresse bs
...@@ -126,7 +127,7 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0 ...@@ -126,7 +127,7 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
, yieldMany docs , yieldMany docs
.| mapC presseEnrich .| mapC presseEnrich
.| mapC (map $ both decodeUtf8) .| mapC (map $ both decodeUtf8)
.| mapMC (toDoc RIS)) ) <$> eDocs .| mapMC (fmap ParseRecordSucceeded . toDoc RIS)) ) <$> eDocs
do_parse WOS Plain bs = do do_parse WOS Plain bs = do
let eDocs = runParser' WOS bs let eDocs = runParser' WOS bs
pure $ (\docs -> pure $ (\docs ->
...@@ -134,7 +135,7 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0 ...@@ -134,7 +135,7 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
, yieldMany docs , yieldMany docs
.| mapC (map $ first WOS.keys) .| mapC (map $ first WOS.keys)
.| mapC (map $ both decodeUtf8) .| mapC (map $ both decodeUtf8)
.| mapMC (toDoc WOS)) ) <$> eDocs .| mapMC (fmap ParseRecordSucceeded . toDoc WOS)) ) <$> eDocs
do_parse Iramuteq Plain bs = do do_parse Iramuteq Plain bs = do
let eDocs = runParser' Iramuteq bs let eDocs = runParser' Iramuteq bs
pure $ (\docs -> pure $ (\docs ->
...@@ -142,13 +143,13 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0 ...@@ -142,13 +143,13 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
, yieldMany docs , yieldMany docs
.| mapC (map $ first Iramuteq.keys) .| mapC (map $ first Iramuteq.keys)
.| mapC (map $ both decodeUtf8) .| mapC (map $ both decodeUtf8)
.| mapMC (toDoc Iramuteq . map (second (DT.replace "_" " "))) .| mapMC (fmap ParseRecordSucceeded . toDoc Iramuteq . map (second (DT.replace "_" " ")))
) )
) )
<$> eDocs <$> eDocs
do_parse JSON Plain bs = do do_parse JSON Plain bs = do
let eParsedC = parseJSONC $ DBL.fromStrict bs let eParsedC = parseJSONC $ DBL.fromStrict bs
pure (second (transPipe (pure . runIdentity)) <$> eParsedC) pure (second (mapOutput ParseRecordSucceeded . transPipe (pure . runIdentity)) <$> eParsedC)
do_parse fty ZIP bs = liftBase $ UZip.withZipFileBS bs $ do do_parse fty ZIP bs = liftBase $ UZip.withZipFileBS bs $ do
fileNames <- filter (filterZIPFileNameP ft) . DM.keys <$> getEntries fileNames <- filter (filterZIPFileNameP ft) . DM.keys <$> getEntries
printDebug "[do_parse] fileNames" fileNames printDebug "[do_parse] fileNames" fileNames
......
module Gargantext.Core.Text.Corpus.Parsers.Types where
import Data.ByteString
import Data.Csv
import Data.Text
import Prelude
-- | Allows the parser to report errors happening at a given row in the document.
data AtRow a = AtRow Int a
deriving Show
data ParseCorpusResult a
= ParseRecordSucceeded a
| ParseTsvRecordFailed (AtRow (Text, ByteString))
deriving Show
instance FromNamedRecord a => FromNamedRecord (ParseCorpusResult a) where
parseNamedRecord m = ParseRecordSucceeded <$> parseNamedRecord m
...@@ -18,8 +18,8 @@ Portability : POSIX ...@@ -18,8 +18,8 @@ Portability : POSIX
module Gargantext.Core.Worker where module Gargantext.Core.Worker where
import Async.Worker.Broker.Types (toA, getMessage, messageId)
import Async.Worker qualified as W import Async.Worker qualified as W
import Async.Worker.Broker.Types (toA, getMessage, messageId)
import Async.Worker.Types qualified as W import Async.Worker.Types qualified as W
import Control.Exception.Safe qualified as CES import Control.Exception.Safe qualified as CES
import Control.Lens (to) import Control.Lens (to)
...@@ -33,14 +33,14 @@ import Gargantext.API.Ngrams.List (postAsyncJSON) ...@@ -33,14 +33,14 @@ import Gargantext.API.Ngrams.List (postAsyncJSON)
import Gargantext.API.Node.Contact (addContact) import Gargantext.API.Node.Contact (addContact)
import Gargantext.API.Node.Corpus.Annuaire qualified as Annuaire import Gargantext.API.Node.Corpus.Annuaire qualified as Annuaire
import Gargantext.API.Node.Corpus.New (addToCorpusWithTempFile, addToCorpusWithQuery) import Gargantext.API.Node.Corpus.New (addToCorpusWithTempFile, addToCorpusWithQuery)
import Gargantext.API.Node.DocumentsFromWriteNodes (documentsFromWriteNodes)
import Gargantext.API.Node.DocumentUpload (documentUploadAsync, remoteImportDocuments) import Gargantext.API.Node.DocumentUpload (documentUploadAsync, remoteImportDocuments)
import Gargantext.API.Node.DocumentsFromWriteNodes (documentsFromWriteNodes)
import Gargantext.API.Node.File (addWithFile) import Gargantext.API.Node.File (addWithFile)
import Gargantext.API.Node.FrameCalcUpload (frameCalcUploadAsync) import Gargantext.API.Node.FrameCalcUpload (frameCalcUploadAsync)
import Gargantext.API.Node.New (postNode') import Gargantext.API.Node.New (postNode')
import Gargantext.API.Node.Types (_wtf_file_oid) import Gargantext.API.Node.Types (_wtf_file_oid)
import Gargantext.API.Node.Update.Types (UpdateNodeParams(..), Granularity (..))
import Gargantext.API.Node.Update (updateNode) import Gargantext.API.Node.Update (updateNode)
import Gargantext.API.Node.Update.Types (UpdateNodeParams(..), Granularity (..))
import Gargantext.API.Server.Named.Ngrams (tableNgramsPostChartsAsync) import Gargantext.API.Server.Named.Ngrams (tableNgramsPostChartsAsync)
import Gargantext.Core.Config (hasConfig, gc_database_config, gc_jobs, gc_worker, gc_logging) import Gargantext.Core.Config (hasConfig, gc_database_config, gc_jobs, gc_worker, gc_logging)
import Gargantext.Core.Config.Types (jc_max_docs_scrapers) import Gargantext.Core.Config.Types (jc_max_docs_scrapers)
...@@ -56,7 +56,7 @@ import Gargantext.Core.Worker.Types (JobInfo(..)) ...@@ -56,7 +56,7 @@ import Gargantext.Core.Worker.Types (JobInfo(..))
import Gargantext.Database.Prelude (readLargeObject, removeLargeObject) import Gargantext.Database.Prelude (readLargeObject, removeLargeObject)
import Gargantext.Database.Query.Table.User (getUsersWithEmail) import Gargantext.Database.Query.Table.User (getUsersWithEmail)
import Gargantext.Prelude hiding (to) import Gargantext.Prelude hiding (to)
import Gargantext.System.Logging ( logLocM, LogLevel(..), logMsg, withLogger ) import Gargantext.System.Logging
import Gargantext.Utils.Jobs.Error (HumanFriendlyErrorText(..)) import Gargantext.Utils.Jobs.Error (HumanFriendlyErrorText(..))
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(markStarted, markComplete, markFailed)) import Gargantext.Utils.Jobs.Monad (MonadJobStatus(markStarted, markComplete, markFailed))
import System.Posix.Signals (Handler(Catch), installHandler, keyboardSignal) import System.Posix.Signals (Handler(Catch), installHandler, keyboardSignal)
...@@ -90,7 +90,7 @@ notifyJobStarted env (W.State { name }) bm = do ...@@ -90,7 +90,7 @@ notifyJobStarted env (W.State { name }) bm = do
let j = toA $ getMessage bm let j = toA $ getMessage bm
let job = W.job j let job = W.job j
withLogger (env ^. w_env_config . gc_logging) $ \ioL -> withLogger (env ^. w_env_config . gc_logging) $ \ioL ->
logMsg ioL DEBUG $ "[notifyJobStarted] [" <> name <> " :: " <> show mId <> "] starting job: " <> show j $(logLoc) ioL DEBUG $ T.pack $ "[notifyJobStarted] [" <> name <> " :: " <> show mId <> "] starting job: " <> show j
let ji = JobInfo { _ji_message_id = messageId bm let ji = JobInfo { _ji_message_id = messageId bm
, _ji_mNode_id = getWorkerMNodeId job } , _ji_mNode_id = getWorkerMNodeId job }
let jh = WorkerJobHandle { _w_job_info = ji } let jh = WorkerJobHandle { _w_job_info = ji }
......
...@@ -27,7 +27,7 @@ import Data.Maybe (fromJust) ...@@ -27,7 +27,7 @@ import Data.Maybe (fromJust)
import Data.Pool qualified as Pool import Data.Pool qualified as Pool
import Database.PostgreSQL.Simple qualified as PSQL import Database.PostgreSQL.Simple qualified as PSQL
import Gargantext.API.Admin.Orchestrator.Types (JobLog, noJobLog) import Gargantext.API.Admin.Orchestrator.Types (JobLog, noJobLog)
import Gargantext.API.Job (RemainingSteps(..), jobLogStart, jobLogProgress, jobLogFailures, jobLogComplete, addErrorEvent, jobLogFailTotal, jobLogFailTotalWithMessage, jobLogAddMore) import Gargantext.API.Job (RemainingSteps(..), jobLogStart, jobLogProgress, jobLogFailures, jobLogComplete, addErrorEvent, jobLogFailTotal, jobLogFailTotalWithMessage, jobLogAddMore, addWarningEvent)
import Gargantext.API.Prelude (GargM) import Gargantext.API.Prelude (GargM)
import Gargantext.Core.Notifications.CentralExchange qualified as CE import Gargantext.Core.Notifications.CentralExchange qualified as CE
import Gargantext.Core.Notifications.CentralExchange.Types qualified as CET import Gargantext.Core.Notifications.CentralExchange.Types qualified as CET
...@@ -229,6 +229,7 @@ instance MonadJobStatus WorkerMonad where ...@@ -229,6 +229,7 @@ instance MonadJobStatus WorkerMonad where
updateJobProgress jh (\latest -> case mb_msg of updateJobProgress jh (\latest -> case mb_msg of
Nothing -> jobLogFailTotal latest Nothing -> jobLogFailTotal latest
Just msg -> jobLogFailTotalWithMessage msg latest) Just msg -> jobLogFailTotalWithMessage msg latest)
emitWarning msg jh = updateJobProgress jh (addWarningEvent msg)
addMoreSteps steps jh = updateJobProgress jh (jobLogAddMore steps) addMoreSteps steps jh = updateJobProgress jh (jobLogAddMore steps)
......
{-# LANGUAGE TemplateHaskell #-}
{-| {-|
Module : Gargantext.Core.Worker.Jobs Module : Gargantext.Core.Worker.Jobs
Description : Worker job definitions Description : Worker job definitions
...@@ -22,7 +23,7 @@ import Gargantext.Core.Worker.Jobs.Types (Job(..)) ...@@ -22,7 +23,7 @@ import Gargantext.Core.Worker.Jobs.Types (Job(..))
import Gargantext.Core.Worker.PGMQTypes (HasWorkerBroker, MessageId, SendJob) import Gargantext.Core.Worker.PGMQTypes (HasWorkerBroker, MessageId, SendJob)
import Gargantext.Database.Prelude (Cmd) import Gargantext.Database.Prelude (Cmd)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.System.Logging (logMsg, withLogger, LogLevel(..)) import Gargantext.System.Logging
sendJob :: (HasWorkerBroker, HasConfig env) sendJob :: (HasWorkerBroker, HasConfig env)
...@@ -45,7 +46,7 @@ sendJobWithCfg gcConfig job = do ...@@ -45,7 +46,7 @@ sendJobWithCfg gcConfig job = do
let queueName = _wdQueue wd let queueName = _wdQueue wd
let job' = (updateJobData job $ W.mkDefaultSendJob' b queueName job) { W.delay = _wsDefaultDelay } let job' = (updateJobData job $ W.mkDefaultSendJob' b queueName job) { W.delay = _wsDefaultDelay }
withLogger (gcConfig ^. gc_logging) $ \ioL -> withLogger (gcConfig ^. gc_logging) $ \ioL ->
logMsg ioL DEBUG $ "[sendJob] sending job " <> show job <> " (delay " <> show (W.delay job') <> ")" $(logLoc) ioL DEBUG $ "[sendJob] sending job " <> show job <> " (delay " <> show (W.delay job') <> ")"
W.sendJob' job' W.sendJob' job'
-- | We want to fine-tune job metadata parameters, for each job type -- | We want to fine-tune job metadata parameters, for each job type
......
...@@ -240,9 +240,10 @@ fromField' field mb = do ...@@ -240,9 +240,10 @@ fromField' field mb = do
where where
valueToHyperdata v = case fromJSON v of valueToHyperdata v = case fromJSON v of
Success a -> pure a Success a -> pure a
Error _err -> returnError ConversionFailed field Error err -> returnError ConversionFailed field
$ DL.unwords [ "cannot parse hyperdata for JSON: " $ DL.unwords [ "cannot parse hyperdata for JSON: "
, show v , show v
, err
] ]
dbCheck :: DBCmd err Bool dbCheck :: DBCmd err Bool
......
...@@ -104,6 +104,9 @@ class MonadJobStatus m where ...@@ -104,6 +104,9 @@ class MonadJobStatus m where
-- message to the failure. -- message to the failure.
markFailed :: forall e. ToHumanFriendlyError e => Maybe e -> JobHandle m -> m () markFailed :: forall e. ToHumanFriendlyError e => Maybe e -> JobHandle m -> m ()
-- Logs a new event with \"WARNING\" severity.
emitWarning :: forall e. ToHumanFriendlyError e => e -> JobHandle m -> m ()
-- | Add 'n' more steps to the running computation, they will be marked as remaining. -- | Add 'n' more steps to the running computation, they will be marked as remaining.
addMoreSteps :: MonadJobStatus m => Int -> JobHandle m -> m () addMoreSteps :: MonadJobStatus m => Int -> JobHandle m -> m ()
......
Title Authors Source Abstract Publication Year Publication Month Publication Day
#Ecuador | 🚨Se insta antisananews mastodon.social #Ecuador | 🚨Se instaló la audiencia de juicio contra el exalcalde de #Quito, Jorge Yunda, y 13 personas más, procesadas por #peculado en la compra de 100.000 pruebas para detectar #COVID19 y que presuntamente abrían causado un perjuicio al Estado por 2’235.491,16 dólares.#Comparta #Suscríbase↩👍Sígame enRadio Antisana Media Online: https://antisananews.blogspot.com/TikTok: https://www.tiktok.com/@antisanamediaonline?lang=esTelegram: https://t.me/AntisanaMediaOnlineVK: https://vk.com/antisanamultimediosX: https://twitter.com/AntisanaNews 2024 02 06
#NorthCarolina bann MatthewChat@mstdn.social toot.io #NorthCarolina banned immunocompromized people from wearing masks in public. This does NOT apply to the #KKK, as their is a specific exemption for them. #covid #masking 2024 05 16
'The coronation of t BenHigbie@mastodon.social fosstodon.org 'The coronation of the Serbian Tsar Stefan Dušan as East Roman Emperor' from 'The Slav Epic' by Alphonse Mucha (1926) #art #arts #artist #artists #artlover #artlovers #arthistory #illustration #painting #paintings #inspiration #artmuseum #museum #artmuseums #museums #artnet 2024 05 05
3/5 Krankenstand 202 ToveHarris mastodon.social 3/5 Krankenstand 2022 und 2023 komme Verlust von 350.000 Beschäftigten gleich. Arbeitsausfall werde derzeit durch Überstunden + erhöhte Produktivität aufgefangen. Ohne diese Leistungen der Beschäftigten gäbe es eine Lücke von 700.000 Beschäftigten.Und #CovidIsNotOver #LongCOVID #COVID #COVID19 #Coronahttps://www.vfa.de/de/wirtschaft-politik/macroscope/macroscope-hoher-krankenstand-drueckt-deutschland-in-die-rezession 2024 01 27
@ABScientist @Hidde,justafrog@mstdn.social,mastodon.social,@ABScientist @Hidde @roelgrif How about this one?https://nos.nl/artikel/2457983-viroloog-koopmans-coronagolf-in-china-nu-niet-heel-zorgelijk-voor-nederland 2023 12 28
@cassandra17lina In gemswinc counter.social @cassandra17lina In a way, Covid was a gift to introverts 2024 01 24
@erictopol This is t SpookieRobieTheCat@mastodon.social toot.io @erictopol This is the #CovidBrain that Trump and MAGA suffer from. All those #Antivaxx proponents will suffer the consequences too. And I'm ok with it. Be anti-science, be wilfully ignorant and live a life in agony, that's their choice. I shouldn't have to pay a dime for their stupidity. 2024 01 09
A bunch of maskless crowgirl@hachyderm.io toot.io "A bunch of maskless #Covid "experts" like Gregg Gonsalves are now trying to sound credible by warning about H5N1.I think H5N1 is a serious problem that requires airborne mitigations and decontaminating the food supply.And I also don't dine in restaurants. Gregg on the other hand is proud of his masklessness.Be warned. Don't let these professional Covid minimizers get away with this crap with #H5N1." 2024 06 03
...@@ -9,12 +9,17 @@ module Test.API.Private.List ( ...@@ -9,12 +9,17 @@ module Test.API.Private.List (
) where ) where
import Data.Aeson.QQ import Data.Aeson.QQ
import Data.Text qualified as T
import Data.Text.IO qualified as TIO import Data.Text.IO qualified as TIO
import Fmt import Fmt
import Gargantext.API.HashedResponse
import Gargantext.API.Ngrams.List.Types import Gargantext.API.Ngrams.List.Types
import Gargantext.API.Ngrams.Types qualified as APINgrams import Gargantext.API.Ngrams.Types qualified as APINgrams
import Gargantext.API.Node.Corpus.New.Types qualified as FType import Gargantext.API.Node.Corpus.New.Types qualified as FType
import Gargantext.API.Node.Types
import Gargantext.Core qualified as Lang
import Gargantext.Core.Config import Gargantext.Core.Config
import Gargantext.Core.Text.List.Social
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Core.Types.Individu import Gargantext.Core.Types.Individu
import Gargantext.Core.Worker.Types import Gargantext.Core.Worker.Types
...@@ -25,13 +30,23 @@ import Servant.Client.Streaming ...@@ -25,13 +30,23 @@ import Servant.Client.Streaming
import Test.API.Prelude (newCorpusForUser, checkEither) import Test.API.Prelude (newCorpusForUser, checkEither)
import Test.API.Routes import Test.API.Routes
import Test.API.Setup import Test.API.Setup
import Test.API.UpdateList qualified as UpdateList
import Test.Database.Types import Test.Database.Types
import Test.Hspec (Spec, it, aroundAll, describe, sequential) import Test.Hspec (Spec, it, aroundAll, describe, sequential)
import Test.Hspec.Expectations import Test.Hspec.Expectations
import Test.Hspec.Wai.Internal (withApplication) import Test.Hspec.Wai.Internal (withApplication)
import Test.Utils import Test.Utils
mkNewWithForm :: T.Text -> T.Text -> NewWithForm
mkNewWithForm content name = NewWithForm
{ _wf_filetype = FType.TSV
, _wf_fileformat = FType.Plain
, _wf_data = content
, _wf_lang = Just Lang.EN
, _wf_name = name
, _wf_selection = FlowSocialListWithPriority MySelfFirst
}
importTermsTSV :: SpecContext () -> String -> IO (JobInfo, CorpusId, ListId) importTermsTSV :: SpecContext () -> String -> IO (JobInfo, CorpusId, ListId)
importTermsTSV (SpecContext testEnv port app _) name = do importTermsTSV (SpecContext testEnv port app _) name = do
cId <- liftIO $ newCorpusForUser testEnv "alice" cId <- liftIO $ newCorpusForUser testEnv "alice"
...@@ -58,8 +73,8 @@ importCorpusTSV (SpecContext testEnv port app _) name = do ...@@ -58,8 +73,8 @@ importCorpusTSV (SpecContext testEnv port app _) name = do
([listId] :: [NodeId]) <- protectedJSON token "POST" (mkUrl port ("/node/" <> build cId)) [aesonQQ|{"pn_typename":"NodeList","pn_name":"Testing"}|] ([listId] :: [NodeId]) <- protectedJSON token "POST" (mkUrl port ("/node/" <> build cId)) [aesonQQ|{"pn_typename":"NodeList","pn_name":"Testing"}|]
-- Upload the CSV doc -- Upload the CSV doc
simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName name) simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName name)
let params = UpdateList.mkNewWithForm simpleNgrams "simple.tsv" let params = mkNewWithForm simpleNgrams "simple.tsv"
pendingJob <- checkEither $ liftIO $ runClientM (importCorpus token listId params) clientEnv pendingJob <- checkEither $ liftIO $ runClientM (importCorpus token cId params) clientEnv
jobInfo <- pollUntilWorkFinished log_cfg port pendingJob jobInfo <- pollUntilWorkFinished log_cfg port pendingJob
pure (jobInfo, cId, listId) pure (jobInfo, cId, listId)
...@@ -85,11 +100,21 @@ tests = sequential $ aroundAll withTestDBAndPort $ do ...@@ -85,11 +100,21 @@ tests = sequential $ aroundAll withTestDBAndPort $ do
length terms `shouldSatisfy` (>= 1) length terms `shouldSatisfy` (>= 1)
it "should handle dirty TSV as per issue #380" $ \ctx@(SpecContext _testEnv port app _) -> do it "should handle dirty TSV as per issue #380" $ \ctx@(SpecContext _testEnv port app _) -> do
(_, cId, listId) <- importCorpusTSV ctx "test-data/issue-380/corpus.tsv" (_, cId, _listId) <- importCorpusTSV ctx "test-data/issue-380/corpus.tsv"
withApplication app $ do withApplication app $ do
withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do
-- Now check that we can retrieve the ngrams, and the ngrams list is not empty! -- Now check that we can retrieve the ngrams, and the ngrams list is not empty!
liftIO $ do liftIO $ do
eRes <- checkEither $ runClientM (get_table_ngrams token cId APINgrams.Terms listId 50 Nothing (Just MapTerm) Nothing Nothing Nothing Nothing) clientEnv eRes <- checkEither $ runClientM (get_table token cId (Just APINgrams.Docs) Nothing Nothing Nothing Nothing Nothing) clientEnv
let (APINgrams.NgramsTable terms) = APINgrams._vc_data eRes let (HashedResponse _ TableResult{tr_docs}) = eRes
length terms `shouldSatisfy` (>= 1) length tr_docs `shouldBe` 7
it "should skip problematic rows" $ \ctx@(SpecContext _testEnv port app _) -> do
(_, cId, _listId) <- importCorpusTSV ctx "test-data/issue-380/malformed_row.tsv"
withApplication app $ do
withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do
-- Now check that we can retrieve the ngrams, and the ngrams list is not empty!
liftIO $ do
eRes <- checkEither $ runClientM (get_table token cId (Just APINgrams.Docs) Nothing Nothing Nothing Nothing Nothing) clientEnv
let (HashedResponse _ TableResult{tr_docs}) = eRes
length tr_docs `shouldBe` 6 -- it must have skipped the broken row
...@@ -89,6 +89,7 @@ instance MonadJobStatus TestMonad where ...@@ -89,6 +89,7 @@ instance MonadJobStatus TestMonad where
markFailure _ _ _ = TestMonad $ pure () markFailure _ _ _ = TestMonad $ pure ()
markComplete _ = TestMonad $ pure () markComplete _ = TestMonad $ pure ()
markFailed _ _ = TestMonad $ pure () markFailed _ _ = TestMonad $ pure ()
emitWarning _ _ = TestMonad $ pure ()
addMoreSteps _ _ = TestMonad $ pure () addMoreSteps _ _ = TestMonad $ pure ()
data DBHandle = DBHandle { data DBHandle = DBHandle {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment