Commit da4f2596 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Generate and commit later ngrams in insertMasterDocs

This commit refactors the flow code to generate the ngrams for
the master docs separately, and then it "commits" them later after such
docs have been associated with a `Node`.
parent d35f64fd
Pipeline #7639 failed with stages
in 49 minutes and 41 seconds
...@@ -19,6 +19,7 @@ module CLI.Import where ...@@ -19,6 +19,7 @@ module CLI.Import where
import CLI.Parsers import CLI.Parsers
import CLI.Types import CLI.Types
import Control.Monad.Catch (MonadCatch)
import Gargantext.API.Admin.EnvTypes (DevEnv(..), DevJobHandle(..)) import Gargantext.API.Admin.EnvTypes (DevEnv(..), DevJobHandle(..))
import Gargantext.API.Dev (withDevEnv, runCmdGargDev) import Gargantext.API.Dev (withDevEnv, runCmdGargDev)
import Gargantext.API.Errors.Types ( BackendInternalError ) import Gargantext.API.Errors.Types ( BackendInternalError )
...@@ -42,14 +43,14 @@ importCLI (ImportArgs fun user name settingsPath corpusPath) = do ...@@ -42,14 +43,14 @@ importCLI (ImportArgs fun user name settingsPath corpusPath) = do
let let
tt = Multi EN tt = Multi EN
format = TsvGargV3 format = TsvGargV3
corpus :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId corpus :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, MonadCatch m, JobHandle m ~ DevJobHandle) => m CorpusId
mkCorpusUser = MkCorpusUserNormalCorpusName (UserName $ cs user) (cs name :: Text) mkCorpusUser = MkCorpusUserNormalCorpusName (UserName $ cs user) (cs name :: Text)
corpus = flowCorpusFile mkCorpusUser tt format Plain corpusPath Nothing DevJobHandle corpus = flowCorpusFile mkCorpusUser tt format Plain corpusPath Nothing DevJobHandle
corpusTsvHal :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId corpusTsvHal :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, MonadCatch m, JobHandle m ~ DevJobHandle) => m CorpusId
corpusTsvHal = flowCorpusFile mkCorpusUser tt TsvHal Plain corpusPath Nothing DevJobHandle corpusTsvHal = flowCorpusFile mkCorpusUser tt TsvHal Plain corpusPath Nothing DevJobHandle
annuaire :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId annuaire :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, MonadCatch m, JobHandle m ~ DevJobHandle) => m CorpusId
annuaire = flowAnnuaire (MkCorpusUserNormalCorpusName (UserName $ cs user) "Annuaire") (Multi EN) corpusPath DevJobHandle annuaire = flowAnnuaire (MkCorpusUserNormalCorpusName (UserName $ cs user) "Annuaire") (Multi EN) corpusPath DevJobHandle
withDevEnv settingsPath $ \env -> do withDevEnv settingsPath $ \env -> do
......
...@@ -702,6 +702,7 @@ executable gargantext ...@@ -702,6 +702,7 @@ executable gargantext
, containers ^>= 0.6.7 , containers ^>= 0.6.7
, cryptohash ^>= 0.11.9 , cryptohash ^>= 0.11.9
, directory ^>= 1.3.7.1 , directory ^>= 1.3.7.1
, exceptions >= 0.9.0 && < 0.11
, extra ^>= 1.7.9 , extra ^>= 1.7.9
, gargantext , gargantext
, gargantext-prelude , gargantext-prelude
......
...@@ -18,6 +18,7 @@ module Gargantext.API.Node.Contact ...@@ -18,6 +18,7 @@ module Gargantext.API.Node.Contact
where where
import Conduit ( yield ) import Conduit ( yield )
import Control.Monad.Catch (MonadCatch)
import Gargantext.API.Admin.Auth.Types ( AuthenticatedUser(AuthenticatedUser) ) import Gargantext.API.Admin.Auth.Types ( AuthenticatedUser(AuthenticatedUser) )
import Gargantext.API.Admin.EnvTypes (Env) import Gargantext.API.Admin.EnvTypes (Env)
import Gargantext.API.Errors.Types ( BackendInternalError ) import Gargantext.API.Errors.Types ( BackendInternalError )
...@@ -57,7 +58,7 @@ apiAsync u nId = Named.ContactAsyncAPI { ...@@ -57,7 +58,7 @@ apiAsync u nId = Named.ContactAsyncAPI {
, _ac_user = u } , _ac_user = u }
} }
addContact :: (FlowCmdM env err m, MonadJobStatus m) addContact :: (FlowCmdM env err m, MonadJobStatus m, MonadCatch m)
=> User => User
-> NodeId -> NodeId
-> AddContactParams -> AddContactParams
......
...@@ -24,6 +24,7 @@ module Gargantext.API.Node.Corpus.New ...@@ -24,6 +24,7 @@ module Gargantext.API.Node.Corpus.New
import Conduit ((.|), yieldMany, mapMC, transPipe) import Conduit ((.|), yieldMany, mapMC, transPipe)
import Control.Exception.Safe (MonadMask) import Control.Exception.Safe (MonadMask)
import Control.Lens ( view, non ) import Control.Lens ( view, non )
import Control.Monad.Catch (MonadCatch)
import Data.Conduit.Internal (zipSources) import Data.Conduit.Internal (zipSources)
import Data.Conduit.List (mapMaybeM) import Data.Conduit.List (mapMaybeM)
import Data.Swagger ( ToSchema(..) ) import Data.Swagger ( ToSchema(..) )
...@@ -56,6 +57,7 @@ import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), ParentId) ...@@ -56,6 +57,7 @@ import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), ParentId)
import Gargantext.Database.GargDB qualified as GargDB import Gargantext.Database.GargDB qualified as GargDB
import Gargantext.Database.Prelude import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.Node (getNodeWith, getOrMkList) import Gargantext.Database.Query.Table.Node (getNodeWith, getOrMkList)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata) import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(MkCorpusUserNormalCorpusIds)) import Gargantext.Database.Query.Tree.Root (MkCorpusUser(MkCorpusUserNormalCorpusIds))
import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Database.Schema.Node (node_hyperdata)
...@@ -63,7 +65,6 @@ import Gargantext.Prelude ...@@ -63,7 +65,6 @@ import Gargantext.Prelude
import Gargantext.System.Logging ( logLocM, LogLevel(..) ) import Gargantext.System.Logging ( logLocM, LogLevel(..) )
import Gargantext.Utils.Jobs.Error (HumanFriendlyErrorText(..)) import Gargantext.Utils.Jobs.Error (HumanFriendlyErrorText(..))
import Gargantext.Utils.Jobs.Monad (JobHandle, MonadJobStatus(..)) import Gargantext.Utils.Jobs.Monad (JobHandle, MonadJobStatus(..))
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
------------------------------------------------------------------------ ------------------------------------------------------------------------
{- {-
...@@ -150,6 +151,7 @@ type AddWithFile = Summary "Add with MultipartData to corpus endpoint" ...@@ -150,6 +151,7 @@ type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
addToCorpusWithQuery :: ( FlowCmdM env err m addToCorpusWithQuery :: ( FlowCmdM env err m
, MonadJobStatus m , MonadJobStatus m
, MonadCatch m
) )
=> User => User
-> CorpusId -> CorpusId
......
...@@ -14,6 +14,7 @@ Portability : POSIX ...@@ -14,6 +14,7 @@ Portability : POSIX
module Gargantext.API.Node.Corpus.Searx where module Gargantext.API.Node.Corpus.Searx where
import Control.Lens (view) import Control.Lens (view)
import Control.Monad.Catch (MonadCatch)
import Data.Aeson qualified as Aeson import Data.Aeson qualified as Aeson
import Data.Text qualified as T import Data.Text qualified as T
import Data.Text qualified as Text import Data.Text qualified as Text
...@@ -40,6 +41,7 @@ import Gargantext.Database.Query.Table.Node (getOrMkList, insertDefaultNodeIfNot ...@@ -40,6 +41,7 @@ import Gargantext.Database.Query.Table.Node (getOrMkList, insertDefaultNodeIfNot
import Gargantext.Database.Query.Table.Node.Error (HasNodeError) import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Tree.Error (HasTreeError) import Gargantext.Database.Query.Tree.Error (HasTreeError)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.System.Logging (MonadLogger)
import Gargantext.Utils.Jobs.Monad (JobHandle, MonadJobStatus(..)) import Gargantext.Utils.Jobs.Monad (JobHandle, MonadJobStatus(..))
import Network.HTTP.Client import Network.HTTP.Client
import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Client.TLS (tlsManagerSettings)
...@@ -118,6 +120,8 @@ insertSearxResponse :: ( MonadBase IO m ...@@ -118,6 +120,8 @@ insertSearxResponse :: ( MonadBase IO m
, HasNodeError err , HasNodeError err
, HasTreeError err , HasTreeError err
, HasValidationError err , HasValidationError err
, MonadCatch m
, MonadLogger m
) )
=> User => User
-> CorpusId -> CorpusId
...@@ -155,6 +159,8 @@ triggerSearxSearch :: ( MonadBase IO m ...@@ -155,6 +159,8 @@ triggerSearxSearch :: ( MonadBase IO m
, HasTreeError err , HasTreeError err
, HasValidationError err , HasValidationError err
, MonadJobStatus m , MonadJobStatus m
, MonadCatch m
, MonadLogger m
) )
=> User => User
-> CorpusId -> CorpusId
......
...@@ -16,6 +16,7 @@ Portability : POSIX ...@@ -16,6 +16,7 @@ Portability : POSIX
module Gargantext.API.Node.DocumentUpload where module Gargantext.API.Node.DocumentUpload where
import Control.Lens (view) import Control.Lens (view)
import Control.Monad.Catch (MonadCatch)
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.API.Admin.Auth.Types (AuthenticatedUser(..)) import Gargantext.API.Admin.Auth.Types (AuthenticatedUser(..))
import Gargantext.API.Admin.EnvTypes (Env) import Gargantext.API.Admin.EnvTypes (Env)
...@@ -32,16 +33,16 @@ import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv) ...@@ -32,16 +33,16 @@ import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv)
import Gargantext.Core.Text.Corpus.Parsers.Date (mDateSplit) import Gargantext.Core.Text.Corpus.Parsers.Date (mDateSplit)
import Gargantext.Core.Text.Terms (TermType(..)) import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Core.Worker.Jobs.Types (WorkSplit(..)) import Gargantext.Core.Worker.Jobs.Types (WorkSplit(..))
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Database.Action.Flow (addDocumentsToHyperCorpus) import Gargantext.Database.Action.Flow (addDocumentsToHyperCorpus)
import Gargantext.Database.Action.Flow.Types ( FlowCmdM ) import Gargantext.Database.Action.Flow.Types ( FlowCmdM )
import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataCorpus ) import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataCorpus )
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Node ( DocId, NodeId, NodeType(NodeCorpus), ParentId ) import Gargantext.Database.Admin.Types.Node ( DocId, NodeId, NodeType(NodeCorpus), ParentId )
import Gargantext.Database.Prelude import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node (getClosestParentIdByType') import Gargantext.Database.Query.Table.Node (getClosestParentIdByType')
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Schema.Node (_node_hyperdata) import Gargantext.Database.Schema.Node (_node_hyperdata)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.System.Logging (logLocM, LogLevel(..), MonadLogger) import Gargantext.System.Logging (logLocM, LogLevel(..), MonadLogger)
...@@ -55,7 +56,7 @@ api nId = Named.DocumentUploadAPI { ...@@ -55,7 +56,7 @@ api nId = Named.DocumentUploadAPI {
, _ud_node_id = nId } , _ud_node_id = nId }
} }
documentUploadAsync :: (FlowCmdM env err m, MonadJobStatus m) documentUploadAsync :: (FlowCmdM env err m, MonadJobStatus m, MonadCatch m)
=> NodeId => NodeId
-> DocumentUpload -> DocumentUpload
-> JobHandle m -> JobHandle m
...@@ -66,7 +67,7 @@ documentUploadAsync nId doc jobHandle = do ...@@ -66,7 +67,7 @@ documentUploadAsync nId doc jobHandle = do
-- printDebug "documentUploadAsync" docIds -- printDebug "documentUploadAsync" docIds
markComplete jobHandle markComplete jobHandle
documentUpload :: (FlowCmdM env err m) documentUpload :: (FlowCmdM env err m, MonadCatch m)
=> NodeId => NodeId
-> DocumentUpload -> DocumentUpload
-> m [DocId] -> m [DocId]
...@@ -110,6 +111,7 @@ remoteImportDocuments :: ( HasNodeError err ...@@ -110,6 +111,7 @@ remoteImportDocuments :: ( HasNodeError err
, HasNodeStoryEnv env err , HasNodeStoryEnv env err
, IsDBCmd env err m , IsDBCmd env err m
, MonadLogger m , MonadLogger m
, MonadCatch m
, MonadIO m) , MonadIO m)
=> AuthenticatedUser => AuthenticatedUser
-> ParentId -> ParentId
......
...@@ -17,6 +17,8 @@ module Gargantext.API.Node.DocumentsFromWriteNodes ...@@ -17,6 +17,8 @@ module Gargantext.API.Node.DocumentsFromWriteNodes
where where
import Conduit ( yieldMany ) import Conduit ( yieldMany )
import Control.Lens (view)
import Control.Monad.Catch (MonadCatch)
import Data.List qualified as List import Data.List qualified as List
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.API.Admin.Auth.Types ( AuthenticatedUser, auth_node_id, auth_user_id ) import Gargantext.API.Admin.Auth.Types ( AuthenticatedUser, auth_node_id, auth_user_id )
...@@ -39,15 +41,14 @@ import Gargantext.Database.Action.Flow.Types (FlowCmdM) ...@@ -39,15 +41,14 @@ import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Frame ( HyperdataFrame(..), getHyperdataFrameContents ) import Gargantext.Database.Admin.Types.Hyperdata.Frame ( HyperdataFrame(..), getHyperdataFrameContents )
import Gargantext.Database.Admin.Types.Node ( NodeId, Node, NodeType(..) ) import Gargantext.Database.Admin.Types.Node ( NodeId, Node, NodeType(..) )
import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith, getOrMkList) import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith, getOrMkList)
import Gargantext.Database.Schema.Node (node_hyperdata, node_name, node_date) import Gargantext.Database.Schema.Node (node_hyperdata, node_name, node_date)
import Gargantext.Database.Prelude
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.System.Logging (logLocM, LogLevel(..)) import Gargantext.System.Logging (logLocM, LogLevel(..))
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..))
import Gargantext.Utils.Jobs.Error (HumanFriendlyErrorText(..)) import Gargantext.Utils.Jobs.Error (HumanFriendlyErrorText(..))
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..))
import Servant.Server.Generic (AsServerT) import Servant.Server.Generic (AsServerT)
import Control.Lens (view)
api :: AuthenticatedUser api :: AuthenticatedUser
-- ^ The logged-in user -- ^ The logged-in user
...@@ -63,6 +64,7 @@ api authenticatedUser nId = ...@@ -63,6 +64,7 @@ api authenticatedUser nId =
documentsFromWriteNodes :: ( FlowCmdM env err m documentsFromWriteNodes :: ( FlowCmdM env err m
, MonadJobStatus m , MonadJobStatus m
, MonadCatch m
) )
=> AuthenticatedUser => AuthenticatedUser
-- ^ The logged-in user -- ^ The logged-in user
......
...@@ -17,13 +17,15 @@ Portability : POSIX ...@@ -17,13 +17,15 @@ Portability : POSIX
{-# LANGUAGE ConstrainedClassMethods #-} {-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE InstanceSigs #-} {-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
{-# LANGUAGE LambdaCase #-} {-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TupleSections #-}
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
( DataText(..) ( DataText(..)
...@@ -54,7 +56,6 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -54,7 +56,6 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
import Conduit import Conduit
import Control.Lens ( to, view ) import Control.Lens ( to, view )
import Data.Bifunctor qualified as B
import Data.Conduit qualified as C import Data.Conduit qualified as C
import Data.Conduit.Internal (zipSources) import Data.Conduit.Internal (zipSources)
import Data.Conduit.List qualified as CL import Data.Conduit.List qualified as CL
...@@ -67,12 +68,12 @@ import Data.Text qualified as T ...@@ -67,12 +68,12 @@ import Data.Text qualified as T
import Gargantext.API.Ngrams.Tools (getTermsWith) import Gargantext.API.Ngrams.Tools (getTermsWith)
import Gargantext.API.Ngrams.Types (NgramsTerm) import Gargantext.API.Ngrams.Types (NgramsTerm)
import Gargantext.Core (Lang(..), withDefaultLanguage, NLPServerConfig) import Gargantext.Core (Lang(..), withDefaultLanguage, NLPServerConfig)
import Gargantext.Core.Notifications.CentralExchange.Types (HasCentralExchangeNotification(ce_notify), CEMessage(..))
import Gargantext.Core.Config (GargConfig(..), hasConfig) import Gargantext.Core.Config (GargConfig(..), hasConfig)
import Gargantext.Core.Config.Types (APIsConfig(..)) import Gargantext.Core.Config.Types (APIsConfig(..))
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire) import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.NLP (HasNLPServer, nlpServerGet) import Gargantext.Core.NLP (HasNLPServer, nlpServerGet)
import Gargantext.Core.NodeStory.Types (HasNodeStory, NodeStoryEnv, HasNodeStoryEnv (..)) import Gargantext.Core.NodeStory.Types (HasNodeStory, NodeStoryEnv, HasNodeStoryEnv (..))
import Gargantext.Core.Notifications.CentralExchange.Types (HasCentralExchangeNotification(ce_notify), CEMessage(..))
import Gargantext.Core.Text.Corpus.API qualified as API import Gargantext.Core.Text.Corpus.API qualified as API
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType) import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
import Gargantext.Core.Text.List (buildNgramsLists) import Gargantext.Core.Text.List (buildNgramsLists)
...@@ -86,28 +87,30 @@ import Gargantext.Core.Types.Individu (User(..)) ...@@ -86,28 +87,30 @@ import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main ( ListType(MapTerm) ) import Gargantext.Core.Types.Main ( ListType(MapTerm) )
import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances
import Gargantext.Database.Action.Flow.List ( flowList_DbRepo, toNodeNgramsW' ) import Gargantext.Database.Action.Flow.List ( flowList_DbRepo, toNodeNgramsW' )
import Gargantext.Database.Action.Flow.Types ( do_api, DataOrigin(..), DataText(..), FlowCorpus ) import Gargantext.Database.Action.Flow.Types ( do_api, DataOrigin(..), DataText(..), FlowCorpus, DocumentIdWithNgrams (..) )
import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, insertDocs, mkNodeIdNgramsMap, ngramsByDoc, mapDocumentIdWithNgrams) import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, insertDocs, mkNodeIdNgramsMap, ngramsByDoc, documentIdWithNgrams)
import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore) import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore)
import Gargantext.Database.Action.Search (searchDocInDatabase) import Gargantext.Database.Action.Search (searchDocInDatabase)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact ) import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact )
import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataAnnuaire, HyperdataCorpus(_hc_lang) ) import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataAnnuaire, HyperdataCorpus(_hc_lang) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( ToHyperdataDocument(toHyperdataDocument), HyperdataDocument ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( ToHyperdataDocument(toHyperdataDocument), HyperdataDocument )
import Gargantext.Database.Admin.Types.Node hiding (DEBUG) -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId) import Gargantext.Database.Admin.Types.Node hiding (ERROR, DEBUG) -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Prelude import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.ContextNodeNgrams2 ( ContextNodeNgrams2Poly(..), insertContextNodeNgrams2 ) import Gargantext.Database.Query.Table.ContextNodeNgrams2 ( ContextNodeNgrams2Poly(..), insertContextNodeNgrams2 )
import Gargantext.Database.Query.Table.Node ( MkCorpus, insertDefaultNodeIfNotExists, getOrMkList, getNodeWith ) import Gargantext.Database.Query.Table.Node ( MkCorpus, insertDefaultNodeIfNotExists, getOrMkList, getNodeWith )
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add) import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert ( ToNode(toNode) ) -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..)) import Gargantext.Database.Query.Table.Node.Document.Insert ( ToNode(toNode), UniqParameters (..) ) -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..)) import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId) import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId)
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId) import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser) import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser)
import Gargantext.Database.Schema.Ngrams ( indexNgrams, NgramsId ) import Gargantext.Database.Schema.Ngrams ( indexNgrams, NgramsId )
import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Database.Schema.Node
import Gargantext.Prelude hiding (to) import Gargantext.Database.Types
import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG), MonadLogger ) import Gargantext.Prelude hiding (catch, onException, to)
import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG, ERROR), MonadLogger )
import Gargantext.Utils.Jobs.Monad ( JobHandle, MonadJobStatus(..) ) import Gargantext.Utils.Jobs.Monad ( JobHandle, MonadJobStatus(..) )
import Control.Monad.Catch
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Imports for upgrade function -- Imports for upgrade function
...@@ -168,6 +171,7 @@ flowDataText :: forall env err m. ...@@ -168,6 +171,7 @@ flowDataText :: forall env err m.
, HasTreeError err , HasTreeError err
, HasValidationError err , HasValidationError err
, MonadJobStatus m , MonadJobStatus m
, MonadCatch m
, HasCentralExchangeNotification env , HasCentralExchangeNotification env
) )
=> User => User
...@@ -202,6 +206,7 @@ flowAnnuaire :: ( IsDBCmd env err m ...@@ -202,6 +206,7 @@ flowAnnuaire :: ( IsDBCmd env err m
, HasTreeError err , HasTreeError err
, HasValidationError err , HasValidationError err
, MonadJobStatus m , MonadJobStatus m
, MonadCatch m
, HasCentralExchangeNotification env ) , HasCentralExchangeNotification env )
=> MkCorpusUser => MkCorpusUser
-> TermType Lang -> TermType Lang
...@@ -221,6 +226,7 @@ flowCorpusFile :: ( IsDBCmd env err m ...@@ -221,6 +226,7 @@ flowCorpusFile :: ( IsDBCmd env err m
, HasTreeError err , HasTreeError err
, HasValidationError err , HasValidationError err
, MonadJobStatus m , MonadJobStatus m
, MonadCatch m
, HasCentralExchangeNotification env ) , HasCentralExchangeNotification env )
=> MkCorpusUser => MkCorpusUser
-> TermType Lang -> TermType Lang
...@@ -250,6 +256,7 @@ flowCorpus :: ( IsDBCmd env err m ...@@ -250,6 +256,7 @@ flowCorpus :: ( IsDBCmd env err m
, HasValidationError err , HasValidationError err
, FlowCorpus a , FlowCorpus a
, MonadJobStatus m , MonadJobStatus m
, MonadCatch m
, HasCentralExchangeNotification env ) , HasCentralExchangeNotification env )
=> MkCorpusUser => MkCorpusUser
-> TermType Lang -> TermType Lang
...@@ -271,6 +278,7 @@ flow :: forall env err m a c. ...@@ -271,6 +278,7 @@ flow :: forall env err m a c.
, MkCorpus c , MkCorpus c
, MonadJobStatus m , MonadJobStatus m
, HasCentralExchangeNotification env , HasCentralExchangeNotification env
, MonadCatch m
) )
=> Maybe c => Maybe c
-> MkCorpusUser -> MkCorpusUser
...@@ -309,6 +317,8 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m ...@@ -309,6 +317,8 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m
, HasNLPServer env , HasNLPServer env
, FlowCorpus document , FlowCorpus document
, MkCorpus corpus , MkCorpus corpus
, MonadLogger m
, MonadCatch m
) )
=> Maybe corpus => Maybe corpus
-> TermType Lang -> TermType Lang
...@@ -318,7 +328,13 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m ...@@ -318,7 +328,13 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m
addDocumentsToHyperCorpus mb_hyper la corpusId docs = do addDocumentsToHyperCorpus mb_hyper la corpusId docs = do
cfg <- view hasConfig cfg <- view hasConfig
nlp <- view (nlpServerGet $ _tt_lang la) nlp <- view (nlpServerGet $ _tt_lang la)
ids <- insertMasterDocs cfg nlp mb_hyper la docs -- First extract all the ngrams for the input documents via the nlp server,
-- collect errors (if any) and pass to 'insertMasterDocs' only the documents
-- for which the ngrams extraction succeeded. At the moment errors are just
-- logged, but in the future they could be returned upstream so that we can
-- display a final result of how many were skipped, how many succeded etc.
uncommittedNgrams <- extractNgramsFromDocuments nlp la docs
ids <- runDBTx $ insertMasterDocs cfg uncommittedNgrams mb_hyper docs
runDBTx $ do runDBTx $ do
void $ Doc.add corpusId (map nodeId2ContextId ids) void $ Doc.add corpusId (map nodeId2ContextId ids)
pure ids pure ids
...@@ -413,44 +429,136 @@ buildSocialList l user userCorpusId listId ctype = \case ...@@ -413,44 +429,136 @@ buildSocialList l user userCorpusId listId ctype = \case
_mastListId <- runDBTx $ getOrMkList masterCorpusId masterUserId _mastListId <- runDBTx $ getOrMkList masterCorpusId masterUserId
pure () pure ()
-------------------------------------------------------------------------------
--
-- Splitting Ngrams extraction from document creation
--
-------------------------------------------------------------------------------
--
-- There is a bit of tension between extracting the Ngrams and creating the documents:
-- We need to produce a map between a given 'NodeId' and the ngrams associated with it, where
-- the latter are extract via the NLP server. However, each ngrams has to be matched to the
-- NodeId associated with the new resource being created as part of 'insertMasterDocs'. This
-- creates a bit of a chicken-and-egg problem in trying to make 'insertMasterDocs' a 'DBUpdate'
-- function: we need a 'NodeId' to exist by the time we call 'extractNgrams' but the latter can't
-- be execute in a pure fashion without a 'NodeId'.
--
-- To fix this, we need a data structure which would index the ngrams by some other notion of
-- index, and later have a transformation function which would re-index these ngrams to the actual
-- 'NodeId' created during the DB Transaction.
-- | Ngrams that have been fully \"committed\", i.e. associated to the respective document
-- where the latter has been persisted (i.e. committed) on secondary storage.
type CommittedNgrams =
HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
-- | Ngrams that have been extracted from the input 'doc' but not fully associated with
-- a persisted entity on the database.
newtype UncommittedNgrams doc = UncommittedNgrams
{ _UncommittedNgrams :: HashMap.HashMap ContextId (DocumentIdWithNgrams ContextId doc ExtractedNgrams) }
deriving stock Show
deriving newtype (Semigroup, Monoid)
data InsertDocError
= NgramsNotFound !ContextId !DocId
extractNgramsFromDocument :: ( UniqParameters doc
, ExtractNgrams doc
, IsDBCmd err env m
, MonadLogger m
, MonadCatch m
)
=> NLPServerConfig
-> TermType Lang
-> doc
-> m (UncommittedNgrams doc)
extractNgramsFromDocument nlpServer lang doc =
-- In case of an exception from the NLP server, treat this as having no ngrams,
-- but still index it in the final map, so that later reconciliation still works.
-- Pratically speaking it means this won't have any ngrams associated, but the document
-- will still be added to the corpus and we can try to regen the ngrams at a later stage.
UncommittedNgrams . HashMap.singleton docId <$>
(documentIdWithNgrams (extractNgrams nlpServer $ withLang lang [doc]) (Indexed docId doc)
`catch` \(e :: SomeException) -> do
$(logLocM) ERROR $ T.pack $ "Document with hash " <> show docId <> " failed ngrams extraction due to an exception: " <> displayException e
pure $ DocumentIdWithNgrams (Indexed docId doc) mempty
)
where
docId = UnsafeMkContextId $ hash $ uniqParameters doc
commitNgramsForDocument :: UniqParameters doc
=> UncommittedNgrams doc
-> Node doc
-> Either InsertDocError CommittedNgrams
commitNgramsForDocument (UncommittedNgrams ng) node =
case HashMap.lookup docId ng of
Nothing -> Left $ NgramsNotFound docId (_node_id node)
Just ngs -> Right $ mkNodeIdNgramsMap [reIndex ngs]
where
docId = UnsafeMkContextId $ hash $ uniqParameters $ _node_hyperdata node
reIndex :: DocumentIdWithNgrams ContextId doc ExtractedNgrams
-> DocumentIdWithNgrams NodeId doc ExtractedNgrams
reIndex did =
let (Indexed _ a) = documentWithId did
in did { documentWithId = Indexed (_node_id node) a }
extractNgramsFromDocuments :: forall doc env err m.
( UniqParameters doc
, ExtractNgrams doc
, IsDBCmd env err m
, MonadLogger m
, MonadCatch m
)
=> NLPServerConfig
-> TermType Lang
-> [doc]
-> m (UncommittedNgrams doc)
extractNgramsFromDocuments nlpServer lang docs =
foldlM go mempty docs
where
go :: UncommittedNgrams doc -> doc -> m (UncommittedNgrams doc)
go !acc inputDoc = do
ngrams <- extractNgramsFromDocument nlpServer lang inputDoc
pure $ acc <> ngrams
commitNgramsForDocuments :: UniqParameters doc
=> UncommittedNgrams doc
-> [Node doc]
-> ([InsertDocError], CommittedNgrams)
commitNgramsForDocuments ng nodes =
let (errs, successes) = partitionEithers $ map (commitNgramsForDocument ng) nodes
in (errs, mconcat successes)
-- FIME(adn): the use of 'extractNgramsT' is iffy and problematic -- we shouldn't
-- be contacting the NLP server in the middle of some DB ops! we should extract
-- the tokens /before/ inserting things into the DB.
insertMasterDocs :: ( HasNodeError err insertMasterDocs :: ( HasNodeError err
, FlowCorpus a , UniqParameters doc
, FlowCorpus doc
, MkCorpus c , MkCorpus c
, IsDBCmd env err m
) )
=> GargConfig => GargConfig
-> NLPServerConfig -> UncommittedNgrams doc
-- ^ The ngrams extracted for /all/ the documents
-- and indexed by the hash of the given document.
-- We can use this map to associate the document
-- with the node being created.
-> Maybe c -> Maybe c
-> TermType Lang -> [doc]
-> [a] -> DBUpdate err [DocId]
-> m [DocId] insertMasterDocs cfg uncommittedNgrams c hs = do
insertMasterDocs cfg nlpServer c lang hs = do (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c
(ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
(masterUserId, masterCorpusId, documentsWithId, ids') <- runDBTx $ do _ <- Doc.add masterCorpusId ids'
(master_user_id, _, master_corpus_id) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c
(ids_prime, documents_with_id) <- insertDocs master_user_id master_corpus_id (map (toNode master_user_id Nothing) hs )
_ <- Doc.add master_corpus_id ids_prime
pure (master_user_id, master_corpus_id, documents_with_id, ids_prime)
-- TODO -- TODO
-- create a corpus with database name (CSV or PubMed) -- create a corpus with database name (CSV or PubMed)
-- add documents to the corpus (create node_node link) -- add documents to the corpus (create node_node link)
-- this will enable global database monitoring -- this will enable global database monitoring
ngramsDocsMap :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount))) let (_failedExtraction, ngramsDocsMap) = commitNgramsForDocuments uncommittedNgrams (map _unIndex documentsWithId)
<- mkNodeIdNgramsMap
<$> mapDocumentIdWithNgrams
(extractNgrams nlpServer $ withLang lang documentsWithId)
(map (B.first contextId2NodeId) documentsWithId)
runDBTx $ do lId <- getOrMkList masterCorpusId masterUserId
lId <- getOrMkList masterCorpusId masterUserId _ <- saveDocNgramsWith lId ngramsDocsMap
_ <- saveDocNgramsWith lId ngramsDocsMap pure $ map contextId2NodeId ids'
pure $ map contextId2NodeId ids'
saveDocNgramsWith :: ListId saveDocNgramsWith :: ListId
......
...@@ -292,7 +292,7 @@ instance ToSchema NodeId ...@@ -292,7 +292,7 @@ instance ToSchema NodeId
-- | An identifier for a 'Context' in gargantext. -- | An identifier for a 'Context' in gargantext.
newtype ContextId = UnsafeMkContextId { _ContextId :: Int } newtype ContextId = UnsafeMkContextId { _ContextId :: Int }
deriving stock (Show, Eq, Ord, Generic) deriving stock (Show, Eq, Ord, Generic)
deriving newtype (Csv.ToField, ToJSONKey, FromJSONKey, ToJSON, FromJSON, ToField, ToSchema) deriving newtype (Csv.ToField, ToJSONKey, FromJSONKey, ToJSON, FromJSON, ToField, ToSchema, Hashable)
deriving anyclass ToExpr deriving anyclass ToExpr
deriving FromField via NodeId deriving FromField via NodeId
......
...@@ -11,6 +11,7 @@ Portability : POSIX ...@@ -11,6 +11,7 @@ Portability : POSIX
{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE UndecidableInstances #-} {-# LANGUAGE UndecidableInstances #-}
{-# OPTIONS_GHC -Wno-orphans #-} {-# OPTIONS_GHC -Wno-orphans #-}
{-# LANGUAGE TypeApplications #-}
module Test.Database.Types where module Test.Database.Types where
...@@ -24,6 +25,7 @@ import Data.Map qualified as Map ...@@ -24,6 +25,7 @@ import Data.Map qualified as Map
import Data.Pool import Data.Pool
import Database.PostgreSQL.Simple qualified as PG import Database.PostgreSQL.Simple qualified as PG
import Database.Postgres.Temp qualified as Tmp import Database.Postgres.Temp qualified as Tmp
import GHC.IO.Exception (userError)
import Gargantext hiding (to) import Gargantext hiding (to)
import Gargantext.API.Admin.Orchestrator.Types import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.API.Errors.Types import Gargantext.API.Errors.Types
...@@ -33,13 +35,12 @@ import Gargantext.Core.Config.Mail (MailConfig(..), LoginType(NoAuth), SendEmail ...@@ -33,13 +35,12 @@ import Gargantext.Core.Config.Mail (MailConfig(..), LoginType(NoAuth), SendEmail
import Gargantext.Core.Mail.Types (HasMail(..)) import Gargantext.Core.Mail.Types (HasMail(..))
import Gargantext.Core.NLP (HasNLPServer(..)) import Gargantext.Core.NLP (HasNLPServer(..))
import Gargantext.Core.NodeStory import Gargantext.Core.NodeStory
import Gargantext.System.Logging (HasLogger(..), Logger, MonadLogger(..)) import Gargantext.System.Logging (HasLogger(..), Logger, MonadLogger(..), LogLevel(..))
import Gargantext.System.Logging.Loggers import Gargantext.System.Logging.Loggers
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..)) import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..))
import Network.URI (parseURI) import Network.URI (parseURI)
import Prelude qualified import Prelude qualified
import System.Log.FastLogger qualified as FL import System.Log.FastLogger qualified as FL
import GHC.IO.Exception (userError)
newtype Counter = Counter { _Counter :: IORef Int } newtype Counter = Counter { _Counter :: IORef Int }
...@@ -75,6 +76,19 @@ newtype TestMonadM env err a = TestMonad { _TestMonad :: ReaderT env IO a } ...@@ -75,6 +76,19 @@ newtype TestMonadM env err a = TestMonad { _TestMonad :: ReaderT env IO a }
, MonadThrow , MonadThrow
) )
instance HasLogger (TestMonadM TestEnv BackendInternalError) where
data instance Logger (TestMonadM TestEnv BackendInternalError) = TestLogger { _IOLogger :: IOStdLogger }
type instance LogInitParams (TestMonadM TestEnv BackendInternalError) = LogConfig
type instance LogPayload (TestMonadM TestEnv BackendInternalError) = Prelude.String
initLogger cfg = fmap TestLogger $ (liftIO $ ioStdLogger cfg)
destroyLogger = liftIO . _iosl_destroy . _IOLogger
logMsg (TestLogger ioLogger) lvl msg = liftIO $ _iosl_log_msg ioLogger lvl msg
logTxt (TestLogger ioLogger) lvl msg = liftIO $ _iosl_log_txt ioLogger lvl msg
instance MonadLogger (TestMonadM TestEnv BackendInternalError) where
getLogger = TestMonad $ do
initLogger @(TestMonadM TestEnv BackendInternalError) (LogConfig Nothing ERROR)
runTestMonadM :: env -> TestMonadM env err a -> IO a runTestMonadM :: env -> TestMonadM env err a -> IO a
runTestMonadM env = flip runReaderT env . _TestMonad runTestMonadM env = flip runReaderT env . _TestMonad
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment