Verified Commit 337d2af5 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into 471-dev-node-multiterms

parents b0913118 d19839d8
Pipeline #7651 failed with stages
in 62 minutes and 31 seconds
......@@ -19,6 +19,7 @@ module CLI.Import where
import CLI.Parsers
import CLI.Types
import Control.Monad.Catch (MonadCatch)
import Gargantext.API.Admin.EnvTypes (DevEnv(..), DevJobHandle(..))
import Gargantext.API.Dev (withDevEnv, runCmdGargDev)
import Gargantext.API.Errors.Types ( BackendInternalError )
......@@ -42,14 +43,14 @@ importCLI (ImportArgs fun user name settingsPath corpusPath) = do
let
tt = Multi EN
format = TsvGargV3
corpus :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
corpus :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, MonadCatch m, JobHandle m ~ DevJobHandle) => m CorpusId
mkCorpusUser = MkCorpusUserNormalCorpusName (UserName $ cs user) (cs name :: Text)
corpus = flowCorpusFile mkCorpusUser tt format Plain corpusPath Nothing DevJobHandle
corpusTsvHal :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
corpusTsvHal :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, MonadCatch m, JobHandle m ~ DevJobHandle) => m CorpusId
corpusTsvHal = flowCorpusFile mkCorpusUser tt TsvHal Plain corpusPath Nothing DevJobHandle
annuaire :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
annuaire :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, MonadCatch m, JobHandle m ~ DevJobHandle) => m CorpusId
annuaire = flowAnnuaire (MkCorpusUserNormalCorpusName (UserName $ cs user) "Annuaire") (Multi EN) corpusPath DevJobHandle
withDevEnv settingsPath $ \env -> do
......
......@@ -702,6 +702,7 @@ executable gargantext
, containers ^>= 0.6.7
, cryptohash ^>= 0.11.9
, directory ^>= 1.3.7.1
, exceptions >= 0.9.0 && < 0.11
, extra ^>= 1.7.9
, gargantext
, gargantext-prelude
......
......@@ -18,6 +18,7 @@ module Gargantext.API.Node.Contact
where
import Conduit ( yield )
import Control.Monad.Catch (MonadCatch)
import Gargantext.API.Admin.Auth.Types ( AuthenticatedUser(AuthenticatedUser) )
import Gargantext.API.Admin.EnvTypes (Env)
import Gargantext.API.Errors.Types ( BackendInternalError )
......@@ -57,7 +58,7 @@ apiAsync u nId = Named.ContactAsyncAPI {
, _ac_user = u }
}
addContact :: (FlowCmdM env err m, MonadJobStatus m)
addContact :: (FlowCmdM env err m, MonadJobStatus m, MonadCatch m)
=> User
-> NodeId
-> AddContactParams
......
......@@ -24,6 +24,7 @@ module Gargantext.API.Node.Corpus.New
import Conduit ((.|), yieldMany, mapMC, transPipe)
import Control.Exception.Safe (MonadMask)
import Control.Lens ( view, non )
import Control.Monad.Catch (MonadCatch)
import Data.Conduit.Internal (zipSources)
import Data.Conduit.List (mapMaybeM)
import Data.Swagger ( ToSchema(..) )
......@@ -56,6 +57,7 @@ import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), ParentId)
import Gargantext.Database.GargDB qualified as GargDB
import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.Node (getNodeWith, getOrMkList)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(MkCorpusUserNormalCorpusIds))
import Gargantext.Database.Schema.Node (node_hyperdata)
......@@ -63,7 +65,6 @@ import Gargantext.Prelude
import Gargantext.System.Logging ( logLocM, LogLevel(..) )
import Gargantext.Utils.Jobs.Error (HumanFriendlyErrorText(..))
import Gargantext.Utils.Jobs.Monad (JobHandle, MonadJobStatus(..))
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
------------------------------------------------------------------------
{-
......@@ -150,6 +151,7 @@ type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
addToCorpusWithQuery :: ( FlowCmdM env err m
, MonadJobStatus m
, MonadCatch m
)
=> User
-> CorpusId
......
......@@ -14,6 +14,7 @@ Portability : POSIX
module Gargantext.API.Node.Corpus.Searx where
import Control.Lens (view)
import Control.Monad.Catch (MonadCatch)
import Data.Aeson qualified as Aeson
import Data.Text qualified as T
import Data.Text qualified as Text
......@@ -40,6 +41,7 @@ import Gargantext.Database.Query.Table.Node (getOrMkList, insertDefaultNodeIfNot
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Tree.Error (HasTreeError)
import Gargantext.Prelude
import Gargantext.System.Logging (MonadLogger)
import Gargantext.Utils.Jobs.Monad (JobHandle, MonadJobStatus(..))
import Network.HTTP.Client
import Network.HTTP.Client.TLS (tlsManagerSettings)
......@@ -118,6 +120,8 @@ insertSearxResponse :: ( MonadBase IO m
, HasNodeError err
, HasTreeError err
, HasValidationError err
, MonadCatch m
, MonadLogger m
)
=> User
-> CorpusId
......@@ -155,6 +159,8 @@ triggerSearxSearch :: ( MonadBase IO m
, HasTreeError err
, HasValidationError err
, MonadJobStatus m
, MonadCatch m
, MonadLogger m
)
=> User
-> CorpusId
......
......@@ -16,6 +16,7 @@ Portability : POSIX
module Gargantext.API.Node.DocumentUpload where
import Control.Lens (view)
import Control.Monad.Catch (MonadCatch)
import Data.Text qualified as T
import Gargantext.API.Admin.Auth.Types (AuthenticatedUser(..))
import Gargantext.API.Admin.EnvTypes (Env)
......@@ -32,16 +33,16 @@ import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv)
import Gargantext.Core.Text.Corpus.Parsers.Date (mDateSplit)
import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Core.Worker.Jobs.Types (WorkSplit(..))
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Database.Action.Flow (addDocumentsToHyperCorpus)
import Gargantext.Database.Action.Flow.Types ( FlowCmdM )
import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataCorpus )
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Node ( DocId, NodeId, NodeType(NodeCorpus), ParentId )
import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node (getClosestParentIdByType')
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Schema.Node (_node_hyperdata)
import Gargantext.Prelude
import Gargantext.System.Logging (logLocM, LogLevel(..), MonadLogger)
......@@ -55,7 +56,7 @@ api nId = Named.DocumentUploadAPI {
, _ud_node_id = nId }
}
documentUploadAsync :: (FlowCmdM env err m, MonadJobStatus m)
documentUploadAsync :: (FlowCmdM env err m, MonadJobStatus m, MonadCatch m)
=> NodeId
-> DocumentUpload
-> JobHandle m
......@@ -66,7 +67,7 @@ documentUploadAsync nId doc jobHandle = do
-- printDebug "documentUploadAsync" docIds
markComplete jobHandle
documentUpload :: (FlowCmdM env err m)
documentUpload :: (FlowCmdM env err m, MonadCatch m)
=> NodeId
-> DocumentUpload
-> m [DocId]
......@@ -110,6 +111,7 @@ remoteImportDocuments :: ( HasNodeError err
, HasNodeStoryEnv env err
, IsDBCmd env err m
, MonadLogger m
, MonadCatch m
, MonadIO m)
=> AuthenticatedUser
-> ParentId
......
......@@ -17,6 +17,8 @@ module Gargantext.API.Node.DocumentsFromWriteNodes
where
import Conduit ( yieldMany )
import Control.Lens (view)
import Control.Monad.Catch (MonadCatch)
import Data.List qualified as List
import Data.Text qualified as T
import Gargantext.API.Admin.Auth.Types ( AuthenticatedUser, auth_node_id, auth_user_id )
......@@ -39,15 +41,14 @@ import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Frame ( HyperdataFrame(..), getHyperdataFrameContents )
import Gargantext.Database.Admin.Types.Node ( NodeId, Node, NodeType(..) )
import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith, getOrMkList)
import Gargantext.Database.Schema.Node (node_hyperdata, node_name, node_date)
import Gargantext.Database.Prelude
import Gargantext.Prelude
import Gargantext.System.Logging (logLocM, LogLevel(..))
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..))
import Gargantext.Utils.Jobs.Error (HumanFriendlyErrorText(..))
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..))
import Servant.Server.Generic (AsServerT)
import Control.Lens (view)
api :: AuthenticatedUser
-- ^ The logged-in user
......@@ -63,6 +64,7 @@ api authenticatedUser nId =
documentsFromWriteNodes :: ( FlowCmdM env err m
, MonadJobStatus m
, MonadCatch m
)
=> AuthenticatedUser
-- ^ The logged-in user
......
......@@ -116,13 +116,12 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
instance Hashable ExtractedNgrams
-- | A typeclass that represents extracting ngrams from an entity.
class ExtractNgramsT h
where
extractNgramsT :: HasText h
=> NLPServerConfig
-> TermType Lang
-> h
-> DBCmd err (HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
class ExtractNgrams h where
extractNgrams :: NLPServerConfig
-> TermType Lang
-> h
-> DBCmd err (HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms { .. }) =
......
......@@ -15,15 +15,17 @@ Portability : POSIX
-- TODO-EVENTS: InsertedNodes
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TupleSections #-}
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
( DataText(..)
......@@ -54,7 +56,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
import Conduit
import Control.Lens ( to, view )
import Data.Bifunctor qualified as B
import Control.Monad.Catch
import Data.Conduit qualified as C
import Data.Conduit.Internal (zipSources)
import Data.Conduit.List qualified as CL
......@@ -67,12 +69,13 @@ import Data.Text qualified as T
import Gargantext.API.Ngrams.Tools (getTermsWith)
import Gargantext.API.Ngrams.Types (NgramsTerm)
import Gargantext.Core (Lang(..), withDefaultLanguage, NLPServerConfig)
import Gargantext.Core.Notifications.CentralExchange.Types (HasCentralExchangeNotification(ce_notify), CEMessage(..))
import Gargantext.Core.Config (GargConfig(..), hasConfig)
import Gargantext.Core.Config.Types (APIsConfig(..))
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.NLP (HasNLPServer, nlpServerGet)
import Gargantext.Core.NodeStory.Types (HasNodeStory, NodeStoryEnv, HasNodeStoryEnv (..))
import Gargantext.Core.Notifications.CentralExchange.Types (HasCentralExchangeNotification(ce_notify), CEMessage(..))
import Gargantext.Core.Text (HasText)
import Gargantext.Core.Text.Corpus.API qualified as API
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
import Gargantext.Core.Text.List (buildNgramsLists)
......@@ -86,27 +89,28 @@ import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main ( ListType(MapTerm) )
import Gargantext.Database.Action.Flow.Extract () -- ExtractNgramsT instances
import Gargantext.Database.Action.Flow.List ( flowList_DbRepo, toNodeNgramsW' )
import Gargantext.Database.Action.Flow.Types ( do_api, DataOrigin(..), DataText(..), FlowCorpus )
import Gargantext.Database.Action.Flow.Utils (documentIdWithNgrams, insertDocNgrams, insertDocs, mapNodeIdNgrams, ngramsByDoc)
import Gargantext.Database.Action.Flow.Types ( do_api, DataOrigin(..), DataText(..), FlowCorpus, DocumentIdWithNgrams (..) )
import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, insertDocs, mkNodeIdNgramsMap, ngramsByDoc, documentIdWithNgrams)
import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore)
import Gargantext.Database.Action.Search (searchDocInDatabase)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact )
import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataAnnuaire, HyperdataCorpus(_hc_lang) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( ToHyperdataDocument(toHyperdataDocument), HyperdataDocument )
import Gargantext.Database.Admin.Types.Node hiding (DEBUG) -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Admin.Types.Node hiding (ERROR, DEBUG) -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.ContextNodeNgrams2 ( ContextNodeNgrams2Poly(..), insertContextNodeNgrams2 )
import Gargantext.Database.Query.Table.Node ( MkCorpus, insertDefaultNodeIfNotExists, getOrMkList, getNodeWith )
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert ( ToNode(toNode) ) -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Query.Table.Node.Document.Insert ( ToNode(toNode), UniqParameters (..), newUniqIdHash ) -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId)
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser)
import Gargantext.Database.Schema.Ngrams ( indexNgrams, NgramsId )
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude hiding (to)
import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG), MonadLogger )
import Gargantext.Database.Schema.Node
import Gargantext.Database.Types
import Gargantext.Prelude hiding (catch, onException, to)
import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG, ERROR), MonadLogger )
import Gargantext.Utils.Jobs.Monad ( JobHandle, MonadJobStatus(..) )
------------------------------------------------------------------------
......@@ -168,6 +172,7 @@ flowDataText :: forall env err m.
, HasTreeError err
, HasValidationError err
, MonadJobStatus m
, MonadCatch m
, HasCentralExchangeNotification env
)
=> User
......@@ -202,6 +207,7 @@ flowAnnuaire :: ( IsDBCmd env err m
, HasTreeError err
, HasValidationError err
, MonadJobStatus m
, MonadCatch m
, HasCentralExchangeNotification env )
=> MkCorpusUser
-> TermType Lang
......@@ -221,6 +227,7 @@ flowCorpusFile :: ( IsDBCmd env err m
, HasTreeError err
, HasValidationError err
, MonadJobStatus m
, MonadCatch m
, HasCentralExchangeNotification env )
=> MkCorpusUser
-> TermType Lang
......@@ -250,7 +257,8 @@ flowCorpus :: ( IsDBCmd env err m
, HasValidationError err
, FlowCorpus a
, MonadJobStatus m
, HasCentralExchangeNotification env )
, MonadCatch m
, HasCentralExchangeNotification env, Show a )
=> MkCorpusUser
-> TermType Lang
-> Maybe FlowSocialListWith
......@@ -271,6 +279,7 @@ flow :: forall env err m a c.
, MkCorpus c
, MonadJobStatus m
, HasCentralExchangeNotification env
, MonadCatch m, Show a
)
=> Maybe c
-> MkCorpusUser
......@@ -309,6 +318,8 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m
, HasNLPServer env
, FlowCorpus document
, MkCorpus corpus
, MonadLogger m
, MonadCatch m, Show document
)
=> Maybe corpus
-> TermType Lang
......@@ -318,7 +329,13 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m
addDocumentsToHyperCorpus mb_hyper la corpusId docs = do
cfg <- view hasConfig
nlp <- view (nlpServerGet $ _tt_lang la)
ids <- insertMasterDocs cfg nlp mb_hyper la docs
-- First extract all the ngrams for the input documents via the nlp server,
-- collect errors (if any) and pass to 'insertMasterDocs' only the documents
-- for which the ngrams extraction succeeded. At the moment errors are just
-- logged, but in the future they could be returned upstream so that we can
-- display a final result of how many were skipped, how many succeded etc.
uncommittedNgrams <- extractNgramsFromDocuments nlp la docs
ids <- runDBTx $ insertMasterDocs cfg uncommittedNgrams mb_hyper docs
runDBTx $ do
void $ Doc.add corpusId (map nodeId2ContextId ids)
pure ids
......@@ -413,44 +430,147 @@ buildSocialList l user userCorpusId listId ctype = \case
_mastListId <- runDBTx $ getOrMkList masterCorpusId masterUserId
pure ()
-------------------------------------------------------------------------------
--
-- Splitting Ngrams extraction from document creation
--
-------------------------------------------------------------------------------
--
-- There is a bit of tension between extracting the Ngrams and creating the documents:
-- We need to produce a map between a given 'NodeId' and the ngrams associated with it, where
-- the latter are extract via the NLP server. However, each ngrams has to be matched to the
-- NodeId associated with the new resource being created as part of 'insertMasterDocs'. This
-- creates a bit of a chicken-and-egg problem in trying to make 'insertMasterDocs' a 'DBUpdate'
-- function: we need a 'NodeId' to exist by the time we call 'extractNgrams' but the latter can't
-- be execute in a pure fashion without a 'NodeId'.
--
-- To fix this, we need a data structure which would index the ngrams by some other notion of
-- index, and later have a transformation function which would re-index these ngrams to the actual
-- 'NodeId' created during the DB Transaction.
-- | Ngrams that have been fully \"committed\", i.e. associated to the respective document
-- where the latter has been persisted (i.e. committed) on secondary storage.
type CommittedNgrams =
HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
newtype DocumentHashId
= DocumentHashId { _DocumentHashId :: T.Text }
deriving stock (Show, Eq)
deriving newtype Ord
-- | Ngrams that have been extracted from the input 'doc' but not fully associated with
-- a persisted entity on the database.
newtype UncommittedNgrams doc = UncommittedNgrams
{ _UncommittedNgrams :: Map.Map DocumentHashId (DocumentIdWithNgrams DocumentHashId doc ExtractedNgrams) }
deriving stock Show
deriving newtype (Semigroup, Monoid)
data InsertDocError
= NgramsNotFound !(Maybe DocumentHashId) !DocId
deriving Show
extractNgramsFromDocument :: ( UniqParameters doc
, HasText doc
, ExtractNgrams doc
, IsDBCmd err env m
, MonadLogger m
, MonadCatch m
)
=> NLPServerConfig
-> TermType Lang
-> doc
-> m (UncommittedNgrams doc)
extractNgramsFromDocument nlpServer lang doc =
-- In case of an exception from the NLP server, treat this as having no ngrams,
-- but still index it in the final map, so that later reconciliation still works.
-- Pratically speaking it means this won't have any ngrams associated, but the document
-- will still be added to the corpus and we can try to regen the ngrams at a later stage.
UncommittedNgrams . Map.singleton docId <$>
(documentIdWithNgrams (extractNgrams nlpServer $ withLang lang [doc]) (Indexed docId doc)
`catch` \(e :: SomeException) -> do
$(logLocM) ERROR $ T.pack $ "Document with hash " <> show docId <> " failed ngrams extraction due to an exception: " <> displayException e
pure $ DocumentIdWithNgrams (Indexed docId doc) mempty
)
where
docId = DocumentHashId $ newUniqIdHash doc
commitNgramsForDocument :: UniqParameters doc
=> UncommittedNgrams doc
-> Indexed ContextId (Node doc)
-> Either InsertDocError CommittedNgrams
commitNgramsForDocument (UncommittedNgrams ng) (Indexed oldIx node) = do
docId <- mb_docId
case Map.lookup docId ng of
Nothing -> Left $ NgramsNotFound (Just docId) (_node_id node)
Just ngs -> Right $ mkNodeIdNgramsMap [reIndex ngs]
where
mb_docId = case DocumentHashId <$> _node_hash_id node of
Nothing -> Left $ NgramsNotFound Nothing (_node_id node)
Just dId -> Right dId
reIndex :: DocumentIdWithNgrams DocumentHashId doc ExtractedNgrams
-> DocumentIdWithNgrams NodeId doc ExtractedNgrams
reIndex did =
let (Indexed _ a) = documentWithId did
in did { documentWithId = Indexed (contextId2NodeId oldIx) a }
extractNgramsFromDocuments :: forall doc env err m.
( HasText doc
, UniqParameters doc
, ExtractNgrams doc
, IsDBCmd env err m
, MonadLogger m
, MonadCatch m
)
=> NLPServerConfig
-> TermType Lang
-> [doc]
-> m (UncommittedNgrams doc)
extractNgramsFromDocuments nlpServer lang docs =
foldlM go mempty docs
where
go :: UncommittedNgrams doc -> doc -> m (UncommittedNgrams doc)
go !acc inputDoc = do
ngrams <- extractNgramsFromDocument nlpServer lang inputDoc
pure $ acc <> ngrams
commitNgramsForDocuments :: UniqParameters doc
=> UncommittedNgrams doc
-> [Indexed ContextId (Node doc)]
-> ([InsertDocError], CommittedNgrams)
commitNgramsForDocuments ng nodes =
let (errs, successes) = partitionEithers $ map (commitNgramsForDocument ng) nodes
in (errs, mconcat successes)
-- FIME(adn): the use of 'extractNgramsT' is iffy and problematic -- we shouldn't
-- be contacting the NLP server in the middle of some DB ops! we should extract
-- the tokens /before/ inserting things into the DB.
insertMasterDocs :: ( HasNodeError err
, FlowCorpus a
, MkCorpus c
, IsDBCmd env err m
, UniqParameters doc
, FlowCorpus doc
, MkCorpus c, Show doc
)
=> GargConfig
-> NLPServerConfig
-> UncommittedNgrams doc
-- ^ The ngrams extracted for /all/ the documents
-- and indexed by the hash of the given document.
-- We can use this map to associate the document
-- with the node being created.
-> Maybe c
-> TermType Lang
-> [a]
-> m [DocId]
insertMasterDocs cfg nlpServer c lang hs = do
(masterUserId, masterCorpusId, documentsWithId, ids') <- runDBTx $ do
(master_user_id, _, master_corpus_id) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c
(ids_prime, documents_with_id) <- insertDocs master_user_id master_corpus_id (map (toNode master_user_id Nothing) hs )
_ <- Doc.add master_corpus_id ids_prime
pure (master_user_id, master_corpus_id, documents_with_id, ids_prime)
-> [doc]
-> DBUpdate err [DocId]
insertMasterDocs cfg uncommittedNgrams c hs = do
(masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c
(ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
_ <- Doc.add masterCorpusId ids'
-- TODO
-- create a corpus with database name (CSV or PubMed)
-- add documents to the corpus (create node_node link)
-- this will enable global database monitoring
mapNgramsDocs' :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
<- mapNodeIdNgrams
<$> documentIdWithNgrams
(extractNgramsT nlpServer $ withLang lang documentsWithId)
(map (B.first contextId2NodeId) documentsWithId)
let (_failedExtraction, ngramsDocsMap) = commitNgramsForDocuments uncommittedNgrams documentsWithId
runDBTx $ do
lId <- getOrMkList masterCorpusId masterUserId
_ <- saveDocNgramsWith lId mapNgramsDocs'
pure $ map contextId2NodeId ids'
lId <- getOrMkList masterCorpusId masterUserId
_ <- saveDocNgramsWith lId ngramsDocsMap
pure $ map contextId2NodeId ids'
saveDocNgramsWith :: ListId
......
......@@ -25,7 +25,7 @@ import Gargantext.Core (Lang, NLPServerConfig(server))
import Gargantext.Core.Text (HasText(..))
import Gargantext.Core.Text.Corpus.Parsers (splitOn)
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms (ExtractNgramsT, ExtractedNgrams(..), TermType, cleanExtractedNgrams, enrichedTerms, extractNgramsT, extractTerms, tt_lang)
import Gargantext.Core.Text.Terms (ExtractNgrams(..), ExtractedNgrams(..), TermType, cleanExtractedNgrams, enrichedTerms, extractTerms, tt_lang)
import Gargantext.Core.Types (POS(NP), TermsCount, TermsWeight)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_lastName, hc_who )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source )
......@@ -39,59 +39,57 @@ import Gargantext.Prelude
------------------------------------------------------------------------
instance ExtractNgramsT HyperdataContact
where
extractNgramsT _ncs l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
where
extract :: TermType Lang -> HyperdataContact
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extract _l hc' = do
let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc'
instance ExtractNgrams HyperdataContact where
extractNgrams _ncs _l = pure . HashMap.mapKeys (cleanExtractedNgrams 255) . extract
where
extract :: HyperdataContact
-> HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount)
extract hc' =
let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc'
pure $ HashMap.fromList $ [(SimpleNgrams a', (DM.singleton Authors 1, 1)) | a' <- authors ]
in HashMap.fromList $ [(SimpleNgrams a', (DM.singleton Authors 1, 1)) | a' <- authors ]
-- | Main ngrams extraction functionality.
-- For NgramsTerms, this calls NLP server under the hood.
-- For Sources, Institutes, Authors, this uses simple split on " ".
instance ExtractNgramsT HyperdataDocument
where
extractNgramsT :: NLPServerConfig
-> TermType Lang
-> HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgramsT ncs lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' hd
where
extractNgramsT' :: HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgramsT' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
$ doc ^. hd_source
institutes = map text2ngrams
$ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
$ doc ^. hd_institutes
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
$ doc ^. hd_authors
termsWithCounts' :: [(NgramsPostag, TermsCount)] <-
map (first (enrichedTerms (lang ^. tt_lang) (server ncs) NP)) . concat <$>
liftBase (extractTerms ncs lang $ hasText doc)
pure $ HashMap.fromList
$ [(SimpleNgrams source, (DM.singleton Sources 1, 1)) ]
<> [(SimpleNgrams i', (DM.singleton Institutes 1, 1)) | i' <- institutes ]
<> [(SimpleNgrams a', (DM.singleton Authors 1, 1)) | a' <- authors ]
<> [(EnrichedNgrams t', (DM.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
instance ExtractNgrams HyperdataDocument where
extractNgrams :: NLPServerConfig
-> TermType Lang
-> HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgrams ncs lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' hd
where
extractNgramsT' :: HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgramsT' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
$ doc ^. hd_source
institutes = map text2ngrams
$ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
$ doc ^. hd_institutes
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
$ doc ^. hd_authors
termsWithCounts' :: [(NgramsPostag, TermsCount)] <-
map (first (enrichedTerms (lang ^. tt_lang) (server ncs) NP)) . concat <$>
liftBase (extractTerms ncs lang $ hasText doc)
pure $ HashMap.fromList
$ [(SimpleNgrams source, (DM.singleton Sources 1, 1)) ]
<> [(SimpleNgrams i', (DM.singleton Institutes 1, 1)) | i' <- institutes ]
<> [(SimpleNgrams a', (DM.singleton Authors 1, 1)) | a' <- authors ]
<> [(EnrichedNgrams t', (DM.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
instance (ExtractNgrams a, HasText a) => ExtractNgrams (Node a)
where
extractNgramsT ncs l (Node { _node_hyperdata = h }) = extractNgramsT ncs l h
extractNgrams ncs l (Node { _node_hyperdata = h }) = extractNgrams ncs l h
instance HasText a => HasText (Node a)
......
......@@ -25,7 +25,7 @@ import Gargantext.Core.NodeStory.Types ( HasNodeStory )
import Gargantext.Core.Text ( HasText )
import Gargantext.API.Admin.Orchestrator.Types qualified as API
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms ( ExtractNgramsT )
import Gargantext.Core.Text.Terms ( ExtractNgrams )
import Gargantext.Core.Types (HasValidationError, TermsCount, TermsWeight)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument )
......@@ -52,7 +52,7 @@ type FlowCmdM env err m =
type FlowCorpus a = ( UniqParameters a
, InsertDb a
, ExtractNgramsT a
, ExtractNgrams a
, HasText a
, ToNode a
, ToJSON a
......@@ -66,9 +66,9 @@ type FlowInsertDB a = ( AddUniqId a
data DocumentIdWithNgrams a b =
data DocumentIdWithNgrams ix a b =
DocumentIdWithNgrams
{ documentWithId :: Indexed NodeId a
{ documentWithId :: Indexed ix a
, documentNgrams :: HashMap b (Map NgramsType TermsWeight, TermsCount)
} deriving (Show)
......
......@@ -13,9 +13,10 @@ module Gargantext.Database.Action.Flow.Utils
( docNgrams
, docNgrams'
, documentIdWithNgrams
, mapDocumentIdWithNgrams
, insertDocNgrams
, insertDocs
, mapNodeIdNgrams
, mkNodeIdNgramsMap
, ngramsByDoc )
where
......@@ -39,7 +40,6 @@ import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.ContextNodeNgrams ( ContextNodeNgramsPoly(..), insertContextNodeNgrams )
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Schema.Context (context_oid_hyperdata, context_oid_id)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..), text2ngrams)
import Gargantext.Database.Types ( Indexed(..), index )
......@@ -94,34 +94,40 @@ docNgrams' lang ts txt =
termsInText lang (buildPatternsWith lang ts) txt
documentIdWithNgrams :: HasNodeError err
=> ( a
-> DBCmd err (HashMap.HashMap b (Map NgramsType TermsWeight, TermsCount)) )
-> [Indexed NodeId a]
-> DBCmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
documentIdWithNgrams :: Monad m
=> ( a -> m (HashMap.HashMap b (Map NgramsType TermsWeight, TermsCount)) )
-> Indexed ix a
-> m (DocumentIdWithNgrams ix a b)
documentIdWithNgrams f = toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ _unIndex d
pure $ DocumentIdWithNgrams { documentWithId = d
, documentNgrams = e }
mapDocumentIdWithNgrams :: Monad m
=> ( a -> m (HashMap.HashMap b (Map NgramsType TermsWeight, TermsCount)) )
-> [Indexed ix a]
-> m [DocumentIdWithNgrams ix a b]
mapDocumentIdWithNgrams f = mapM (documentIdWithNgrams f)
-- | TODO check optimization
mapNodeIdNgrams :: (Ord b, Hashable b)
=> [DocumentIdWithNgrams a b]
-- | Creates a NodeIdNgrams map out of the input 'DocumentIdWithNgrams' list.
-- TODO check optimization
mkNodeIdNgramsMap :: forall ix a b. (Ord b, Hashable b, Ord ix)
=> [DocumentIdWithNgrams ix a b]
-> HashMap.HashMap b
(Map NgramsType
(Map NodeId (TermsWeight, TermsCount))
(Map ix (TermsWeight, TermsCount))
)
mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . fmap f
mkNodeIdNgramsMap = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . fmap f
where
-- | NOTE We are somehow multiplying 'TermsCount' here: If the
-- same ngrams term has different ngrams types, the 'TermsCount'
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f :: DocumentIdWithNgrams a b
-> HashMap.HashMap b (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
f :: DocumentIdWithNgrams ix a b
-> HashMap.HashMap b (Map NgramsType (Map ix (TermsWeight, TermsCount)))
f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\w -> DM.singleton nId (w, cnt)) ngramsTypeMap) $ documentNgrams d
where
nId = _index $ documentWithId d
......
......@@ -43,7 +43,7 @@ instance NFData HyperdataContact where
instance HasText HyperdataContact
where
hasText = undefined
hasText = mempty
defaultHyperdataContact :: HyperdataContact
defaultHyperdataContact =
......
......@@ -292,7 +292,7 @@ instance ToSchema NodeId
-- | An identifier for a 'Context' in gargantext.
newtype ContextId = UnsafeMkContextId { _ContextId :: Int }
deriving stock (Show, Eq, Ord, Generic)
deriving newtype (Csv.ToField, ToJSONKey, FromJSONKey, ToJSON, FromJSON, ToField, ToSchema)
deriving newtype (Csv.ToField, ToJSONKey, FromJSONKey, ToJSON, FromJSON, ToField, ToSchema, Hashable)
deriving anyclass ToExpr
deriving FromField via NodeId
......
......@@ -72,7 +72,7 @@ import Gargantext.Database.Prelude
import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (hash, toLower)
import Gargantext.Prelude.Crypto.Hash (hash)
import Gargantext.Prelude.Crypto.Hash (hash, Hash)
{-| To Print result query
import Data.ByteString.Internal (ByteString)
......@@ -221,9 +221,9 @@ instance UniqParameters HyperdataContact
where
uniqParameters _ = ""
instance UniqParameters (Node a)
instance UniqParameters a => UniqParameters (Node a)
where
uniqParameters _ = undefined
uniqParameters = uniqParameters . _node_hyperdata
filterText :: Text -> Text
......@@ -232,9 +232,13 @@ filterText = DT.toLower . DT.filter isAlphaNum
instance (UniqParameters a, ToJSON a, HasDBid NodeType) => AddUniqId (Node a)
where
addUniqId (Node nid _ t u p n d h) = Node nid (Just newHash) t u p n d h
addUniqId node = node { _node_hash_id = Just $ newUniqIdHash node }
where
newHash = "\\x" <> hash (uniqParameters h)
-- | Returns a new unique ID computed by hashing the uniq parameters of the input
-- and prefixing everything with '\\x'.
newUniqIdHash :: UniqParameters a => a -> Hash
newUniqIdHash a = "\\x" <> hash (uniqParameters a)
---------------------------------------------------------------------------
......
......@@ -11,6 +11,7 @@ Portability : POSIX
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE UndecidableInstances #-}
{-# OPTIONS_GHC -Wno-orphans #-}
{-# LANGUAGE TypeApplications #-}
module Test.Database.Types where
......@@ -24,6 +25,7 @@ import Data.Map qualified as Map
import Data.Pool
import Database.PostgreSQL.Simple qualified as PG
import Database.Postgres.Temp qualified as Tmp
import GHC.IO.Exception (userError)
import Gargantext hiding (to)
import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.API.Errors.Types
......@@ -33,13 +35,12 @@ import Gargantext.Core.Config.Mail (MailConfig(..), LoginType(NoAuth), SendEmail
import Gargantext.Core.Mail.Types (HasMail(..))
import Gargantext.Core.NLP (HasNLPServer(..))
import Gargantext.Core.NodeStory
import Gargantext.System.Logging (HasLogger(..), Logger, MonadLogger(..))
import Gargantext.System.Logging (HasLogger(..), Logger, MonadLogger(..), LogLevel(..))
import Gargantext.System.Logging.Loggers
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..))
import Network.URI (parseURI)
import Prelude qualified
import System.Log.FastLogger qualified as FL
import GHC.IO.Exception (userError)
newtype Counter = Counter { _Counter :: IORef Int }
......@@ -75,6 +76,19 @@ newtype TestMonadM env err a = TestMonad { _TestMonad :: ReaderT env IO a }
, MonadThrow
)
instance HasLogger (TestMonadM TestEnv BackendInternalError) where
data instance Logger (TestMonadM TestEnv BackendInternalError) = TestLogger { _IOLogger :: IOStdLogger }
type instance LogInitParams (TestMonadM TestEnv BackendInternalError) = LogConfig
type instance LogPayload (TestMonadM TestEnv BackendInternalError) = Prelude.String
initLogger cfg = fmap TestLogger $ (liftIO $ ioStdLogger cfg)
destroyLogger = liftIO . _iosl_destroy . _IOLogger
logMsg (TestLogger ioLogger) lvl msg = liftIO $ _iosl_log_msg ioLogger lvl msg
logTxt (TestLogger ioLogger) lvl msg = liftIO $ _iosl_log_txt ioLogger lvl msg
instance MonadLogger (TestMonadM TestEnv BackendInternalError) where
getLogger = TestMonad $ do
initLogger @(TestMonadM TestEnv BackendInternalError) (LogConfig Nothing ERROR)
runTestMonadM :: env -> TestMonadM env err a -> IO a
runTestMonadM env = flip runReaderT env . _TestMonad
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment