Commit c62480c7 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Proper support for importing documents

parent 6019587c
Pipeline #7206 passed with stages
in 50 minutes and 56 seconds
......@@ -25,8 +25,8 @@ import Control.Lens ( view, non )
import Data.ByteString.Base64 qualified as BSB64
import Data.Conduit.Internal (zipSources)
import Data.Swagger ( ToSchema(..) )
import Data.Text qualified as T
import Data.Text.Encoding qualified as TE
import Data.Text qualified as T
import EPO.API.Client.Types qualified as EPO
import Gargantext.API.Admin.Orchestrator.Types qualified as API
import Gargantext.API.Ngrams (commitStatePatch, Versioned(..))
......@@ -35,14 +35,14 @@ import Gargantext.API.Node.Corpus.Searx ( triggerSearxSearch )
import Gargantext.API.Node.Corpus.Types ( Datafield(Web), database2origin )
import Gargantext.API.Node.Corpus.Update (addLanguageToCorpus)
import Gargantext.API.Node.Types
import Gargantext.Core (withDefaultLanguage, defaultLanguage)
import Gargantext.Core.Config (gc_jobs, hasConfig)
import Gargantext.Core.Config.Types (jc_max_docs_parsers)
import Gargantext.Core.NodeStory (HasNodeStoryImmediateSaver, HasNodeArchiveStoryImmediateSaver, currentVersion, NgramsStatePatch')
import Gargantext.Core.NodeStory (HasNodeStoryImmediateSaver, HasNodeArchiveStoryImmediateSaver, currentVersion, NgramsStatePatch', HasNodeStoryEnv)
import Gargantext.Core.Text.Corpus.Parsers qualified as Parser (FileType(..), parseFormatC, _ParseFormatError)
import Gargantext.Core.Text.Corpus.Query qualified as API
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Core (withDefaultLanguage, defaultLanguage)
import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Action.Mail (sendMail)
......@@ -52,6 +52,8 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document ( ToHyperdataDocument(
import Gargantext.Database.Admin.Types.Hyperdata.File ( HyperdataFile(..) )
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), ParentId)
import Gargantext.Database.GargDB qualified as GargDB
import Gargantext.Database.Prelude (IsDBCmd)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node (getNodeWith, getOrMkList)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(MkCorpusUserNormalCorpusIds))
......@@ -366,11 +368,15 @@ addToCorpusWithFile user cid nwf@(NewWithFile _d (withDefaultLanguage -> l) fNam
--- UTILITIES
commitCorpus :: ( FlowCmdM env err m
commitCorpus :: ( IsDBCmd env err m
, HasNodeStoryEnv env
, HasNodeError err
, HasNodeArchiveStoryImmediateSaver env
, HasNodeStoryImmediateSaver env )
=> ParentId -> User -> m (Versioned NgramsStatePatch')
commitCorpus cid user = do
=> ParentId
-> User
-> m (Versioned NgramsStatePatch')
commitCorpus cid user = do
userId <- getUserId user
listId <- getOrMkList cid userId
v <- currentVersion listId
......
......@@ -29,13 +29,14 @@ import Gargantext.Utils.Servant (ZIP)
import Gargantext.Utils.Zip (zipContentsPureWithLastModified)
import Protolude
import Servant (MimeRender(..), MimeUnrender(..))
import Prelude (show)
-- | Document Export
data DocumentExport =
DocumentExport { _de_documents :: [Document]
, _de_garg_version :: Text
} deriving (Generic)
} deriving (Generic, Show, Eq)
instance Serialise DocumentExport where
......@@ -44,14 +45,20 @@ data DocumentExportZIP =
DocumentExportZIP { _dez_dexp :: DocumentExport
, _dez_doc_id :: DocId
, _dez_last_modified :: Integer } deriving (Generic)
data Document =
Document { _d_document :: Node HyperdataDocument
, _d_ngrams :: Ngrams
, _d_hash :: Hash
} deriving (Generic)
instance Eq Document where
(Document _ _ h1) == (Document _ _ h2) = h1 == h2 -- compare by their hashes
instance Show Document where
show (Document _ _ h1) = "Document " <> Prelude.show h1
instance Serialise Document where
--instance Read Document where
-- read "" = panic "not implemented"
......@@ -119,7 +126,7 @@ $(deriveJSON (unPrefix "_de_") ''DocumentExport)
-- Needs to be here because of deriveJSON TH above
dezFileName :: DocumentExportZIP -> Text
dezFileName (DocumentExportZIP { .. }) = "GarganText_DocsList-" <> show _dez_doc_id <> ".json"
dezFileName (DocumentExportZIP { .. }) = "GarganText_DocsList-" <> Protolude.show _dez_doc_id <> ".json"
instance MimeRender ZIP DocumentExportZIP where
mimeRender _ dexpz@(DocumentExportZIP { .. }) =
......
......@@ -11,33 +11,43 @@ Portability : POSIX
{-# LANGUAGE AllowAmbiguousTypes #-}
{-# LANGUAGE MonoLocalBinds #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Node.DocumentUpload where
import Control.Lens (view)
import Data.Text qualified as T
import Gargantext.API.Admin.Auth.Types (AuthenticatedUser(..))
import Gargantext.API.Admin.EnvTypes (Env)
import Gargantext.API.Errors.Types ( BackendInternalError )
import Gargantext.API.Errors.Types ( BackendInternalError (..) )
import Gargantext.API.Node.Corpus.New (commitCorpus)
import Gargantext.API.Node.Document.Export.Types ( Document(..))
import Gargantext.API.Node.Document.Export.Types (DocumentExport(..))
import Gargantext.API.Node.DocumentUpload.Types
import Gargantext.API.Prelude ( GargM )
import Gargantext.API.Routes.Named.Document qualified as Named
import Gargantext.API.Worker (serveWorkerAPI)
import Gargantext.Core (Lang(..))
import Gargantext.Core.NLP (nlpServerGet)
import Gargantext.Core.NLP (nlpServerGet, HasNLPServer)
import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv, HasNodeArchiveStoryImmediateSaver)
import Gargantext.Core.Text.Corpus.Parsers.Date (mDateSplit)
import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Database.Action.Flow (addDocumentsToHyperCorpus)
import Gargantext.Database.Action.Flow.Types ( FlowCmdM )
import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataCorpus )
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Node ( DocId, NodeId, NodeType(NodeCorpus) )
import Gargantext.Database.Query.Table.Node (getClosestParentIdByType')
import Gargantext.Database.Prelude (IsDBCmd)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node (getClosestParentIdByType', getClosestParentIdByType)
import Gargantext.Database.Schema.Node (_node_hyperdata)
import Gargantext.Prelude
import Gargantext.System.Logging (logLocM, LogLevel(..), MonadLogger)
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..))
import Servant.Server.Generic (AsServerT)
api :: NodeId -> Named.DocumentUploadAPI (AsServerT (GargM Env BackendInternalError))
api nId = Named.DocumentUploadAPI {
uploadDocAsyncEp = serveWorkerAPI $ \p ->
......@@ -91,3 +101,31 @@ documentUpload nId doc = do
let lang = EN
ncs <- view $ nlpServerGet lang
addDocumentsToHyperCorpus ncs (Nothing :: Maybe HyperdataCorpus) (Multi lang) cId [hd]
-- | Imports the documents contained into this 'DocumentExport' into this (local) version
-- of the running node.
-- /NOTE(adn)/: We should compare the gargantext version and ensure that we are importing
-- only compatible versions.
remoteImportDocuments :: ( HasNodeError err
, HasNLPServer env
, HasNodeArchiveStoryImmediateSaver env
, HasNodeStoryEnv env
, IsDBCmd env err m
, MonadLogger m
, MonadIO m)
=> AuthenticatedUser
-> NodeId
-> DocumentExport
-> m [NodeId]
remoteImportDocuments loggedInUser nodeId (DocumentExport documents _gargVersion) = do
mb_corpusId <- getClosestParentIdByType nodeId NodeCorpus
case mb_corpusId of
Nothing -> panicTrace $ "remoteImportDocuments: impossible, freshly imported doc node without parent corpus"
Just corpusId -> do
let la = Multi EN
nlpServerConfig <- view $ nlpServerGet (_tt_lang la)
$(logLocM) INFO $ "Importing " <> T.pack (show $ length documents) <> " documents for corpus node " <> T.pack (show nodeId)
docs <- addDocumentsToHyperCorpus nlpServerConfig (Nothing :: Maybe HyperdataCorpus) la corpusId (map (_node_hyperdata . _d_document) documents)
_versioned <- commitCorpus corpusId (RootId $ _auth_node_id loggedInUser)
$(logLocM) INFO $ "Done importing " <> T.pack (show $ length documents) <> " documents for corpus node " <> T.pack (show nodeId)
pure docs
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.API.Server.Named.Remote (
remoteAPI
......@@ -30,22 +31,24 @@ import Gargantext.API.Errors.Types
import Gargantext.API.Ngrams.Prelude (getNgramsList)
import Gargantext.API.Ngrams.Types (NgramsList)
import Gargantext.API.Node.Document.Export (get_document_json)
import Gargantext.API.Node.Document.Export.Types (DocumentExport)
import Gargantext.API.Node.Document.Export.Types
import Gargantext.API.Prelude (IsGargServer)
import Gargantext.API.Routes.Client (remoteImportClient)
import Gargantext.API.Routes.Named.Remote qualified as Named
import Gargantext.Core.Config
import Gargantext.Core (lookupDBid)
import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv)
import Gargantext.Core.NLP (HasNLPServer)
import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv, HasNodeArchiveStoryImmediateSaver)
import Gargantext.Core.Types.Main
import Gargantext.Core.Worker.Jobs (sendJob)
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Admin.Types.Node hiding (INFO)
import Gargantext.Database.Prelude (IsDBCmd)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError, nodeError, NodeError (..))
import Gargantext.Database.Query.Table.Node (insertNodeWithHyperdata, getNodes, getUserRootPrivateNode)
import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Orphans ()
import Gargantext.System.Logging (logLocM, LogLevel(..), MonadLogger)
import GHC.Generics (Generic)
import Prelude
import Servant.Client.Streaming (mkClientEnv, withClientM, ClientError)
......@@ -64,7 +67,7 @@ data ExportableNode =
instance Serialise ExportableNode where
remoteAPI :: (MonadIO m, IsGargServer env BackendInternalError m)
remoteAPI :: (MonadIO m, IsGargServer env BackendInternalError m, HasNodeArchiveStoryImmediateSaver env)
=> AuthenticatedUser
-> Named.RemoteAPI (AsServerT m)
remoteAPI authenticatedUser = Named.RemoteAPI $
......@@ -80,7 +83,10 @@ remoteImportHandler :: forall err env m.
( HasNodeStoryEnv env
, HasNodeError err
, HasBackendInternalError err
, HasNodeArchiveStoryImmediateSaver env
, IsDBCmd env err m
, HasNLPServer env
, MonadLogger m
, MonadIO m)
=> AuthenticatedUser
-> ConduitT () Named.RemoteBinaryData IO ()
......@@ -92,25 +98,33 @@ remoteImportHandler loggedInUser c = do
case deserialiseOrFail @ExpectedPayload (B.toLazyByteString $ mconcat chunks) of
Left err -> throwError $ _BackendInternalError # InternalUnexpectedError (toException $ userError $ "Deserialization error: " ++ show err)
Right (TreeN x xs) -> do
$(logLocM) INFO $ "Importing " <> T.pack (show $ _node_id $ _en_node $ x)
-- NOTE(adn) By default, we append the imported node(s) to the user's
-- private folder.
privateFolderId <- _node_id <$> getUserRootPrivateNode (_auth_user_id loggedInUser)
$(logLocM) INFO $ "Attaching " <> T.pack (show $ _node_id $ _en_node $ x) <> " to private folder " <> T.pack (show privateFolderId)
-- Attempts to insert nodes a we go along.
rootNode <- insertNode (Just privateFolderId) x
foldlM (insertTrees (Just rootNode)) [rootNode] xs
nodes <- foldlM (insertTrees (Just rootNode)) [rootNode] xs
$(logLocM) INFO $ "Successfully imported all the requested nodes."
pure nodes
where
insertNode :: Maybe NodeId -> ExportableNode -> m NodeId
insertNode mb_parent (ExportableNode x _mb_docs mb_terms) = case lookupDBid $ _node_typename x of
insertNode mb_parent (ExportableNode x mb_docs mb_terms) = case lookupDBid $ _node_typename x of
Nothing -> throwError $ _BackendInternalError # InternalUnexpectedError (toException $ userError $ "remoteImportHandler: impossible, node with invalid type.")
Just ty -> do
new_node <- insertNodeWithHyperdata ty (_node_name x) (_node_hyperdata x) mb_parent (_auth_user_id loggedInUser)
$(logLocM) INFO $ "Created a new node " <> T.pack (show $ new_node) <> " of type " <> T.pack (show ty)
for_ mb_docs $ \docsList -> do
$(logLocM) INFO $ "Found document list to import..."
let payload = Jobs.ImportRemoteDocumentsPayload loggedInUser new_node docsList
void $ sendJob $ Jobs.ImportRemoteDocuments payload
for_ mb_terms $ \ngramsList -> do
$(logLocM) INFO $ "Found ngrams list to import..."
void $ sendJob $ Jobs.JSONPost { _jp_list_id = new_node
, _jp_ngrams_list = ngramsList
}
--for_ mb_docs $ \docsList -> do
-- addToCorpusWithForm user corpusId new_with_form (noJobHandle @m Proxy)
, _jp_ngrams_list = ngramsList
}
pure new_node
insertTrees :: Maybe NodeId -> [NodeId] -> Tree ExportableNode -> m [NodeId]
......@@ -118,8 +132,6 @@ remoteImportHandler loggedInUser c = do
childrenRoot <- insertNode currentParent x
(`mappend` acc) <$> foldlM (insertTrees (Just childrenRoot)) [childrenRoot] xs
remoteExportHandler :: ( MonadIO m, Safe.MonadCatch m
, IsGargServer err env m
)
......@@ -142,8 +154,8 @@ makeExportable userNodeId (TreeN x xs)
| Just nty <- lookupDBid (_node_typename x)
= do
mb_docs <- case nty of
NodeDocument -> Just <$> get_document_json userNodeId (_node_id x)
_ -> pure Nothing
NodeTexts -> Just <$> get_document_json userNodeId (_node_id x)
_ -> pure Nothing
mb_ngrams <- case nty of
NodeList -> Just <$> getNgramsList (_node_id x)
_ -> pure Nothing
......
......@@ -30,7 +30,7 @@ import Gargantext.API.Node.Corpus.Annuaire qualified as Annuaire
import Gargantext.API.Node.Contact (addContact)
import Gargantext.API.Node.Corpus.New (addToCorpusWithForm, addToCorpusWithQuery)
import Gargantext.API.Node.DocumentsFromWriteNodes (documentsFromWriteNodes)
import Gargantext.API.Node.DocumentUpload (documentUploadAsync)
import Gargantext.API.Node.DocumentUpload (documentUploadAsync, remoteImportDocuments)
import Gargantext.API.Node.FrameCalcUpload (frameCalcUploadAsync)
import Gargantext.API.Node.File (addWithFile)
import Gargantext.API.Node.New (postNode')
......@@ -45,7 +45,7 @@ import Gargantext.Core.Viz.Graph.API (graphRecompute)
import Gargantext.Core.Worker.Broker (initBrokerWithDBCreate)
import Gargantext.Core.Worker.Env
import Gargantext.Core.Worker.PGMQTypes (BrokerMessage, HasWorkerBroker, WState)
import Gargantext.Core.Worker.Jobs.Types (Job(..), getWorkerMNodeId)
import Gargantext.Core.Worker.Jobs.Types (Job(..), getWorkerMNodeId, ImportRemoteDocumentsPayload(..))
import Gargantext.Core.Worker.Types (JobInfo(..))
import Gargantext.Database.Query.Table.User (getUsersWithEmail)
import Gargantext.Prelude hiding (to)
......@@ -296,3 +296,8 @@ performAction env _state bm = do
UploadDocument { .. } -> runWorkerMonad env $ do
$(logLocM) DEBUG $ "[performAction] upload document"
void $ documentUploadAsync _ud_node_id _ud_args jh
-- | Remotely import documents
ImportRemoteDocuments (ImportRemoteDocumentsPayload loggedInUser corpusId docs) -> runWorkerMonad env $ do
$(logLocM) DEBUG $ "[performAction] import remote documents"
void $ remoteImportDocuments loggedInUser corpusId docs
......@@ -9,26 +9,51 @@ Portability : POSIX
-}
{-# LANGUAGE TypeApplications #-}
module Gargantext.Core.Worker.Jobs.Types where
import Data.Aeson ((.:), (.=), object, withObject)
import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.Aeson qualified as JS
import Data.Aeson.KeyMap qualified as KM
import Data.Text qualified as T
import Gargantext.API.Admin.Auth.Types (AuthenticatedUser, ForgotPasswordAsyncParams)
import Gargantext.API.Ngrams.Types (NgramsList, UpdateTableNgramsCharts(_utn_list_id))
import Gargantext.API.Node.Corpus.Annuaire (AnnuaireWithForm)
import Gargantext.API.Node.Contact.Types (AddContactParams)
import Gargantext.API.Node.Corpus.Annuaire (AnnuaireWithForm)
import Gargantext.API.Node.Document.Export.Types (DocumentExport)
import Gargantext.API.Node.DocumentsFromWriteNodes.Types qualified as DFWN
import Gargantext.API.Node.DocumentUpload.Types (DocumentUpload)
import Gargantext.API.Node.FrameCalcUpload.Types (FrameCalcUpload)
import Gargantext.API.Node.New.Types ( PostNode(..) )
import Gargantext.API.Node.Update.Types (UpdateNodeParams)
import Gargantext.API.Node.Types (NewWithFile, NewWithForm, WithQuery(..))
import Gargantext.API.Node.Update.Types (UpdateNodeParams)
import Gargantext.Core.Types.Individu (User)
import Gargantext.Database.Admin.Types.Node (AnnuaireId, CorpusId, ListId, NodeId(UnsafeMkNodeId))
import Gargantext.Prelude
data ImportRemoteDocumentsPayload
= ImportRemoteDocumentsPayload
{ _irdp_user :: AuthenticatedUser
, _irdp_corpus_id :: NodeId
, _irdp_document_export :: DocumentExport
} deriving (Show, Eq)
instance ToJSON ImportRemoteDocumentsPayload where
toJSON ImportRemoteDocumentsPayload{..} =
object [ "user" .= _irdp_user
, "corpus_id" .= _irdp_corpus_id
, "document_export" .= _irdp_document_export
]
instance FromJSON ImportRemoteDocumentsPayload where
parseJSON = withObject "ImportRemoteDocumentsPayload" $ \o -> do
_irdp_user <- o .: "user"
_irdp_corpus_id <- o .: "corpus_id"
_irdp_document_export <- o .: "document_export"
pure ImportRemoteDocumentsPayload{..}
data Job =
Ping
......@@ -65,6 +90,7 @@ data Job =
, _un_args :: UpdateNodeParams }
| UploadDocument { _ud_node_id :: NodeId
, _ud_args :: DocumentUpload }
| ImportRemoteDocuments !ImportRemoteDocumentsPayload
deriving (Show, Eq)
instance FromJSON Job where
parseJSON = withObject "Job" $ \o -> do
......@@ -132,6 +158,8 @@ instance FromJSON Job where
_ud_node_id <- o .: "node_id"
_ud_args <- o .: "args"
return $ UploadDocument { .. }
"ImportRemoteDocuments" ->
ImportRemoteDocuments <$> parseJSON (JS.Object o)
s -> prependFailure "parsing job type failed, " (typeMismatch "type" s)
instance ToJSON Job where
toJSON Ping = object [ "type" .= ("Ping" :: Text) ]
......@@ -196,10 +224,12 @@ instance ToJSON Job where
object [ "type" .= ("UploadDocument" :: Text)
, "node_id" .= _ud_node_id
, "args" .= _ud_args ]
toJSON (ImportRemoteDocuments payload) =
case toJSON payload of
(JS.Object o) ->
let o1 = KM.fromList [ ("type", toJSON @T.Text "ImportRemoteDocuments") ]
in JS.Object $ o1 <> o
_ -> errorTrace "impossible, toJSON ImportRemoteDocuments did not return an Object."
-- | We want to have a way to specify 'Maybe NodeId' from given worker
-- parameters. The given 'Maybe CorpusId' is an alternative, when
......@@ -223,3 +253,4 @@ getWorkerMNodeId (PostNodeAsync { _pna_node_id }) = Just _pna_node_id
getWorkerMNodeId (RecomputeGraph { _rg_node_id }) = Just _rg_node_id
getWorkerMNodeId (UpdateNode { _un_node_id }) = Just _un_node_id
getWorkerMNodeId (UploadDocument { _ud_node_id }) = Just _ud_node_id
getWorkerMNodeId (ImportRemoteDocuments (ImportRemoteDocumentsPayload _ corpusId _)) = Just corpusId
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment