[sqlite] first implementation of sqlite export

parent 13457ca8
Pipeline #7332 failed with stages
in 59 minutes and 16 seconds
......@@ -326,6 +326,7 @@ library
Gargantext.API.Node.Contact
Gargantext.API.Node.Corpus.Export
Gargantext.API.Node.Corpus.Export.Types
Gargantext.API.Node.Corpus.Export.Utils
Gargantext.API.Node.Corpus.Searx
Gargantext.API.Node.Corpus.Subcorpus
Gargantext.API.Node.Document.Export
......@@ -603,6 +604,7 @@ library
, singletons ^>= 3.0.2
, singletons-th >= 3.1 && < 3.2
, smtp-mail >= 0.3.0.0
, sqlite-simple >= 0.4.19 && < 0.5
, stemmer == 0.5.2
, stm >= 2.5.1.0 && < 2.6
, stm-containers >= 1.2.0.3 && < 1.3
......
{-# LANGUAGE TypeOperators #-}
{-|
Module : Gargantext.API.Node.Corpus.Export
Description : Corpus export
......@@ -17,27 +16,22 @@ Main exports of Gargantext:
module Gargantext.API.Node.Corpus.Export
where
import Data.HashMap.Strict qualified as HashMap
import Control.Exception.Safe qualified as CES
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Text (pack)
import Gargantext.API.Ngrams.Tools (filterListWithRoot, mapTermListRoot, getRepo)
import Gargantext.API.Ngrams.Tools (getRepo)
import Gargantext.API.Ngrams.Types ( NgramsTerm(unNgramsTerm) )
import Gargantext.API.Node.Corpus.Export.Types ( Corpus(..) )
import Gargantext.API.Node.Corpus.Export.Types ( Corpus(..), CorpusSQLite(..) )
import Gargantext.API.Node.Corpus.Export.Utils (getContextNgrams, mkCorpusSQLite)
import Gargantext.API.Node.Document.Export.Types qualified as DocumentExport
import Gargantext.API.Prelude (IsGargServer)
import Gargantext.Core.NodeStory.Types ( NodeListStory )
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Types.Main ( ListType(MapTerm) )
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Action.Metrics.NgramsByContext (getNgramsByContextOnlyUser)
import Gargantext.Database.Admin.Config (userMaster)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Database.Prelude (DBCmdExtra)
import Gargantext.Database.Query.Table.Node ( defaultList )
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
import Gargantext.Database.Schema.Context (_context_id)
import Gargantext.Prelude hiding (hash)
......@@ -48,10 +42,13 @@ import qualified Gargantext.API.Routes.Named.Corpus as Named
--------------------------------------------------
-- | Hashes are ordered by Set
getCorpus :: forall env err m. IsGargServer env err m
getCorpus :: (CES.MonadMask m, IsGargServer env err m)
=> CorpusId
-> Named.CorpusExportAPI (AsServerT m)
getCorpus cId = Named.CorpusExportAPI $ \lId nt' -> get_corpus lId nt'
getCorpus cId = Named.CorpusExportAPI {
Named.corpusExportEp = get_corpus
, Named.corpusSQLiteEp = getCorpusSQLite cId
}
where
get_corpus :: IsGargServer env err m
......@@ -89,23 +86,15 @@ getCorpus cId = Named.CorpusExportAPI $ \lId nt' -> get_corpus lId nt'
$ Corpus { _c_corpus = Map.elems r
, _c_hash = hash $ List.map DocumentExport._d_hash $ Map.elems r }
getContextNgrams :: HasNodeError err
=> CorpusId
-> ListId
-> ListType
-> NgramsType
-> NodeListStory
-> DBCmdExtra err (Map ContextId (Set NgramsTerm))
getContextNgrams cId lId listType nt repo = do
-- lId <- case lId' of
-- Nothing -> defaultList cId
-- Just l -> pure l
lIds <- selectNodesWithUsername NodeList userMaster
let ngs = filterListWithRoot [listType] $ mapTermListRoot [lId] nt repo
-- TODO HashMap
r <- getNgramsByContextOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs)
pure r
getCorpusSQLite :: (CES.MonadMask m, IsGargServer env err m)
=> CorpusId
-> Maybe ListId
-> m (Headers '[Header "Content-Disposition" Text] CorpusSQLite)
getCorpusSQLite cId lId = do
corpusSQLite <- mkCorpusSQLite cId lId
pure $ addHeader ("attachment; filename=GarganText_corpus-" <> pack (show cId) <> ".sqlite")
$ corpusSQLite
-- TODO
-- Exports List
......
......@@ -13,14 +13,13 @@ Portability : POSIX
module Gargantext.API.Node.Corpus.Export.Types where
import Data.Aeson.TH (deriveJSON)
import Data.Swagger ( ToSchema(..), genericDeclareNamedSchema, ToParamSchema(..) )
import Data.Text (Text)
import GHC.Generics (Generic)
import Data.ByteString.Lazy qualified as BSL
import Data.Swagger ( ToSchema(..), genericDeclareNamedSchema, ToParamSchema(..), NamedSchema(..), binarySchema )
import Gargantext.API.Node.Document.Export.Types qualified as DocumentExport
import Gargantext.Core.Types ( TODO )
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Servant
import Gargantext.Prelude
import Servant (Accept(..), MimeRender(mimeRender), OctetStream)
-- Corpus Export
......@@ -37,3 +36,17 @@ instance ToSchema Corpus where
instance ToParamSchema Corpus where
toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
$(deriveJSON (unPrefix "_c_") ''Corpus)
newtype CorpusSQLite =
CorpusSQLite { _cs_bs :: BSL.ByteString }
deriving (Generic)
instance Accept CorpusSQLite where
contentType _ = contentType (Proxy :: Proxy OctetStream)
instance MimeRender OctetStream CorpusSQLite where
mimeRender _ (CorpusSQLite bs) = bs
instance ToSchema CorpusSQLite where
declareNamedSchema _ = pure $ NamedSchema (Just "CorpusSQLite") binarySchema
{-|
Module : Gargantext.API.Node.Corpus.Export.Utils
Description : Corpus export
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TypeApplications #-}
module Gargantext.API.Node.Corpus.Export.Utils
where
import Control.Exception.Safe qualified as CES
import Data.Aeson qualified as Aeson
import Data.ByteString.Lazy qualified as BSL
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Version (showVersion)
import Database.SQLite.Simple qualified as S
import Gargantext.API.Ngrams.Tools (filterListWithRoot, mapTermListRoot, getRepo)
import Gargantext.API.Ngrams.Types ( NgramsTerm(unNgramsTerm) )
import Gargantext.API.Node.Corpus.Export.Types (CorpusSQLite(..))
import Gargantext.Core.NodeStory.Types ( HasNodeStoryEnv, NodeListStory )
import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms))
import Gargantext.Core.Types (CorpusId, ListId, NodeType(NodeList))
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Database.Action.Metrics.NgramsByContext (getNgramsByContextOnlyUser)
import Gargantext.Database.Admin.Config (userMaster)
import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Hyperdata.List (HyperdataList)
import Gargantext.Database.Admin.Types.Node (unNodeId, ContextId(_ContextId))
import Gargantext.Database.Prelude (DBCmd, IsDBCmd)
import Gargantext.Database.Schema.Context (context_id, context_name, context_date, context_hyperdata)
import Gargantext.Database.Schema.Node (node_hash_id, node_hyperdata, node_name, node_parent_id)
import Gargantext.Database.Query.Table.Node ( defaultList, getNodeWith )
import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
import Gargantext.Prelude
import Paths_gargantext qualified as PG -- cabal magic build module
import System.Directory (removeDirectoryRecursive)
import System.IO.Temp (createTempDirectory, getCanonicalTemporaryDirectory)
getContextNgrams :: HasNodeError err
=> CorpusId
-> ListId
-> ListType
-> NgramsType
-> NodeListStory
-> DBCmd err (Map ContextId (Set NgramsTerm))
getContextNgrams cId lId listType nt repo = do
-- lId <- case lId' of
-- Nothing -> defaultList cId
-- Just l -> pure l
lIds <- selectNodesWithUsername NodeList userMaster
let ngs = filterListWithRoot [listType] $ mapTermListRoot [lId] nt repo
-- TODO HashMap
r <- getNgramsByContextOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs)
pure r
mkCorpusSQLite :: ( CES.MonadMask m
, HasNodeStoryEnv env
, HasNodeError err
, IsDBCmd env err m )
=> CorpusId
-> Maybe ListId
-> m CorpusSQLite
mkCorpusSQLite cId lId =
CES.bracket setup tearDown $ \(fp, _fname, fpath) -> do
corpus <- getNodeWith cId (Proxy @HyperdataCorpus)
listId <- case lId of
Nothing -> defaultList cId
Just l -> pure l
l <- getNodeWith listId (Proxy @HyperdataList)
repo <- getRepo [listId]
let nt = NgramsTerms
mapNgs <- getContextNgrams cId listId MapTerm nt repo
stopNgs <- getContextNgrams cId listId StopTerm nt repo
candidateNgs <- getContextNgrams cId listId CandidateTerm nt repo
docs <- selectDocNodes cId
liftBase $ putText $ "[mkCorpusSQLite] listId: " <> show listId
liftBase $ putText $ "[mkCorpusSQLite] fp: " <> show fp
liftBase $ S.withConnection fpath $ \conn -> do
-- better performance
-- https://kerkour.com/sqlite-for-servers
S.execute_ conn "PRAGMA journal_mode = WAL"
S.execute_ conn "CREATE TABLE info (key, value);"
S.execute conn "INSERT INTO info (key, value) VALUES ('gargVersion', ?)" (S.Only $ showVersion PG.version)
S.execute conn "INSERT INTO info (key, value) VALUES ('corpusId', ?)" (S.Only $ unNodeId cId)
S.execute conn "INSERT INTO info (key, value) VALUES ('listId', ?)" (S.Only $ unNodeId listId)
S.execute_ conn "INSERT INTO info (key, value) VALUES ('created', datetime())"
S.execute_ conn "CREATE TABLE corpus (id, name, hash, parent_id, hyperdata)"
S.execute conn "INSERT INTO corpus (id, name, hash, parent_id, hyperdata) VALUES (?, ?, ?, ?, ?)"
(unNodeId cId, corpus ^. node_name, corpus ^. node_hash_id, unNodeId <$> (corpus ^. node_parent_id), Aeson.encode (corpus ^. node_hyperdata))
S.execute_ conn "CREATE TABLE lists (id, name, parent_id, hyperdata)"
S.execute conn "INSERT INTO lists (id, name, parent_id, hyperdata) VALUES (?, ?, ?, ?)"
(unNodeId listId, l ^. node_name, unNodeId <$> (l ^. node_parent_id), Aeson.encode (l ^. node_hyperdata))
S.execute_ conn "CREATE TABLE ngrams (context_id, terms, type_)"
let insertTerms ngs type_ = do
let ngs' = concatMap (\(ctxId, ngrams) -> (\n -> (_ContextId ctxId, unNgramsTerm n, type_)) <$> Set.toList ngrams) (Map.toList ngs)
S.executeMany conn "INSERT INTO ngrams (context_id, terms, type_) VALUES (?, ?, ?)" ngs'
insertTerms mapNgs ("map" :: Text)
insertTerms stopNgs ("stop" :: Text)
insertTerms candidateNgs ("candidate" :: Text)
S.execute_ conn "CREATE TABLE documents (context_id, name, date, hyperdata)"
S.executeMany conn "INSERT INTO documents (context_id, name, date, hyperdata) VALUES (?, ?, ?, ?)"
((\c -> (unNodeId (c ^. context_id), c ^. context_name, c ^. context_date, Aeson.encode (c ^. context_hyperdata))) <$> docs)
bsl <- liftBase $ BSL.readFile fpath
pure $ CorpusSQLite bsl
where
setup = do
tmpDir <- liftBase getCanonicalTemporaryDirectory
fp <- liftBase $ createTempDirectory tmpDir "sqlite"
let fname = "gargantext.sqlite"
let fpath = fp <> "/" <> fname
pure (fp, fname, fpath)
tearDown (fp, _fname, _fpath) = do
liftBase $ removeDirectoryRecursive fp
......@@ -25,7 +25,7 @@ import Data.Aeson.TH (deriveJSON)
import Data.Swagger (ToSchema(..), genericDeclareNamedSchema)
import Data.Text (Text)
import GHC.Generics
import Gargantext.API.Node.Corpus.Export.Types (Corpus)
import Gargantext.API.Node.Corpus.Export.Types (Corpus, CorpusSQLite)
import Gargantext.API.Node.Types (NewWithForm, WithQuery)
import Gargantext.API.Worker (WorkerAPI)
import Gargantext.Core.Text.Ngrams (NgramsType(..))
......@@ -35,12 +35,16 @@ import Gargantext.Prelude (Bool)
import Servant
--------------------------------------------------
newtype CorpusExportAPI mode = CorpusExportAPI
data CorpusExportAPI mode = CorpusExportAPI
{ corpusExportEp :: mode :- Summary "Corpus Export"
:> "export"
:> QueryParam "listId" ListId
:> QueryParam "ngramsType" NgramsType
:> Get '[JSON] (Headers '[Servant.Header "Content-Disposition" Text] Corpus)
, corpusSQLiteEp :: mode :- Summary "Corpus SQLite export"
:> "sqlite"
:> QueryParam "listId" ListId
:> Get '[OctetStream] (Headers '[Servant.Header "Content-Disposition" Text] CorpusSQLite)
} deriving Generic
newtype AddWithForm mode = AddWithForm
......
......@@ -43,27 +43,27 @@ serverPrivateGargAPI'
:: AuthenticatedUser -> Named.GargPrivateAPI' (AsServerT (GargM Env BackendInternalError))
serverPrivateGargAPI' authenticatedUser@(AuthenticatedUser userNodeId userId)
= Named.GargPrivateAPI'
{ gargAdminAPI = serverGargAdminAPI
, nodeEp = nodeAPI authenticatedUser
, contextEp = contextAPI (Proxy :: Proxy HyperdataAny) authenticatedUser
, corpusNodeAPI = corpusNodeAPI authenticatedUser
, corpusNodeNodeAPI = nodeNodeAPI (Proxy :: Proxy HyperdataAny) authenticatedUser
, corpusExportAPI = CorpusExport.getCorpus
, annuaireEp = annuaireNodeAPI authenticatedUser
, contactAPI = contactAPI authenticatedUser
, tableNgramsAPI = apiNgramsTableDoc authenticatedUser
, phyloExportAPI = PhyloExport.api userNodeId
, documentExportAPI = documentExportAPI userNodeId
, countAPI = Count.countAPI
, graphAPI = Viz.graphAPI authenticatedUser userId
, treeAPI = Tree.treeAPI authenticatedUser
, treeFlatAPI = Tree.treeFlatAPI authenticatedUser
, membersAPI = members
, addWithFormAPI = addCorpusWithForm (RootId userNodeId)
, addWithQueryEp = addCorpusWithQuery (RootId userNodeId)
, makeSubcorpusAPI = Subcorpus.makeSubcorpus userId
, listGetAPI = List.getAPI
, listJsonAPI = List.jsonAPI
, listTsvAPI = List.tsvAPI
, shareUrlAPI = shareURL
{ gargAdminAPI = serverGargAdminAPI
, nodeEp = nodeAPI authenticatedUser
, contextEp = contextAPI (Proxy :: Proxy HyperdataAny) authenticatedUser
, corpusNodeAPI = corpusNodeAPI authenticatedUser
, corpusNodeNodeAPI = nodeNodeAPI (Proxy :: Proxy HyperdataAny) authenticatedUser
, corpusExportAPI = CorpusExport.getCorpus
, annuaireEp = annuaireNodeAPI authenticatedUser
, contactAPI = contactAPI authenticatedUser
, tableNgramsAPI = apiNgramsTableDoc authenticatedUser
, phyloExportAPI = PhyloExport.api userNodeId
, documentExportAPI = documentExportAPI userNodeId
, countAPI = Count.countAPI
, graphAPI = Viz.graphAPI authenticatedUser userId
, treeAPI = Tree.treeAPI authenticatedUser
, treeFlatAPI = Tree.treeFlatAPI authenticatedUser
, membersAPI = members
, addWithFormAPI = addCorpusWithForm (RootId userNodeId)
, addWithQueryEp = addCorpusWithQuery (RootId userNodeId)
, makeSubcorpusAPI = Subcorpus.makeSubcorpus userId
, listGetAPI = List.getAPI
, listJsonAPI = List.jsonAPI
, listTsvAPI = List.tsvAPI
, shareUrlAPI = shareURL
}
......@@ -305,6 +305,7 @@ selectNodesIdWithType nt = proc () -> do
restrict -< tn .== (sqlInt4 $ toDBid nt)
returnA -< _node_id row
-- | Get node, Hyperdata is 'Aeson.Value'
getNode :: HasNodeError err => NodeId -> DBCmd err (Node Value)
getNode nId = do
maybeNode <- headMay <$> runOpaQuery (selectNode (pgNodeId nId))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment