[sqlite] first implementation of sqlite export

parent 13457ca8
Pipeline #7332 failed with stages
in 59 minutes and 16 seconds
...@@ -326,6 +326,7 @@ library ...@@ -326,6 +326,7 @@ library
Gargantext.API.Node.Contact Gargantext.API.Node.Contact
Gargantext.API.Node.Corpus.Export Gargantext.API.Node.Corpus.Export
Gargantext.API.Node.Corpus.Export.Types Gargantext.API.Node.Corpus.Export.Types
Gargantext.API.Node.Corpus.Export.Utils
Gargantext.API.Node.Corpus.Searx Gargantext.API.Node.Corpus.Searx
Gargantext.API.Node.Corpus.Subcorpus Gargantext.API.Node.Corpus.Subcorpus
Gargantext.API.Node.Document.Export Gargantext.API.Node.Document.Export
...@@ -603,6 +604,7 @@ library ...@@ -603,6 +604,7 @@ library
, singletons ^>= 3.0.2 , singletons ^>= 3.0.2
, singletons-th >= 3.1 && < 3.2 , singletons-th >= 3.1 && < 3.2
, smtp-mail >= 0.3.0.0 , smtp-mail >= 0.3.0.0
, sqlite-simple >= 0.4.19 && < 0.5
, stemmer == 0.5.2 , stemmer == 0.5.2
, stm >= 2.5.1.0 && < 2.6 , stm >= 2.5.1.0 && < 2.6
, stm-containers >= 1.2.0.3 && < 1.3 , stm-containers >= 1.2.0.3 && < 1.3
......
{-# LANGUAGE TypeOperators #-}
{-| {-|
Module : Gargantext.API.Node.Corpus.Export Module : Gargantext.API.Node.Corpus.Export
Description : Corpus export Description : Corpus export
...@@ -17,27 +16,22 @@ Main exports of Gargantext: ...@@ -17,27 +16,22 @@ Main exports of Gargantext:
module Gargantext.API.Node.Corpus.Export module Gargantext.API.Node.Corpus.Export
where where
import Data.HashMap.Strict qualified as HashMap import Control.Exception.Safe qualified as CES
import Data.List qualified as List import Data.List qualified as List
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Data.Set qualified as Set import Data.Set qualified as Set
import Data.Text (pack) import Data.Text (pack)
import Gargantext.API.Ngrams.Tools (filterListWithRoot, mapTermListRoot, getRepo) import Gargantext.API.Ngrams.Tools (getRepo)
import Gargantext.API.Ngrams.Types ( NgramsTerm(unNgramsTerm) ) import Gargantext.API.Ngrams.Types ( NgramsTerm(unNgramsTerm) )
import Gargantext.API.Node.Corpus.Export.Types ( Corpus(..) ) import Gargantext.API.Node.Corpus.Export.Types ( Corpus(..), CorpusSQLite(..) )
import Gargantext.API.Node.Corpus.Export.Utils (getContextNgrams, mkCorpusSQLite)
import Gargantext.API.Node.Document.Export.Types qualified as DocumentExport import Gargantext.API.Node.Document.Export.Types qualified as DocumentExport
import Gargantext.API.Prelude (IsGargServer) import Gargantext.API.Prelude (IsGargServer)
import Gargantext.Core.NodeStory.Types ( NodeListStory )
import Gargantext.Core.Text.Ngrams (NgramsType(..)) import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Types.Main ( ListType(MapTerm) ) import Gargantext.Core.Types.Main ( ListType(MapTerm) )
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Action.Metrics.NgramsByContext (getNgramsByContextOnlyUser)
import Gargantext.Database.Admin.Config (userMaster)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Database.Prelude (DBCmdExtra)
import Gargantext.Database.Query.Table.Node ( defaultList ) import Gargantext.Database.Query.Table.Node ( defaultList )
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
import Gargantext.Database.Query.Table.NodeContext (selectDocNodes) import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
import Gargantext.Database.Schema.Context (_context_id) import Gargantext.Database.Schema.Context (_context_id)
import Gargantext.Prelude hiding (hash) import Gargantext.Prelude hiding (hash)
...@@ -48,10 +42,13 @@ import qualified Gargantext.API.Routes.Named.Corpus as Named ...@@ -48,10 +42,13 @@ import qualified Gargantext.API.Routes.Named.Corpus as Named
-------------------------------------------------- --------------------------------------------------
-- | Hashes are ordered by Set -- | Hashes are ordered by Set
getCorpus :: forall env err m. IsGargServer env err m getCorpus :: (CES.MonadMask m, IsGargServer env err m)
=> CorpusId => CorpusId
-> Named.CorpusExportAPI (AsServerT m) -> Named.CorpusExportAPI (AsServerT m)
getCorpus cId = Named.CorpusExportAPI $ \lId nt' -> get_corpus lId nt' getCorpus cId = Named.CorpusExportAPI {
Named.corpusExportEp = get_corpus
, Named.corpusSQLiteEp = getCorpusSQLite cId
}
where where
get_corpus :: IsGargServer env err m get_corpus :: IsGargServer env err m
...@@ -89,23 +86,15 @@ getCorpus cId = Named.CorpusExportAPI $ \lId nt' -> get_corpus lId nt' ...@@ -89,23 +86,15 @@ getCorpus cId = Named.CorpusExportAPI $ \lId nt' -> get_corpus lId nt'
$ Corpus { _c_corpus = Map.elems r $ Corpus { _c_corpus = Map.elems r
, _c_hash = hash $ List.map DocumentExport._d_hash $ Map.elems r } , _c_hash = hash $ List.map DocumentExport._d_hash $ Map.elems r }
getContextNgrams :: HasNodeError err
=> CorpusId
-> ListId
-> ListType
-> NgramsType
-> NodeListStory
-> DBCmdExtra err (Map ContextId (Set NgramsTerm))
getContextNgrams cId lId listType nt repo = do
-- lId <- case lId' of
-- Nothing -> defaultList cId
-- Just l -> pure l
lIds <- selectNodesWithUsername NodeList userMaster getCorpusSQLite :: (CES.MonadMask m, IsGargServer env err m)
let ngs = filterListWithRoot [listType] $ mapTermListRoot [lId] nt repo => CorpusId
-- TODO HashMap -> Maybe ListId
r <- getNgramsByContextOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs) -> m (Headers '[Header "Content-Disposition" Text] CorpusSQLite)
pure r getCorpusSQLite cId lId = do
corpusSQLite <- mkCorpusSQLite cId lId
pure $ addHeader ("attachment; filename=GarganText_corpus-" <> pack (show cId) <> ".sqlite")
$ corpusSQLite
-- TODO -- TODO
-- Exports List -- Exports List
......
...@@ -13,14 +13,13 @@ Portability : POSIX ...@@ -13,14 +13,13 @@ Portability : POSIX
module Gargantext.API.Node.Corpus.Export.Types where module Gargantext.API.Node.Corpus.Export.Types where
import Data.Aeson.TH (deriveJSON) import Data.ByteString.Lazy qualified as BSL
import Data.Swagger ( ToSchema(..), genericDeclareNamedSchema, ToParamSchema(..) ) import Data.Swagger ( ToSchema(..), genericDeclareNamedSchema, ToParamSchema(..), NamedSchema(..), binarySchema )
import Data.Text (Text)
import GHC.Generics (Generic)
import Gargantext.API.Node.Document.Export.Types qualified as DocumentExport import Gargantext.API.Node.Document.Export.Types qualified as DocumentExport
import Gargantext.Core.Types ( TODO ) import Gargantext.Core.Types ( TODO )
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Servant import Gargantext.Prelude
import Servant (Accept(..), MimeRender(mimeRender), OctetStream)
-- Corpus Export -- Corpus Export
...@@ -37,3 +36,17 @@ instance ToSchema Corpus where ...@@ -37,3 +36,17 @@ instance ToSchema Corpus where
instance ToParamSchema Corpus where instance ToParamSchema Corpus where
toParamSchema _ = toParamSchema (Proxy :: Proxy TODO) toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
$(deriveJSON (unPrefix "_c_") ''Corpus) $(deriveJSON (unPrefix "_c_") ''Corpus)
newtype CorpusSQLite =
CorpusSQLite { _cs_bs :: BSL.ByteString }
deriving (Generic)
instance Accept CorpusSQLite where
contentType _ = contentType (Proxy :: Proxy OctetStream)
instance MimeRender OctetStream CorpusSQLite where
mimeRender _ (CorpusSQLite bs) = bs
instance ToSchema CorpusSQLite where
declareNamedSchema _ = pure $ NamedSchema (Just "CorpusSQLite") binarySchema
{-|
Module : Gargantext.API.Node.Corpus.Export.Utils
Description : Corpus export
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TypeApplications #-}
module Gargantext.API.Node.Corpus.Export.Utils
where
import Control.Exception.Safe qualified as CES
import Data.Aeson qualified as Aeson
import Data.ByteString.Lazy qualified as BSL
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Version (showVersion)
import Database.SQLite.Simple qualified as S
import Gargantext.API.Ngrams.Tools (filterListWithRoot, mapTermListRoot, getRepo)
import Gargantext.API.Ngrams.Types ( NgramsTerm(unNgramsTerm) )
import Gargantext.API.Node.Corpus.Export.Types (CorpusSQLite(..))
import Gargantext.Core.NodeStory.Types ( HasNodeStoryEnv, NodeListStory )
import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms))
import Gargantext.Core.Types (CorpusId, ListId, NodeType(NodeList))
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Database.Action.Metrics.NgramsByContext (getNgramsByContextOnlyUser)
import Gargantext.Database.Admin.Config (userMaster)
import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Hyperdata.List (HyperdataList)
import Gargantext.Database.Admin.Types.Node (unNodeId, ContextId(_ContextId))
import Gargantext.Database.Prelude (DBCmd, IsDBCmd)
import Gargantext.Database.Schema.Context (context_id, context_name, context_date, context_hyperdata)
import Gargantext.Database.Schema.Node (node_hash_id, node_hyperdata, node_name, node_parent_id)
import Gargantext.Database.Query.Table.Node ( defaultList, getNodeWith )
import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
import Gargantext.Prelude
import Paths_gargantext qualified as PG -- cabal magic build module
import System.Directory (removeDirectoryRecursive)
import System.IO.Temp (createTempDirectory, getCanonicalTemporaryDirectory)
getContextNgrams :: HasNodeError err
=> CorpusId
-> ListId
-> ListType
-> NgramsType
-> NodeListStory
-> DBCmd err (Map ContextId (Set NgramsTerm))
getContextNgrams cId lId listType nt repo = do
-- lId <- case lId' of
-- Nothing -> defaultList cId
-- Just l -> pure l
lIds <- selectNodesWithUsername NodeList userMaster
let ngs = filterListWithRoot [listType] $ mapTermListRoot [lId] nt repo
-- TODO HashMap
r <- getNgramsByContextOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs)
pure r
mkCorpusSQLite :: ( CES.MonadMask m
, HasNodeStoryEnv env
, HasNodeError err
, IsDBCmd env err m )
=> CorpusId
-> Maybe ListId
-> m CorpusSQLite
mkCorpusSQLite cId lId =
CES.bracket setup tearDown $ \(fp, _fname, fpath) -> do
corpus <- getNodeWith cId (Proxy @HyperdataCorpus)
listId <- case lId of
Nothing -> defaultList cId
Just l -> pure l
l <- getNodeWith listId (Proxy @HyperdataList)
repo <- getRepo [listId]
let nt = NgramsTerms
mapNgs <- getContextNgrams cId listId MapTerm nt repo
stopNgs <- getContextNgrams cId listId StopTerm nt repo
candidateNgs <- getContextNgrams cId listId CandidateTerm nt repo
docs <- selectDocNodes cId
liftBase $ putText $ "[mkCorpusSQLite] listId: " <> show listId
liftBase $ putText $ "[mkCorpusSQLite] fp: " <> show fp
liftBase $ S.withConnection fpath $ \conn -> do
-- better performance
-- https://kerkour.com/sqlite-for-servers
S.execute_ conn "PRAGMA journal_mode = WAL"
S.execute_ conn "CREATE TABLE info (key, value);"
S.execute conn "INSERT INTO info (key, value) VALUES ('gargVersion', ?)" (S.Only $ showVersion PG.version)
S.execute conn "INSERT INTO info (key, value) VALUES ('corpusId', ?)" (S.Only $ unNodeId cId)
S.execute conn "INSERT INTO info (key, value) VALUES ('listId', ?)" (S.Only $ unNodeId listId)
S.execute_ conn "INSERT INTO info (key, value) VALUES ('created', datetime())"
S.execute_ conn "CREATE TABLE corpus (id, name, hash, parent_id, hyperdata)"
S.execute conn "INSERT INTO corpus (id, name, hash, parent_id, hyperdata) VALUES (?, ?, ?, ?, ?)"
(unNodeId cId, corpus ^. node_name, corpus ^. node_hash_id, unNodeId <$> (corpus ^. node_parent_id), Aeson.encode (corpus ^. node_hyperdata))
S.execute_ conn "CREATE TABLE lists (id, name, parent_id, hyperdata)"
S.execute conn "INSERT INTO lists (id, name, parent_id, hyperdata) VALUES (?, ?, ?, ?)"
(unNodeId listId, l ^. node_name, unNodeId <$> (l ^. node_parent_id), Aeson.encode (l ^. node_hyperdata))
S.execute_ conn "CREATE TABLE ngrams (context_id, terms, type_)"
let insertTerms ngs type_ = do
let ngs' = concatMap (\(ctxId, ngrams) -> (\n -> (_ContextId ctxId, unNgramsTerm n, type_)) <$> Set.toList ngrams) (Map.toList ngs)
S.executeMany conn "INSERT INTO ngrams (context_id, terms, type_) VALUES (?, ?, ?)" ngs'
insertTerms mapNgs ("map" :: Text)
insertTerms stopNgs ("stop" :: Text)
insertTerms candidateNgs ("candidate" :: Text)
S.execute_ conn "CREATE TABLE documents (context_id, name, date, hyperdata)"
S.executeMany conn "INSERT INTO documents (context_id, name, date, hyperdata) VALUES (?, ?, ?, ?)"
((\c -> (unNodeId (c ^. context_id), c ^. context_name, c ^. context_date, Aeson.encode (c ^. context_hyperdata))) <$> docs)
bsl <- liftBase $ BSL.readFile fpath
pure $ CorpusSQLite bsl
where
setup = do
tmpDir <- liftBase getCanonicalTemporaryDirectory
fp <- liftBase $ createTempDirectory tmpDir "sqlite"
let fname = "gargantext.sqlite"
let fpath = fp <> "/" <> fname
pure (fp, fname, fpath)
tearDown (fp, _fname, _fpath) = do
liftBase $ removeDirectoryRecursive fp
...@@ -25,7 +25,7 @@ import Data.Aeson.TH (deriveJSON) ...@@ -25,7 +25,7 @@ import Data.Aeson.TH (deriveJSON)
import Data.Swagger (ToSchema(..), genericDeclareNamedSchema) import Data.Swagger (ToSchema(..), genericDeclareNamedSchema)
import Data.Text (Text) import Data.Text (Text)
import GHC.Generics import GHC.Generics
import Gargantext.API.Node.Corpus.Export.Types (Corpus) import Gargantext.API.Node.Corpus.Export.Types (Corpus, CorpusSQLite)
import Gargantext.API.Node.Types (NewWithForm, WithQuery) import Gargantext.API.Node.Types (NewWithForm, WithQuery)
import Gargantext.API.Worker (WorkerAPI) import Gargantext.API.Worker (WorkerAPI)
import Gargantext.Core.Text.Ngrams (NgramsType(..)) import Gargantext.Core.Text.Ngrams (NgramsType(..))
...@@ -35,12 +35,16 @@ import Gargantext.Prelude (Bool) ...@@ -35,12 +35,16 @@ import Gargantext.Prelude (Bool)
import Servant import Servant
-------------------------------------------------- --------------------------------------------------
newtype CorpusExportAPI mode = CorpusExportAPI data CorpusExportAPI mode = CorpusExportAPI
{ corpusExportEp :: mode :- Summary "Corpus Export" { corpusExportEp :: mode :- Summary "Corpus Export"
:> "export" :> "export"
:> QueryParam "listId" ListId :> QueryParam "listId" ListId
:> QueryParam "ngramsType" NgramsType :> QueryParam "ngramsType" NgramsType
:> Get '[JSON] (Headers '[Servant.Header "Content-Disposition" Text] Corpus) :> Get '[JSON] (Headers '[Servant.Header "Content-Disposition" Text] Corpus)
, corpusSQLiteEp :: mode :- Summary "Corpus SQLite export"
:> "sqlite"
:> QueryParam "listId" ListId
:> Get '[OctetStream] (Headers '[Servant.Header "Content-Disposition" Text] CorpusSQLite)
} deriving Generic } deriving Generic
newtype AddWithForm mode = AddWithForm newtype AddWithForm mode = AddWithForm
......
...@@ -305,6 +305,7 @@ selectNodesIdWithType nt = proc () -> do ...@@ -305,6 +305,7 @@ selectNodesIdWithType nt = proc () -> do
restrict -< tn .== (sqlInt4 $ toDBid nt) restrict -< tn .== (sqlInt4 $ toDBid nt)
returnA -< _node_id row returnA -< _node_id row
-- | Get node, Hyperdata is 'Aeson.Value'
getNode :: HasNodeError err => NodeId -> DBCmd err (Node Value) getNode :: HasNodeError err => NodeId -> DBCmd err (Node Value)
getNode nId = do getNode nId = do
maybeNode <- headMay <$> runOpaQuery (selectNode (pgNodeId nId)) maybeNode <- headMay <$> runOpaQuery (selectNode (pgNodeId nId))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment