[sqlite] implement ngrams reading for CorpusSQLiteData

parent 625d6bbe
...@@ -19,12 +19,12 @@ import Data.ByteString.Lazy qualified as BSL ...@@ -19,12 +19,12 @@ import Data.ByteString.Lazy qualified as BSL
import Data.HashMap.Strict qualified as HashMap import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Data.Set qualified as Set import Data.Set qualified as Set
import Data.Time.Clock (getCurrentTime) import Data.Time.Clock (getCurrentTime, UTCTime)
import Data.Time.Format.ISO8601 (iso8601ParseM, iso8601Show) import Data.Time.Format.ISO8601 (iso8601ParseM, iso8601Show)
import Data.Version (parseVersion, showVersion) import Data.Version (parseVersion, showVersion)
import Database.SQLite.Simple qualified as S import Database.SQLite.Simple qualified as S
import Gargantext.API.Ngrams.Tools (filterListWithRoot, mapTermListRoot, getRepo) import Gargantext.API.Ngrams.Tools (filterListWithRoot, mapTermListRoot, getRepo)
import Gargantext.API.Ngrams.Types ( NgramsTerm(unNgramsTerm) ) import Gargantext.API.Ngrams.Types ( NgramsTerm(..) )
import Gargantext.API.Node.Corpus.Export.Types (CorpusSQLite(..), CorpusSQLiteData(..)) import Gargantext.API.Node.Corpus.Export.Types (CorpusSQLite(..), CorpusSQLiteData(..))
import Gargantext.Core.NodeStory.Types ( HasNodeStoryEnv, NodeListStory ) import Gargantext.Core.NodeStory.Types ( HasNodeStoryEnv, NodeListStory )
import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms)) import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms))
...@@ -33,8 +33,9 @@ import Gargantext.Core.Types.Main (ListType(..)) ...@@ -33,8 +33,9 @@ import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Database.Action.Metrics.NgramsByContext (getNgramsByContextOnlyUser) import Gargantext.Database.Action.Metrics.NgramsByContext (getNgramsByContextOnlyUser)
import Gargantext.Database.Admin.Config (userMaster) import Gargantext.Database.Admin.Config (userMaster)
import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus) import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument)
import Gargantext.Database.Admin.Types.Hyperdata.List (HyperdataList) import Gargantext.Database.Admin.Types.Hyperdata.List (HyperdataList)
import Gargantext.Database.Admin.Types.Node (unNodeId, ContextId(_ContextId), NodeId(UnsafeMkNodeId)) import Gargantext.Database.Admin.Types.Node (unNodeId, ContextId(..), NodeId(UnsafeMkNodeId))
import Gargantext.Database.Prelude (DBCmd, IsDBCmd) import Gargantext.Database.Prelude (DBCmd, IsDBCmd)
import Gargantext.Database.Schema.Context (context_id, context_name, context_date, context_hyperdata) import Gargantext.Database.Schema.Context (context_id, context_name, context_date, context_hyperdata)
import Gargantext.Database.Schema.Node (node_hash_id, node_hyperdata, node_name, node_parent_id) import Gargantext.Database.Schema.Node (node_hash_id, node_hyperdata, node_name, node_parent_id)
...@@ -178,7 +179,7 @@ readCorpusSQLite :: ( CES.MonadMask m ...@@ -178,7 +179,7 @@ readCorpusSQLite :: ( CES.MonadMask m
=> CorpusSQLite => CorpusSQLite
-> m (Either Text CorpusSQLiteData) -> m (Either Text CorpusSQLiteData)
readCorpusSQLite (CorpusSQLite { _cs_bs }) = withTempSQLiteDir $ \(_fp, _fname, fpath) -> liftBase $ do readCorpusSQLite (CorpusSQLite { _cs_bs }) = withTempSQLiteDir $ \(_fp, _fname, fpath) -> liftBase $ do
(info, corpusData, listData, documents) <- S.withConnection fpath $ \conn -> do (info, corpusData, listData, documents, ngrams) <- S.withConnection fpath $ \conn -> do
[S.Only version] <- S.query_ conn "SELECT value FROM info WHERE key = 'gargVersion'" [S.Only version] <- S.query_ conn "SELECT value FROM info WHERE key = 'gargVersion'"
[S.Only cId] <- S.query_ conn "SELECT value FROM info WHERE key = 'corpusId'" [S.Only cId] <- S.query_ conn "SELECT value FROM info WHERE key = 'corpusId'"
[S.Only lId] <- S.query_ conn "SELECT value FROM info WHERE key = 'listId'" [S.Only lId] <- S.query_ conn "SELECT value FROM info WHERE key = 'listId'"
...@@ -187,28 +188,27 @@ readCorpusSQLite (CorpusSQLite { _cs_bs }) = withTempSQLiteDir $ \(_fp, _fname, ...@@ -187,28 +188,27 @@ readCorpusSQLite (CorpusSQLite { _cs_bs }) = withTempSQLiteDir $ \(_fp, _fname,
[corpusData] <- S.query_ conn "SELECT name, hash, parent_id, hyperdata FROM corpus" [corpusData] <- S.query_ conn "SELECT name, hash, parent_id, hyperdata FROM corpus"
[listData] <- S.query_ conn "SELECT name, parent_id, hyperdata FROM lists" [listData] <- S.query_ conn "SELECT name, parent_id, hyperdata FROM lists"
-- [ngrams] <- S.query_ conn "SELECT context_id, terms, type_ FROM ngrams"
documents <- S.query_ conn "SELECT context_id, name, date, hyperdata FROM documents" documents <- S.query_ conn "SELECT context_id, name, date, hyperdata FROM documents"
ngrams <- S.query_ conn "SELECT context_id, terms, type_ FROM ngrams"
pure (info, corpusData, listData, documents) pure (info, corpusData, listData, documents, ngrams)
let (version, cId, lId, created) = info let (version, cId, lId, created) = info
let (_csd_corpus_name, _csd_corpus_hash, corpusParent, corpusHyperdata) = corpusData let (_csd_corpus_name, _csd_corpus_hash, corpusParent, corpusHyperdata) = corpusData
let (_csd_list_name, listParent, listHyperdata) = listData let (_csd_list_name, listParent, listHyperdata) = listData
let parseCtx (ctxId, name, date, hd) =
case ( iso8601ParseM date, Aeson.decode hd ) of
( Just d, Just h ) -> Right ( UnsafeMkNodeId ctxId, name, d, h )
_ -> Left ("Context " <> show ctxId <> " parse error" :: Text)
let (context_errors, _csd_contexts) = partitionEithers (parseCtx <$> documents) let (context_errors, _csd_contexts) = partitionEithers (parseCtx <$> documents)
let (ngrams_errors, ngrams_) = partitionEithers (parseNgrams <$> ngrams)
-- TODO pure $ do in Either monad
case ( readP_to_S parseVersion version case ( readP_to_S parseVersion version
, iso8601ParseM created , iso8601ParseM created
, Aeson.decode corpusHyperdata , Aeson.decode corpusHyperdata
, Aeson.decode listHyperdata , Aeson.decode listHyperdata
, context_errors ) of , context_errors
([(_csd_version, _)], Just _csd_created, Just _csd_corpus_hyperdata, Just _csd_list_hyperdata, []) -> do , ngrams_errors ) of
([(_csd_version, _)], Just _csd_created, Just _csd_corpus_hyperdata, Just _csd_list_hyperdata, [], []) -> do
let _csd_cId = UnsafeMkNodeId cId let _csd_cId = UnsafeMkNodeId cId
let _csd_lId = UnsafeMkNodeId lId let _csd_lId = UnsafeMkNodeId lId
...@@ -216,14 +216,35 @@ readCorpusSQLite (CorpusSQLite { _cs_bs }) = withTempSQLiteDir $ \(_fp, _fname, ...@@ -216,14 +216,35 @@ readCorpusSQLite (CorpusSQLite { _cs_bs }) = withTempSQLiteDir $ \(_fp, _fname,
let _csd_list_parent = UnsafeMkNodeId <$> listParent let _csd_list_parent = UnsafeMkNodeId <$> listParent
-- TODO let _csd_map_context_ngrams = filterNgrams MapTerm ngrams_
let _csd_map_context_ngrams = Map.empty let _csd_stop_context_ngrams = filterNgrams StopTerm ngrams_
let _csd_stop_context_ngrams = Map.empty let _csd_candidate_context_ngrams = filterNgrams CandidateTerm ngrams_
let _csd_candidate_context_ngrams = Map.empty
pure $ Right $ CorpusSQLiteData { .. } pure $ Right $ CorpusSQLiteData { .. }
_ -> pure $ Left "Parse error" _ -> pure $ Left "Parse error"
where
parseCtx :: (Int, Text, Prelude.String, BSL.ByteString)
-> Either Text (NodeId, Text, UTCTime, HyperdataDocument)
parseCtx (ctxId, name, date, hd) =
case ( iso8601ParseM date, Aeson.decode hd ) of
( Just d, Just h ) -> Right ( UnsafeMkNodeId ctxId, name, d, h )
_ -> Left ("Context " <> show ctxId <> " parse error" :: Text)
parseNgrams :: (Int, Text, Text) -> Either Text (ListType, (ContextId, NgramsTerm))
parseNgrams (ctxId, term, type_) =
case type_ of
"map" -> Right ( MapTerm, ( UnsafeMkContextId ctxId, NgramsTerm term ) )
"stop" -> Right ( StopTerm, ( UnsafeMkContextId ctxId, NgramsTerm term ) )
"candidate" -> Right ( CandidateTerm, ( UnsafeMkContextId ctxId, NgramsTerm term ) )
_ -> Left ("Unknown term " <> term)
filterNgrams :: ListType -> [(ListType, (ContextId, NgramsTerm))] -> Map ContextId (Set NgramsTerm)
filterNgrams lt ngrams_ = Map.fromListWith (<>) $
map (\(_, (ctxId, term)) -> (ctxId, Set.singleton term))
(filter (\(lt_, _) -> lt == lt_) ngrams_)
withTempSQLiteDir :: (CES.MonadMask m, MonadBase IO m) withTempSQLiteDir :: (CES.MonadMask m, MonadBase IO m)
=> ((FilePath, Prelude.String, FilePath) -> m a) => ((FilePath, Prelude.String, FilePath) -> m a)
-> m a -> m a
......
...@@ -65,6 +65,9 @@ tests = sequential $ around withTestDBAndPort $ beforeWith dbEnvSetup $ do ...@@ -65,6 +65,9 @@ tests = sequential $ around withTestDBAndPort $ beforeWith dbEnvSetup $ do
_csd_cId `shouldBe` corpusId _csd_cId `shouldBe` corpusId
_csd_lId `shouldBe` aliceListId _csd_lId `shouldBe` aliceListId
length _csd_contexts `shouldBe` 2 length _csd_contexts `shouldBe` 2
length _csd_map_context_ngrams `shouldBe` 0
length _csd_stop_context_ngrams `shouldBe` 0
length _csd_candidate_context_ngrams `shouldBe` 0
describe "GET /api/v1.0/corpus/cId/sqlite" $ do describe "GET /api/v1.0/corpus/cId/sqlite" $ do
it "returns correct SQLite db" $ \ctx -> do it "returns correct SQLite db" $ \ctx -> do
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment