{-| Module : Gargantext.Core.NodeStory Description : NodeStory Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX A Node Story is a Map between NodeId and an Archive (with state, version and history) for that node. Couple of words on how this is implemented. First version used files which stored Archive for each NodeId in a separate .cbor file. For performance reasons, it is rewritten to use the DB. The table `node_stories` contains two columns: `node_id` and `archive`. Next, it was observed that `a_history` in `Archive` takes much space. So a new table was created, `node_story_archive_history` with columns: `node_id`, `ngrams_type_id`, `patch`. This is because each history item is in fact a map from `NgramsType` to `NgramsTablePatch` (see the `NgramsStatePatch'` type). Moreover, since in `G.A.Ngrams.commitStatePatch` we use current state only, with only recent history items, I concluded that it is not necessary to load whole history into memory. Instead, it is kept in DB (history is immutable) and only recent changes are added to `a_history`. Then that record is cleared whenever `Archive` is saved. Please note that TODO: - remove - filter - charger les listes -} {-# LANGUAGE Arrows #-} {-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE QuasiQuotes #-} module Gargantext.Core.NodeStory ( module Gargantext.Core.NodeStory.Types , getNodesArchiveHistory , Archive(..) , nodeExists , getNodesIdWithType , mkNodeStoryEnv , upsertNodeStories -- , getNodeStory , getNodeStory' , nodeStoriesQuery , currentVersion , archiveStateFromList , archiveStateToList , fixNodeStoryVersions , fixChildrenDuplicatedAsParents , getParentsChildren ) where import Control.Lens ((%~), non, _Just, at, over) import Data.Map.Strict qualified as Map import Data.Set qualified as Set import Database.PostgreSQL.Simple qualified as PGS import Database.PostgreSQL.Simple.SqlQQ (sql) import Database.PostgreSQL.Simple.ToField qualified as PGS import Gargantext.API.Ngrams.Types import Gargantext.Core.NodeStory.DB import Gargantext.Core.NodeStory.Types import Gargantext.Core.Text.Ngrams qualified as Ngrams import Gargantext.Database.Admin.Types.Node ( ListId, NodeId(..) ) import Gargantext.Database.Admin.Config () import Gargantext.Database.Prelude import Gargantext.Prelude hiding (to) getNodeStory' :: NodeId -> DBQuery err x ArchiveList getNodeStory' nId = do --res <- withResource pool $ \c -> runSelect c query :: IO [NodeStoryPoly NodeId Version Int Int NgramsRepoElement] res <- mkPGQuery nodeStoriesQuery (PGS.Only $ PGS.toField nId) :: DBQuery err x [(Version, Ngrams.NgramsType, NgramsTerm, NgramsRepoElement)] -- We have multiple rows with same node_id and different (ngrams_type_id, ngrams_id). -- Need to create a map: {<node_id>: {<ngrams_type_id>: {<ngrams_id>: <data>}}} let dbData = map (\(version, ngramsType, ngrams, ngrams_repo_element) -> Archive { _a_version = version , _a_history = [] , _a_state = Map.singleton ngramsType $ Map.singleton ngrams ngrams_repo_element }) res -- NOTE Sanity check: all versions in the DB should be the same -- TODO Maybe redesign the DB so that `node_stories` has only -- `node_id`, `version` and there is a M2M table -- `node_stories_ngrams` without the `version` colum? Then we would -- have `version` in only one place. {- let versionsS = Set.fromList $ map (\a -> a ^. a_version) dbData if Set.size versionsS > 1 then panic $ Text.pack $ "[getNodeStory] versions for " <> show nodeId <> " differ! " <> show versionsS else pure () -} pure $ foldl' combine initArchive dbData where -- NOTE (<>) for Archive doesn't concatenate states, so we have to use `combine` combine a1 a2 = a1 & a_state %~ combineState (a2 ^. a_state) & a_version .~ (a2 ^. a_version) -- version should be updated from list, not taken from the empty Archive getNodeStory :: NodeId -> DBQuery err x NodeListStory getNodeStory nId = do a <- getNodeStory' nId pure $ NodeStory $ Map.singleton nId a -- |Functions to convert archive state (which is a `Map NgramsType -- (Map NgramsTerm NgramsRepoElement`)) to/from a flat list archiveStateToList :: NgramsState' -> ArchiveStateList archiveStateToList s = mconcat $ (\(nt, ntm) -> (\(n, nre) -> (nt, n, nre)) <$> Map.toList ntm) <$> Map.toList s archiveStateFromList :: ArchiveStateList -> NgramsState' archiveStateFromList l = Map.fromListWith (<>) $ (\(nt, t, nre) -> (nt, Map.singleton t nre)) <$> l archiveStateSet :: ArchiveStateList -> ArchiveStateSet archiveStateSet lst = Set.fromList $ (\(nt, term, _) -> (nt, term)) <$> lst archiveStateListFilterFromSet :: ArchiveStateSet -> ArchiveStateList -> ArchiveStateList archiveStateListFilterFromSet set = filter (\(nt, term, _) -> Set.member (nt, term) set) -- | This function inserts whole new node story and archive for given node_id. insertNodeStory :: NodeId -> ArchiveList -> DBUpdate err () insertNodeStory nId a = do insertArchiveStateList nId (a ^. a_version) (archiveStateToList $ a ^. a_state) -- | This function updates the node story and archive for given node_id. updateNodeStory :: NodeId -> ArchiveList -> ArchiveList -> DBUpdate err () updateNodeStory nodeId currentArchive newArchive = do -- STEPS -- 0. We assume we're inside an advisory lock -- 1. Find differences (inserts/updates/deletes) let currentList = archiveStateToList $ currentArchive ^. a_state let newList = archiveStateToList $ newArchive ^. a_state let currentSet = archiveStateSet currentList let newSet = archiveStateSet newList -- printDebug "[updateNodeStory] new - current = " $ Set.difference newSet currentSet let inserts = archiveStateListFilterFromSet (Set.difference newSet currentSet) newList -- printDebug "[updateNodeStory] inserts" inserts -- printDebug "[updateNodeStory] current - new" $ Set.difference currentSet newSet let deletes = archiveStateListFilterFromSet (Set.difference currentSet newSet) currentList -- printDebug "[updateNodeStory] deletes" deletes -- updates are the things that are in new but not in current let commonSet = Set.intersection currentSet newSet let commonNewList = archiveStateListFilterFromSet commonSet newList let commonCurrentList = archiveStateListFilterFromSet commonSet currentList let updates = Set.toList $ Set.difference (Set.fromList commonNewList) (Set.fromList commonCurrentList) -- printDebug "[updateNodeStory] updates" $ Text.unlines $ (Text.pack . show) <$> updates -- 2. Perform inserts/deletes/updates -- printDebug "[updateNodeStory] applying inserts" inserts insertArchiveStateList nodeId (newArchive ^. a_version) inserts --printDebug "[updateNodeStory] insert applied" () --TODO Use currentArchive ^. a_version in delete and report error -- if entries with (node_id, ngrams_type_id, ngrams_id) but -- different version are found. deleteArchiveStateList nodeId deletes --printDebug "[updateNodeStory] delete applied" () updateArchiveStateList nodeId (newArchive ^. a_version) updates --printDebug "[updateNodeStory] update applied" () pure () -- where -- update = Update { uTable = nodeStoryTable -- , uUpdateWith = updateEasy (\(NodeStoryDB { node_id }) -> -- NodeStoryDB { archive = sqlValueJSONB $ Archive { _a_history = emptyHistory -- , ..} -- , .. }) -- , uWhere = (\row -> node_id row .== sqlInt4 nId) -- , uReturning = rCount } upsertNodeStories :: NodeId -> ArchiveList -> DBUpdate err () upsertNodeStories nodeId newArchive = do -- printDebug "[upsertNodeStories] START nId" nId -- printDebug "[upsertNodeStories] locking nId" nId (NodeStory m) <- getNodeStory nodeId case Map.lookup nodeId m of Nothing -> do _ <- insertNodeStory nodeId newArchive pure () Just currentArchive -> do _ <- updateNodeStory nodeId currentArchive newArchive pure () -- 3. Now we need to set versions of all node state to be the same updateNodeStoryVersion nodeId newArchive -- printDebug "[upsertNodeStories] STOP nId" nId -- | Returns a `NodeListStory`, updating the given one for given `NodeId` nodeStoryInc :: NodeListStory -> NodeId -> DBQuery err x NodeListStory nodeStoryInc ns@(NodeStory nls) nId = do case Map.lookup nId nls of Nothing -> do NodeStory nls' <- getNodeStory nId pure $ NodeStory $ Map.unionWith archiveAdvance nls' nls Just _ -> pure ns -- | NgramsRepoElement contains, in particular, `nre_list`, -- `nre_parent` and `nre_children`. We want to make sure that all -- children entries (i.e. ones that have `nre_parent`) have the same -- `list` as their parent entry. fixChildrenInNgrams :: NgramsState' -> NgramsState' fixChildrenInNgrams ns = archiveStateFromList $ nsParents <> nsChildrenFixed where (nsParents, nsChildren) = getParentsChildren ns parentNtMap = Map.fromList $ (\(_nt, t, nre) -> (t, nre ^. nre_list)) <$> nsParents nsChildrenFixed = (\(nt, t, nre) -> ( nt , t , nre & nre_list %~ (\l -> parentNtMap ^. at (nre ^. nre_parent . _Just) . non l) ) ) <$> nsChildren -- | (#281) Sometimes, when we upload a new list, a child can be left -- without a parent. Find such ngrams and set their 'root' and -- 'parent' to 'Nothing'. fixChildrenWithNoParent :: NgramsState' -> NgramsState' fixChildrenWithNoParent ns = archiveStateFromList $ nsParents <> nsChildrenFixed where (nsParents, nsChildren) = getParentsChildren ns parentNtMap = Map.fromList $ (\(_nt, t, nre) -> (t, nre ^. nre_children & mSetToSet)) <$> nsParents nsChildrenFixFunc (nt, t, nre) = ( nt , t , nre { _nre_root = root , _nre_parent = parent } ) where (root, parent) = case parentNtMap ^. at (nre ^. nre_parent . _Just) . _Just . at t of Just _ -> (nre ^. nre_root, nre ^. nre_parent) Nothing -> (Nothing, Nothing) nsChildrenFixed = nsChildrenFixFunc <$> nsChildren -- | Sometimes children can also become parents (e.g. #313). Find such -- | children and remove them from the list. fixChildrenDuplicatedAsParents :: NgramsState' -> NgramsState' fixChildrenDuplicatedAsParents ns = archiveStateFromList $ nsChildren <> nsParentsFixed where (nsParents, nsChildren) = getParentsChildren ns parentNtMap = Map.fromList $ (\(_nt, t, nre) -> (t, nre ^. nre_children & mSetToSet)) <$> nsParents parentsSet = Set.fromList $ Map.keys parentNtMap nsParentsFixed = (\(nt, t, nre) -> ( nt , t , over nre_children (\c -> mSetFromSet $ Set.difference (mSetToSet c) parentsSet) nre ) ) <$> nsParents getParentsChildren :: NgramsState' -> (ArchiveStateList, ArchiveStateList) getParentsChildren ns = (nsParents, nsChildren) where nls = archiveStateToList ns nsParents = filter (\(_nt, _t, nre) -> isNothing $ nre ^. nre_parent) nls nsChildren = filter (\(_nt, _t, nre) -> isJust $ nre ^. nre_parent) nls ------------------------------------ mkNodeStoryEnv :: NodeStoryEnv err mkNodeStoryEnv = do -- tvar <- nodeStoryVar pool Nothing [] let saver_immediate nId a = do -- ns <- atomically $ -- readTVar tvar -- -- fix children so their 'list' is the same as their parents' -- >>= pure . fixChildrenTermTypes -- -- fix children that don't have a parent anymore -- >>= pure . fixChildrenWithNoParent -- >>= writeTVar tvar -- >> readTVar tvar --printDebug "[mkNodeStorySaver] will call writeNodeStories, ns" ns -- writeNodeStories c $ fixChildrenWithNoParent $ fixChildrenTermTypes ns -- |NOTE Fixing a_state is kinda a hack. We shouldn't land -- |with bad state in the first place. upsertNodeStories nId $ a & a_state %~ ( fixChildrenDuplicatedAsParents . fixChildrenInNgrams . fixChildrenWithNoParent ) let archive_saver_immediate nId a = do insertNodeArchiveHistory nId (a ^. a_version) $ reverse $ a ^. a_history pure $ a & a_history .~ [] -- mapM_ (\(nId, a) -> do -- insertNodeArchiveHistory c nId (a ^. a_version) $ reverse $ a ^. a_history -- ) $ Map.toList nls -- pure $ clearHistory ns NodeStoryEnv { _nse_saver = saver_immediate , _nse_archive_saver = archive_saver_immediate , _nse_getter = getNodeStory' , _nse_getter_multi = \nIds -> foldM nodeStoryInc (NodeStory Map.empty) nIds } currentVersion :: ListId -> DBQuery err x Version currentVersion listId = do nls <- getNodeStory listId pure $ nls ^. unNodeStory . at listId . _Just . a_version ----------------------------------------- -- | To be called from the REPL fixNodeStoryVersions :: (HasNodeStory env err m, IsDBCmd env err m) => m () fixNodeStoryVersions = runDBTx $ do nIds <- mkPGQuery [sql| SELECT id FROM nodes WHERE ? |] (PGS.Only True) :: DBQuery err x [PGS.Only Int64] -- printDebug "[fixNodeStoryVersions] nIds" nIds mapM_ (\(PGS.Only nId) -> do -- printDebug "[fixNodeStoryVersions] nId" nId updateVer Ngrams.Authors nId updateVer Ngrams.Institutes nId updateVer Ngrams.Sources nId updateVer Ngrams.NgramsTerms nId ) nIds where maxVerQuery :: PGS.Query maxVerQuery = [sql| SELECT max(version) FROM node_stories WHERE node_id = ? AND ngrams_type_id = ? |] updateVerQuery :: PGS.Query updateVerQuery = [sql| UPDATE node_stories SET version = ? WHERE node_id = ? AND ngrams_type_id = ? |] updateVer :: Ngrams.NgramsType -> Int64 -> DBUpdate err () updateVer ngramsType nId = do maxVer <- mkPGQuery maxVerQuery (nId, ngramsType) :: DBUpdate err [PGS.Only (Maybe Int64)] case maxVer of [] -> pure () [PGS.Only Nothing] -> pure () [PGS.Only (Just maxVersion)] -> do void $ mkPGUpdate updateVerQuery (maxVersion, nId, ngramsType) _ -> panicTrace "Should get only 1 result!" ----------------------------------------- -- DEPRECATED -- nodeStoryVar :: Pool PGS.Connection -- -> Maybe (TVar NodeListStory) -- -> [NodeId] -- -> IO (TVar NodeListStory) -- nodeStoryVar pool Nothing nIds = do -- state' <- withResource pool $ \c -> nodeStoryIncrementalRead c Nothing nIds -- atomically $ newTVar state' -- nodeStoryVar pool (Just tv) nIds = do -- nls <- atomically $ readTVar tv -- nls' <- withResource pool -- $ \c -> nodeStoryIncrementalRead c (Just nls) nIds -- _ <- atomically $ writeTVar tv nls' -- pure tv -- clearHistory :: NodeListStory -> NodeListStory -- clearHistory (NodeStory ns) = NodeStory $ ns & (traverse . a_history) .~ emptyHistory -- where -- emptyHistory = [] :: [NgramsStatePatch'] -- fixChildrenWithNoParent :: NodeListStory -> NodeListStory -- fixChildrenWithNoParent (NodeStory nls) = -- NodeStory $ Map.fromList [ (nId, a & a_state %~ fixChildrenWithNoParentStatePatch) -- | (nId, a) <- Map.toList nls ] -- fixChildrenTermTypes :: NodeListStory -> NodeListStory -- fixChildrenTermTypes (NodeStory nls) = -- NodeStory $ Map.fromList [ (nId, a & a_state %~ fixChildrenInNgramsStatePatch) -- | (nId, a) <- Map.toList nls ] -- nodeStoryIncrementalRead :: PGS.Connection -> Maybe NodeListStory -> [NodeId] -> IO NodeListStory -- nodeStoryIncrementalRead _ Nothing [] = pure $ NodeStory Map.empty -- nodeStoryIncrementalRead c Nothing (ni:ns) = do -- m <- getNodeStory c ni -- nodeStoryIncrementalRead c (Just m) ns -- nodeStoryIncrementalRead c (Just nls) ns = foldM (\m n -> nodeStoryInc c m n) nls ns -- nodeStoryDec :: Pool PGS.Connection -> NodeListStory -> NodeId -> IO NodeListStory -- nodeStoryDec pool ns@(NodeStory nls) ni = do -- case Map.lookup ni nls of -- Nothing -> do -- _ <- nodeStoryRemove pool ni -- pure ns -- Just _ -> do -- let ns' = Map.filterWithKey (\k _v -> k /= ni) nls -- _ <- nodeStoryRemove pool ni -- pure $ NodeStory ns' ------------------------------------ -- writeNodeStories :: PGS.Connection -> NodeListStory -> IO () -- writeNodeStories c (NodeStory nls) = do -- mapM_ (\(nId, a) -> upsertNodeStories c nId a) $ Map.toList nls -- nodeStoryRemove :: Pool PGS.Connection -> NodeId -> IO Int64 -- nodeStoryRemove pool (NodeId nId) = withResource pool $ \c -> runDelete c delete -- where -- delete = Delete { dTable = nodeStoryTable -- , dWhere = (\row -> node_id row .== sqlInt4 nId) -- , dReturning = rCount }