Commit 0426c9f1 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli Committed by Alfredo Di Napoli

Rewrite buildForest in terms of graphs (and strongly-connected

components)

This commit rewrites the logic behind `buildForest` to build first a
graph, discovering loops in there (via the strongly-connected components
machinery) and finally converting the DAG (freshly-made so) into a
Forest.
parent 4739268a
Pipeline #7945 failed with stages
in 43 minutes and 33 seconds
...@@ -44,7 +44,7 @@ TODO: ...@@ -44,7 +44,7 @@ TODO:
{-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-} {-# LANGUAGE TupleSections #-}
{-# LANGUAGE LambdaCase #-} {-# LANGUAGE LambdaCase #-}
module Gargantext.Core.NodeStory module Gargantext.Core.NodeStory
( module Gargantext.Core.NodeStory.Types ( module Gargantext.Core.NodeStory.Types
...@@ -72,16 +72,17 @@ module Gargantext.Core.NodeStory ...@@ -72,16 +72,17 @@ module Gargantext.Core.NodeStory
, pruneForest , pruneForest
) where ) where
import Control.Lens ((%~), non, _Just, at, over, Lens', (#), to) import Control.Lens ((%~), non, _Just, at, over, Lens', (#), view)
import Control.Monad.State.Strict (modify') import Data.Graph
import Database.PostgreSQL.Simple qualified as PGS import Data.Graph qualified as G
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.ToField qualified as PGS
import Data.List qualified as L import Data.List qualified as L
import Data.ListZipper import Data.ListZipper
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Data.Set qualified as Set import Data.Set qualified as Set
import Data.Tree import Data.Tree
import Database.PostgreSQL.Simple qualified as PGS
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.ToField qualified as PGS
import Gargantext.API.Ngrams.Types import Gargantext.API.Ngrams.Types
import Gargantext.Core.NodeStory.DB import Gargantext.Core.NodeStory.DB
import Gargantext.Core.NodeStory.Types import Gargantext.Core.NodeStory.Types
...@@ -109,6 +110,15 @@ instance HasNgramParent NgramsRepoElement where ...@@ -109,6 +110,15 @@ instance HasNgramParent NgramsRepoElement where
instance HasNgramParent NgramsElement where instance HasNgramParent NgramsElement where
ngramsElementParent = ne_parent ngramsElementParent = ne_parent
class HasNgramRoot e where
ngramsElementRoot :: Lens' e (Maybe NgramsTerm)
instance HasNgramRoot NgramsRepoElement where
ngramsElementRoot = nre_root
instance HasNgramRoot NgramsElement where
ngramsElementRoot = ne_root
-- | A 'Forest' (i.e. a list of trees) that models a hierarchy of ngrams terms, possibly grouped in -- | A 'Forest' (i.e. a list of trees) that models a hierarchy of ngrams terms, possibly grouped in
-- a nested fashion, all wrapped in a 'Zipper'. Why using a 'Zipper'? Because when traversing the -- a nested fashion, all wrapped in a 'Zipper'. Why using a 'Zipper'? Because when traversing the
-- forest (for example to fix the children in case of dangling imports) we need sometimes to search -- forest (for example to fix the children in case of dangling imports) we need sometimes to search
...@@ -119,6 +129,18 @@ type ArchiveStateForest = ListZipper (Tree (NgramsTerm, NgramsRepoElement)) ...@@ -119,6 +129,18 @@ type ArchiveStateForest = ListZipper (Tree (NgramsTerm, NgramsRepoElement))
type TreeNode e = (NgramsTerm, e) type TreeNode e = (NgramsTerm, e)
-- | A 'NodeWithKey' is morally a 'TreeNode' but with custom 'Eq' and 'Ord' instances
-- which indexes everything on the 'NgramsTerm', which is our unique key.
data NodeWithKey e = NWK { _nwk_key :: NgramsTerm, _nwk_value :: e }
deriving Show
instance Eq (NodeWithKey e) where
(NWK k1 _) == (NWK k2 _) = k1 == k2
instance Ord (NodeWithKey e) where
compare = comparing _nwk_key
-- | An algorithm to break loops in a graph.
data LoopBreakAlgorithm data LoopBreakAlgorithm
= -- | Just break the loop the easiest possible way = -- | Just break the loop the easiest possible way
LBA_just_do_it LBA_just_do_it
...@@ -129,11 +151,212 @@ data LoopBreakAlgorithm ...@@ -129,11 +151,212 @@ data LoopBreakAlgorithm
-- (CURRENTLY UNIMPLEMENTED) -- (CURRENTLY UNIMPLEMENTED)
| LBA_prefer_largest_occurrences_chain | LBA_prefer_largest_occurrences_chain
-- | A user preference on what to do if a loop is detected in a graph.
data OnLoopDetectedStrategy data OnLoopDetectedStrategy
= -- When a loop is detected don't do anything, just fail. = -- When a loop is detected don't do anything, just fail.
FailOnLoop FailOnLoop
| BreakLoop LoopBreakAlgorithm | BreakLoop LoopBreakAlgorithm
--
-- Dealing with loops
--
-- For dealing with loops (which it would be the case even for externally-sourced data)
-- we need something more than just Forests, we need a full blown graph, out of which we
-- can compute the strongly-connected-components, remove the loops with one of
-- the given strategies, find the spanning forest and finally materialise it.
--
sccsOf
:: forall e. HasNgramChildren e
=> Map.Map NgramsTerm e
-> [SCC (NodeWithKey e)]
sccsOf mp =
let triples =
[ (NWK t e
, t
, fromMaybe [] $ childrenOf <$> Map.lookup t mp)
| (t, e) <- Map.toList mp
]
in G.stronglyConnComp triples
where
childrenOf :: HasNgramChildren e => e -> [NgramsTerm]
childrenOf e = mSetToList (e ^. ngramsElementChildren)
-- Deterministic DFS spanning forest inside a set of nodes.
-- We return the set of *kept* edges (u -> v) that form a forest.
spanningEdges
:: (Ord k)
=> Map.Map k [k]
-- adjacency restricted to the SCC
-> [(k, k)]
spanningEdges adjScc =
let -- make the starting order deterministic
order = L.sort (Map.keys adjScc)
go seen u acc =
let nbrs = L.sort (Map.findWithDefault [] u adjScc)
-- if we have already seen this, skip it (or it would form a cycle)
step (s, edges_) v
| v `Set.member` s = (s, edges_)
| otherwise =
let (s', edges') = go (Set.insert v s) v edges_
in (s', (u,v):edges')
in L.foldl' step (seen, acc) nbrs
finalEdges =
snd $ L.foldl'
(\(seen, edges_) u ->
if u `Set.member` seen
then (seen, edges_)
else go (Set.insert u seen) u edges_)
(Set.empty, [])
order
in finalEdges
-- Given a map whose children have been rewritten to be acyclic (DAG),
-- set each node’s parent and roots from incoming edges.
syncAncestorshipFromChildren
:: (HasNgramChildren e, HasNgramParent e, HasNgramRoot e)
=> Map.Map NgramsTerm e
-> Map.Map NgramsTerm e
syncAncestorshipFromChildren mp =
let children_Of t =
maybe [] (mSetToList . view ngramsElementChildren) (Map.lookup t mp)
incoming :: Map.Map NgramsTerm [NgramsTerm]
incoming =
Map.fromListWith (<>)
[ (v, [u])
| (u, _) <- Map.toList mp
, v <- children_Of u
, Map.member v mp
]
-- Pick a parent out of the incoming edges
selectParent :: [NgramsTerm] -> Maybe NgramsTerm
selectParent [] = Nothing
selectParent [p] = Just p
selectParent ps = Just (minimum ps)
-- Pick the root by feeding the /parent/, and if
-- the parent has no root either (because it *is* the root)
-- then the root is the parent itself.
selectRoot :: NgramsTerm -> Maybe NgramsTerm
selectRoot currentParentId = case Map.lookup currentParentId mp of
Nothing -> Nothing
Just currentParent -> view ngramsElementRoot currentParent <|> Just currentParentId
in
Map.mapWithKey
(\t e -> let e' = e & ngramsElementParent .~ selectParent (Map.findWithDefault [] t incoming)
in e' & ngramsElementRoot .~ (view ngramsElementParent e' >>= selectRoot)
)
mp
-- | Break cycles just by "doing it", i.e. pick the simplest loop breaker possible.
breakCycles_justDoIt
:: forall e. (HasNgramChildren e, Ord NgramsTerm)
=> Map.Map NgramsTerm e
-> [SCC (NodeWithKey e)]
-> Map.Map NgramsTerm e
breakCycles_justDoIt mp0 sccs =
foldl' rewriteOneScc mp0 cyclicSets
where
cyclicSets :: [Set.Set NgramsTerm]
cyclicSets =
[ Set.fromList (map _nwk_key xs)
| CyclicSCC xs <- sccs
]
-- Rewrite exactly one SCC
rewriteOneScc :: Map.Map NgramsTerm e -> Set.Set NgramsTerm -> Map.Map NgramsTerm e
rewriteOneScc cur sccSet
| Set.null sccSet = cur
| otherwise =
let nodes = sort (Set.toList sccSet)
-- Snapshot children from *cur* for nodes in this SCC
oldChildren :: Map.Map NgramsTerm [NgramsTerm]
oldChildren =
Map.fromList
[ (u
, filter (/= u) -- drop self-loops proactively
$ maybe [] (mSetToList . (^. ngramsElementChildren)) (Map.lookup u cur)
)
| u <- nodes
]
-- Adjacency restricted to the SCC (used for DFS)
insideAdj :: Map.Map NgramsTerm [NgramsTerm]
insideAdj =
Map.fromList
[ (u, [ v | v <- Map.findWithDefault [] u oldChildren
, v `Set.member` sccSet ])
| u <- nodes
]
keptIn :: Map.Map NgramsTerm [NgramsTerm]
keptIn = Map.fromListWith (++)
[ (u, [v]) | (u,v) <- spanningEdges insideAdj ]
-- For each node, replace children with: (outside edges) ++ (kept inside edges)
cur' :: Map.Map NgramsTerm e
cur' =
foldl'
(\m u ->
let olds = Map.findWithDefault [] u oldChildren
outs = [ v | v <- olds, not (v `Set.member` sccSet) ]
kept = Map.findWithDefault [] u keptIn
-- keep a stable order: outs first (as in original), then kept (both sorted)
newKs = L.nub (sort outs ++ sort kept)
in Map.adjust (\e -> e & ngramsElementChildren .~ mSetFromList newKs) u m
)
cur
nodes
in cur'
-- | Find all the loops given the strongly connected components.
findLoops :: [SCC (NodeWithKey e)] -> [[NgramsTerm]]
findLoops sccs = [ [ t | NWK t _ <- xs ] | CyclicSCC xs <- sccs ]
-- | Builds an ngrams forest from the input ngrams table map. Under the hood
-- this function does a few things:
--
-- * Given that the input map can contain loops, we need to turn the map
-- (which is, in fact, an adjacency map for a graph) into a full blown graph
-- or rather its strongly connected components;
-- * Find any loop in the SCCs and, if there are any, we apply the input 'OnLoopDetectedStrategy';
-- * Once the graph has been made cycle-free,i.e. it's a DAG again we can convert
-- it into a 'Forest'.
buildForest
:: forall e. (Show e, HasNgramChildren e, HasNgramParent e, HasNgramRoot e)
=> OnLoopDetectedStrategy
-> Map.Map NgramsTerm e
-> Either BuildForestError (Forest (TreeNode e))
buildForest strat mp = case findLoops sccs of
[] -> pure $ unfoldForest (mkTreeNode mp) $ Map.toList mp
(l:_loops) -> do
dag <- case strat of
FailOnLoop -> Left $ BFE_loop_detected (Set.fromList $ map (uncurry VN) (zip [1..] l))
BreakLoop LBA_just_do_it ->
pure $ syncAncestorshipFromChildren $ breakCycles_justDoIt mp sccs
-- Future work: the other two algorithms
BreakLoop _other ->
pure $ syncAncestorshipFromChildren $ breakCycles_justDoIt mp sccs
pure $ unfoldForest (mkTreeNode dag) $ Map.toList dag
where
sccs :: [ SCC (NodeWithKey e) ]
sccs = sccsOf mp
mkTreeNode :: Map.Map NgramsTerm e -> TreeNode e -> (TreeNode e, [TreeNode e])
mkTreeNode dag (k, el) = ((k, el), mapMaybe (findChildren dag) $ mSetToList (el ^. ngramsElementChildren))
findChildren :: Map.Map NgramsTerm e -> NgramsTerm -> Maybe (TreeNode e)
findChildren dag t = Map.lookup t dag <&> \el -> (t, el)
buildForestsFromArchiveState :: NgramsState' buildForestsFromArchiveState :: NgramsState'
-> Either BuildForestError (Map Ngrams.NgramsType (Forest (TreeNode NgramsRepoElement))) -> Either BuildForestError (Map Ngrams.NgramsType (Forest (TreeNode NgramsRepoElement)))
buildForestsFromArchiveState = traverse (buildForest (BreakLoop LBA_just_do_it)) buildForestsFromArchiveState = traverse (buildForest (BreakLoop LBA_just_do_it))
...@@ -141,106 +364,6 @@ buildForestsFromArchiveState = traverse (buildForest (BreakLoop LBA_just_do_it)) ...@@ -141,106 +364,6 @@ buildForestsFromArchiveState = traverse (buildForest (BreakLoop LBA_just_do_it))
destroyArchiveStateForest :: Map Ngrams.NgramsType (Forest (TreeNode NgramsRepoElement)) -> NgramsState' destroyArchiveStateForest :: Map Ngrams.NgramsType (Forest (TreeNode NgramsRepoElement)) -> NgramsState'
destroyArchiveStateForest = Map.map destroyForest destroyArchiveStateForest = Map.map destroyForest
-- | Builds an ngrams forest from the input ngrams table map.
buildForest :: forall e. (HasNgramParent e, HasNgramChildren e)
=> OnLoopDetectedStrategy
-- ^ A strategy to apply when a loop is found.
-> Map NgramsTerm e
-> Either BuildForestError (Forest (TreeNode e))
buildForest onLoopStrategy mp = flip evalState (BuildForestState 1 mempty []) . runExceptT $
unfoldForestM buildTree $ Map.toList mp
where
buildTree :: TreeNode e
-> ExceptT BuildForestError (State (BuildForestState e)) (TreeNode e, [TreeNode e])
buildTree (n, el) = do
lift $ modify' (\st -> st { _bfs_visited = mempty, _bfs_children = [] })
let initialChildren = getChildren mp (mSetToList $ el ^. ngramsElementChildren)
children <- unfold_node onLoopStrategy mp initialChildren
-- Create the final ngram by setting the children in the root node to be
-- the children computed by unfold_node.
let root = el & ngramsElementChildren .~ (mSetFromList $ map fst children)
pure ((n, root), children)
getChildren :: Map NgramsTerm e -> [NgramsTerm] -> [TreeNode e]
getChildren mp = mapMaybe (\t -> (t,) <$> Map.lookup t mp)
data BuildForestState e
= BuildForestState
{ _bfs_pos :: !Int
, _bfs_visited :: !(Set VisitedNode)
-- | The children we computed for the target root.
, _bfs_children :: [TreeNode e]
}
-- This function is quite simple: the internal 'State' keeps track of the current
-- position of the visit, and if we discover a term we already seen before, we throw
-- an error, otherwise we store it in the state at the current position and carry on.
unfold_node :: HasNgramChildren e
=> OnLoopDetectedStrategy
-> Map NgramsTerm e
-> [ TreeNode e ]
-> ExceptT BuildForestError (State (BuildForestState e)) [TreeNode e]
unfold_node _ _ [] = L.reverse <$> gets _bfs_children
unfold_node onLoopStrategy mp (x:xs) = do
(BuildForestState !pos !visited !children_so_far) <- get
let nt = fst x
case Set.member (VN pos nt) visited of
True -> case onLoopStrategy of
FailOnLoop -> throwError $ BFE_loop_detected visited
BreakLoop algo -> breakLoopByAlgo mp x xs algo
False -> do
put (BuildForestState (pos + 1) (Set.insert (VN (pos + 1) nt) visited) (x : children_so_far))
unfold_node onLoopStrategy mp xs
breakLoopByAlgo :: HasNgramChildren e
=> Map NgramsTerm e
-> TreeNode e
-> [TreeNode e]
-> LoopBreakAlgorithm
-> ExceptT BuildForestError (State (BuildForestState e)) [TreeNode e]
breakLoopByAlgo mp x xs = \case
LBA_just_do_it -> justDoItLoopBreaker mp x xs
LBA_prefer_longest_children_chain -> preferLongestChildrenLoopBreaker mp x xs
LBA_prefer_largest_occurrences_chain -> preferLargestOccurrencesLoopBreaker mp x xs
justDoItLoopBreaker :: HasNgramChildren e
=> Map NgramsTerm e
-> TreeNode e
-> [ TreeNode e ]
-> ExceptT BuildForestError (State (BuildForestState e)) [TreeNode e]
justDoItLoopBreaker mp (nt, el) xs = do
(BuildForestState !pos !visited !children_so_far) <- get
-- We need to find the edges which are loopy and remove them
let loopyEdges = findLoopyEdges el visited
let el' = el & over ngramsElementChildren (\mchildren -> mchildren `mSetDifference` loopyEdges)
let x' = (nt, el')
put (BuildForestState pos visited (x' : children_so_far))
unfold_node (BreakLoop LBA_just_do_it) mp xs
findLoopyEdges :: HasNgramChildren e => e -> Set VisitedNode -> MSet NgramsTerm
findLoopyEdges e vns = mSetFromSet $
(e ^. ngramsElementChildren . to mSetToSet) `Set.intersection` allVisitedNgramsTerms vns
-- FIXME(adinapoli) At the moment this is unimplemented, just an alias for the simplest version.
preferLongestChildrenLoopBreaker :: HasNgramChildren e
=> Map NgramsTerm e
-> TreeNode e
-> [ TreeNode e ]
-> ExceptT BuildForestError (State (BuildForestState e)) [TreeNode e]
preferLongestChildrenLoopBreaker mp x = justDoItLoopBreaker mp x
-- FIXME(adinapoli) At the moment this is unimplemented, just an alias for the simplest version.
preferLargestOccurrencesLoopBreaker :: HasNgramChildren e
=> Map NgramsTerm e
-> TreeNode e
-> [ TreeNode e ]
-> ExceptT BuildForestError (State (BuildForestState e)) [TreeNode e]
preferLargestOccurrencesLoopBreaker mp x = justDoItLoopBreaker mp x
-- | Folds an Ngrams forest back to a table map. -- | Folds an Ngrams forest back to a table map.
-- This function doesn't aggregate information, but merely just recostructs the original -- This function doesn't aggregate information, but merely just recostructs the original
-- map without loss of information. To perform operations on the forest, use the appropriate -- map without loss of information. To perform operations on the forest, use the appropriate
......
...@@ -198,7 +198,7 @@ data BuildForestError ...@@ -198,7 +198,7 @@ data BuildForestError
= -- We found a loop, something that shouldn't normally happen if the calling = -- We found a loop, something that shouldn't normally happen if the calling
-- code is correct by construction, but if that does happen, the value will -- code is correct by construction, but if that does happen, the value will
-- contain the full path to the cycle. -- contain the full path to the cycle.
BFE_loop_detected !(Set VisitedNode) BFE_loop_detected !(Set VisitedNode)
deriving (Show, Eq) deriving (Show, Eq)
instance ToHumanFriendlyError BuildForestError where instance ToHumanFriendlyError BuildForestError where
......
...@@ -235,7 +235,7 @@ testFlat05 = do ...@@ -235,7 +235,7 @@ testFlat05 = do
testForestSearchProp :: Property testForestSearchProp :: Property
testForestSearchProp = forAll arbitrary $ \(AcyclicTableMap ngramsTable el) -> do testForestSearchProp = forAll arbitrary $ \(AcyclicTableMap ngramsTable el) -> do
case searchTableNgrams (Versioned 0 ngramsTable) (searchQuery el) of case searchTableNgrams (Versioned 0 ngramsTable) (searchQuery el) of
Left (BFE_loop_detected err) -> fail (T.unpack $ renderLoop err) Left (BFE_loop_detected err) -> fail (T.unpack $ renderLoop err)
Right res -> res ^. vc_data `shouldSatisfy` (any (containsTerm (_ne_ngrams el)) . getNgramsTable) Right res -> res ^. vc_data `shouldSatisfy` (any (containsTerm (_ne_ngrams el)) . getNgramsTable)
where where
searchQuery term = NgramsSearchQuery { searchQuery term = NgramsSearchQuery {
......
...@@ -315,8 +315,8 @@ testLoopBreaker_02 = ...@@ -315,8 +315,8 @@ testLoopBreaker_02 =
let t1 = Map.fromList [ ( "foo", mkMapTerm "foo" & ne_children .~ mSetFromList ["bar"]) let t1 = Map.fromList [ ( "foo", mkMapTerm "foo" & ne_children .~ mSetFromList ["bar"])
, ( "bar", mkMapTerm "bar" & ne_children .~ mSetFromList ["foo"]) , ( "bar", mkMapTerm "bar" & ne_children .~ mSetFromList ["foo"])
] ]
in (buildForestOrBreakLoop t1) `compareForestVisually` [r| in (pruneForest $ buildForestOrBreakLoop t1) `compareForestVisually` [r|
foo bar
| |
`- bar `- foo
|] |]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment