Commit 4b0e60f2 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Try introducing buildForest and destroyForest in Gargantext.API.Ngrams

parent fca2b73e
......@@ -84,17 +84,17 @@ module Gargantext.API.Ngrams
-- * Handlers to be used when serving top-level API requests
, getTableNgramsCorpusHandler
-- * Internals, for testing
-- * Internals for testing
, compute_new_state_patches
, PatchHistory(..)
, newNgramsFromNgramsStatePatch
, filterNgramsNodes
, rootOfNgramsElement
, matchingNode
, buildForest
, destroyForest
)
where
import Control.Lens (view, (^..), (+~), (%~), msumOf, at, ix, _Just, Each(..), (%%~), ifolded, to, withIndex, over)
import Control.Lens (view, (^..), (+~), (%~), msumOf, at, ix, _Just, Each(..), (%%~), ifolded, to, withIndex)
import Data.Aeson.Text qualified as DAT
import Data.List qualified as List
import Data.Map.Strict qualified as Map
......@@ -103,6 +103,7 @@ import Data.Patch.Class (Action(act), Transformable(..), ours)
import Data.Set qualified as Set
import Data.Text (isInfixOf, toLower, unpack)
import Data.Text.Lazy.IO as DTL ( writeFile )
import Data.Tree
import Gargantext.API.Ngrams.Tools (getNodeStory)
import Gargantext.API.Ngrams.Types
import Gargantext.Core.NodeStory (ArchiveList, HasNodeStory, NgramsStatePatch', a_history, a_state, a_version, currentVersion, NodeStoryEnv, hasNodeArchiveStoryImmediateSaver, hasNodeStoryImmediateSaver, HasNodeStoryEnv (..))
......@@ -426,49 +427,20 @@ dumpJsonTableMap fpath nodeId ngramsType = do
pure ()
-- | Filters the given `tableMap` with the search criteria. It returns
-- the input map, where each bucket indexed by a 'NgramsTerm' has been
-- filtered via the given predicate. Removes the key from the map if
-- the filtering would result in the empty set.
filterNgramsNodes :: Maybe ListType
-> Maybe MinSize
-> Maybe MaxSize
-> (NgramsTerm -> Bool)
-> Map NgramsTerm NgramsElement
-> Map NgramsTerm NgramsElement
-> Map NgramsTerm NgramsElement
filterNgramsNodes listTy minSize maxSize searchFn tblMap =
filterNgramsNodes listTy minSize maxSize searchFn tblMap =
Set.fromList $ concatMap (findRootPath tblMap) selectedNodes
where
allNodes = Set.fromList $ Map.elems tblMap
selectedNodes = Set.filter (matchingNode listTy minSize maxSize searchFn) allNodes
-- | Given the full forest of ngrams and the current element we are iterating on,
-- returns the full path of ngrams between this node and its /root/. This informs
-- us of all the nodes we have to keep in the final result set, because if we were
-- to filter them, we would be omitting important hierarchical information and this
-- will cause ngrams to not be displayed correctly on the frontend.
findRootPath :: Map NgramsTerm NgramsElement -> NgramsElement -> [NgramsElement]
findRootPath tblMap node = go node []
where
go current !acc =
case _ne_parent current >>= (`Map.lookup` tblMap) of
Nothing -> current : acc
Just parentNode -> go parentNode (current : acc)
-- | Returns the \"root\" of the 'NgramsElement', or it falls back to the input
-- 'NgramsElement' itself, if no root can be found.
-- /CAREFUL/: The root we select might /not/ have the same 'listType' we are
-- filtering for, in which case we have to change its type to match, if needed.
rootOfNgramsElement :: Maybe ListType
-> Map NgramsTerm NgramsElement
-> NgramsElement
-> NgramsElement
rootOfNgramsElement listType tblMap ne = case ne ^. ne_root of
Nothing -> ne
Just rootKey
| Just r <- tblMap ^. at rootKey
-- NOTE(adinapoli) It's unclear what is the correct behaviour here: should
-- we override the type or we filter out the node altogether?
-> over ne_list (\oldList -> fromMaybe oldList listType) r
| otherwise
flip Map.mapMaybe tblMap $ \e ->
case matchingNode listTy minSize maxSize searchFn e of
False -> Nothing
True -> Just e
-- | Returns 'True' if the input 'NgramsElement' satisfies the search criteria
-- mandated by 'NgramsSearchQuery'.
......@@ -489,7 +461,32 @@ matchingNode listType minSize maxSize searchQuery inputNode =
&& searchQuery (inputNode ^. ne_ngrams)
&& matchesListType (inputNode ^. ne_list)
-- | Builds an ngrams forest from the input ngrams table map.
buildForest :: Map NgramsTerm NgramsElement -> Forest NgramsElement
buildForest mp = unfoldForest mkTreeNode (Map.toList mp)
where
mkTreeNode :: (NgramsTerm, NgramsElement) -> (NgramsElement, [(NgramsTerm, NgramsElement)])
mkTreeNode (_, el) = (el, mapMaybe findChildren $ mSetToList (_ne_children el))
findChildren :: NgramsTerm -> Maybe (NgramsTerm, NgramsElement)
findChildren t = Map.lookup t mp <&> \el -> (t, el)
-- | Folds an Ngrams forest back to a table map.
-- FIXME(adn) propagate the root information.
destroyForest :: Forest NgramsElement -> Map NgramsTerm NgramsElement
destroyForest f = Map.fromList . map (foldTree destroyTree) $ f
where
destroyTree :: NgramsElement -> [(NgramsTerm, NgramsElement)] -> (NgramsTerm, NgramsElement)
destroyTree rootEl childrenEl = (_ne_ngrams rootEl, squashElements rootEl childrenEl)
-- Given a list of children, generate a single node that has as the parent
-- the children, as the score the sum of the individual elements.
squashElements :: NgramsElement -> [(NgramsTerm, NgramsElement)] -> NgramsElement
squashElements r c =
r { _ne_size = _ne_size r <> sum (map (_ne_size . snd) c)
, _ne_occurrences = _ne_occurrences r <> (mconcat $ map (_ne_occurrences . snd) c)
, _ne_children = mSetFromList $ map fst c
}
-- | TODO Errors management
-- TODO: polymorphic for Annuaire or Corpus or ...
......@@ -507,10 +504,11 @@ searchTableNgrams :: Versioned (Map NgramsTerm NgramsElement)
searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
let tableMap = versionedTableMap ^. v_data
filteredData = filterNgramsNodes _nsq_listType _nsq_minSize _nsq_maxSize _nsq_searchQuery tableMap
forestRoots = Set.fromList . Map.elems . destroyForest . buildForest $ filteredData
tableMapSorted = versionedTableMap
& v_data .~ (NgramsTable . sortAndPaginate . withInners tableMap $ filteredData)
& v_data .~ (NgramsTable . sortAndPaginate $ forestRoots)
in toVersionedWithCount (Set.size filteredData) tableMapSorted
in toVersionedWithCount (Set.size forestRoots) tableMapSorted
where
-- Sorts the input 'NgramsElement' list.
......@@ -530,8 +528,8 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
-- | For each input root, extends its occurrence count with
-- the information found in the subitems.
withInners :: Map NgramsTerm NgramsElement -> Set NgramsElement -> Set NgramsElement
withInners tblMap roots = Set.map addSubitemsOccurrences roots
_withInners :: Map NgramsTerm NgramsElement -> Set NgramsElement -> Set NgramsElement
_withInners tblMap roots = Set.map addSubitemsOccurrences roots
where
addSubitemsOccurrences :: NgramsElement -> NgramsElement
addSubitemsOccurrences e =
......
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ViewPatterns #-}
module Test.Offline.Ngrams (tests) where
import Prelude
import Control.Lens
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Text qualified as T
import Gargantext.API.Ngrams (filterNgramsNodes)
import Gargantext.API.Ngrams (filterNgramsNodes, buildForest)
import Gargantext.API.Ngrams.Types
import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core
......@@ -23,6 +24,10 @@ import Test.Instances ()
import Test.Ngrams.Query (mkMapTerm)
import Test.QuickCheck
import Test.QuickCheck qualified as QC
import Data.Tree
import Text.RawString.QQ (r)
import Data.Char (isSpace)
import Data.Map.Strict (Map)
genScientificText :: Gen T.Text
......@@ -95,22 +100,27 @@ tests = describe "Ngrams" $ do
it "return results for non-empty input terms" $ property testBuildPatternsNonEmpty
describe "docNgrams" $ do
it "always matches if the input text contains any of the terms" $ property testDocNgramsOKMatch
describe "ngram forests" $ do
it "building a simple tree works" testBuildNgramsTree_01
it "building a complex tree works" testBuildNgramsTree_02
describe "hierarchical grouping" $ do
it "filterNgramsNodes with empty query is identity" testFilterNgramsNodesEmptyQuery
hierarchicalTableMap :: Map NgramsTerm NgramsElement
hierarchicalTableMap = Map.fromList [
("vehicle", mkMapTerm "vehicle" & ne_children .~ mSetFromList ["car"])
, ("car", mkMapTerm "car" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "vehicle"
& ne_children .~ mSetFromList ["ford"])
, ("ford", mkMapTerm "ford" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "car")
]
testFilterNgramsNodesEmptyQuery :: Assertion
testFilterNgramsNodesEmptyQuery = do
let input = Map.fromList [
("vehicle", mkMapTerm "vehicle" & ne_children .~ mSetFromList ["car"])
, ("car", mkMapTerm "car" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "vehicle"
& ne_children .~ mSetFromList ["ford"])
, ("ford", mkMapTerm "ford" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "car")
]
let input = hierarchicalTableMap
let actual = filterNgramsNodes (Just MapTerm) Nothing Nothing (const True) input
actual @?= (Set.fromList $ Map.elems input)
actual @?= input
testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property
testDocNgramsOKMatch lang (DocumentWithMatches ts doc) =
......@@ -125,3 +135,57 @@ testBuildPatternsNonEmpty :: Lang -> NonEmptyList NgramsTermNonEmpty -> Property
testBuildPatternsNonEmpty lang ts =
let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts
in counterexample "buildPatterns returned no results" $ length (buildPatternsWith lang ts') > 0
newtype ASCIIForest = ASCIIForest String
deriving Eq
instance Show ASCIIForest where
show (ASCIIForest x) = x
compareForestVisually :: Forest NgramsElement -> String -> Property
compareForestVisually f expected =
let actual = init $ drawForest (map (fmap renderEl) f)
outermostIndentation = T.length . T.takeWhile isSpace . T.dropWhile (=='\n') . T.pack $ expected
in ASCIIForest actual === ASCIIForest (sanitiseDrawing outermostIndentation expected)
where
renderEl :: NgramsElement -> String
renderEl = T.unpack . unNgramsTerm . _ne_ngrams
toTextPaths :: String -> [T.Text]
toTextPaths = T.splitOn "\n" . T.strip . T.pack
sanitiseDrawing :: Int -> String -> String
sanitiseDrawing outermostIndentation =
let dropLayout t = case T.uncons t of
Just (' ', _) -> T.drop outermostIndentation t
_ -> t -- leave it be
in T.unpack . T.unlines . map dropLayout . toTextPaths
testBuildNgramsTree_01 :: Property
testBuildNgramsTree_01 =
let t1 = Map.fromList [ ( "foo", mkMapTerm "foo" & ne_children .~ mSetFromList ["bar"])
, ( "bar", mkMapTerm "bar" & ne_parent .~ Just "foo")
]
in (buildForest t1) `compareForestVisually` [r|
bar
foo
|
`- bar
|]
testBuildNgramsTree_02 :: Property
testBuildNgramsTree_02 =
buildForest hierarchicalTableMap `compareForestVisually` [r|
car
|
`- ford
ford
vehicle
|
`- car
|
`- ford
|]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment