Commit 4b0e60f2 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Try introducing buildForest and destroyForest in Gargantext.API.Ngrams

parent fca2b73e
...@@ -84,17 +84,17 @@ module Gargantext.API.Ngrams ...@@ -84,17 +84,17 @@ module Gargantext.API.Ngrams
-- * Handlers to be used when serving top-level API requests -- * Handlers to be used when serving top-level API requests
, getTableNgramsCorpusHandler , getTableNgramsCorpusHandler
-- * Internals, for testing -- * Internals for testing
, compute_new_state_patches , compute_new_state_patches
, PatchHistory(..) , PatchHistory(..)
, newNgramsFromNgramsStatePatch , newNgramsFromNgramsStatePatch
, filterNgramsNodes , filterNgramsNodes
, rootOfNgramsElement , buildForest
, matchingNode , destroyForest
) )
where where
import Control.Lens (view, (^..), (+~), (%~), msumOf, at, ix, _Just, Each(..), (%%~), ifolded, to, withIndex, over) import Control.Lens (view, (^..), (+~), (%~), msumOf, at, ix, _Just, Each(..), (%%~), ifolded, to, withIndex)
import Data.Aeson.Text qualified as DAT import Data.Aeson.Text qualified as DAT
import Data.List qualified as List import Data.List qualified as List
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
...@@ -103,6 +103,7 @@ import Data.Patch.Class (Action(act), Transformable(..), ours) ...@@ -103,6 +103,7 @@ import Data.Patch.Class (Action(act), Transformable(..), ours)
import Data.Set qualified as Set import Data.Set qualified as Set
import Data.Text (isInfixOf, toLower, unpack) import Data.Text (isInfixOf, toLower, unpack)
import Data.Text.Lazy.IO as DTL ( writeFile ) import Data.Text.Lazy.IO as DTL ( writeFile )
import Data.Tree
import Gargantext.API.Ngrams.Tools (getNodeStory) import Gargantext.API.Ngrams.Tools (getNodeStory)
import Gargantext.API.Ngrams.Types import Gargantext.API.Ngrams.Types
import Gargantext.Core.NodeStory (ArchiveList, HasNodeStory, NgramsStatePatch', a_history, a_state, a_version, currentVersion, NodeStoryEnv, hasNodeArchiveStoryImmediateSaver, hasNodeStoryImmediateSaver, HasNodeStoryEnv (..)) import Gargantext.Core.NodeStory (ArchiveList, HasNodeStory, NgramsStatePatch', a_history, a_state, a_version, currentVersion, NodeStoryEnv, hasNodeArchiveStoryImmediateSaver, hasNodeStoryImmediateSaver, HasNodeStoryEnv (..))
...@@ -426,49 +427,20 @@ dumpJsonTableMap fpath nodeId ngramsType = do ...@@ -426,49 +427,20 @@ dumpJsonTableMap fpath nodeId ngramsType = do
pure () pure ()
-- | Filters the given `tableMap` with the search criteria. It returns -- | Filters the given `tableMap` with the search criteria. It returns
-- the input map, where each bucket indexed by a 'NgramsTerm' has been
-- filtered via the given predicate. Removes the key from the map if
-- the filtering would result in the empty set.
filterNgramsNodes :: Maybe ListType filterNgramsNodes :: Maybe ListType
-> Maybe MinSize -> Maybe MinSize
-> Maybe MaxSize -> Maybe MaxSize
-> (NgramsTerm -> Bool) -> (NgramsTerm -> Bool)
-> Map NgramsTerm NgramsElement -> Map NgramsTerm NgramsElement
-> Map NgramsTerm NgramsElement -> Map NgramsTerm NgramsElement
filterNgramsNodes listTy minSize maxSize searchFn tblMap = filterNgramsNodes listTy minSize maxSize searchFn tblMap =
filterNgramsNodes listTy minSize maxSize searchFn tblMap = flip Map.mapMaybe tblMap $ \e ->
Set.fromList $ concatMap (findRootPath tblMap) selectedNodes case matchingNode listTy minSize maxSize searchFn e of
where False -> Nothing
allNodes = Set.fromList $ Map.elems tblMap True -> Just e
selectedNodes = Set.filter (matchingNode listTy minSize maxSize searchFn) allNodes
-- | Given the full forest of ngrams and the current element we are iterating on,
-- returns the full path of ngrams between this node and its /root/. This informs
-- us of all the nodes we have to keep in the final result set, because if we were
-- to filter them, we would be omitting important hierarchical information and this
-- will cause ngrams to not be displayed correctly on the frontend.
findRootPath :: Map NgramsTerm NgramsElement -> NgramsElement -> [NgramsElement]
findRootPath tblMap node = go node []
where
go current !acc =
case _ne_parent current >>= (`Map.lookup` tblMap) of
Nothing -> current : acc
Just parentNode -> go parentNode (current : acc)
-- | Returns the \"root\" of the 'NgramsElement', or it falls back to the input
-- 'NgramsElement' itself, if no root can be found.
-- /CAREFUL/: The root we select might /not/ have the same 'listType' we are
-- filtering for, in which case we have to change its type to match, if needed.
rootOfNgramsElement :: Maybe ListType
-> Map NgramsTerm NgramsElement
-> NgramsElement
-> NgramsElement
rootOfNgramsElement listType tblMap ne = case ne ^. ne_root of
Nothing -> ne
Just rootKey
| Just r <- tblMap ^. at rootKey
-- NOTE(adinapoli) It's unclear what is the correct behaviour here: should
-- we override the type or we filter out the node altogether?
-> over ne_list (\oldList -> fromMaybe oldList listType) r
| otherwise
-- | Returns 'True' if the input 'NgramsElement' satisfies the search criteria -- | Returns 'True' if the input 'NgramsElement' satisfies the search criteria
-- mandated by 'NgramsSearchQuery'. -- mandated by 'NgramsSearchQuery'.
...@@ -489,7 +461,32 @@ matchingNode listType minSize maxSize searchQuery inputNode = ...@@ -489,7 +461,32 @@ matchingNode listType minSize maxSize searchQuery inputNode =
&& searchQuery (inputNode ^. ne_ngrams) && searchQuery (inputNode ^. ne_ngrams)
&& matchesListType (inputNode ^. ne_list) && matchesListType (inputNode ^. ne_list)
-- | Builds an ngrams forest from the input ngrams table map.
buildForest :: Map NgramsTerm NgramsElement -> Forest NgramsElement
buildForest mp = unfoldForest mkTreeNode (Map.toList mp)
where
mkTreeNode :: (NgramsTerm, NgramsElement) -> (NgramsElement, [(NgramsTerm, NgramsElement)])
mkTreeNode (_, el) = (el, mapMaybe findChildren $ mSetToList (_ne_children el))
findChildren :: NgramsTerm -> Maybe (NgramsTerm, NgramsElement)
findChildren t = Map.lookup t mp <&> \el -> (t, el)
-- | Folds an Ngrams forest back to a table map.
-- FIXME(adn) propagate the root information.
destroyForest :: Forest NgramsElement -> Map NgramsTerm NgramsElement
destroyForest f = Map.fromList . map (foldTree destroyTree) $ f
where
destroyTree :: NgramsElement -> [(NgramsTerm, NgramsElement)] -> (NgramsTerm, NgramsElement)
destroyTree rootEl childrenEl = (_ne_ngrams rootEl, squashElements rootEl childrenEl)
-- Given a list of children, generate a single node that has as the parent
-- the children, as the score the sum of the individual elements.
squashElements :: NgramsElement -> [(NgramsTerm, NgramsElement)] -> NgramsElement
squashElements r c =
r { _ne_size = _ne_size r <> sum (map (_ne_size . snd) c)
, _ne_occurrences = _ne_occurrences r <> (mconcat $ map (_ne_occurrences . snd) c)
, _ne_children = mSetFromList $ map fst c
}
-- | TODO Errors management -- | TODO Errors management
-- TODO: polymorphic for Annuaire or Corpus or ... -- TODO: polymorphic for Annuaire or Corpus or ...
...@@ -507,10 +504,11 @@ searchTableNgrams :: Versioned (Map NgramsTerm NgramsElement) ...@@ -507,10 +504,11 @@ searchTableNgrams :: Versioned (Map NgramsTerm NgramsElement)
searchTableNgrams versionedTableMap NgramsSearchQuery{..} = searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
let tableMap = versionedTableMap ^. v_data let tableMap = versionedTableMap ^. v_data
filteredData = filterNgramsNodes _nsq_listType _nsq_minSize _nsq_maxSize _nsq_searchQuery tableMap filteredData = filterNgramsNodes _nsq_listType _nsq_minSize _nsq_maxSize _nsq_searchQuery tableMap
forestRoots = Set.fromList . Map.elems . destroyForest . buildForest $ filteredData
tableMapSorted = versionedTableMap tableMapSorted = versionedTableMap
& v_data .~ (NgramsTable . sortAndPaginate . withInners tableMap $ filteredData) & v_data .~ (NgramsTable . sortAndPaginate $ forestRoots)
in toVersionedWithCount (Set.size filteredData) tableMapSorted in toVersionedWithCount (Set.size forestRoots) tableMapSorted
where where
-- Sorts the input 'NgramsElement' list. -- Sorts the input 'NgramsElement' list.
...@@ -530,8 +528,8 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} = ...@@ -530,8 +528,8 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
-- | For each input root, extends its occurrence count with -- | For each input root, extends its occurrence count with
-- the information found in the subitems. -- the information found in the subitems.
withInners :: Map NgramsTerm NgramsElement -> Set NgramsElement -> Set NgramsElement _withInners :: Map NgramsTerm NgramsElement -> Set NgramsElement -> Set NgramsElement
withInners tblMap roots = Set.map addSubitemsOccurrences roots _withInners tblMap roots = Set.map addSubitemsOccurrences roots
where where
addSubitemsOccurrences :: NgramsElement -> NgramsElement addSubitemsOccurrences :: NgramsElement -> NgramsElement
addSubitemsOccurrences e = addSubitemsOccurrences e =
......
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ViewPatterns #-}
module Test.Offline.Ngrams (tests) where module Test.Offline.Ngrams (tests) where
import Prelude import Prelude
import Control.Lens import Control.Lens
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.API.Ngrams (filterNgramsNodes) import Gargantext.API.Ngrams (filterNgramsNodes, buildForest)
import Gargantext.API.Ngrams.Types import Gargantext.API.Ngrams.Types
import Gargantext.API.Ngrams.Types qualified as NT import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core import Gargantext.Core
...@@ -23,6 +24,10 @@ import Test.Instances () ...@@ -23,6 +24,10 @@ import Test.Instances ()
import Test.Ngrams.Query (mkMapTerm) import Test.Ngrams.Query (mkMapTerm)
import Test.QuickCheck import Test.QuickCheck
import Test.QuickCheck qualified as QC import Test.QuickCheck qualified as QC
import Data.Tree
import Text.RawString.QQ (r)
import Data.Char (isSpace)
import Data.Map.Strict (Map)
genScientificText :: Gen T.Text genScientificText :: Gen T.Text
...@@ -95,22 +100,27 @@ tests = describe "Ngrams" $ do ...@@ -95,22 +100,27 @@ tests = describe "Ngrams" $ do
it "return results for non-empty input terms" $ property testBuildPatternsNonEmpty it "return results for non-empty input terms" $ property testBuildPatternsNonEmpty
describe "docNgrams" $ do describe "docNgrams" $ do
it "always matches if the input text contains any of the terms" $ property testDocNgramsOKMatch it "always matches if the input text contains any of the terms" $ property testDocNgramsOKMatch
describe "ngram forests" $ do
it "building a simple tree works" testBuildNgramsTree_01
it "building a complex tree works" testBuildNgramsTree_02
describe "hierarchical grouping" $ do describe "hierarchical grouping" $ do
it "filterNgramsNodes with empty query is identity" testFilterNgramsNodesEmptyQuery it "filterNgramsNodes with empty query is identity" testFilterNgramsNodesEmptyQuery
hierarchicalTableMap :: Map NgramsTerm NgramsElement
hierarchicalTableMap = Map.fromList [
("vehicle", mkMapTerm "vehicle" & ne_children .~ mSetFromList ["car"])
, ("car", mkMapTerm "car" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "vehicle"
& ne_children .~ mSetFromList ["ford"])
, ("ford", mkMapTerm "ford" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "car")
]
testFilterNgramsNodesEmptyQuery :: Assertion testFilterNgramsNodesEmptyQuery :: Assertion
testFilterNgramsNodesEmptyQuery = do testFilterNgramsNodesEmptyQuery = do
let input = Map.fromList [ let input = hierarchicalTableMap
("vehicle", mkMapTerm "vehicle" & ne_children .~ mSetFromList ["car"])
, ("car", mkMapTerm "car" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "vehicle"
& ne_children .~ mSetFromList ["ford"])
, ("ford", mkMapTerm "ford" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "car")
]
let actual = filterNgramsNodes (Just MapTerm) Nothing Nothing (const True) input let actual = filterNgramsNodes (Just MapTerm) Nothing Nothing (const True) input
actual @?= (Set.fromList $ Map.elems input) actual @?= input
testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property
testDocNgramsOKMatch lang (DocumentWithMatches ts doc) = testDocNgramsOKMatch lang (DocumentWithMatches ts doc) =
...@@ -125,3 +135,57 @@ testBuildPatternsNonEmpty :: Lang -> NonEmptyList NgramsTermNonEmpty -> Property ...@@ -125,3 +135,57 @@ testBuildPatternsNonEmpty :: Lang -> NonEmptyList NgramsTermNonEmpty -> Property
testBuildPatternsNonEmpty lang ts = testBuildPatternsNonEmpty lang ts =
let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts
in counterexample "buildPatterns returned no results" $ length (buildPatternsWith lang ts') > 0 in counterexample "buildPatterns returned no results" $ length (buildPatternsWith lang ts') > 0
newtype ASCIIForest = ASCIIForest String
deriving Eq
instance Show ASCIIForest where
show (ASCIIForest x) = x
compareForestVisually :: Forest NgramsElement -> String -> Property
compareForestVisually f expected =
let actual = init $ drawForest (map (fmap renderEl) f)
outermostIndentation = T.length . T.takeWhile isSpace . T.dropWhile (=='\n') . T.pack $ expected
in ASCIIForest actual === ASCIIForest (sanitiseDrawing outermostIndentation expected)
where
renderEl :: NgramsElement -> String
renderEl = T.unpack . unNgramsTerm . _ne_ngrams
toTextPaths :: String -> [T.Text]
toTextPaths = T.splitOn "\n" . T.strip . T.pack
sanitiseDrawing :: Int -> String -> String
sanitiseDrawing outermostIndentation =
let dropLayout t = case T.uncons t of
Just (' ', _) -> T.drop outermostIndentation t
_ -> t -- leave it be
in T.unpack . T.unlines . map dropLayout . toTextPaths
testBuildNgramsTree_01 :: Property
testBuildNgramsTree_01 =
let t1 = Map.fromList [ ( "foo", mkMapTerm "foo" & ne_children .~ mSetFromList ["bar"])
, ( "bar", mkMapTerm "bar" & ne_parent .~ Just "foo")
]
in (buildForest t1) `compareForestVisually` [r|
bar
foo
|
`- bar
|]
testBuildNgramsTree_02 :: Property
testBuildNgramsTree_02 =
buildForest hierarchicalTableMap `compareForestVisually` [r|
car
|
`- ford
ford
vehicle
|
`- car
|
`- ford
|]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment