Commit fb8b17c4 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Change implementation of filterNgramsNodes

This commit changes the implementation of filterNgramsNodes to not
filter too aggressively nodes that are present in a path.

This causes a test failure that needs to be investigated.
parent 51a2fe5a
......@@ -18,12 +18,13 @@ add get
{-# OPTIONS_GHC -fno-warn-unused-top-binds #-}
{-# OPTIONS_GHC -fno-warn-deprecations #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE IncoherentInstances #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.API.Ngrams
......@@ -87,6 +88,9 @@ module Gargantext.API.Ngrams
, compute_new_state_patches
, PatchHistory(..)
, newNgramsFromNgramsStatePatch
, filterNgramsNodes
, rootOfNgramsElement
, matchingNode
)
where
......@@ -112,7 +116,6 @@ import Gargantext.Prelude hiding (log, to, toLower, (%), isInfixOf)
import Text.Collate qualified as Unicode
{-
-- TODO sequences of modifications (Patchs)
type NgramsIdPatch = Patch NgramsId NgramsPatch
......@@ -422,6 +425,71 @@ dumpJsonTableMap fpath nodeId ngramsType = do
liftBase $ DTL.writeFile (unpack fpath) (DAT.encodeToLazyText m)
pure ()
-- | Filters the given `tableMap` with the search criteria. It returns
-- a set of 'NgramsElement' all matching the input 'NGramsSearchQuery'.
filterNgramsNodes :: Maybe ListType
-> Maybe MinSize
-> Maybe MaxSize
-> (NgramsTerm -> Bool)
-> Map NgramsTerm NgramsElement
-> Set NgramsElement
filterNgramsNodes listTy minSize maxSize searchFn tblMap =
Set.fromList $ concatMap (findRootPath tblMap) selectedNodes
where
allNodes = Set.fromList $ Map.elems tblMap
selectedNodes = Set.filter (matchingNode listTy minSize maxSize searchFn) allNodes
-- | Given the full forest of ngrams and the current element we are iterating on,
-- returns the full path of ngrams between this node and its /root/. This informs
-- us of all the nodes we have to keep in the final result set, because if we were
-- to filter them, we would be omitting important hierarchical information and this
-- will cause ngrams to not be displayed correctly on the frontend.
findRootPath :: Map NgramsTerm NgramsElement -> NgramsElement -> [NgramsElement]
findRootPath tblMap node = go node []
where
go current !acc =
case _ne_parent current >>= (`Map.lookup` tblMap) of
Nothing -> current : acc
Just parentNode -> go parentNode (current : acc)
-- | Returns the \"root\" of the 'NgramsElement', or it falls back to the input
-- 'NgramsElement' itself, if no root can be found.
-- /CAREFUL/: The root we select might /not/ have the same 'listType' we are
-- filtering for, in which case we have to change its type to match, if needed.
rootOfNgramsElement :: Maybe ListType
-> Map NgramsTerm NgramsElement
-> NgramsElement
-> NgramsElement
rootOfNgramsElement listType tblMap ne = case ne ^. ne_root of
Nothing -> ne
Just rootKey
| Just r <- tblMap ^. at rootKey
-- NOTE(adinapoli) It's unclear what is the correct behaviour here: should
-- we override the type or we filter out the node altogether?
-> over ne_list (\oldList -> fromMaybe oldList listType) r
| otherwise
-> ne
-- | Returns 'True' if the input 'NgramsElement' satisfies the search criteria
-- mandated by 'NgramsSearchQuery'.
matchingNode :: Maybe ListType
-> Maybe MinSize
-> Maybe MaxSize
-> (NgramsTerm -> Bool)
-> NgramsElement
-> Bool
matchingNode listType minSize maxSize searchQuery inputNode =
let nodeSize = inputNode ^. ne_size
matchesListType = maybe (const True) (==) listType
respectsMinSize = maybe (const True) ((<=) . getMinSize) minSize
respectsMaxSize = maybe (const True) ((>=) . getMaxSize) maxSize
in respectsMinSize nodeSize
&& respectsMaxSize nodeSize
&& searchQuery (inputNode ^. ne_ngrams)
&& matchesListType (inputNode ^. ne_list)
-- | TODO Errors management
-- TODO: polymorphic for Annuaire or Corpus or ...
......@@ -438,42 +506,13 @@ searchTableNgrams :: Versioned (Map NgramsTerm NgramsElement)
-> VersionedWithCount NgramsTable
searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
let tableMap = versionedTableMap ^. v_data
filteredData = filterNodes tableMap
filteredData = filterNgramsNodes _nsq_listType _nsq_minSize _nsq_maxSize _nsq_searchQuery tableMap
tableMapSorted = versionedTableMap
& v_data .~ (NgramsTable . sortAndPaginate . withInners tableMap $ filteredData)
in toVersionedWithCount (Set.size filteredData) tableMapSorted
where
-- | Returns the \"root\" of the 'NgramsElement', or it falls back to the input
-- 'NgramsElement' itself, if no root can be found.
-- /CAREFUL/: The root we select might /not/ have the same 'listType' we are
-- filtering for, in which case we have to change its type to match, if needed.
rootOf :: Map NgramsTerm NgramsElement -> NgramsElement -> NgramsElement
rootOf tblMap ne = case ne ^. ne_root of
Nothing -> ne
Just rootKey
| Just r <- tblMap ^. at rootKey
-- NOTE(adinapoli) It's unclear what is the correct behaviour here: should
-- we override the type or we filter out the node altogether?
-> over ne_list (\oldList -> fromMaybe oldList _nsq_listType) r
| otherwise
-> ne
-- | Returns 'True' if the input 'NgramsElement' satisfies the search criteria
-- mandated by 'NgramsSearchQuery'.
matchingNode :: NgramsElement -> Bool
matchingNode inputNode =
let nodeSize = inputNode ^. ne_size
matchesListType = maybe (const True) (==) _nsq_listType
respectsMinSize = maybe (const True) ((<=) . getMinSize) _nsq_minSize
respectsMaxSize = maybe (const True) ((>=) . getMaxSize) _nsq_maxSize
in respectsMinSize nodeSize
&& respectsMaxSize nodeSize
&& _nsq_searchQuery (inputNode ^. ne_ngrams)
&& matchesListType (inputNode ^. ne_list)
-- Sorts the input 'NgramsElement' list.
-- /IMPORTANT/: As we might be sorting ngrams in all sorts of language,
-- some of them might include letters with accents and other unicode symbols,
......@@ -489,14 +528,6 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
ngramTermsAscSorter = on unicodeDUCETSorter (unNgramsTerm . view ne_ngrams)
ngramTermsDescSorter = on (\n1 n2 -> unicodeDUCETSorter n2 n1) (unNgramsTerm . view ne_ngrams)
-- | Filters the given `tableMap` with the search criteria. It returns
-- a set of 'NgramsElement' all matching the input 'NGramsSearchQuery'.
filterNodes :: Map NgramsTerm NgramsElement -> Set NgramsElement
filterNodes tblMap = Set.map (rootOf tblMap) selectedNodes
where
allNodes = Set.fromList $ Map.elems tblMap
selectedNodes = Set.filter matchingNode allNodes
-- | For each input root, extends its occurrence count with
-- the information found in the subitems.
withInners :: Map NgramsTerm NgramsElement -> Set NgramsElement -> Set NgramsElement
......
......@@ -5,9 +5,9 @@ import Prelude
import Control.Lens
import Data.Map.Strict qualified as Map
import Data.Map.Strict.Patch qualified as PM
import Data.Patch.Class (Replace(..))
import Data.Set qualified as Set
import Data.Text qualified as T
import Gargantext.API.Ngrams (filterNgramsNodes)
import Gargantext.API.Ngrams.Types
import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core
......@@ -17,13 +17,11 @@ import Gargantext.Core.Types
import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Schema.Context
import Test.Hspec
import Test.HUnit
import Test.Hspec
import Test.Instances ()
import Test.QuickCheck
import qualified Data.Patch.Class as Patch
import qualified Data.Validity as Validity
import qualified Test.QuickCheck as QC
import Test.QuickCheck qualified as QC
genScientificText :: Gen T.Text
......@@ -97,65 +95,37 @@ tests = describe "Ngrams" $ do
describe "docNgrams" $ do
it "always matches if the input text contains any of the terms" $ property testDocNgramsOKMatch
describe "hierarchical grouping" $ do
it "attaching a child with children to a parent should preserve ancestorship" testHierarchicalGrouping
hierarchicalCorpus :: NgramsTableMap
hierarchicalCorpus = Map.fromList [
( "car", NgramsRepoElement { _nre_size = 1
, _nre_list = MapTerm
, _nre_root = Nothing
, _nre_parent = Nothing
, _nre_children = mSetFromList [ "Ford" ]
})
, ( "Ford", NgramsRepoElement { _nre_size = 1
, _nre_list = MapTerm
, _nre_root = Just "car"
, _nre_parent = Just "car"
, _nre_children = mempty
})
]
patchedHierarchicalCorpus :: NgramsTableMap
patchedHierarchicalCorpus = Map.fromList [
( "vehicle", NgramsRepoElement { _nre_size = 1
, _nre_list = MapTerm
, _nre_root = Nothing
, _nre_parent = Nothing
, _nre_children = mSetFromList [ "car" ]
})
, ( "car", NgramsRepoElement { _nre_size = 1
, _nre_list = MapTerm
, _nre_root = Just "vehicle"
, _nre_parent = Just "vehicle"
, _nre_children = mSetFromList [ "Ford" ]
})
, ( "Ford", NgramsRepoElement { _nre_size = 1
, _nre_list = MapTerm
, _nre_root = Just "vehicle"
, _nre_parent = Just "car"
, _nre_children = mempty
})
]
patchHierarchical :: NgramsTablePatch
patchHierarchical = mkNgramsTablePatch $ Map.fromList [
(NgramsTerm "vehicle", NgramsPatch
{ _patch_children = PatchMSet
$ fst
$ PM.fromList
$ [ ( "car", addPatch ) ]
, _patch_list = Keep
}
)
]
testHierarchicalGrouping :: Assertion
testHierarchicalGrouping = do
-- Check the patch is applicable
Validity.validationIsValid (Patch.applicable patchHierarchical (Just hierarchicalCorpus)) @?= True
Patch.act patchHierarchical (Just hierarchicalCorpus) @?= Just patchedHierarchicalCorpus
it "filterNgramsNodes with empty query is identity" testFilterNgramsNodesEmptyQuery
testFilterNgramsNodesEmptyQuery :: Assertion
testFilterNgramsNodesEmptyQuery = do
let input = Map.fromList [
("car", NgramsElement { _ne_ngrams = "car"
, _ne_size = 1
, _ne_list = MapTerm
, _ne_occurrences = mempty
, _ne_root = Just "vehicle"
, _ne_parent = Just "vehicle"
, _ne_children = mSetFromList ["ford"]
})
, ("ford", NgramsElement { _ne_ngrams = "ford"
, _ne_size = 1
, _ne_list = MapTerm
, _ne_occurrences = mempty
, _ne_root = Just "vehicle"
, _ne_parent = Just "car"
, _ne_children = mempty})
, ("vehicle", NgramsElement { _ne_ngrams = "vehicle"
, _ne_size = 1
, _ne_list = MapTerm
, _ne_occurrences = mempty
, _ne_root = Nothing
, _ne_parent = Nothing
, _ne_children = mSetFromList ["car"]})
]
let actual = filterNgramsNodes (Just MapTerm) Nothing Nothing (const True) input
actual @?= (Set.fromList $ Map.elems input)
testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property
testDocNgramsOKMatch lang (DocumentWithMatches ts doc) =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment