Commit fb8b17c4 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Change implementation of filterNgramsNodes

This commit changes the implementation of filterNgramsNodes to not
filter too aggressively nodes that are present in a path.

This causes a test failure that needs to be investigated.
parent 51a2fe5a
...@@ -18,12 +18,13 @@ add get ...@@ -18,12 +18,13 @@ add get
{-# OPTIONS_GHC -fno-warn-unused-top-binds #-} {-# OPTIONS_GHC -fno-warn-unused-top-binds #-}
{-# OPTIONS_GHC -fno-warn-deprecations #-} {-# OPTIONS_GHC -fno-warn-deprecations #-}
{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE IncoherentInstances #-} {-# LANGUAGE IncoherentInstances #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
module Gargantext.API.Ngrams module Gargantext.API.Ngrams
...@@ -87,6 +88,9 @@ module Gargantext.API.Ngrams ...@@ -87,6 +88,9 @@ module Gargantext.API.Ngrams
, compute_new_state_patches , compute_new_state_patches
, PatchHistory(..) , PatchHistory(..)
, newNgramsFromNgramsStatePatch , newNgramsFromNgramsStatePatch
, filterNgramsNodes
, rootOfNgramsElement
, matchingNode
) )
where where
...@@ -112,7 +116,6 @@ import Gargantext.Prelude hiding (log, to, toLower, (%), isInfixOf) ...@@ -112,7 +116,6 @@ import Gargantext.Prelude hiding (log, to, toLower, (%), isInfixOf)
import Text.Collate qualified as Unicode import Text.Collate qualified as Unicode
{- {-
-- TODO sequences of modifications (Patchs) -- TODO sequences of modifications (Patchs)
type NgramsIdPatch = Patch NgramsId NgramsPatch type NgramsIdPatch = Patch NgramsId NgramsPatch
...@@ -422,6 +425,71 @@ dumpJsonTableMap fpath nodeId ngramsType = do ...@@ -422,6 +425,71 @@ dumpJsonTableMap fpath nodeId ngramsType = do
liftBase $ DTL.writeFile (unpack fpath) (DAT.encodeToLazyText m) liftBase $ DTL.writeFile (unpack fpath) (DAT.encodeToLazyText m)
pure () pure ()
-- | Filters the given `tableMap` with the search criteria. It returns
-- a set of 'NgramsElement' all matching the input 'NGramsSearchQuery'.
filterNgramsNodes :: Maybe ListType
-> Maybe MinSize
-> Maybe MaxSize
-> (NgramsTerm -> Bool)
-> Map NgramsTerm NgramsElement
-> Set NgramsElement
filterNgramsNodes listTy minSize maxSize searchFn tblMap =
Set.fromList $ concatMap (findRootPath tblMap) selectedNodes
where
allNodes = Set.fromList $ Map.elems tblMap
selectedNodes = Set.filter (matchingNode listTy minSize maxSize searchFn) allNodes
-- | Given the full forest of ngrams and the current element we are iterating on,
-- returns the full path of ngrams between this node and its /root/. This informs
-- us of all the nodes we have to keep in the final result set, because if we were
-- to filter them, we would be omitting important hierarchical information and this
-- will cause ngrams to not be displayed correctly on the frontend.
findRootPath :: Map NgramsTerm NgramsElement -> NgramsElement -> [NgramsElement]
findRootPath tblMap node = go node []
where
go current !acc =
case _ne_parent current >>= (`Map.lookup` tblMap) of
Nothing -> current : acc
Just parentNode -> go parentNode (current : acc)
-- | Returns the \"root\" of the 'NgramsElement', or it falls back to the input
-- 'NgramsElement' itself, if no root can be found.
-- /CAREFUL/: The root we select might /not/ have the same 'listType' we are
-- filtering for, in which case we have to change its type to match, if needed.
rootOfNgramsElement :: Maybe ListType
-> Map NgramsTerm NgramsElement
-> NgramsElement
-> NgramsElement
rootOfNgramsElement listType tblMap ne = case ne ^. ne_root of
Nothing -> ne
Just rootKey
| Just r <- tblMap ^. at rootKey
-- NOTE(adinapoli) It's unclear what is the correct behaviour here: should
-- we override the type or we filter out the node altogether?
-> over ne_list (\oldList -> fromMaybe oldList listType) r
| otherwise
-> ne
-- | Returns 'True' if the input 'NgramsElement' satisfies the search criteria
-- mandated by 'NgramsSearchQuery'.
matchingNode :: Maybe ListType
-> Maybe MinSize
-> Maybe MaxSize
-> (NgramsTerm -> Bool)
-> NgramsElement
-> Bool
matchingNode listType minSize maxSize searchQuery inputNode =
let nodeSize = inputNode ^. ne_size
matchesListType = maybe (const True) (==) listType
respectsMinSize = maybe (const True) ((<=) . getMinSize) minSize
respectsMaxSize = maybe (const True) ((>=) . getMaxSize) maxSize
in respectsMinSize nodeSize
&& respectsMaxSize nodeSize
&& searchQuery (inputNode ^. ne_ngrams)
&& matchesListType (inputNode ^. ne_list)
-- | TODO Errors management -- | TODO Errors management
-- TODO: polymorphic for Annuaire or Corpus or ... -- TODO: polymorphic for Annuaire or Corpus or ...
...@@ -438,42 +506,13 @@ searchTableNgrams :: Versioned (Map NgramsTerm NgramsElement) ...@@ -438,42 +506,13 @@ searchTableNgrams :: Versioned (Map NgramsTerm NgramsElement)
-> VersionedWithCount NgramsTable -> VersionedWithCount NgramsTable
searchTableNgrams versionedTableMap NgramsSearchQuery{..} = searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
let tableMap = versionedTableMap ^. v_data let tableMap = versionedTableMap ^. v_data
filteredData = filterNodes tableMap filteredData = filterNgramsNodes _nsq_listType _nsq_minSize _nsq_maxSize _nsq_searchQuery tableMap
tableMapSorted = versionedTableMap tableMapSorted = versionedTableMap
& v_data .~ (NgramsTable . sortAndPaginate . withInners tableMap $ filteredData) & v_data .~ (NgramsTable . sortAndPaginate . withInners tableMap $ filteredData)
in toVersionedWithCount (Set.size filteredData) tableMapSorted in toVersionedWithCount (Set.size filteredData) tableMapSorted
where where
-- | Returns the \"root\" of the 'NgramsElement', or it falls back to the input
-- 'NgramsElement' itself, if no root can be found.
-- /CAREFUL/: The root we select might /not/ have the same 'listType' we are
-- filtering for, in which case we have to change its type to match, if needed.
rootOf :: Map NgramsTerm NgramsElement -> NgramsElement -> NgramsElement
rootOf tblMap ne = case ne ^. ne_root of
Nothing -> ne
Just rootKey
| Just r <- tblMap ^. at rootKey
-- NOTE(adinapoli) It's unclear what is the correct behaviour here: should
-- we override the type or we filter out the node altogether?
-> over ne_list (\oldList -> fromMaybe oldList _nsq_listType) r
| otherwise
-> ne
-- | Returns 'True' if the input 'NgramsElement' satisfies the search criteria
-- mandated by 'NgramsSearchQuery'.
matchingNode :: NgramsElement -> Bool
matchingNode inputNode =
let nodeSize = inputNode ^. ne_size
matchesListType = maybe (const True) (==) _nsq_listType
respectsMinSize = maybe (const True) ((<=) . getMinSize) _nsq_minSize
respectsMaxSize = maybe (const True) ((>=) . getMaxSize) _nsq_maxSize
in respectsMinSize nodeSize
&& respectsMaxSize nodeSize
&& _nsq_searchQuery (inputNode ^. ne_ngrams)
&& matchesListType (inputNode ^. ne_list)
-- Sorts the input 'NgramsElement' list. -- Sorts the input 'NgramsElement' list.
-- /IMPORTANT/: As we might be sorting ngrams in all sorts of language, -- /IMPORTANT/: As we might be sorting ngrams in all sorts of language,
-- some of them might include letters with accents and other unicode symbols, -- some of them might include letters with accents and other unicode symbols,
...@@ -489,14 +528,6 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} = ...@@ -489,14 +528,6 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
ngramTermsAscSorter = on unicodeDUCETSorter (unNgramsTerm . view ne_ngrams) ngramTermsAscSorter = on unicodeDUCETSorter (unNgramsTerm . view ne_ngrams)
ngramTermsDescSorter = on (\n1 n2 -> unicodeDUCETSorter n2 n1) (unNgramsTerm . view ne_ngrams) ngramTermsDescSorter = on (\n1 n2 -> unicodeDUCETSorter n2 n1) (unNgramsTerm . view ne_ngrams)
-- | Filters the given `tableMap` with the search criteria. It returns
-- a set of 'NgramsElement' all matching the input 'NGramsSearchQuery'.
filterNodes :: Map NgramsTerm NgramsElement -> Set NgramsElement
filterNodes tblMap = Set.map (rootOf tblMap) selectedNodes
where
allNodes = Set.fromList $ Map.elems tblMap
selectedNodes = Set.filter matchingNode allNodes
-- | For each input root, extends its occurrence count with -- | For each input root, extends its occurrence count with
-- the information found in the subitems. -- the information found in the subitems.
withInners :: Map NgramsTerm NgramsElement -> Set NgramsElement -> Set NgramsElement withInners :: Map NgramsTerm NgramsElement -> Set NgramsElement -> Set NgramsElement
......
...@@ -5,9 +5,9 @@ import Prelude ...@@ -5,9 +5,9 @@ import Prelude
import Control.Lens import Control.Lens
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Data.Map.Strict.Patch qualified as PM import Data.Set qualified as Set
import Data.Patch.Class (Replace(..))
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.API.Ngrams (filterNgramsNodes)
import Gargantext.API.Ngrams.Types import Gargantext.API.Ngrams.Types
import Gargantext.API.Ngrams.Types qualified as NT import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core import Gargantext.Core
...@@ -17,13 +17,11 @@ import Gargantext.Core.Types ...@@ -17,13 +17,11 @@ import Gargantext.Core.Types
import Gargantext.Database.Action.Flow.Utils (docNgrams) import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Database.Admin.Types.Hyperdata import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Schema.Context import Gargantext.Database.Schema.Context
import Test.Hspec
import Test.HUnit import Test.HUnit
import Test.Hspec
import Test.Instances () import Test.Instances ()
import Test.QuickCheck import Test.QuickCheck
import qualified Data.Patch.Class as Patch import Test.QuickCheck qualified as QC
import qualified Data.Validity as Validity
import qualified Test.QuickCheck as QC
genScientificText :: Gen T.Text genScientificText :: Gen T.Text
...@@ -97,65 +95,37 @@ tests = describe "Ngrams" $ do ...@@ -97,65 +95,37 @@ tests = describe "Ngrams" $ do
describe "docNgrams" $ do describe "docNgrams" $ do
it "always matches if the input text contains any of the terms" $ property testDocNgramsOKMatch it "always matches if the input text contains any of the terms" $ property testDocNgramsOKMatch
describe "hierarchical grouping" $ do describe "hierarchical grouping" $ do
it "attaching a child with children to a parent should preserve ancestorship" testHierarchicalGrouping it "filterNgramsNodes with empty query is identity" testFilterNgramsNodesEmptyQuery
hierarchicalCorpus :: NgramsTableMap testFilterNgramsNodesEmptyQuery :: Assertion
hierarchicalCorpus = Map.fromList [ testFilterNgramsNodesEmptyQuery = do
( "car", NgramsRepoElement { _nre_size = 1 let input = Map.fromList [
, _nre_list = MapTerm ("car", NgramsElement { _ne_ngrams = "car"
, _nre_root = Nothing , _ne_size = 1
, _nre_parent = Nothing , _ne_list = MapTerm
, _nre_children = mSetFromList [ "Ford" ] , _ne_occurrences = mempty
}) , _ne_root = Just "vehicle"
, ( "Ford", NgramsRepoElement { _nre_size = 1 , _ne_parent = Just "vehicle"
, _nre_list = MapTerm , _ne_children = mSetFromList ["ford"]
, _nre_root = Just "car" })
, _nre_parent = Just "car" , ("ford", NgramsElement { _ne_ngrams = "ford"
, _nre_children = mempty , _ne_size = 1
}) , _ne_list = MapTerm
] , _ne_occurrences = mempty
, _ne_root = Just "vehicle"
patchedHierarchicalCorpus :: NgramsTableMap , _ne_parent = Just "car"
patchedHierarchicalCorpus = Map.fromList [ , _ne_children = mempty})
( "vehicle", NgramsRepoElement { _nre_size = 1 , ("vehicle", NgramsElement { _ne_ngrams = "vehicle"
, _nre_list = MapTerm , _ne_size = 1
, _nre_root = Nothing , _ne_list = MapTerm
, _nre_parent = Nothing , _ne_occurrences = mempty
, _nre_children = mSetFromList [ "car" ] , _ne_root = Nothing
}) , _ne_parent = Nothing
, ( "car", NgramsRepoElement { _nre_size = 1 , _ne_children = mSetFromList ["car"]})
, _nre_list = MapTerm ]
, _nre_root = Just "vehicle" let actual = filterNgramsNodes (Just MapTerm) Nothing Nothing (const True) input
, _nre_parent = Just "vehicle" actual @?= (Set.fromList $ Map.elems input)
, _nre_children = mSetFromList [ "Ford" ]
})
, ( "Ford", NgramsRepoElement { _nre_size = 1
, _nre_list = MapTerm
, _nre_root = Just "vehicle"
, _nre_parent = Just "car"
, _nre_children = mempty
})
]
patchHierarchical :: NgramsTablePatch
patchHierarchical = mkNgramsTablePatch $ Map.fromList [
(NgramsTerm "vehicle", NgramsPatch
{ _patch_children = PatchMSet
$ fst
$ PM.fromList
$ [ ( "car", addPatch ) ]
, _patch_list = Keep
}
)
]
testHierarchicalGrouping :: Assertion
testHierarchicalGrouping = do
-- Check the patch is applicable
Validity.validationIsValid (Patch.applicable patchHierarchical (Just hierarchicalCorpus)) @?= True
Patch.act patchHierarchical (Just hierarchicalCorpus) @?= Just patchedHierarchicalCorpus
testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property
testDocNgramsOKMatch lang (DocumentWithMatches ts doc) = testDocNgramsOKMatch lang (DocumentWithMatches ts doc) =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment