Commit d38dbd35 authored by Fabien Maniere's avatar Fabien Maniere

Revert "Merge branch 'revert-97441f3d' into 'dev'"

This reverts merge request !449
parent c767088d
Pipeline #7924 passed with stages
in 41 minutes and 41 seconds
......@@ -17,14 +17,12 @@ add get
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE IncoherentInstances #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Ngrams
(
......@@ -114,7 +112,7 @@ import Gargantext.API.Ngrams.Types
import Gargantext.Core.NodeStory hiding (buildForest)
import Gargantext.Core.NodeStory qualified as NodeStory
import Gargantext.Core.Text.Ngrams (Ngrams, NgramsType)
import Gargantext.Core.Types (ListType(..), NodeId, ListId, TODO, assertValid, ContextId, HasValidationError)
import Gargantext.Core.Types (ListType(..), NodeId, ListId, TODO, assertValid, HasValidationError)
import Gargantext.Core.Types.Query (Limit(..), Offset(..), MinSize(..), MaxSize(..))
import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast)
import Gargantext.Database.Prelude
......@@ -458,14 +456,16 @@ buildForest = fmap (map (fmap snd)) . NodeStory.buildForest
-- This function doesn't aggregate information, but merely just recostructs the original
-- map without loss of information. To perform operations on the forest, use the appropriate
-- functions.
destroyForest :: Forest NgramsElement -> Map NgramsTerm NgramsElement
destroyForest f = Map.fromList . map (foldTree destroyTree) $ f
-- /NOTA BENE:/ We return a list and not a Map because we might have sorted the forest, and
-- converting into a map would trash the carefully-constructed sorting.
destroyForest :: Forest NgramsElement -> [(NgramsTerm, NgramsElement)]
destroyForest f = concatMap (map (\el -> (_ne_ngrams el, el)) . flatten) $ f
where
destroyTree :: NgramsElement -> [(NgramsTerm, NgramsElement)] -> (NgramsTerm, NgramsElement)
destroyTree rootEl childrenEl = (_ne_ngrams rootEl, squashElements rootEl childrenEl)
-- _destroyTree :: NgramsElement -> [(NgramsTerm, [NgramsElement])] -> (NgramsTerm, [NgramsElement])
-- _destroyTree rootEl childrenEl = (_ne_ngrams rootEl, childrenEl)
squashElements :: NgramsElement -> [(NgramsTerm, NgramsElement)] -> NgramsElement
squashElements r _ = r
-- _squashElements :: NgramsElement -> [(NgramsTerm, NgramsElement)] -> NgramsElement
-- _squashElements r _ = r
-- | TODO Errors management
-- TODO: polymorphic for Annuaire or Corpus or ...
......@@ -482,59 +482,64 @@ searchTableNgrams :: Versioned (Map NgramsTerm NgramsElement)
-> Either BuildForestError (VersionedWithCount NgramsTable)
searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
let tableMap = versionedTableMap ^. v_data
in case buildForest tableMap of
in case keepRoots <$> buildForest tableMap of
Left err -> Left err
Right fs ->
let forestRoots = Set.fromList
. Map.elems
. destroyForest
. filterNgramsNodes _nsq_listType _nsq_minSize _nsq_maxSize _nsq_searchQuery
$ fs
let forestRoots = filterNgramsNodes _nsq_listType _nsq_minSize _nsq_maxSize _nsq_searchQuery $ fs
tableMapSorted = versionedTableMap
& v_data .~ (NgramsTable . sortAndPaginate . withInners tableMap $ forestRoots)
& v_data .~ (NgramsTable . map snd
. destroyForest
. sortAndPaginateForest _nsq_offset _nsq_limit _nsq_orderBy
. withInnersForest
$ forestRoots
)
in Right $ toVersionedWithCount (Set.size forestRoots) tableMapSorted
where
in Right $ toVersionedWithCount (length forestRoots) tableMapSorted
keepRoots :: Forest NgramsElement -> Forest NgramsElement
keepRoots = filter (\(Node r _) -> isNothing (_ne_root r) || isNothing (_ne_parent r))
-- Sorts the input 'NgramsElement' list.
-- /IMPORTANT/: As we might be sorting ngrams in all sorts of language,
-- some of them might include letters with accents and other unicode symbols,
-- but we need to filter those /diacritics/ out so that the sorting would
-- happen in the way users would expect. See ticket #331.
sortOnOrder :: Maybe OrderBy -> ([NgramsElement] -> [NgramsElement])
sortOnOrder Nothing = sortOnOrder (Just ScoreDesc)
sortOnOrder (Just TermAsc) = List.sortBy ngramTermsAscSorter
sortOnOrder (Just TermDesc) = List.sortBy ngramTermsDescSorter
sortOnOrder (Just ScoreAsc) = List.sortOn $ view (ne_occurrences . to Set.size)
sortOnOrder (Just ScoreDesc) = List.sortOn $ Down . view (ne_occurrences . to Set.size)
ngramTermsAscSorter = on unicodeDUCETSorter (unNgramsTerm . view ne_ngrams)
ngramTermsDescSorter = on (\n1 n2 -> unicodeDUCETSorter n2 n1) (unNgramsTerm . view ne_ngrams)
-- | For each input root, extends its occurrence count with
-- the information found in the subitems.
withInners :: Map NgramsTerm NgramsElement -> Set NgramsElement -> Set NgramsElement
withInners tblMap roots = Set.map addSubitemsOccurrences roots
-- | For each input root, extends its occurrence count with
-- the information found in the subforest.
withInnersForest :: Forest NgramsElement -> Forest NgramsElement
withInnersForest = map sumSubitemsOccurrences
where
addSubitemsOccurrences :: NgramsElement -> NgramsElement
addSubitemsOccurrences e =
e { _ne_occurrences = foldl' alterOccurrences (e ^. ne_occurrences) (e ^. ne_children) }
alterOccurrences :: Set ContextId -> NgramsTerm -> Set ContextId
alterOccurrences occs t = case Map.lookup t tblMap of
Nothing -> occs
Just e' -> occs <> e' ^. ne_occurrences
-- | Paginate the results
sortAndPaginate :: Set NgramsElement -> [NgramsElement]
sortAndPaginate xs =
let offset' = getOffset $ maybe 0 identity _nsq_offset
in take (getLimit _nsq_limit)
sumSubitemsOccurrences :: Tree NgramsElement -> Tree NgramsElement
sumSubitemsOccurrences (Node root children) =
let children' = withInnersForest children
root' = root { _ne_occurrences = (_ne_occurrences root) <> foldMap (_ne_occurrences . rootLabel) children' }
in Node root' children'
sortAndPaginateForest :: Maybe Offset
-> Limit
-> Maybe OrderBy
-> Forest NgramsElement
-> Forest NgramsElement
sortAndPaginateForest mb_offset limit orderBy xs =
let offset' = getOffset $ maybe 0 identity mb_offset
in take (getLimit limit)
. drop offset'
. sortOnOrder _nsq_orderBy
. Set.toList
. sortOnOrderForest orderBy
$ xs
-- Sorts the input 'NgramsElement' list.
-- /IMPORTANT/: As we might be sorting ngrams in all sorts of language,
-- some of them might include letters with accents and other unicode symbols,
-- but we need to filter those /diacritics/ out so that the sorting would
-- happen in the way users would expect. See ticket #331.
sortOnOrderForest :: Maybe OrderBy -> (Forest NgramsElement -> Forest NgramsElement)
sortOnOrderForest Nothing = sortOnOrderForest (Just ScoreDesc)
sortOnOrderForest (Just TermAsc) = List.sortBy (\(Node t1 _) (Node t2 _) -> ngramTermsAscSorter t1 t2)
sortOnOrderForest (Just TermDesc) = List.sortBy (\(Node t1 _) (Node t2 _) -> ngramTermsDescSorter t1 t2)
sortOnOrderForest (Just ScoreAsc) = List.sortOn $ \(Node root _) -> root ^. (ne_occurrences . to Set.size)
sortOnOrderForest (Just ScoreDesc) = List.sortOn $ Down . (\(Node root _) -> root ^. (ne_occurrences . to Set.size))
ngramTermsAscSorter :: NgramsElement -> NgramsElement -> Ordering
ngramTermsAscSorter = on unicodeDUCETSorter (unNgramsTerm . view ne_ngrams)
ngramTermsDescSorter :: NgramsElement -> NgramsElement -> Ordering
ngramTermsDescSorter = on (\n1 n2 -> unicodeDUCETSorter n2 n1) (unNgramsTerm . view ne_ngrams)
-- | This function allows sorting two texts via their unicode sorting
-- (as opposed as the standard lexicographical sorting) by relying on
-- the DUCET table, a table that specifies the ordering of all unicode
......
......@@ -214,22 +214,23 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
eRes <- runClientM (get_table_ngrams token cId APINgrams.Terms listId 50 Nothing (Just MapTerm) Nothing Nothing Nothing Nothing) clientEnv
eRes `shouldSatisfy` isRight
let (Right res) = eRes
-- /NOTA BENE/ The count is 1 because the count applies to roots only.
Just res `shouldBe` JSON.decode [json| {"version":5
,"count":3
,"count":1
,"data":[
{"ngrams":"guitar pedals"
{"ngrams":"overdrives"
,"size":1
,"list":"MapTerm"
,"root":"overdrives"
,"parent":"overdrives"
,"occurrences":[]
,"children":["tube screamers"]
,"children":["guitar pedals"]
},
{"ngrams":"overdrives"
{"ngrams":"guitar pedals"
,"size":1
,"list":"MapTerm"
,"root":"overdrives"
,"parent":"overdrives"
,"occurrences":[]
,"children":["guitar pedals"]
,"children":["tube screamers"]
},
{"ngrams":"tube screamers"
,"size":1
......@@ -309,16 +310,8 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
-- check that new term is parent of old one
checkNgrams getNgrams [json| {"version": 2
,"count":2
,"count":1
,"data":[
{"ngrams":"abelian group"
,"size":2
,"list":"MapTerm"
,"root": "new abelian group"
,"parent": "new abelian group"
,"occurrences":[]
,"children":[]
},
{"ngrams":"new abelian group"
,"size":1
,"list":"MapTerm"
......@@ -326,6 +319,14 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
,"parent":null
,"occurrences":[]
,"children":["abelian group"]
},
{"ngrams":"abelian group"
,"size":2
,"list":"MapTerm"
,"root": "new abelian group"
,"parent": "new abelian group"
,"occurrences":[]
,"children":[]
}
]
}
......@@ -341,16 +342,8 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
-- In essence, this JSON needs to be exactly the same as the previous one,
-- i.e. important doesn't change the topology.
checkNgrams getNgrams [json| {"version": 2
,"count":2
,"count":1
,"data":[
{"ngrams":"abelian group"
,"size":2
,"list":"MapTerm"
,"root": "new abelian group"
,"parent": "new abelian group"
,"occurrences":[]
,"children":[]
},
{"ngrams":"new abelian group"
,"size":1
,"list":"MapTerm"
......@@ -358,6 +351,14 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
,"parent":null
,"occurrences":[]
,"children":["abelian group"]
},
{"ngrams":"abelian group"
,"size":2
,"list":"MapTerm"
,"root": "new abelian group"
,"parent": "new abelian group"
,"occurrences":[]
,"children":[]
}
]
}
......
......@@ -768,7 +768,7 @@ genCorpusWithMatchingElement = do
depth <- choose (1, 5)
let mkEntry = do
trm <- arbitrary
el <- over ne_children (breakLoop trm) <$> (resize depth arbitrary)
el <- over ne_children (breakLoop trm) . makeItRoot <$> (resize depth arbitrary)
pure (trm, el { _ne_ngrams = trm })
-- Let's build the map first, so that duplicates will be overwritten.
fullMap <- (Map.fromList <$> vectorOf depth mkEntry) `suchThat` (\x -> isRight (buildForest x)) -- exclude loops
......@@ -778,6 +778,10 @@ genCorpusWithMatchingElement = do
breakLoop :: NgramsTerm -> MSet NgramsTerm -> MSet NgramsTerm
breakLoop t = mSetFromSet . Set.delete t . mSetToSet
makeItRoot :: NgramsElement -> NgramsElement
makeItRoot ne = ne & ne_root .~ Nothing
& ne_parent .~ Nothing
instance Arbitrary AcyclicTableMap where
arbitrary = genCorpusWithMatchingElement
shrink = shrinkTree
......
......@@ -236,7 +236,7 @@ testForestSearchProp :: Property
testForestSearchProp = forAll arbitrary $ \(AcyclicTableMap ngramsTable el) -> do
case searchTableNgrams (Versioned 0 ngramsTable) (searchQuery el) of
Left (BFE_loop_detected err) -> fail (T.unpack $ renderLoop err)
Right res -> res ^. vc_data `shouldSatisfy` (elem (_ne_ngrams el) . map _ne_ngrams . getNgramsTable)
Right res -> res ^. vc_data `shouldSatisfy` (any (containsTerm (_ne_ngrams el)) . getNgramsTable)
where
searchQuery term = NgramsSearchQuery {
_nsq_limit = Limit 5
......@@ -255,7 +255,9 @@ testSearchNestedTerms :: Assertion
testSearchNestedTerms = do
case searchTableNgrams (Versioned 0 hierarchicalTableMap) searchQuery of
Left (BFE_loop_detected err) -> fail (T.unpack $ renderLoop err)
Right res -> res ^. vc_data `shouldSatisfy` (elem "ford" . map _ne_ngrams . getNgramsTable)
Right res ->
-- it should appear at the top level or as one of the children.
res ^. vc_data `shouldSatisfy` (any (containsTerm "ford") . getNgramsTable)
where
searchQuery = NgramsSearchQuery {
_nsq_limit = Limit 5
......@@ -267,6 +269,11 @@ testSearchNestedTerms = do
, _nsq_searchQuery = mockQueryFn (Just "ford")
}
-- | Returns True if the input 'NgramsElement' contains (either in the root or in the children)
-- the input term.
containsTerm :: NgramsTerm -> NgramsElement -> Bool
containsTerm t (NgramsElement{..}) = _ne_ngrams == t || any ((==) t) (mSetToList _ne_children)
-- Pagination tests
test_pagination_allTerms :: Assertion
......@@ -390,7 +397,7 @@ test_paginationQuantum = do
Left err -> fail (show err)
Right res -> do
let elems = coerce @NgramsTable @[NgramsElement] $ _vc_data res
length elems @?= 10
countRoots elems @?= 10
forM_ elems $ \term ->
assertBool ("found " <> show (_ne_list term) <> " in: " <> show elems) (_ne_list term == MapTerm)
where
......@@ -404,13 +411,20 @@ test_paginationQuantum = do
, _nsq_searchQuery = mockQueryFn Nothing
}
countRoots :: [NgramsElement] -> Int
countRoots [] = 0
countRoots (x:xs) =
if isNothing (_ne_root x) || isNothing (_ne_parent x)
then 1 + countRoots xs
else countRoots xs
test_paginationQuantum_02 :: Assertion
test_paginationQuantum_02 = do
case searchTableNgrams quantumComputingCorpus searchQuery of
Left err -> fail (show err)
Right res -> do
let elems = coerce @NgramsTable @[NgramsElement] $ _vc_data res
assertBool ("found only " <> show (length elems) <> " in: " <> show elems) (length elems == 10)
assertBool ("found only " <> show (length elems) <> " in: " <> show elems) (countRoots elems == 10)
where
searchQuery = NgramsSearchQuery {
_nsq_limit = Limit 10
......
......@@ -269,7 +269,7 @@ testBuildNgramsTree_03 =
-- /PRECONDITION/: The '_ne_ngrams' field always matches the 'NgramsTerm', key of the map.
buildDestroyForestRoundtrips :: AcyclicTableMap -> Property
buildDestroyForestRoundtrips (AcyclicTableMap mp _) =
(destroyForest . buildForestOrFail $ mp) === mp
(Map.fromList . destroyForest . buildForestOrFail $ mp) === mp
testPruningNgramsForest_01 :: Property
testPruningNgramsForest_01 =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment