Commit 37979721 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Use DUCET sorting for ngrams elements

parent 1259eabd
Pipeline #5884 passed with stages
in 167 minutes and 16 seconds
......@@ -628,6 +628,7 @@ library
, tomland >= 1.3.3.2
, tuple ^>= 0.3.0.2
, unordered-containers ^>= 0.2.16.0
, unicode-collation >= 0.1.3.6
, uri-encode ^>= 1.5.0.7
, utf8-string ^>= 1.0.2
, uuid ^>= 1.3.15
......@@ -959,6 +960,7 @@ test-suite garg-test-tasty
, tmp-postgres >= 1.34.1 && < 1.35
, tree-diff
, unordered-containers ^>= 0.2.16.0
, unicode-collation >= 0.1.3.6
, validity ^>= 0.11.0.1
, vector ^>= 0.12.3.0
, wai
......
......@@ -120,6 +120,7 @@ import Gargantext.Prelude hiding (log, to, toLower, (%), isInfixOf)
import Gargantext.Prelude.Clock (hasTime, getTime)
import Gargantext.Utils.Jobs (serveJobsAPI, MonadJobStatus(..))
import Servant hiding (Patch)
import Text.Collate qualified as Unicode
{-
-- TODO sequences of modifications (Patchs)
......@@ -556,13 +557,21 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
&& _nsq_searchQuery (inputNode ^. ne_ngrams)
&& matchesListType (inputNode ^. ne_list)
-- Sorts the input 'NgramsElement' list.
-- /IMPORTANT/: As we might be sorting ngrams in all sorts of language,
-- some of them might include letters with accents and other unicode symbols,
-- but we need to filter those /diacritics/ out so that the sorting would
-- happen in the way users would expect. See ticket #331.
sortOnOrder :: Maybe OrderBy -> ([NgramsElement] -> [NgramsElement])
sortOnOrder Nothing = sortOnOrder (Just ScoreDesc)
sortOnOrder (Just TermAsc) = List.sortOn $ view ne_ngrams
sortOnOrder (Just TermDesc) = List.sortOn $ Down . view ne_ngrams
sortOnOrder (Just TermAsc) = List.sortBy ngramTermsAscSorter
sortOnOrder (Just TermDesc) = List.sortBy ngramTermsDescSorter
sortOnOrder (Just ScoreAsc) = List.sortOn $ view (ne_occurrences . to Set.size)
sortOnOrder (Just ScoreDesc) = List.sortOn $ Down . view (ne_occurrences . to Set.size)
ngramTermsAscSorter = on unicodeDUCETSorter (unNgramsTerm . view ne_ngrams)
ngramTermsDescSorter = on (\n1 n2 -> unicodeDUCETSorter n2 n1) (unNgramsTerm . view ne_ngrams)
-- | Filters the given `tableMap` with the search criteria. It returns
-- a set of 'NgramsElement' all matching the input 'NGramsSearchQuery'.
filterNodes :: Map NgramsTerm NgramsElement -> Set NgramsElement
......@@ -595,6 +604,13 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
. Set.toList
$ xs
-- | This function allows sorting two texts via their unicode sorting
-- (as opposed as the standard lexicographical sorting) by relying on
-- the DUCET table, a table that specifies the ordering of all unicode
-- characters. This is enough for mimicking the \"natural sort\" effect
-- that users would expect.
unicodeDUCETSorter :: Text -> Text -> Ordering
unicodeDUCETSorter = Unicode.collate Unicode.rootCollator
getTableNgrams :: forall env err m.
( HasNodeStory env err m
......
......@@ -10,6 +10,7 @@ import Data.Patch.Class qualified as Patch
import Data.String
import Data.Text qualified as T
import Data.Validity qualified as Validity
import Text.Collate qualified as Unicode
import Gargantext.API.Ngrams
import Gargantext.API.Ngrams.Types
import Gargantext.Core.Types.Main
......@@ -48,7 +49,9 @@ unitTests = testGroup "Query tests"
[ -- Sorting
testCase "Simple query mockFlatCorpus" testFlat01
, testCase "Simple query (desc sorting)" testFlat02
, testCase "[#331] Sort must ignore diacritics" testSortDiacritics
, testCase "[#331] sorting via DUCET works" testSortDiacriticsDucet
, testCase "[#331] Natural sort ascending works" testNaturalSortAsceding
, testCase "[#331] Natural sort descending works" testNaturalSortDescending
-- -- Filtering
, testCase "Simple query (listType = MapTerm)" testFlat03
, testCase "Simple query (listType = StopTerm)" testFlat04
......@@ -102,10 +105,40 @@ testFlat02 = do
, _nsq_searchQuery = mockQueryFn Nothing
}
testSortDiacritics :: Assertion
testSortDiacritics = do
testSortDiacriticsDucet :: Assertion
testSortDiacriticsDucet = do
let inputData = [ "étude", "âge", "vue", "période" ]
let expected = [ "âge", "étude", "période", "vue" ]
expected @??= sortBy (Unicode.collate Unicode.rootCollator) inputData
testNaturalSortAsceding :: Assertion
testNaturalSortAsceding = do
let res = searchTableNgrams frenchCorpus searchQuery
res @??= VersionedWithCount 0 4 ( NgramsTable $ map mkMapTerm [ "âge", "étude", "période", "vue" ])
where
frenchCorpus :: Versioned (Map NgramsTerm NgramsElement)
frenchCorpus = Versioned 0 $ Map.fromList [
( "doc_01", mkMapTerm "période")
, ( "doc_02", mkMapTerm "vue")
, ( "doc_03", mkMapTerm "âge")
, ( "doc_04", mkMapTerm "étude")
]
searchQuery = NgramsSearchQuery {
_nsq_limit = Limit 10
, _nsq_offset = Nothing
, _nsq_listType = Nothing
, _nsq_minSize = Nothing
, _nsq_maxSize = Nothing
, _nsq_orderBy = Just TermAsc
, _nsq_searchQuery = mockQueryFn Nothing
}
testNaturalSortDescending :: Assertion
testNaturalSortDescending = do
let res = searchTableNgrams frenchCorpus searchQuery
res @??= VersionedWithCount 0 4 ( NgramsTable $ map mkMapTerm [ "vue", "période", "étude", "âge" ])
where
frenchCorpus :: Versioned (Map NgramsTerm NgramsElement)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment