Commit 2e282303 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Tighten generator for NgramsTermNonEmpty

Previously the generator was generating all sorts of unicode symbols,
which doesn't play well for things like tab separators, carriage returns
and other things.

Furthermore, we need to be careful to not use the same symbol set of
`isSep` when we generate terms, because we are simulating an ngrams
search in a document and ngrams do not contain those separators
(i.e. `k2(` is not a valid ngram, but `k2` is).
parent d8bdcdee
{-# LANGUAGE TypeApplications #-}
module Test.Offline.Ngrams (tests) where
import Prelude
......@@ -5,13 +6,49 @@ import Prelude
import Data.Text qualified as T
import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core
import Gargantext.Core.Text.Terms.WithList
import Gargantext.Core.Types
import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Database.Schema.Context
import Gargantext.Database.Admin.Types.Hyperdata
import Test.Instances ()
import Test.QuickCheck
import Test.Tasty
import Test.Tasty.QuickCheck (testProperty)
import Control.Lens
import qualified Test.QuickCheck as QC
import Gargantext.Core.Text.Terms.Mono (isSep)
genScientificText :: Gen T.Text
genScientificText = T.pack <$> listOf genScientificChar
-- | Roughly simulate the kind of text we might find in scientific papers and their abstracts.
genScientificChar :: Gen Char
genScientificChar = frequency
[ (10, choose ('a', 'z'))
, (10, choose ('A', 'Z'))
, (5, choose ('0', '9'))
, (2, QC.elements ws) -- Whitespace
, (2, QC.elements punctuation) -- Punctuation
, (1, QC.elements "éèàçöñüßøåÆŒ") -- Diacritics
, (1, QC.elements "αβγδΔθλμπσφχΩ≤≥≠≈") -- Greek/math
]
ws :: String
ws = " \t\n"
punctuation :: String
punctuation = ",.();:-"
genNgramsTermNonEmpty :: Gen NgramsTermNonEmpty
genNgramsTermNonEmpty = do
singleChar <- arbitrary `suchThat` (\x -> x /= ' ' && isAllowed x)
txt <- filter isAllowed <$> listOf1 genScientificChar
pure $ NgramsTermNonEmpty $ (T.pack $ singleChar : txt)
where
isAllowed :: Char -> Bool
isAllowed s = not (s `elem` punctuation) && not (s `elem` ws) && not (isSep s)
-- In order to test the behaviour of 'docNgrams' we create wrappers around 'NgramsTerm' to have two
-- different 'Arbitrary' flavours, one that always produces non-empty 'Text' fragments, and one that
......@@ -20,19 +57,54 @@ newtype NgramsTermNonEmpty = NgramsTermNonEmpty { unNgramsTermNonEmpty :: T.Text
deriving (Eq, Show)
instance Arbitrary NgramsTermNonEmpty where
arbitrary = genNgramsTermNonEmpty
data DocumentWithMatches =
DocumentWithMatches
{ _dwm_terms :: NonEmptyList NgramsTermNonEmpty
, _dwn_document :: ContextOnlyId HyperdataDocument
} deriving (Show)
-- | Generates a document where the title contains at least one
-- of the generated terms.
instance Arbitrary DocumentWithMatches where
arbitrary = do
singleChar <- arbitrary `suchThat` ((/=) ' ')
txt <- arbitrary
pure $ NgramsTermNonEmpty $ (T.singleton singleChar <> txt)
generatedTerms <- arbitrary
txtFragments <- listOf1 genScientificText
mkText <- do
sf <- shuffle txtFragments
let txt' = (map unNgramsTermNonEmpty $ getNonEmpty generatedTerms) <> sf
shuffle txt'
contextOnlyDoc <- arbitrary
let doc = _context_oid_hyperdata contextOnlyDoc
let doc' = doc { _hd_title = Just $ T.intercalate " " mkText }
let hyperDoc = contextOnlyDoc { _context_oid_hyperdata = doc' }
pure $ DocumentWithMatches generatedTerms hyperDoc
tests :: TestTree
tests = testGroup "Ngrams" [
testGroup "docNgrams" [
testProperty "return results for non-empty input ngrams" testDocNgramsWithTerms
testGroup "buildPatterns internal correctness" [
testProperty "patterns, no matter how simple, can be searched" prop_patterns_internal_consistency
]
, testGroup "buildPatternsWith" [
testProperty "return results for non-empty input terms" testBuildPatternsNonEmpty
]
, testGroup "docNgrams" [
testProperty "always matches if the input text contains any of the terms" testDocNgramsOKMatch
]
]
testDocNgramsWithTerms :: Lang -> NonEmptyList NgramsTermNonEmpty -> ContextOnlyId HyperdataDocument -> Property
testDocNgramsWithTerms lang ts doc =
testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property
testDocNgramsOKMatch lang (DocumentWithMatches ts doc) =
let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts
in conjoin [
counterexample "patterns empty" $ length (buildPatternsWith lang ts') > 0
, counterexample "termsInText empty" $ length (termsInText lang (buildPatternsWith lang ts') (doc ^. context_oid_hyperdata . hd_title . _Just)) > 0
, counterexample "docNgrams returned no results" $ length (docNgrams lang ts' doc) > 0
]
testBuildPatternsNonEmpty :: Lang -> NonEmptyList NgramsTermNonEmpty -> Property
testBuildPatternsNonEmpty lang ts =
let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts
in counterexample "docNgrams returned no results" $ length (docNgrams lang ts' doc) > 0
in counterexample "buildPatterns returned no results" $ length (buildPatternsWith lang ts') > 0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment