{-# LANGUAGE TypeApplications #-} module Test.Offline.Ngrams (tests) where import Prelude import Data.Text qualified as T import Gargantext.API.Ngrams.Types qualified as NT import Gargantext.Core import Gargantext.Core.Text.Terms.WithList import Gargantext.Core.Types import Gargantext.Database.Action.Flow.Utils (docNgrams) import Gargantext.Database.Schema.Context import Gargantext.Database.Admin.Types.Hyperdata import Test.Instances () import Test.QuickCheck import Test.Tasty import Test.Tasty.QuickCheck (testProperty) import Control.Lens import qualified Test.QuickCheck as QC import Gargantext.Core.Text.Terms.Mono (isSep) genScientificText :: Gen T.Text genScientificText = T.pack <$> listOf genScientificChar -- | Roughly simulate the kind of text we might find in scientific papers and their abstracts. genScientificChar :: Gen Char genScientificChar = frequency [ (10, choose ('a', 'z')) , (10, choose ('A', 'Z')) , (5, choose ('0', '9')) , (2, QC.elements ws) -- Whitespace , (2, QC.elements punctuation) -- Punctuation , (1, QC.elements "éèàçöñüßøåÆŒ") -- Diacritics , (1, QC.elements "αβγδΔθλμπσφχΩ≤≥≠≈") -- Greek/math ] ws :: String ws = " \t\n" punctuation :: String punctuation = ",.();:-" genNgramsTermNonEmpty :: Gen NgramsTermNonEmpty genNgramsTermNonEmpty = do singleChar <- arbitrary `suchThat` (\x -> x /= ' ' && isAllowed x) txt <- filter isAllowed <$> listOf1 genScientificChar pure $ NgramsTermNonEmpty $ (T.pack $ singleChar : txt) where isAllowed :: Char -> Bool isAllowed s = not (s `elem` punctuation) && not (s `elem` ws) && not (isSep s) -- In order to test the behaviour of 'docNgrams' we create wrappers around 'NgramsTerm' to have two -- different 'Arbitrary' flavours, one that always produces non-empty 'Text' fragments, and one that -- /might/ occasionally generate empty text fragments. newtype NgramsTermNonEmpty = NgramsTermNonEmpty { unNgramsTermNonEmpty :: T.Text } deriving (Eq, Show) instance Arbitrary NgramsTermNonEmpty where arbitrary = genNgramsTermNonEmpty data DocumentWithMatches = DocumentWithMatches { _dwm_terms :: NonEmptyList NgramsTermNonEmpty , _dwn_document :: ContextOnlyId HyperdataDocument } deriving (Show) -- | Generates a document where the title contains at least one -- of the generated terms. instance Arbitrary DocumentWithMatches where arbitrary = do generatedTerms <- arbitrary txtFragments <- listOf1 genScientificText mkText <- do sf <- shuffle txtFragments let txt' = (map unNgramsTermNonEmpty $ getNonEmpty generatedTerms) <> sf shuffle txt' contextOnlyDoc <- arbitrary let doc = _context_oid_hyperdata contextOnlyDoc let doc' = doc { _hd_title = Just $ T.intercalate " " mkText } let hyperDoc = contextOnlyDoc { _context_oid_hyperdata = doc' } pure $ DocumentWithMatches generatedTerms hyperDoc tests :: TestTree tests = testGroup "Ngrams" [ testGroup "buildPatterns internal correctness" [ testProperty "patterns, no matter how simple, can be searched" prop_patterns_internal_consistency ] , testGroup "buildPatternsWith" [ testProperty "return results for non-empty input terms" testBuildPatternsNonEmpty ] , testGroup "docNgrams" [ testProperty "always matches if the input text contains any of the terms" testDocNgramsOKMatch ] ] testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property testDocNgramsOKMatch lang (DocumentWithMatches ts doc) = let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts in conjoin [ counterexample "patterns empty" $ length (buildPatternsWith lang ts') > 0 , counterexample "termsInText empty" $ length (termsInText lang (buildPatternsWith lang ts') (doc ^. context_oid_hyperdata . hd_title . _Just)) > 0 , counterexample "docNgrams returned no results" $ length (docNgrams lang ts' doc) > 0 ] testBuildPatternsNonEmpty :: Lang -> NonEmptyList NgramsTermNonEmpty -> Property testBuildPatternsNonEmpty lang ts = let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts in counterexample "buildPatterns returned no results" $ length (buildPatternsWith lang ts') > 0