{-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE ViewPatterns #-} module Test.Offline.Ngrams (tests) where import Prelude import Control.Lens import Data.Char (isSpace) import Data.Char qualified as Char import Data.Map.Strict (Map) import Data.Map.Strict qualified as Map import Data.Text qualified as T import Data.Tree import Gargantext.API.Ngrams (filterNgramsNodes, buildForest, destroyForest, pruneForest) import Gargantext.API.Ngrams.Types import Gargantext.API.Ngrams.Types qualified as NT import Gargantext.Core import Gargantext.Core.Text.Terms.Mono (isSep) import Gargantext.Core.Text.Terms.WithList import Gargantext.Core.Types import Gargantext.Database.Action.Flow.Utils (docNgrams) import Gargantext.Database.Admin.Types.Hyperdata import Gargantext.Database.Schema.Context import Test.HUnit import Test.Hspec import Test.Hspec.QuickCheck (prop) import Test.Instances () import Test.Ngrams.Query (mkMapTerm) import Test.QuickCheck import Test.QuickCheck qualified as QC import Text.RawString.QQ (r) genScientificText :: Gen T.Text genScientificText = T.pack <$> listOf genScientificChar -- | Roughly simulate the kind of text we might find in scientific papers and their abstracts. genScientificChar :: Gen Char genScientificChar = frequency [ (10, choose ('a', 'z')) , (10, choose ('A', 'Z')) , (5, choose ('0', '9')) , (2, QC.elements ws) -- Whitespace , (2, QC.elements punctuation) -- Punctuation , (1, QC.elements "éèàçöñüßøåÆŒ") -- Diacritics , (1, QC.elements "αβγδΔθλμπσφχΩ≤≥≠≈") -- Greek/math ] ws :: String ws = " \t\n" punctuation :: String punctuation = ",.();:-" genNgramsTermNonEmpty :: Gen NgramsTermNonEmpty genNgramsTermNonEmpty = do singleChar <- arbitrary `suchThat` isAllowed txt <- filter isAllowed <$> listOf1 genScientificChar pure $ NgramsTermNonEmpty $ (T.pack $ singleChar : txt) where isAllowed :: Char -> Bool isAllowed s = not (Char.isSpace s) && not (s `elem` punctuation) && not (s `elem` ws) && not (isSep s) -- In order to test the behaviour of 'docNgrams' we create wrappers around 'NgramsTerm' to have two -- different 'Arbitrary' flavours, one that always produces non-empty 'Text' fragments, and one that -- /might/ occasionally generate empty text fragments. newtype NgramsTermNonEmpty = NgramsTermNonEmpty { unNgramsTermNonEmpty :: T.Text } deriving (Eq, Show) instance Arbitrary NgramsTermNonEmpty where arbitrary = genNgramsTermNonEmpty data DocumentWithMatches = DocumentWithMatches { _dwm_terms :: NonEmptyList NgramsTermNonEmpty , _dwn_document :: ContextOnlyId HyperdataDocument } deriving (Show) -- | Generates a document where the title contains at least one -- of the generated terms. instance Arbitrary DocumentWithMatches where arbitrary = do generatedTerms <- arbitrary txtFragments <- listOf1 genScientificText mkText <- do sf <- shuffle txtFragments let txt' = (map unNgramsTermNonEmpty $ getNonEmpty generatedTerms) <> sf shuffle txt' contextOnlyDoc <- arbitrary let doc = _context_oid_hyperdata contextOnlyDoc let doc' = doc { _hd_title = Just $ T.intercalate " " mkText } let hyperDoc = contextOnlyDoc { _context_oid_hyperdata = doc' } pure $ DocumentWithMatches generatedTerms hyperDoc tests :: Spec tests = describe "Ngrams" $ do describe "buildPatterns internal correctness" $ do it "patterns, no matter how simple, can be searched" $ property prop_patterns_internal_consistency describe "buildPatternsWith" $ do it "return results for non-empty input terms" $ property testBuildPatternsNonEmpty describe "docNgrams" $ do it "always matches if the input text contains any of the terms" $ property testDocNgramsOKMatch describe "ngram forests" $ do it "building a simple tree works" testBuildNgramsTree_01 it "building a complex tree works" testBuildNgramsTree_02 it "building a complex deep tree works" testBuildNgramsTree_03 it "pruning a simple tree works" testPruningNgramsForest_01 it "pruning a complex tree works" testPruningNgramsForest_02 prop "destroyForest . buildForest === id" buildDestroyForestRoundtrips describe "hierarchical grouping" $ do it "filterNgramsNodes with empty query is identity" testFilterNgramsNodesEmptyQuery hierarchicalTableMap :: Map NgramsTerm NgramsElement hierarchicalTableMap = Map.fromList [ ("vehicle", mkMapTerm "vehicle" & ne_children .~ mSetFromList ["car"]) , ("car", mkMapTerm "car" & ne_root .~ Just "vehicle" & ne_parent .~ Just "vehicle" & ne_children .~ mSetFromList ["ford"]) , ("ford", mkMapTerm "ford" & ne_root .~ Just "vehicle" & ne_parent .~ Just "car") ] testFilterNgramsNodesEmptyQuery :: Assertion testFilterNgramsNodesEmptyQuery = do let input = hierarchicalTableMap let actual = filterNgramsNodes (Just MapTerm) Nothing Nothing (const True) input actual @?= input testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property testDocNgramsOKMatch lang (DocumentWithMatches ts doc) = let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts in conjoin [ counterexample "patterns empty" $ length (buildPatternsWith lang ts') > 0 , counterexample "termsInText empty" $ length (termsInText lang (buildPatternsWith lang ts') (doc ^. context_oid_hyperdata . hd_title . _Just)) > 0 , counterexample "docNgrams returned no results" $ length (docNgrams lang ts' doc) > 0 ] testBuildPatternsNonEmpty :: Lang -> NonEmptyList NgramsTermNonEmpty -> Property testBuildPatternsNonEmpty lang ts = let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts in counterexample "buildPatterns returned no results" $ length (buildPatternsWith lang ts') > 0 newtype ASCIIForest = ASCIIForest String deriving Eq instance Show ASCIIForest where show (ASCIIForest x) = x compareForestVisually :: Forest NgramsElement -> String -> Property compareForestVisually f expected = let actual = init $ drawForest (map (fmap renderEl) f) outermostIndentation = T.length . T.takeWhile isSpace . T.dropWhile (=='\n') . T.pack $ expected in ASCIIForest actual === ASCIIForest (sanitiseDrawing outermostIndentation expected) where renderEl :: NgramsElement -> String renderEl = T.unpack . unNgramsTerm . _ne_ngrams toTextPaths :: String -> [T.Text] toTextPaths = T.splitOn "\n" . T.strip . T.pack sanitiseDrawing :: Int -> String -> String sanitiseDrawing outermostIndentation = let dropLayout t = case T.uncons t of Just (' ', _) -> T.drop outermostIndentation t _ -> t -- leave it be in T.unpack . T.unlines . map dropLayout . toTextPaths testBuildNgramsTree_01 :: Property testBuildNgramsTree_01 = let t1 = Map.fromList [ ( "foo", mkMapTerm "foo" & ne_children .~ mSetFromList ["bar"]) , ( "bar", mkMapTerm "bar" & ne_parent .~ Just "foo") ] in (buildForest t1) `compareForestVisually` [r| bar foo | `- bar |] testBuildNgramsTree_02 :: Property testBuildNgramsTree_02 = buildForest hierarchicalTableMap `compareForestVisually` [r| car | `- ford ford vehicle | `- car | `- ford |] testBuildNgramsTree_03 :: Property testBuildNgramsTree_03 = let input = Map.fromList [ ("animalia", mkMapTerm "animalia" & ne_children .~ mSetFromList ["chordata"]) , ("chordata", mkMapTerm "chordata" & ne_root .~ Just "animalia" & ne_parent .~ Just "animalia" & ne_children .~ mSetFromList ["mammalia"]) , ("mammalia", mkMapTerm "mammalia" & ne_root .~ Just "animalia" & ne_parent .~ Just "chordata" & ne_children .~ mSetFromList ["carnivora", "primates"] ) , ("carnivora", mkMapTerm "carnivora" & ne_root .~ Just "animalia" & ne_parent .~ Just "mammalia" & ne_children .~ mSetFromList ["felidae"] ) , ("felidae", mkMapTerm "felidae" & ne_root .~ Just "animalia" & ne_parent .~ Just "carnivora" & ne_children .~ mSetFromList ["panthera"] ) , ("panthera", mkMapTerm "panthera" & ne_root .~ Just "animalia" & ne_parent .~ Just "felidae" & ne_children .~ mSetFromList ["panthera leo", "panthera tigris"] ) , ("panthera leo", mkMapTerm "panthera leo" & ne_root .~ Just "animalia" & ne_parent .~ Just "pathera" ) , ("panthera tigris", mkMapTerm "panthera tigris" & ne_root .~ Just "animalia" & ne_parent .~ Just "panthera" ) , ("panthera tigris", mkMapTerm "panthera tigris" & ne_root .~ Just "animalia" & ne_parent .~ Just "panthera" ) , ("primates", mkMapTerm "primates" & ne_root .~ Just "animalia" & ne_parent .~ Just "mammalia" & ne_children .~ mSetFromList ["hominidae"] ) , ("hominidae", mkMapTerm "hominidae" & ne_root .~ Just "animalia" & ne_parent .~ Just "primates" & ne_children .~ mSetFromList ["homo"] ) , ("homo", mkMapTerm "homo" & ne_root .~ Just "animalia" & ne_parent .~ Just "hominidae" & ne_children .~ mSetFromList ["homo sapiens"] ) , ("homo sapies", mkMapTerm "homo sapiens" & ne_root .~ Just "animalia" & ne_parent .~ Just "homo" ) ] in pruneForest (buildForest input) `compareForestVisually` [r| animalia | `- chordata | `- mammalia | +- carnivora | | | `- felidae | | | `- panthera | | | +- panthera leo | | | `- panthera tigris | `- primates | `- hominidae | `- homo |] newtype TableMapLockStep = TableMapLockStep { getTableMap :: Map NgramsTerm NgramsElement } deriving (Show, Eq) instance Arbitrary TableMapLockStep where arbitrary = do pairs <- map (\(k,v) -> (k, v & ne_ngrams .~ k)) <$> arbitrary pure $ TableMapLockStep (Map.fromList pairs) -- /PRECONDITION/: The '_ne_ngrams' field always matches the 'NgramsTerm', key of the map. buildDestroyForestRoundtrips :: TableMapLockStep -> Property buildDestroyForestRoundtrips (TableMapLockStep mp) = (destroyForest . buildForest $ mp) === mp testPruningNgramsForest_01 :: Property testPruningNgramsForest_01 = let t1 = Map.fromList [ ( "foo", mkMapTerm "foo" & ne_children .~ mSetFromList ["bar"]) , ( "bar", mkMapTerm "bar" & ne_parent .~ Just "foo") ] in (pruneForest $ buildForest t1) `compareForestVisually` [r| foo | `- bar |] testPruningNgramsForest_02 :: Property testPruningNgramsForest_02 = (pruneForest $ buildForest hierarchicalTableMap) `compareForestVisually` [r| vehicle | `- car | `- ford |]