Commit 4ceffa69 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'adinapoli/issue-395' into 'dev'

Fix a bug in `buildPatterns` and friends

Closes #395

See merge request !413
parents f5f0ae1b 1642c59b
Pipeline #7629 failed with stages
in 128 minutes and 4 seconds
......@@ -860,6 +860,7 @@ test-suite garg-test-tasty
Test.Ngrams.Terms
Test.Offline.Errors
Test.Offline.JSON
Test.Offline.Ngrams
Test.Offline.Phylo
Test.Offline.Stemming.Lancaster
Test.Parsers.Date
......
......@@ -12,7 +12,7 @@ Mono-terms are Nterms where n == 1.
-}
module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words)
module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words, isSep)
where
import Data.List qualified as L
......
......@@ -14,14 +14,29 @@ commentary with @some markup@.
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Terms.WithList where
module Gargantext.Core.Text.Terms.WithList (
termsInText
, buildPatterns
, Patterns
, Pattern(_pat_length, _pat_terms)
, MatchedText
, buildPatternsWith
, extractTermsWithList
, extractTermsWithList'
-- * Properties
, prop_patterns_internal_consistency
) where
import Prelude (show)
import Data.Algorithms.KMP qualified as KMP
import Data.IntMap.Strict qualified as IntMap
import Data.List qualified as List
import Data.Ord
import Data.Text (concat)
import Data.Text qualified as Text
import Data.Text qualified as T
import GHC.Exts (sortWith)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context
......@@ -29,15 +44,22 @@ import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude hiding (concat)
import GHC.Exts (sortWith)
import Test.QuickCheck (Property, counterexample, conjoin, (===))
------------------------------------------------------------------------
-- | A 'Pattern' encapsulate a string search (i.e. a pattern) in a text.
data Pattern = Pattern
{ _pat_table :: !(KMP.Table Text)
{ -- | The internal state of the underlying library implementing the Knuth-Morris-Pratt.
-- To not be leaked outside.
_pat_table :: !(KMP.Table Text)
, _pat_length :: !Int
, _pat_terms :: ![Text]
}
instance Show Pattern where
show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")"
type Patterns = [Pattern]
------------------------------------------------------------------------
......@@ -68,19 +90,27 @@ replaceTerms rplaceTerms pats terms = go 0
if len2 < len1 then (len1, lab1) else (len2, lab2)
buildPatternsWith :: Lang -> [NgramsTerm] -> Patterns
buildPatternsWith ZH ts = buildPatterns $ map (\k -> (Text.chunksOf 1 $ unNgramsTerm k, [])) ts
buildPatternsWith _ ts = buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts
buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (T.chunksOf 1 k, [])) ts
buildPatternsWith _ ts = buildPatterns $ map (\(NgramsTerm k) -> (T.splitOn " " k, [])) ts
buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
where
buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
where
f alt | "" `elem` alt = errorTrace ("buildPatterns: ERR1" <> show(label))
| null alt = errorTrace "buildPatterns: ERR2"
| otherwise =
Pattern (KMP.build alt) (length alt) label
--(Terms label $ Set.empty) -- TODO check stems
buildPattern :: Label -> [MultiTerm] -> [Pattern]
buildPattern label alts = mapMaybe (mkPattern label) $ map (\alt -> filter (/= "") alt) (label : alts)
mkPattern :: Label -> [Text] -> Maybe Pattern
mkPattern label alt
| "" `elem` alt = Nothing
| null alt = Nothing
| otherwise = Just $
-- /IMPORTANT/ The call to 'toLower' is crucial, because it means
-- we will be building a pattern that will match lowercase terms,
-- which is exactly what we will be given as part of 'termsInText',
-- that calls 'monoTextsBySentence'. If we don't lower here it
-- means we won't be getting matches, whereas in theory we could.
Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label)
--(Terms label $ Set.empty) -- TODO check stems
--------------------------------------------------------------------------
......@@ -110,7 +140,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat
--------------------------------------------------------------------------
addSpaces :: Text -> Text
addSpaces = Text.unwords . (Text.chunksOf 1)
addSpaces = T.unwords . (T.chunksOf 1)
--------------------------------------------------------------------------
......@@ -134,3 +164,12 @@ filterWith' termList f f' xs = f' xs
where
pats = buildPatterns termList
-}
prop_patterns_internal_consistency :: Text -> Property
prop_patterns_internal_consistency termFragment =
conjoin [
counterexample "{\RS1" $ KMP.match (KMP.build [ "{\RS1" :: Text ]) [ "{\RS1" ] === [ 0 ]
, counterexample "\1099831?" $ KMP.match (KMP.build [ "\1099831?" :: Text ]) [ "\1099831?" ] === [ 0 ]
, counterexample "k2(" $ KMP.match (KMP.build [ "k2(" :: Text ]) [ "k2(" ] === [ 0 ]
, counterexample "no match" $ length (KMP.match (KMP.build [termFragment]) [termFragment]) > 0
]
......@@ -61,6 +61,9 @@ data ContextPolyOnlyId id hyperdata =
, _context_oid_hyperdata :: !hyperdata }
deriving (Show, Generic)
instance (Arbitrary id, Arbitrary hyperdata) => Arbitrary (ContextPolyOnlyId id hyperdata) where
arbitrary = ContextOnlyId <$> arbitrary <*> arbitrary
------------------------------------------------------------------------
-- Automatic instances derivation
$(deriveJSON (unPrefix "_context_oid_") ''ContextPolyOnlyId)
......
{-# LANGUAGE TypeApplications #-}
module Test.Offline.Ngrams (tests) where
import Prelude
import Data.Text qualified as T
import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core
import Gargantext.Core.Text.Terms.WithList
import Gargantext.Core.Types
import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Database.Schema.Context
import Gargantext.Database.Admin.Types.Hyperdata
import Test.Instances ()
import Test.QuickCheck
import Test.Tasty
import Test.Tasty.QuickCheck (testProperty)
import Control.Lens
import qualified Test.QuickCheck as QC
import Gargantext.Core.Text.Terms.Mono (isSep)
genScientificText :: Gen T.Text
genScientificText = T.pack <$> listOf genScientificChar
-- | Roughly simulate the kind of text we might find in scientific papers and their abstracts.
genScientificChar :: Gen Char
genScientificChar = frequency
[ (10, choose ('a', 'z'))
, (10, choose ('A', 'Z'))
, (5, choose ('0', '9'))
, (2, QC.elements ws) -- Whitespace
, (2, QC.elements punctuation) -- Punctuation
, (1, QC.elements "éèàçöñüßøåÆŒ") -- Diacritics
, (1, QC.elements "αβγδΔθλμπσφχΩ≤≥≠≈") -- Greek/math
]
ws :: String
ws = " \t\n"
punctuation :: String
punctuation = ",.();:-"
genNgramsTermNonEmpty :: Gen NgramsTermNonEmpty
genNgramsTermNonEmpty = do
singleChar <- arbitrary `suchThat` (\x -> x /= ' ' && isAllowed x)
txt <- filter isAllowed <$> listOf1 genScientificChar
pure $ NgramsTermNonEmpty $ (T.pack $ singleChar : txt)
where
isAllowed :: Char -> Bool
isAllowed s = not (s `elem` punctuation) && not (s `elem` ws) && not (isSep s)
-- In order to test the behaviour of 'docNgrams' we create wrappers around 'NgramsTerm' to have two
-- different 'Arbitrary' flavours, one that always produces non-empty 'Text' fragments, and one that
-- /might/ occasionally generate empty text fragments.
newtype NgramsTermNonEmpty = NgramsTermNonEmpty { unNgramsTermNonEmpty :: T.Text }
deriving (Eq, Show)
instance Arbitrary NgramsTermNonEmpty where
arbitrary = genNgramsTermNonEmpty
data DocumentWithMatches =
DocumentWithMatches
{ _dwm_terms :: NonEmptyList NgramsTermNonEmpty
, _dwn_document :: ContextOnlyId HyperdataDocument
} deriving (Show)
-- | Generates a document where the title contains at least one
-- of the generated terms.
instance Arbitrary DocumentWithMatches where
arbitrary = do
generatedTerms <- arbitrary
txtFragments <- listOf1 genScientificText
mkText <- do
sf <- shuffle txtFragments
let txt' = (map unNgramsTermNonEmpty $ getNonEmpty generatedTerms) <> sf
shuffle txt'
contextOnlyDoc <- arbitrary
let doc = _context_oid_hyperdata contextOnlyDoc
let doc' = doc { _hd_title = Just $ T.intercalate " " mkText }
let hyperDoc = contextOnlyDoc { _context_oid_hyperdata = doc' }
pure $ DocumentWithMatches generatedTerms hyperDoc
tests :: TestTree
tests = testGroup "Ngrams" [
testGroup "buildPatterns internal correctness" [
testProperty "patterns, no matter how simple, can be searched" prop_patterns_internal_consistency
]
, testGroup "buildPatternsWith" [
testProperty "return results for non-empty input terms" testBuildPatternsNonEmpty
]
, testGroup "docNgrams" [
testProperty "always matches if the input text contains any of the terms" testDocNgramsOKMatch
]
]
testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property
testDocNgramsOKMatch lang (DocumentWithMatches ts doc) =
let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts
in conjoin [
counterexample "patterns empty" $ length (buildPatternsWith lang ts') > 0
, counterexample "termsInText empty" $ length (termsInText lang (buildPatternsWith lang ts') (doc ^. context_oid_hyperdata . hd_title . _Just)) > 0
, counterexample "docNgrams returned no results" $ length (docNgrams lang ts' doc) > 0
]
testBuildPatternsNonEmpty :: Lang -> NonEmptyList NgramsTermNonEmpty -> Property
testBuildPatternsNonEmpty lang ts =
let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts
in counterexample "buildPatterns returned no results" $ length (buildPatternsWith lang ts') > 0
......@@ -28,6 +28,7 @@ import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Ngrams.Terms as NgramsTerms
import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Ngrams as Ngrams
import qualified Test.Offline.Phylo as Phylo
import qualified Test.Offline.Stemming.Lancaster as Lancaster
import qualified Test.Parsers.Date as PD
......@@ -73,6 +74,7 @@ main = do
, CorpusQuery.tests
, TSVParser.tests
, JSON.tests
, Ngrams.tests
, Errors.tests
, similaritySpec
, Phylo.tests
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment