Commit 4ceffa69 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'adinapoli/issue-395' into 'dev'

Fix a bug in `buildPatterns` and friends

Closes #395

See merge request !413
parents f5f0ae1b 1642c59b
Pipeline #7629 failed with stages
in 128 minutes and 4 seconds
...@@ -860,6 +860,7 @@ test-suite garg-test-tasty ...@@ -860,6 +860,7 @@ test-suite garg-test-tasty
Test.Ngrams.Terms Test.Ngrams.Terms
Test.Offline.Errors Test.Offline.Errors
Test.Offline.JSON Test.Offline.JSON
Test.Offline.Ngrams
Test.Offline.Phylo Test.Offline.Phylo
Test.Offline.Stemming.Lancaster Test.Offline.Stemming.Lancaster
Test.Parsers.Date Test.Parsers.Date
......
...@@ -12,7 +12,7 @@ Mono-terms are Nterms where n == 1. ...@@ -12,7 +12,7 @@ Mono-terms are Nterms where n == 1.
-} -}
module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words) module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words, isSep)
where where
import Data.List qualified as L import Data.List qualified as L
......
...@@ -14,14 +14,29 @@ commentary with @some markup@. ...@@ -14,14 +14,29 @@ commentary with @some markup@.
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Terms.WithList where module Gargantext.Core.Text.Terms.WithList (
termsInText
, buildPatterns
, Patterns
, Pattern(_pat_length, _pat_terms)
, MatchedText
, buildPatternsWith
, extractTermsWithList
, extractTermsWithList'
-- * Properties
, prop_patterns_internal_consistency
) where
import Prelude (show)
import Data.Algorithms.KMP qualified as KMP import Data.Algorithms.KMP qualified as KMP
import Data.IntMap.Strict qualified as IntMap import Data.IntMap.Strict qualified as IntMap
import Data.List qualified as List import Data.List qualified as List
import Data.Ord import Data.Ord
import Data.Text (concat) import Data.Text (concat)
import Data.Text qualified as Text import Data.Text qualified as T
import GHC.Exts (sortWith)
import Gargantext.API.Ngrams.Types (NgramsTerm(..)) import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang(ZH)) import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context
...@@ -29,15 +44,22 @@ import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence) ...@@ -29,15 +44,22 @@ import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount) import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts) import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude hiding (concat) import Gargantext.Prelude hiding (concat)
import GHC.Exts (sortWith) import Test.QuickCheck (Property, counterexample, conjoin, (===))
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | A 'Pattern' encapsulate a string search (i.e. a pattern) in a text.
data Pattern = Pattern data Pattern = Pattern
{ _pat_table :: !(KMP.Table Text) { -- | The internal state of the underlying library implementing the Knuth-Morris-Pratt.
-- To not be leaked outside.
_pat_table :: !(KMP.Table Text)
, _pat_length :: !Int , _pat_length :: !Int
, _pat_terms :: ![Text] , _pat_terms :: ![Text]
} }
instance Show Pattern where
show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")"
type Patterns = [Pattern] type Patterns = [Pattern]
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -68,19 +90,27 @@ replaceTerms rplaceTerms pats terms = go 0 ...@@ -68,19 +90,27 @@ replaceTerms rplaceTerms pats terms = go 0
if len2 < len1 then (len1, lab1) else (len2, lab2) if len2 < len1 then (len1, lab1) else (len2, lab2)
buildPatternsWith :: Lang -> [NgramsTerm] -> Patterns buildPatternsWith :: Lang -> [NgramsTerm] -> Patterns
buildPatternsWith ZH ts = buildPatterns $ map (\k -> (Text.chunksOf 1 $ unNgramsTerm k, [])) ts buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (T.chunksOf 1 k, [])) ts
buildPatternsWith _ ts = buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts buildPatternsWith _ ts = buildPatterns $ map (\(NgramsTerm k) -> (T.splitOn " " k, [])) ts
buildPatterns :: TermList -> Patterns buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
where where
buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts) buildPattern :: Label -> [MultiTerm] -> [Pattern]
where buildPattern label alts = mapMaybe (mkPattern label) $ map (\alt -> filter (/= "") alt) (label : alts)
f alt | "" `elem` alt = errorTrace ("buildPatterns: ERR1" <> show(label))
| null alt = errorTrace "buildPatterns: ERR2" mkPattern :: Label -> [Text] -> Maybe Pattern
| otherwise = mkPattern label alt
Pattern (KMP.build alt) (length alt) label | "" `elem` alt = Nothing
--(Terms label $ Set.empty) -- TODO check stems | null alt = Nothing
| otherwise = Just $
-- /IMPORTANT/ The call to 'toLower' is crucial, because it means
-- we will be building a pattern that will match lowercase terms,
-- which is exactly what we will be given as part of 'termsInText',
-- that calls 'monoTextsBySentence'. If we don't lower here it
-- means we won't be getting matches, whereas in theory we could.
Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label)
--(Terms label $ Set.empty) -- TODO check stems
-------------------------------------------------------------------------- --------------------------------------------------------------------------
...@@ -110,7 +140,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat ...@@ -110,7 +140,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat
-------------------------------------------------------------------------- --------------------------------------------------------------------------
addSpaces :: Text -> Text addSpaces :: Text -> Text
addSpaces = Text.unwords . (Text.chunksOf 1) addSpaces = T.unwords . (T.chunksOf 1)
-------------------------------------------------------------------------- --------------------------------------------------------------------------
...@@ -134,3 +164,12 @@ filterWith' termList f f' xs = f' xs ...@@ -134,3 +164,12 @@ filterWith' termList f f' xs = f' xs
where where
pats = buildPatterns termList pats = buildPatterns termList
-} -}
prop_patterns_internal_consistency :: Text -> Property
prop_patterns_internal_consistency termFragment =
conjoin [
counterexample "{\RS1" $ KMP.match (KMP.build [ "{\RS1" :: Text ]) [ "{\RS1" ] === [ 0 ]
, counterexample "\1099831?" $ KMP.match (KMP.build [ "\1099831?" :: Text ]) [ "\1099831?" ] === [ 0 ]
, counterexample "k2(" $ KMP.match (KMP.build [ "k2(" :: Text ]) [ "k2(" ] === [ 0 ]
, counterexample "no match" $ length (KMP.match (KMP.build [termFragment]) [termFragment]) > 0
]
...@@ -61,6 +61,9 @@ data ContextPolyOnlyId id hyperdata = ...@@ -61,6 +61,9 @@ data ContextPolyOnlyId id hyperdata =
, _context_oid_hyperdata :: !hyperdata } , _context_oid_hyperdata :: !hyperdata }
deriving (Show, Generic) deriving (Show, Generic)
instance (Arbitrary id, Arbitrary hyperdata) => Arbitrary (ContextPolyOnlyId id hyperdata) where
arbitrary = ContextOnlyId <$> arbitrary <*> arbitrary
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Automatic instances derivation -- Automatic instances derivation
$(deriveJSON (unPrefix "_context_oid_") ''ContextPolyOnlyId) $(deriveJSON (unPrefix "_context_oid_") ''ContextPolyOnlyId)
......
{-# LANGUAGE TypeApplications #-}
module Test.Offline.Ngrams (tests) where
import Prelude
import Data.Text qualified as T
import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core
import Gargantext.Core.Text.Terms.WithList
import Gargantext.Core.Types
import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Database.Schema.Context
import Gargantext.Database.Admin.Types.Hyperdata
import Test.Instances ()
import Test.QuickCheck
import Test.Tasty
import Test.Tasty.QuickCheck (testProperty)
import Control.Lens
import qualified Test.QuickCheck as QC
import Gargantext.Core.Text.Terms.Mono (isSep)
genScientificText :: Gen T.Text
genScientificText = T.pack <$> listOf genScientificChar
-- | Roughly simulate the kind of text we might find in scientific papers and their abstracts.
genScientificChar :: Gen Char
genScientificChar = frequency
[ (10, choose ('a', 'z'))
, (10, choose ('A', 'Z'))
, (5, choose ('0', '9'))
, (2, QC.elements ws) -- Whitespace
, (2, QC.elements punctuation) -- Punctuation
, (1, QC.elements "éèàçöñüßøåÆŒ") -- Diacritics
, (1, QC.elements "αβγδΔθλμπσφχΩ≤≥≠≈") -- Greek/math
]
ws :: String
ws = " \t\n"
punctuation :: String
punctuation = ",.();:-"
genNgramsTermNonEmpty :: Gen NgramsTermNonEmpty
genNgramsTermNonEmpty = do
singleChar <- arbitrary `suchThat` (\x -> x /= ' ' && isAllowed x)
txt <- filter isAllowed <$> listOf1 genScientificChar
pure $ NgramsTermNonEmpty $ (T.pack $ singleChar : txt)
where
isAllowed :: Char -> Bool
isAllowed s = not (s `elem` punctuation) && not (s `elem` ws) && not (isSep s)
-- In order to test the behaviour of 'docNgrams' we create wrappers around 'NgramsTerm' to have two
-- different 'Arbitrary' flavours, one that always produces non-empty 'Text' fragments, and one that
-- /might/ occasionally generate empty text fragments.
newtype NgramsTermNonEmpty = NgramsTermNonEmpty { unNgramsTermNonEmpty :: T.Text }
deriving (Eq, Show)
instance Arbitrary NgramsTermNonEmpty where
arbitrary = genNgramsTermNonEmpty
data DocumentWithMatches =
DocumentWithMatches
{ _dwm_terms :: NonEmptyList NgramsTermNonEmpty
, _dwn_document :: ContextOnlyId HyperdataDocument
} deriving (Show)
-- | Generates a document where the title contains at least one
-- of the generated terms.
instance Arbitrary DocumentWithMatches where
arbitrary = do
generatedTerms <- arbitrary
txtFragments <- listOf1 genScientificText
mkText <- do
sf <- shuffle txtFragments
let txt' = (map unNgramsTermNonEmpty $ getNonEmpty generatedTerms) <> sf
shuffle txt'
contextOnlyDoc <- arbitrary
let doc = _context_oid_hyperdata contextOnlyDoc
let doc' = doc { _hd_title = Just $ T.intercalate " " mkText }
let hyperDoc = contextOnlyDoc { _context_oid_hyperdata = doc' }
pure $ DocumentWithMatches generatedTerms hyperDoc
tests :: TestTree
tests = testGroup "Ngrams" [
testGroup "buildPatterns internal correctness" [
testProperty "patterns, no matter how simple, can be searched" prop_patterns_internal_consistency
]
, testGroup "buildPatternsWith" [
testProperty "return results for non-empty input terms" testBuildPatternsNonEmpty
]
, testGroup "docNgrams" [
testProperty "always matches if the input text contains any of the terms" testDocNgramsOKMatch
]
]
testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property
testDocNgramsOKMatch lang (DocumentWithMatches ts doc) =
let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts
in conjoin [
counterexample "patterns empty" $ length (buildPatternsWith lang ts') > 0
, counterexample "termsInText empty" $ length (termsInText lang (buildPatternsWith lang ts') (doc ^. context_oid_hyperdata . hd_title . _Just)) > 0
, counterexample "docNgrams returned no results" $ length (docNgrams lang ts' doc) > 0
]
testBuildPatternsNonEmpty :: Lang -> NonEmptyList NgramsTermNonEmpty -> Property
testBuildPatternsNonEmpty lang ts =
let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts
in counterexample "buildPatterns returned no results" $ length (buildPatternsWith lang ts') > 0
...@@ -28,6 +28,7 @@ import qualified Test.Ngrams.Query as NgramsQuery ...@@ -28,6 +28,7 @@ import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Ngrams.Terms as NgramsTerms import qualified Test.Ngrams.Terms as NgramsTerms
import qualified Test.Offline.Errors as Errors import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.JSON as JSON import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Ngrams as Ngrams
import qualified Test.Offline.Phylo as Phylo import qualified Test.Offline.Phylo as Phylo
import qualified Test.Offline.Stemming.Lancaster as Lancaster import qualified Test.Offline.Stemming.Lancaster as Lancaster
import qualified Test.Parsers.Date as PD import qualified Test.Parsers.Date as PD
...@@ -73,6 +74,7 @@ main = do ...@@ -73,6 +74,7 @@ main = do
, CorpusQuery.tests , CorpusQuery.tests
, TSVParser.tests , TSVParser.tests
, JSON.tests , JSON.tests
, Ngrams.tests
, Errors.tests , Errors.tests
, similaritySpec , similaritySpec
, Phylo.tests , Phylo.tests
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment