Commit 1642c59b authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Fix bug in buildPatterns related to case sensitiveness

Fixes a bug in the implementation of `buildPatterns`. In particular,
when we are building a `Pattern`, we need to do so in a case insenstive
fashion, otherwise later in the call to `replaceTerms` we would be
calling this from `extractTermsWithList` that cast everything into
lowercase due to the use of `monoTextsBySentence`.

This means that before this commit if we tried to search "Map" into
the text "Map is what I use when I'm lost" we wouldn't get a match,
because the latter would be converted into lowercase first
(i.e. "map is what i use when i'm lost") and we were trying to look
for the string "Map" (i.e. the former) into the transformer, yielding
no matches.
parent 2e282303
Pipeline #7614 passed with stages
in 44 minutes and 53 seconds
......@@ -12,7 +12,7 @@ Mono-terms are Nterms where n == 1.
-}
module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words)
module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words, isSep)
where
import Data.List qualified as L
......
......@@ -15,22 +15,28 @@ commentary with @some markup@.
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Terms.WithList (
termsInText
termsInText
, buildPatterns
, Patterns
, Pattern(..)
, Pattern(_pat_length, _pat_terms)
, MatchedText
, buildPatternsWith
, extractTermsWithList
, extractTermsWithList'
-- * Properties
, prop_patterns_internal_consistency
) where
import Prelude (show)
import Data.Algorithms.KMP qualified as KMP
import Data.IntMap.Strict qualified as IntMap
import Data.List qualified as List
import Data.Ord
import Data.Text (concat)
import Data.Text qualified as Text
import Data.Text qualified as T
import GHC.Exts (sortWith)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context
......@@ -38,15 +44,22 @@ import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude hiding (concat)
import GHC.Exts (sortWith)
import Test.QuickCheck (Property, counterexample, conjoin, (===))
------------------------------------------------------------------------
-- | A 'Pattern' encapsulate a string search (i.e. a pattern) in a text.
data Pattern = Pattern
{ _pat_table :: !(KMP.Table Text)
{ -- | The internal state of the underlying library implementing the Knuth-Morris-Pratt.
-- To not be leaked outside.
_pat_table :: !(KMP.Table Text)
, _pat_length :: !Int
, _pat_terms :: ![Text]
}
instance Show Pattern where
show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")"
type Patterns = [Pattern]
------------------------------------------------------------------------
......@@ -77,8 +90,8 @@ replaceTerms rplaceTerms pats terms = go 0
if len2 < len1 then (len1, lab1) else (len2, lab2)
buildPatternsWith :: Lang -> [NgramsTerm] -> Patterns
buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (Text.chunksOf 1 k, [])) ts
buildPatternsWith _ ts = buildPatterns $ map (\(NgramsTerm k) -> (Text.splitOn " " k, [])) ts
buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (T.chunksOf 1 k, [])) ts
buildPatternsWith _ ts = buildPatterns $ map (\(NgramsTerm k) -> (T.splitOn " " k, [])) ts
buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
......@@ -91,7 +104,12 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
| "" `elem` alt = Nothing
| null alt = Nothing
| otherwise = Just $
Pattern (KMP.build alt) (length alt) label
-- /IMPORTANT/ The call to 'toLower' is crucial, because it means
-- we will be building a pattern that will match lowercase terms,
-- which is exactly what we will be given as part of 'termsInText',
-- that calls 'monoTextsBySentence'. If we don't lower here it
-- means we won't be getting matches, whereas in theory we could.
Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label)
--(Terms label $ Set.empty) -- TODO check stems
......@@ -122,7 +140,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat
--------------------------------------------------------------------------
addSpaces :: Text -> Text
addSpaces = Text.unwords . (Text.chunksOf 1)
addSpaces = T.unwords . (T.chunksOf 1)
--------------------------------------------------------------------------
......@@ -146,3 +164,12 @@ filterWith' termList f f' xs = f' xs
where
pats = buildPatterns termList
-}
prop_patterns_internal_consistency :: Text -> Property
prop_patterns_internal_consistency termFragment =
conjoin [
counterexample "{\RS1" $ KMP.match (KMP.build [ "{\RS1" :: Text ]) [ "{\RS1" ] === [ 0 ]
, counterexample "\1099831?" $ KMP.match (KMP.build [ "\1099831?" :: Text ]) [ "\1099831?" ] === [ 0 ]
, counterexample "k2(" $ KMP.match (KMP.build [ "k2(" :: Text ]) [ "k2(" ] === [ 0 ]
, counterexample "no match" $ length (KMP.match (KMP.build [termFragment]) [termFragment]) > 0
]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment