Fix bug in buildPatterns related to case sensitiveness

Fixes a bug in the implementation of `buildPatterns`. In particular, when we are building a `Pattern`, we need to do so in a case insenstive fashion, otherwise later in the call to `replaceTerms` we would be calling this from `extractTermsWithList` that cast everything into lowercase due to the use of `monoTextsBySentence`. This means that before this commit if we tried to search "Map" into the text "Map is what I use when I'm lost" we wouldn't get a match, because the latter would be converted into lowercase first (i.e. "map is what i use when i'm lost") and we were trying to look for the string "Map" (i.e. the former) into the transformer, yielding no matches.

Fix bug in buildPatterns related to case sensitiveness
Fixes a bug in the implementation of `buildPatterns`. In particular, when we are building a `Pattern`, we need to do so in a case insenstive fashion, otherwise later in the call to `replaceTerms` we would be calling this from `extractTermsWithList` that cast everything into lowercase due to the use of `monoTextsBySentence`. This means that before this commit if we tried to search "Map" into the text "Map is what I use when I'm lost" we wouldn't get a match, because the latter would be converted into lowercase first (i.e. "map is what i use when i'm lost") and we were trying to look for the string "Map" (i.e. the former) into the transformer, yielding no matches.
1642c59b · Alfredo Di Napoli · 2e282303 · 1642c59b · 1642c59b
Commit 1642c59b authored May 26, 2025 by Alfredo Di Napoli
Hide whitespace changes
Inline Side-by-side

Showing with 37 additions and 10 deletions

Mono.hs src/Gargantext/Core/Text/Terms/Mono.hs +1 -1

WithList.hs src/Gargantext/Core/Text/Terms/WithList.hs +36 -9

No files found.
--- a/src/Gargantext/Core/Text/Terms/Mono.hs
+++ b/src/Gargantext/Core/Text/Terms/Mono.hs
@@ -12,7 +12,7 @@ Mono-terms are Nterms where n == 1.
 -}
-module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words)
+module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words, isSep)
  where
 import Data.List qualified as L

--- a/src/Gargantext/Core/Text/Terms/WithList.hs
+++ b/src/Gargantext/Core/Text/Terms/WithList.hs
@@ -15,22 +15,28 @@ commentary with @some markup@.
 {-# LANGUAGE ViewPatterns      #-}
 module Gargantext.Core.Text.Terms.WithList (
-  termsInText
+    termsInText
  , buildPatterns
  , Patterns
-  , Pattern(..)
+  , Pattern(_pat_length, _pat_terms)
  , MatchedText
  , buildPatternsWith
  , extractTermsWithList
  , extractTermsWithList'
+  -- * Properties
+  , prop_patterns_internal_consistency
  ) where
+import Prelude (show)
 import Data.Algorithms.KMP qualified as KMP
 import Data.IntMap.Strict qualified as IntMap
 import Data.List qualified as List
 import Data.Ord
 import Data.Text (concat)
-import Data.Text qualified as Text
+import Data.Text qualified as T
+import GHC.Exts (sortWith)
 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
 import Gargantext.Core (Lang(ZH))
 import Gargantext.Core.Text.Context
@@ -38,15 +44,22 @@ import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
 import Gargantext.Core.Types (TermsCount)
 import Gargantext.Core.Utils (groupWithCounts)
 import Gargantext.Prelude hiding (concat)
-import GHC.Exts (sortWith)
+import Test.QuickCheck (Property, counterexample, conjoin, (===))
 ------------------------------------------------------------------------
+-- | A 'Pattern' encapsulate a string search (i.e. a pattern) in a text.
 data Pattern = Pattern
-  { _pat_table  :: !(KMP.Table Text)
+  { -- | The internal state of the underlying library implementing the Knuth-Morris-Pratt.
+    -- To not be leaked outside.
+    _pat_table  :: !(KMP.Table Text)
  , _pat_length :: !Int
  , _pat_terms  :: ![Text]
  }
+instance Show Pattern where
+  show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")"
 type Patterns = [Pattern]
 ------------------------------------------------------------------------
@@ -77,8 +90,8 @@ replaceTerms rplaceTerms pats terms = go 0
            if len2 < len1 then (len1, lab1) else (len2, lab2)
 buildPatternsWith :: Lang -> [NgramsTerm] -> Patterns
-buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (Text.chunksOf 1  k, [])) ts
+buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (T.chunksOf 1  k, [])) ts
-buildPatternsWith _  ts = buildPatterns $ map (\(NgramsTerm k) -> (Text.splitOn " " k, [])) ts
+buildPatternsWith _  ts = buildPatterns $ map (\(NgramsTerm k) -> (T.splitOn " " k, [])) ts
 buildPatterns :: TermList -> Patterns
 buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
@@ -91,7 +104,12 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
      | "" `elem` alt = Nothing
      | null alt      = Nothing
      | otherwise     = Just $
-        Pattern (KMP.build alt) (length alt) label
+        -- /IMPORTANT/ The call to 'toLower' is crucial, because it means
+        -- we will be building a pattern that will match lowercase terms,
+        -- which is exactly what we will be given as part of 'termsInText',
+        -- that calls 'monoTextsBySentence'. If we don't lower here it
+        -- means we won't be getting matches, whereas in theory we could.
+        Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label)
                         --(Terms label $ Set.empty) -- TODO check stems
@@ -122,7 +140,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat
 --------------------------------------------------------------------------
 addSpaces :: Text -> Text
-addSpaces = Text.unwords . (Text.chunksOf 1)
+addSpaces = T.unwords . (T.chunksOf 1)
 --------------------------------------------------------------------------
@@ -146,3 +164,12 @@ filterWith' termList f f' xs = f' xs
    where
      pats = buildPatterns termList
 -}
+prop_patterns_internal_consistency :: Text -> Property
+prop_patterns_internal_consistency termFragment =
+  conjoin [
+    counterexample "{\RS1"     $ KMP.match (KMP.build [ "{\RS1" :: Text ]) [ "{\RS1" ] === [ 0 ]
+  , counterexample "\1099831?" $ KMP.match (KMP.build [ "\1099831?" :: Text ]) [ "\1099831?" ] === [ 0 ]
+  , counterexample "k2(" $ KMP.match (KMP.build [ "k2(" :: Text ]) [ "k2(" ] === [ 0 ]
+  , counterexample "no match"  $ length (KMP.match (KMP.build [termFragment]) [termFragment]) > 0
+  ]