Commit 1642c59b authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Fix bug in buildPatterns related to case sensitiveness

Fixes a bug in the implementation of `buildPatterns`. In particular,
when we are building a `Pattern`, we need to do so in a case insenstive
fashion, otherwise later in the call to `replaceTerms` we would be
calling this from `extractTermsWithList` that cast everything into
lowercase due to the use of `monoTextsBySentence`.

This means that before this commit if we tried to search "Map" into
the text "Map is what I use when I'm lost" we wouldn't get a match,
because the latter would be converted into lowercase first
(i.e. "map is what i use when i'm lost") and we were trying to look
for the string "Map" (i.e. the former) into the transformer, yielding
no matches.
parent 2e282303
Pipeline #7614 passed with stages
in 44 minutes and 53 seconds
...@@ -12,7 +12,7 @@ Mono-terms are Nterms where n == 1. ...@@ -12,7 +12,7 @@ Mono-terms are Nterms where n == 1.
-} -}
module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words) module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words, isSep)
where where
import Data.List qualified as L import Data.List qualified as L
......
...@@ -15,22 +15,28 @@ commentary with @some markup@. ...@@ -15,22 +15,28 @@ commentary with @some markup@.
{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Terms.WithList ( module Gargantext.Core.Text.Terms.WithList (
termsInText termsInText
, buildPatterns , buildPatterns
, Patterns , Patterns
, Pattern(..) , Pattern(_pat_length, _pat_terms)
, MatchedText , MatchedText
, buildPatternsWith , buildPatternsWith
, extractTermsWithList , extractTermsWithList
, extractTermsWithList' , extractTermsWithList'
-- * Properties
, prop_patterns_internal_consistency
) where ) where
import Prelude (show)
import Data.Algorithms.KMP qualified as KMP import Data.Algorithms.KMP qualified as KMP
import Data.IntMap.Strict qualified as IntMap import Data.IntMap.Strict qualified as IntMap
import Data.List qualified as List import Data.List qualified as List
import Data.Ord import Data.Ord
import Data.Text (concat) import Data.Text (concat)
import Data.Text qualified as Text import Data.Text qualified as T
import GHC.Exts (sortWith)
import Gargantext.API.Ngrams.Types (NgramsTerm(..)) import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang(ZH)) import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context
...@@ -38,15 +44,22 @@ import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence) ...@@ -38,15 +44,22 @@ import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount) import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts) import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude hiding (concat) import Gargantext.Prelude hiding (concat)
import GHC.Exts (sortWith) import Test.QuickCheck (Property, counterexample, conjoin, (===))
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | A 'Pattern' encapsulate a string search (i.e. a pattern) in a text.
data Pattern = Pattern data Pattern = Pattern
{ _pat_table :: !(KMP.Table Text) { -- | The internal state of the underlying library implementing the Knuth-Morris-Pratt.
-- To not be leaked outside.
_pat_table :: !(KMP.Table Text)
, _pat_length :: !Int , _pat_length :: !Int
, _pat_terms :: ![Text] , _pat_terms :: ![Text]
} }
instance Show Pattern where
show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")"
type Patterns = [Pattern] type Patterns = [Pattern]
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -77,8 +90,8 @@ replaceTerms rplaceTerms pats terms = go 0 ...@@ -77,8 +90,8 @@ replaceTerms rplaceTerms pats terms = go 0
if len2 < len1 then (len1, lab1) else (len2, lab2) if len2 < len1 then (len1, lab1) else (len2, lab2)
buildPatternsWith :: Lang -> [NgramsTerm] -> Patterns buildPatternsWith :: Lang -> [NgramsTerm] -> Patterns
buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (Text.chunksOf 1 k, [])) ts buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (T.chunksOf 1 k, [])) ts
buildPatternsWith _ ts = buildPatterns $ map (\(NgramsTerm k) -> (Text.splitOn " " k, [])) ts buildPatternsWith _ ts = buildPatterns $ map (\(NgramsTerm k) -> (T.splitOn " " k, [])) ts
buildPatterns :: TermList -> Patterns buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern) buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
...@@ -91,7 +104,12 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern) ...@@ -91,7 +104,12 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
| "" `elem` alt = Nothing | "" `elem` alt = Nothing
| null alt = Nothing | null alt = Nothing
| otherwise = Just $ | otherwise = Just $
Pattern (KMP.build alt) (length alt) label -- /IMPORTANT/ The call to 'toLower' is crucial, because it means
-- we will be building a pattern that will match lowercase terms,
-- which is exactly what we will be given as part of 'termsInText',
-- that calls 'monoTextsBySentence'. If we don't lower here it
-- means we won't be getting matches, whereas in theory we could.
Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label)
--(Terms label $ Set.empty) -- TODO check stems --(Terms label $ Set.empty) -- TODO check stems
...@@ -122,7 +140,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat ...@@ -122,7 +140,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat
-------------------------------------------------------------------------- --------------------------------------------------------------------------
addSpaces :: Text -> Text addSpaces :: Text -> Text
addSpaces = Text.unwords . (Text.chunksOf 1) addSpaces = T.unwords . (T.chunksOf 1)
-------------------------------------------------------------------------- --------------------------------------------------------------------------
...@@ -146,3 +164,12 @@ filterWith' termList f f' xs = f' xs ...@@ -146,3 +164,12 @@ filterWith' termList f f' xs = f' xs
where where
pats = buildPatterns termList pats = buildPatterns termList
-} -}
prop_patterns_internal_consistency :: Text -> Property
prop_patterns_internal_consistency termFragment =
conjoin [
counterexample "{\RS1" $ KMP.match (KMP.build [ "{\RS1" :: Text ]) [ "{\RS1" ] === [ 0 ]
, counterexample "\1099831?" $ KMP.match (KMP.build [ "\1099831?" :: Text ]) [ "\1099831?" ] === [ 0 ]
, counterexample "k2(" $ KMP.match (KMP.build [ "k2(" :: Text ]) [ "k2(" ] === [ 0 ]
, counterexample "no match" $ length (KMP.match (KMP.build [termFragment]) [termFragment]) > 0
]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment