Commit 43901901 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Fix bug in GargPorter implementation

parent d44831d0
Pipeline #6830 failed with stages
in 45 minutes and 57 seconds
{-|
Module : Gargantext.Core.Text.Ngrams.Stem.En
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
module Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter (stem)
where
import Control.Monad
import Data.Either
import Data.List ((!!))
import Data.List qualified as List hiding (map, head)
import Data.Maybe
import Data.Text (pack, unpack)
import Gargantext.Prelude
vowels :: [Char]
vowels = ['a','e','i','o','u']
isConsonant :: [Char] -> Int -> Bool
isConsonant str i
| c `elem` vowels = False
| c == 'y' = i == 0 || isVowel str (i - 1)
| otherwise = True
where
c = str !! i
isVowel :: [Char] -> Int -> Bool
isVowel = (not .) . isConsonant
byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
byIndex fun str = fun str [0..length str - 1]
containsVowel :: [Char] -> Bool
containsVowel = byIndex (any . isVowel)
-- | /!\ unsafe fromJust
measure :: [Char] -> Int
measure = length . filter not . List.init . (True:)
. map fromJust . map head
. List.group . byIndex (map . isConsonant)
endsWithDouble :: [Char] -> Bool
endsWithDouble = startsWithDouble . reverse
where
startsWithDouble l = case l of
(x:y:_) -> x == y && x `List.notElem` vowels
_ -> False
cvc :: [Char] -> Bool
cvc word | length word < 3 = False
| otherwise = isConsonant word lastIndex &&
isVowel word (lastIndex - 1) &&
isConsonant word (lastIndex - 2) &&
List.last word `List.notElem` ['w','x','y']
where lastIndex = length word - 1
statefulReplace :: Eq a => ([a] -> Bool)
-> [a] -> [a] -> [a]
-> Maybe (Data.Either.Either [a] [a])
statefulReplace predicate str end replacement
| end `List.isSuffixOf` str = Just replaced
| otherwise = Nothing
where
part = take (length str - length end) str
replaced | predicate part = Right (part <> replacement)
| otherwise = Left str
replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
replaceEnd predicate str end replacement = do
result <- statefulReplace predicate str end replacement
pure (either identity identity result)
findStem
:: (Foldable t, Functor t, Eq a) =>
([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
measureGT :: Int -> [Char] -> Bool
measureGT = flip ((>) . measure)
step1a :: [Char] -> [Char]
step1a word = fromMaybe word result
where
result = findStem (const True) word suffixes
suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
beforeStep1b :: [Char] -> Either [Char] [Char]
beforeStep1b word = fromMaybe (Left word) result
where
cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
cond1 x = do { v <- x; pure (Left v) }
result =
cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
cond23 (statefulReplace containsVowel word "ing" "" )
afterStep1b :: [Char] -> [Char]
afterStep1b word = fromMaybe word result
where
double = endsWithDouble word && not (any ((`List.isSuffixOf` word) . return) ['l','s','z'])
mEq1AndCvc = measure word == 1 && cvc word
iif cond val = if cond then Just val else Nothing
result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
`mplus` iif double (List.init word)
`mplus` iif mEq1AndCvc (word <> "e")
step1b :: [Char] -> [Char]
step1b = either identity afterStep1b . beforeStep1b
step1c :: [Char] -> [Char]
step1c word = fromMaybe word result
where result = replaceEnd containsVowel word "y" "i"
step1 :: [Char] -> [Char]
step1 = step1c . step1b . step1a
step2 :: [Char] -> [Char]
step2 word = fromMaybe word result
where
result = findStem (measureGT 0) word
[ ("ational", "ate" )
, ("tional", "tion")
, ("enci", "ence")
, ("anci", "ance")
, ("izer", "ize" )
, ("bli", "ble" )
, ("alli", "al" )
, ("entli", "ent" )
, ("eli", "e" )
, ("ousli", "ous" )
, ("ization", "ize" )
, ("ation", "ate" )
, ("ator", "ate" )
, ("alism", "al" )
, ("iveness", "ive" )
, ("fulness", "ful" )
, ("ousness", "ous" )
, ("aliti", "al" )
, ("iviti", "ive" )
, ("biliti", "ble" )
, ("logi", "log" ) ]
step3 :: [Char] -> [Char]
step3 word = fromMaybe word result
where
result = findStem (measureGT 0) word
[ ("icate", "ic")
, ("ative", "" )
, ("alize", "al")
, ("iciti", "ic")
, ("ical" , "ic")
, ("ful" , "" )
, ("ness" , "" ) ]
step4 :: [Char] -> [Char]
step4 word = fromMaybe word result
where
gt1andST str = (measureGT 1) str && any ((`List.isSuffixOf` str) . return) ['s','t']
findGT1 = findStem (measureGT 1) word . map (flip (,) "")
result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
(findStem gt1andST word [("ion","")]) `mplus`
(findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
step5a :: [Char] -> [Char]
step5a word = fromMaybe word result
where
test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
result = replaceEnd test word "e" ""
step5b :: [Char] -> [Char]
step5b word = fromMaybe word result
where
cond s = List.last s == 'l' && measureGT 1 s
result = replaceEnd cond word "l" ""
step5 :: [Char] -> [Char]
step5 = step5b . step5a
allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1
stem :: Text -> Text
stem s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char]
stem' s | length s < 3 = s
| otherwise = allSteps s
--fixpoint :: Eq t => (t -> t) -> t -> t
--fixpoint f x = let fx = f x in
-- if fx == x
-- then x
-- else fixpoint f fx
--
--fixstem :: [Char] -> [Char]
--fixstem = fixpoint stem'
{-
main :: IO ()
main = do
content <- readFile "input.txt"
writeFile "output.txt" $ unlines $ map stem $ lines content
-}
{-|
Module : Gargantext.Core.Text.Ngrams.Stem.En
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
module Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter (stem)
where
import Control.Monad
import Data.Either
import Data.List ((!!))
import Data.List qualified as List hiding (map, head)
import Data.Maybe
import Data.Text (pack, unpack)
import Gargantext.Prelude
vowels :: [Char]
vowels = ['a','e','i','o','u']
isConsonant :: [Char] -> Int -> Bool
isConsonant str i
| c `elem` vowels = False
| c == 'y' = i == 0 || isVowel str (i - 1)
| otherwise = True
where
c = str !! i
isVowel :: [Char] -> Int -> Bool
isVowel = (not .) . isConsonant
byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
byIndex fun str = fun str [0..length str - 1]
containsVowel :: [Char] -> Bool
containsVowel = byIndex (any . isVowel)
-- | /!\ unsafe fromJust
measure :: [Char] -> Int
measure = length . filter not . List.init . (True:)
. map fromJust . map head
. List.group . byIndex (map . isConsonant)
endsWithDouble :: [Char] -> Bool
endsWithDouble = startsWithDouble . reverse
where
startsWithDouble l = case l of
(x:y:_) -> x == y && x `List.notElem` vowels
_ -> False
cvc :: [Char] -> Bool
cvc word | length word < 3 = False
| otherwise = isConsonant word lastIndex &&
isVowel word (lastIndex - 1) &&
isConsonant word (lastIndex - 2) &&
List.last word `List.notElem` ['w','x','y']
where lastIndex = length word - 1
statefulReplace :: Eq a => ([a] -> Bool)
-> [a] -> [a] -> [a]
-> Maybe (Data.Either.Either [a] [a])
statefulReplace predicate str end replacement
| end `List.isSuffixOf` str = Just replaced
| otherwise = Nothing
where
part = take (length str - length end) str
replaced | predicate part = Right (part <> replacement)
| otherwise = Left str
replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
replaceEnd predicate str end replacement = do
result <- statefulReplace predicate str end replacement
pure (either identity identity result)
findStem
:: (Foldable t, Functor t, Eq a) =>
([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
measureGT :: Int -> [Char] -> Bool
measureGT = flip ((>) . measure)
step1a :: [Char] -> [Char]
step1a word = fromMaybe word result
where
result = findStem (const True) word suffixes
suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
beforeStep1b :: [Char] -> Either [Char] [Char]
beforeStep1b word = fromMaybe (Left word) result
where
cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
cond1 x = do { v <- x; pure (Left v) }
result =
cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
cond23 (statefulReplace containsVowel word "ing" "" )
afterStep1b :: [Char] -> [Char]
afterStep1b word = fromMaybe word result
where
double = endsWithDouble word && not (any ((`List.isSuffixOf` word) . return) ['l','s','z'])
mEq1AndCvc = measure word == 1 && cvc word
iif cond val = if cond then Just val else Nothing
result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
`mplus` iif double (List.init word)
`mplus` iif mEq1AndCvc (word <> "e")
step1b :: [Char] -> [Char]
step1b = either identity afterStep1b . beforeStep1b
-- Issue #415: According to the Porter stemming rules, we need to replace `y` with `i` only if there
-- are no other vocals at the end.
step1c :: [Char] -> [Char]
step1c word
| length word > 2 && List.last word == 'y' && isConsonant word (List.length word - 2)
= fromMaybe word $ replaceEnd containsVowel word "y" "i"
| otherwise
= word
step1 :: [Char] -> [Char]
step1 = step1c . step1b . step1a
step2 :: [Char] -> [Char]
step2 word = fromMaybe word result
where
result = findStem (measureGT 0) word
[ ("ational", "ate" )
, ("tional", "tion")
, ("enci", "ence")
, ("anci", "ance")
, ("izer", "ize" )
, ("bli", "ble" )
, ("alli", "al" )
, ("entli", "ent" )
, ("eli", "e" )
, ("ousli", "ous" )
, ("ization", "ize" )
, ("ation", "ate" )
, ("ator", "ate" )
, ("alism", "al" )
, ("iveness", "ive" )
, ("fulness", "ful" )
, ("ousness", "ous" )
, ("aliti", "al" )
, ("iviti", "ive" )
, ("biliti", "ble" )
, ("logi", "log" ) ]
step3 :: [Char] -> [Char]
step3 word = fromMaybe word result
where
result = findStem (measureGT 0) word
[ ("icate", "ic")
, ("ative", "" )
, ("alize", "al")
, ("iciti", "ic")
, ("ical" , "ic")
, ("ful" , "" )
, ("ness" , "" ) ]
step4 :: [Char] -> [Char]
step4 word = fromMaybe word result
where
gt1andST str = (measureGT 1) str && any ((`List.isSuffixOf` str) . return) ['s','t']
findGT1 = findStem (measureGT 1) word . map (flip (,) "")
result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
(findStem gt1andST word [("ion","")]) `mplus`
(findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
step5a :: [Char] -> [Char]
step5a word = fromMaybe word result
where
test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
result = replaceEnd test word "e" ""
step5b :: [Char] -> [Char]
step5b word = fromMaybe word result
where
cond s = List.last s == 'l' && measureGT 1 s
result = replaceEnd cond word "l" ""
step5 :: [Char] -> [Char]
step5 = step5b . step5a
allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1
stem :: Text -> Text
stem s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char]
stem' s | length s < 3 = s
| otherwise = allSteps s
--fixpoint :: Eq t => (t -> t) -> t -> t
--fixpoint f x = let fx = f x in
-- if fx == x
-- then x
-- else fixpoint f fx
--
--fixstem :: [Char] -> [Char]
--fixstem = fixpoint stem'
{-
main :: IO ()
main = do
content <- readFile "input.txt"
writeFile "output.txt" $ unlines $ map stem $ lines content
-}
......@@ -52,6 +52,7 @@ tests = sequential $ aroundAll withTestDBAndPort $ do
let corpusId = _sctx_data
withApplication _sctx_app $ do
withValidLogin _sctx_port "alice" (GargPassword "alice") $ \clientEnv token -> do
liftIO $ do
(HashedResponse _ tr1)
<- checkEither $ runClientM (get_table token
......@@ -63,8 +64,8 @@ tests = sequential $ aroundAll withTestDBAndPort $ do
(Just $ RawQuery "soy")
Nothing
) clientEnv
length (tr_docs tr1) `shouldBe` 4
length (tr_docs tr1) `shouldBe` 3
createSoySauceCorpus :: SpecContext () -> IO (SpecContext CorpusId)
createSoySauceCorpus ctx@SpecContext{..} = do
......
......@@ -135,6 +135,8 @@ stemmingTest :: TestEnv -> Assertion
stemmingTest _env = do
stem EN GargPorterAlgorithm "Ajeje" `shouldBe` "Ajeje"
stem EN GargPorterAlgorithm "PyPlasm:" `shouldBe` "PyPlasm:"
stem EN GargPorterAlgorithm "soy" `shouldBe` "soy"
stem EN GargPorterAlgorithm "cry" `shouldBe` "cri"
-- This test outlines the main differences between Porter and Lancaster.
stem EN GargPorterAlgorithm "dancer" `shouldBe` "dancer"
stem EN LancasterAlgorithm "dancer" `shouldBe` "dant"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment