Commit 43901901 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Fix bug in GargPorter implementation

parent d44831d0
Pipeline #6830 failed with stages
in 45 minutes and 57 seconds
{-| {-|
Module : Gargantext.Core.Text.Ngrams.Stem.En Module : Gargantext.Core.Text.Ngrams.Stem.En
Description : Porter Algorithm Implementation purely Haskell Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Adapted from: Adapted from:
- source: https://hackage.haskell.org/package/porter - source: https://hackage.haskell.org/package/porter
- [Char] -> [Text] - [Char] -> [Text]
- adding Types signatures - adding Types signatures
- fixes unseen cases - fixes unseen cases
-} -}
module Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter (stem) module Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter (stem)
where where
import Control.Monad import Control.Monad
import Data.Either import Data.Either
import Data.List ((!!)) import Data.List ((!!))
import Data.List qualified as List hiding (map, head) import Data.List qualified as List hiding (map, head)
import Data.Maybe import Data.Maybe
import Data.Text (pack, unpack) import Data.Text (pack, unpack)
import Gargantext.Prelude import Gargantext.Prelude
vowels :: [Char] vowels :: [Char]
vowels = ['a','e','i','o','u'] vowels = ['a','e','i','o','u']
isConsonant :: [Char] -> Int -> Bool isConsonant :: [Char] -> Int -> Bool
isConsonant str i isConsonant str i
| c `elem` vowels = False | c `elem` vowels = False
| c == 'y' = i == 0 || isVowel str (i - 1) | c == 'y' = i == 0 || isVowel str (i - 1)
| otherwise = True | otherwise = True
where where
c = str !! i c = str !! i
isVowel :: [Char] -> Int -> Bool isVowel :: [Char] -> Int -> Bool
isVowel = (not .) . isConsonant isVowel = (not .) . isConsonant
byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2 byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
byIndex fun str = fun str [0..length str - 1] byIndex fun str = fun str [0..length str - 1]
containsVowel :: [Char] -> Bool containsVowel :: [Char] -> Bool
containsVowel = byIndex (any . isVowel) containsVowel = byIndex (any . isVowel)
-- | /!\ unsafe fromJust -- | /!\ unsafe fromJust
measure :: [Char] -> Int measure :: [Char] -> Int
measure = length . filter not . List.init . (True:) measure = length . filter not . List.init . (True:)
. map fromJust . map head . map fromJust . map head
. List.group . byIndex (map . isConsonant) . List.group . byIndex (map . isConsonant)
endsWithDouble :: [Char] -> Bool endsWithDouble :: [Char] -> Bool
endsWithDouble = startsWithDouble . reverse endsWithDouble = startsWithDouble . reverse
where where
startsWithDouble l = case l of startsWithDouble l = case l of
(x:y:_) -> x == y && x `List.notElem` vowels (x:y:_) -> x == y && x `List.notElem` vowels
_ -> False _ -> False
cvc :: [Char] -> Bool cvc :: [Char] -> Bool
cvc word | length word < 3 = False cvc word | length word < 3 = False
| otherwise = isConsonant word lastIndex && | otherwise = isConsonant word lastIndex &&
isVowel word (lastIndex - 1) && isVowel word (lastIndex - 1) &&
isConsonant word (lastIndex - 2) && isConsonant word (lastIndex - 2) &&
List.last word `List.notElem` ['w','x','y'] List.last word `List.notElem` ['w','x','y']
where lastIndex = length word - 1 where lastIndex = length word - 1
statefulReplace :: Eq a => ([a] -> Bool) statefulReplace :: Eq a => ([a] -> Bool)
-> [a] -> [a] -> [a] -> [a] -> [a] -> [a]
-> Maybe (Data.Either.Either [a] [a]) -> Maybe (Data.Either.Either [a] [a])
statefulReplace predicate str end replacement statefulReplace predicate str end replacement
| end `List.isSuffixOf` str = Just replaced | end `List.isSuffixOf` str = Just replaced
| otherwise = Nothing | otherwise = Nothing
where where
part = take (length str - length end) str part = take (length str - length end) str
replaced | predicate part = Right (part <> replacement) replaced | predicate part = Right (part <> replacement)
| otherwise = Left str | otherwise = Left str
replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a] replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
replaceEnd predicate str end replacement = do replaceEnd predicate str end replacement = do
result <- statefulReplace predicate str end replacement result <- statefulReplace predicate str end replacement
pure (either identity identity result) pure (either identity identity result)
findStem findStem
:: (Foldable t, Functor t, Eq a) => :: (Foldable t, Functor t, Eq a) =>
([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a] ([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
measureGT :: Int -> [Char] -> Bool measureGT :: Int -> [Char] -> Bool
measureGT = flip ((>) . measure) measureGT = flip ((>) . measure)
step1a :: [Char] -> [Char] step1a :: [Char] -> [Char]
step1a word = fromMaybe word result step1a word = fromMaybe word result
where where
result = findStem (const True) word suffixes result = findStem (const True) word suffixes
suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")] suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
beforeStep1b :: [Char] -> Either [Char] [Char] beforeStep1b :: [Char] -> Either [Char] [Char]
beforeStep1b word = fromMaybe (Left word) result beforeStep1b word = fromMaybe (Left word) result
where where
cond23 x = do { v <- x; either (const Nothing) (return . Right) v } cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
cond1 x = do { v <- x; pure (Left v) } cond1 x = do { v <- x; pure (Left v) }
result = result =
cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus` cond1 (replaceEnd (measureGT 0) word "eed" "ee") `mplus`
cond23 (statefulReplace containsVowel word "ed" "" ) `mplus` cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
cond23 (statefulReplace containsVowel word "ing" "" ) cond23 (statefulReplace containsVowel word "ing" "" )
afterStep1b :: [Char] -> [Char] afterStep1b :: [Char] -> [Char]
afterStep1b word = fromMaybe word result afterStep1b word = fromMaybe word result
where where
double = endsWithDouble word && not (any ((`List.isSuffixOf` word) . return) ['l','s','z']) double = endsWithDouble word && not (any ((`List.isSuffixOf` word) . return) ['l','s','z'])
mEq1AndCvc = measure word == 1 && cvc word mEq1AndCvc = measure word == 1 && cvc word
iif cond val = if cond then Just val else Nothing iif cond val = if cond then Just val else Nothing
result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")] result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
`mplus` iif double (List.init word) `mplus` iif double (List.init word)
`mplus` iif mEq1AndCvc (word <> "e") `mplus` iif mEq1AndCvc (word <> "e")
step1b :: [Char] -> [Char] step1b :: [Char] -> [Char]
step1b = either identity afterStep1b . beforeStep1b step1b = either identity afterStep1b . beforeStep1b
step1c :: [Char] -> [Char] -- Issue #415: According to the Porter stemming rules, we need to replace `y` with `i` only if there
step1c word = fromMaybe word result -- are no other vocals at the end.
where result = replaceEnd containsVowel word "y" "i" step1c :: [Char] -> [Char]
step1c word
step1 :: [Char] -> [Char] | length word > 2 && List.last word == 'y' && isConsonant word (List.length word - 2)
step1 = step1c . step1b . step1a = fromMaybe word $ replaceEnd containsVowel word "y" "i"
| otherwise
step2 :: [Char] -> [Char] = word
step2 word = fromMaybe word result
where step1 :: [Char] -> [Char]
result = findStem (measureGT 0) word step1 = step1c . step1b . step1a
[ ("ational", "ate" )
, ("tional", "tion") step2 :: [Char] -> [Char]
, ("enci", "ence") step2 word = fromMaybe word result
, ("anci", "ance") where
, ("izer", "ize" ) result = findStem (measureGT 0) word
, ("bli", "ble" ) [ ("ational", "ate" )
, ("alli", "al" ) , ("tional", "tion")
, ("entli", "ent" ) , ("enci", "ence")
, ("eli", "e" ) , ("anci", "ance")
, ("ousli", "ous" ) , ("izer", "ize" )
, ("ization", "ize" ) , ("bli", "ble" )
, ("ation", "ate" ) , ("alli", "al" )
, ("ator", "ate" ) , ("entli", "ent" )
, ("alism", "al" ) , ("eli", "e" )
, ("iveness", "ive" ) , ("ousli", "ous" )
, ("fulness", "ful" ) , ("ization", "ize" )
, ("ousness", "ous" ) , ("ation", "ate" )
, ("aliti", "al" ) , ("ator", "ate" )
, ("iviti", "ive" ) , ("alism", "al" )
, ("biliti", "ble" ) , ("iveness", "ive" )
, ("logi", "log" ) ] , ("fulness", "ful" )
, ("ousness", "ous" )
step3 :: [Char] -> [Char] , ("aliti", "al" )
step3 word = fromMaybe word result , ("iviti", "ive" )
where , ("biliti", "ble" )
result = findStem (measureGT 0) word , ("logi", "log" ) ]
[ ("icate", "ic")
, ("ative", "" ) step3 :: [Char] -> [Char]
, ("alize", "al") step3 word = fromMaybe word result
, ("iciti", "ic") where
, ("ical" , "ic") result = findStem (measureGT 0) word
, ("ful" , "" ) [ ("icate", "ic")
, ("ness" , "" ) ] , ("ative", "" )
, ("alize", "al")
step4 :: [Char] -> [Char] , ("iciti", "ic")
step4 word = fromMaybe word result , ("ical" , "ic")
where , ("ful" , "" )
gt1andST str = (measureGT 1) str && any ((`List.isSuffixOf` str) . return) ['s','t'] , ("ness" , "" ) ]
findGT1 = findStem (measureGT 1) word . map (flip (,) "")
result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus` step4 :: [Char] -> [Char]
(findStem gt1andST word [("ion","")]) `mplus` step4 word = fromMaybe word result
(findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"]) where
gt1andST str = (measureGT 1) str && any ((`List.isSuffixOf` str) . return) ['s','t']
step5a :: [Char] -> [Char] findGT1 = findStem (measureGT 1) word . map (flip (,) "")
step5a word = fromMaybe word result result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
where (findStem gt1andST word [("ion","")]) `mplus`
test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str)) (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
result = replaceEnd test word "e" ""
step5a :: [Char] -> [Char]
step5b :: [Char] -> [Char] step5a word = fromMaybe word result
step5b word = fromMaybe word result where
where test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
cond s = List.last s == 'l' && measureGT 1 s result = replaceEnd test word "e" ""
result = replaceEnd cond word "l" ""
step5b :: [Char] -> [Char]
step5 :: [Char] -> [Char] step5b word = fromMaybe word result
step5 = step5b . step5a where
cond s = List.last s == 'l' && measureGT 1 s
allSteps :: [Char] -> [Char] result = replaceEnd cond word "l" ""
allSteps = step5 . step4 . step3 . step2 . step1
step5 :: [Char] -> [Char]
stem :: Text -> Text step5 = step5b . step5a
stem s = pack (stem' $ unpack s)
allSteps :: [Char] -> [Char]
stem' :: [Char] -> [Char] allSteps = step5 . step4 . step3 . step2 . step1
stem' s | length s < 3 = s
| otherwise = allSteps s stem :: Text -> Text
stem s = pack (stem' $ unpack s)
--fixpoint :: Eq t => (t -> t) -> t -> t
--fixpoint f x = let fx = f x in stem' :: [Char] -> [Char]
-- if fx == x stem' s | length s < 3 = s
-- then x | otherwise = allSteps s
-- else fixpoint f fx
-- --fixpoint :: Eq t => (t -> t) -> t -> t
--fixstem :: [Char] -> [Char] --fixpoint f x = let fx = f x in
--fixstem = fixpoint stem' -- if fx == x
-- then x
-- else fixpoint f fx
{- --
--fixstem :: [Char] -> [Char]
main :: IO () --fixstem = fixpoint stem'
main = do
content <- readFile "input.txt"
writeFile "output.txt" $ unlines $ map stem $ lines content {-
-} main :: IO ()
main = do
content <- readFile "input.txt"
writeFile "output.txt" $ unlines $ map stem $ lines content
-}
...@@ -52,6 +52,7 @@ tests = sequential $ aroundAll withTestDBAndPort $ do ...@@ -52,6 +52,7 @@ tests = sequential $ aroundAll withTestDBAndPort $ do
let corpusId = _sctx_data let corpusId = _sctx_data
withApplication _sctx_app $ do withApplication _sctx_app $ do
withValidLogin _sctx_port "alice" (GargPassword "alice") $ \clientEnv token -> do withValidLogin _sctx_port "alice" (GargPassword "alice") $ \clientEnv token -> do
liftIO $ do liftIO $ do
(HashedResponse _ tr1) (HashedResponse _ tr1)
<- checkEither $ runClientM (get_table token <- checkEither $ runClientM (get_table token
...@@ -63,8 +64,8 @@ tests = sequential $ aroundAll withTestDBAndPort $ do ...@@ -63,8 +64,8 @@ tests = sequential $ aroundAll withTestDBAndPort $ do
(Just $ RawQuery "soy") (Just $ RawQuery "soy")
Nothing Nothing
) clientEnv ) clientEnv
length (tr_docs tr1) `shouldBe` 4
length (tr_docs tr1) `shouldBe` 3
createSoySauceCorpus :: SpecContext () -> IO (SpecContext CorpusId) createSoySauceCorpus :: SpecContext () -> IO (SpecContext CorpusId)
createSoySauceCorpus ctx@SpecContext{..} = do createSoySauceCorpus ctx@SpecContext{..} = do
......
...@@ -135,6 +135,8 @@ stemmingTest :: TestEnv -> Assertion ...@@ -135,6 +135,8 @@ stemmingTest :: TestEnv -> Assertion
stemmingTest _env = do stemmingTest _env = do
stem EN GargPorterAlgorithm "Ajeje" `shouldBe` "Ajeje" stem EN GargPorterAlgorithm "Ajeje" `shouldBe` "Ajeje"
stem EN GargPorterAlgorithm "PyPlasm:" `shouldBe` "PyPlasm:" stem EN GargPorterAlgorithm "PyPlasm:" `shouldBe` "PyPlasm:"
stem EN GargPorterAlgorithm "soy" `shouldBe` "soy"
stem EN GargPorterAlgorithm "cry" `shouldBe` "cri"
-- This test outlines the main differences between Porter and Lancaster. -- This test outlines the main differences between Porter and Lancaster.
stem EN GargPorterAlgorithm "dancer" `shouldBe` "dancer" stem EN GargPorterAlgorithm "dancer" `shouldBe` "dancer"
stem EN LancasterAlgorithm "dancer" `shouldBe` "dant" stem EN LancasterAlgorithm "dancer" `shouldBe` "dant"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment