Commit fa0ed7df authored by Alexandre Delanoë's avatar Alexandre Delanoë

Revert "[FEAT/STEM] implemenging Porter lib into Gargantext for English language."

This reverts commit e21bad3e.
parent 800756bc
...@@ -40,7 +40,6 @@ library: ...@@ -40,7 +40,6 @@ library:
- Gargantext.Ngrams.CoreNLP - Gargantext.Ngrams.CoreNLP
- Gargantext.Ngrams.Parser - Gargantext.Ngrams.Parser
- Gargantext.Ngrams.Lang.En - Gargantext.Ngrams.Lang.En
- Gargantext.Ngrams.Stem.En
- Gargantext.Ngrams.Lang.Fr - Gargantext.Ngrams.Lang.Fr
- Gargantext.Ngrams.Metrics - Gargantext.Ngrams.Metrics
- Gargantext.Ngrams.TextMining - Gargantext.Ngrams.TextMining
......
...@@ -23,9 +23,8 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters ...@@ -23,9 +23,8 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
, module Gargantext.Ngrams.Occurrences , module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining , module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics , module Gargantext.Ngrams.Metrics
, Ngrams(..), ngrams, occ, sumOcc, text2fis, clean , Ngrams(..), ngrams, occ, sumOcc, text2fis
, ListName(..), equivNgrams, isGram, sentences , ListName(..), equivNgrams, isGram
, ngramsTest
--, module Gargantext.Ngrams.Words --, module Gargantext.Ngrams.Words
) where ) where
...@@ -44,10 +43,7 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS ...@@ -44,10 +43,7 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import Data.List (sort) import Data.List (sort)
import Data.Char (Char, isAlphaNum, isSpace) import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, filter, toLower, split, lines, concat) import Data.Text (Text, words, filter, toLower)
import qualified Data.Text as DT
import Data.Text.IO (readFile)
import Data.Map.Strict (Map import Data.Map.Strict (Map
, empty , empty
, insertWith, unionWith , insertWith, unionWith
...@@ -84,19 +80,11 @@ type Occ = Int ...@@ -84,19 +80,11 @@ type Occ = Int
ngrams :: Text -> [Text] ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs ngrams xs = monograms $ toLower $ filter isGram xs
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
monograms :: Text -> [Text] monograms :: Text -> [Text]
monograms txt = split isWord txt monograms = words
where
isWord c = c `elem` [' ', '\'', ',', ';']
isGram :: Char -> Bool isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\''] isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/']
-- | Compute the occurrences (occ) -- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ occ :: Ord a => [a] -> Map a Occ
...@@ -141,30 +129,4 @@ text2fis n xs = list2fis n (map ngrams xs) ...@@ -141,30 +129,4 @@ text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis]) --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined --text2fisWith = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
sentences txt = split isStop txt
isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!']
-- | Tests
-- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
ngramsTest = ws
where
txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt"
-- | Number of sentences
ls = sentences <$> txt
-- | Number of monograms used in the full text
ws = ngrams <$> txt
-- | stem ngrams
-- TODO
-- group ngrams
ocs = occ <$> ws
{-| module Language.Porter (stem, fixstem)
Module : Gargantext. where
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Ngrams.Stem.En
where
import Control.Monad import Control.Monad
import Data.Either
import Data.Maybe import Data.Maybe
import Data.Text (Text(), pack, unpack) import Data.List
import Data.List hiding (map, head)
import Gargantext.Prelude
vowels :: [Char]
vowels = ['a','e','i','o','u']
isConsonant :: [Char] -> Int -> Bool
isConsonant str i isConsonant str i
| c `elem` vowels = False | c `elem` "aeiou" = False
| c == 'y' = i == 0 || isVowel str (i - 1) | c == 'y' = i == 0 || isVowel str (i - 1)
| otherwise = True | otherwise = True
where where
c = str !! i c = str !! i
isVowel :: [Char] -> Int -> Bool
isVowel = (not .) . isConsonant isVowel = (not .) . isConsonant
byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
byIndex fun str = fun str [0..length str - 1] byIndex fun str = fun str [0..length str - 1]
containsVowel :: [Char] -> Bool measure = length . filter not . init . (True:) . map head . group . byIndex (map . isConsonant)
containsVowel = byIndex (any . isVowel)
-- | /!\ unsafe fromJust
measure :: [Char] -> Int
measure = length . filter not . init . (True:)
. map fromJust . map head
. group . byIndex (map . isConsonant)
containsVowel = byIndex (any . isVowel)
endsWithDouble :: [Char] -> Bool
endsWithDouble = startsWithDouble . reverse endsWithDouble = startsWithDouble . reverse
where where
startsWithDouble l = case l of startsWithDouble l | length l < 2 = False
(x:y:_) -> x == y && x `notElem` vowels | otherwise = let (x:y:_) = l in x == y && x `notElem` "aeiou"
_ -> False
cvc :: [Char] -> Bool
cvc word | length word < 3 = False cvc word | length word < 3 = False
| otherwise = isConsonant word lastIndex && | otherwise = isConsonant word lastIndex &&
isVowel word (lastIndex - 1) && isVowel word (lastIndex - 1) &&
isConsonant word (lastIndex - 2) && isConsonant word (lastIndex - 2) &&
last word `notElem` ['w','x','y'] last word `notElem` "wxy"
where lastIndex = length word - 1 where lastIndex = length word - 1
statefulReplace :: Eq a => ([a] -> Bool)
-> [a] -> [a] -> [a]
-> Maybe (Data.Either.Either [a] [a])
statefulReplace predicate str end replacement statefulReplace predicate str end replacement
| end `isSuffixOf` str = Just replaced | end `isSuffixOf` str = Just replaced
| otherwise = Nothing | otherwise = Nothing
...@@ -82,26 +40,17 @@ statefulReplace predicate str end replacement ...@@ -82,26 +40,17 @@ statefulReplace predicate str end replacement
replaced | predicate part = Right (part ++ replacement) replaced | predicate part = Right (part ++ replacement)
| otherwise = Left str | otherwise = Left str
replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
replaceEnd predicate str end replacement = do replaceEnd predicate str end replacement = do
result <- statefulReplace predicate str end replacement result <- statefulReplace predicate str end replacement
return (either identity identity result) return (either id id result)
findStem
:: (Foldable t, Functor t, Eq a) =>
([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
measureGT :: Int -> [Char] -> Bool
measureGT = flip ((>) . measure) measureGT = flip ((>) . measure)
step1a :: [Char] -> [Char]
step1a word = fromMaybe word result step1a word = fromMaybe word result
where where result = findStem (const True) word [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
result = findStem (const True) word suffixes
suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
beforeStep1b :: [Char] -> Either [Char] [Char]
beforeStep1b word = fromMaybe (Left word) result beforeStep1b word = fromMaybe (Left word) result
where where
cond23 x = do { v <- x; either (const Nothing) (return . Right) v } cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
...@@ -111,27 +60,22 @@ beforeStep1b word = fromMaybe (Left word) result ...@@ -111,27 +60,22 @@ beforeStep1b word = fromMaybe (Left word) result
cond23 (statefulReplace containsVowel word "ed" "" ) `mplus` cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
cond23 (statefulReplace containsVowel word "ing" "" ) cond23 (statefulReplace containsVowel word "ing" "" )
afterStep1b :: [Char] -> [Char]
afterStep1b word = fromMaybe word result afterStep1b word = fromMaybe word result
where where
double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z']) double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) "lsz")
mEq1AndCvc = measure word == 1 && cvc word mEq1AndCvc = measure word == 1 && cvc word
iif cond val = if cond then Just val else Nothing iif cond val = if cond then Just val else Nothing
result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")] result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
`mplus` iif double (init word) `mplus` iif double (init word)
`mplus` iif mEq1AndCvc (word ++ "e") `mplus` iif mEq1AndCvc (word ++ "e")
step1b :: [Char] -> [Char] step1b = either id afterStep1b . beforeStep1b
step1b = either identity afterStep1b . beforeStep1b
step1c :: [Char] -> [Char]
step1c word = fromMaybe word result step1c word = fromMaybe word result
where result = replaceEnd containsVowel word "y" "i" where result = replaceEnd containsVowel word "y" "i"
step1 :: [Char] -> [Char]
step1 = step1c . step1b . step1a step1 = step1c . step1b . step1a
step2 :: [Char] -> [Char]
step2 word = fromMaybe word result step2 word = fromMaybe word result
where where
result = findStem (measureGT 0) word result = findStem (measureGT 0) word
...@@ -157,7 +101,6 @@ step2 word = fromMaybe word result ...@@ -157,7 +101,6 @@ step2 word = fromMaybe word result
, ("biliti", "ble" ) , ("biliti", "ble" )
, ("logi", "log" ) ] , ("logi", "log" ) ]
step3 :: [Char] -> [Char]
step3 word = fromMaybe word result step3 word = fromMaybe word result
where where
result = findStem (measureGT 0) word result = findStem (measureGT 0) word
...@@ -169,49 +112,37 @@ step3 word = fromMaybe word result ...@@ -169,49 +112,37 @@ step3 word = fromMaybe word result
, ("ful" , "" ) , ("ful" , "" )
, ("ness" , "" ) ] , ("ness" , "" ) ]
step4 :: [Char] -> [Char]
step4 word = fromMaybe word result step4 word = fromMaybe word result
where where
gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t'] gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) "st"
findGT1 = findStem (measureGT 1) word . map (flip (,) "") findGT1 = findStem (measureGT 1) word . map (flip (,) "")
result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus` result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
(findStem gt1andST word [("ion","")]) `mplus` (findStem gt1andST word [("ion","")]) `mplus`
(findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"]) (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
step5a :: [Char] -> [Char]
step5a word = fromMaybe word result step5a word = fromMaybe word result
where where
test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str)) test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
result = replaceEnd test word "e" "" result = replaceEnd test word "e" ""
step5b :: [Char] -> [Char]
step5b word = fromMaybe word result step5b word = fromMaybe word result
where where
cond s = last s == 'l' && measureGT 1 s cond s = last s == 'l' && measureGT 1 s
result = replaceEnd cond word "l" "" result = replaceEnd cond word "l" ""
step5 :: [Char] -> [Char]
step5 = step5b . step5a step5 = step5b . step5a
allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1 allSteps = step5 . step4 . step3 . step2 . step1
stem :: Text -> Text stem s | length s < 3 = s
stem s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char]
stem' s | length s < 3 = s
| otherwise = allSteps s | otherwise = allSteps s
fixpoint :: Eq t => (t -> t) -> t -> t
fixpoint f x = let fx = f x in fixpoint f x = let fx = f x in
if fx == x if fx == x
then x then x
else fixpoint f fx else fixpoint f fx
fixstem :: [Char] -> [Char] fixstem = fixpoint stem
fixstem = fixpoint stem'
{- {-
......
...@@ -42,9 +42,8 @@ import Protolude ( Bool(True, False), Int, Double, Integer ...@@ -42,9 +42,8 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, Eq, (==), (>=), (<=), (<>), (/=) , Eq, (==), (>=), (<=), (<>), (/=)
, (&&), (||), not , (&&), (||), not
, fst, snd, toS , fst, snd, toS
, elem, die, mod, div, const , elem, die, mod, div
, curry, uncurry , curry, uncurry
, otherwise
) )
-- TODO import functions optimized in Utils.Count -- TODO import functions optimized in Utils.Count
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment