Commit fa0ed7df authored by Alexandre Delanoë's avatar Alexandre Delanoë

Revert "[FEAT/STEM] implemenging Porter lib into Gargantext for English language."

This reverts commit e21bad3e.
parent 800756bc
......@@ -40,7 +40,6 @@ library:
- Gargantext.Ngrams.CoreNLP
- Gargantext.Ngrams.Parser
- Gargantext.Ngrams.Lang.En
- Gargantext.Ngrams.Stem.En
- Gargantext.Ngrams.Lang.Fr
- Gargantext.Ngrams.Metrics
- Gargantext.Ngrams.TextMining
......
......@@ -23,9 +23,8 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
, module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics
, Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
, ListName(..), equivNgrams, isGram, sentences
, ngramsTest
, Ngrams(..), ngrams, occ, sumOcc, text2fis
, ListName(..), equivNgrams, isGram
--, module Gargantext.Ngrams.Words
) where
......@@ -44,10 +43,7 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import Data.List (sort)
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, filter, toLower, split, lines, concat)
import qualified Data.Text as DT
import Data.Text.IO (readFile)
import Data.Text (Text, words, filter, toLower)
import Data.Map.Strict (Map
, empty
, insertWith, unionWith
......@@ -84,19 +80,11 @@ type Occ = Int
ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
monograms :: Text -> [Text]
monograms txt = split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
monograms = words
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/']
-- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ
......@@ -141,30 +129,4 @@ text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
sentences txt = split isStop txt
isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!']
-- | Tests
-- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
ngramsTest = ws
where
txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt"
-- | Number of sentences
ls = sentences <$> txt
-- | Number of monograms used in the full text
ws = ngrams <$> txt
-- | stem ngrams
-- TODO
-- group ngrams
ocs = occ <$> ws
{-|
Module : Gargantext.
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Ngrams.Stem.En
where
module Language.Porter (stem, fixstem)
where
import Control.Monad
import Data.Either
import Data.Maybe
import Data.Text (Text(), pack, unpack)
import Data.List hiding (map, head)
import Gargantext.Prelude
import Data.List
vowels :: [Char]
vowels = ['a','e','i','o','u']
isConsonant :: [Char] -> Int -> Bool
isConsonant str i
| c `elem` vowels = False
| c `elem` "aeiou" = False
| c == 'y' = i == 0 || isVowel str (i - 1)
| otherwise = True
where
c = str !! i
isVowel :: [Char] -> Int -> Bool
isVowel = (not .) . isConsonant
byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
byIndex fun str = fun str [0..length str - 1]
containsVowel :: [Char] -> Bool
containsVowel = byIndex (any . isVowel)
-- | /!\ unsafe fromJust
measure :: [Char] -> Int
measure = length . filter not . init . (True:)
. map fromJust . map head
. group . byIndex (map . isConsonant)
measure = length . filter not . init . (True:) . map head . group . byIndex (map . isConsonant)
containsVowel = byIndex (any . isVowel)
endsWithDouble :: [Char] -> Bool
endsWithDouble = startsWithDouble . reverse
where
startsWithDouble l = case l of
(x:y:_) -> x == y && x `notElem` vowels
_ -> False
startsWithDouble l | length l < 2 = False
| otherwise = let (x:y:_) = l in x == y && x `notElem` "aeiou"
cvc :: [Char] -> Bool
cvc word | length word < 3 = False
| otherwise = isConsonant word lastIndex &&
isVowel word (lastIndex - 1) &&
isConsonant word (lastIndex - 2) &&
last word `notElem` ['w','x','y']
last word `notElem` "wxy"
where lastIndex = length word - 1
statefulReplace :: Eq a => ([a] -> Bool)
-> [a] -> [a] -> [a]
-> Maybe (Data.Either.Either [a] [a])
statefulReplace predicate str end replacement
| end `isSuffixOf` str = Just replaced
| otherwise = Nothing
......@@ -82,26 +40,17 @@ statefulReplace predicate str end replacement
replaced | predicate part = Right (part ++ replacement)
| otherwise = Left str
replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
replaceEnd predicate str end replacement = do
result <- statefulReplace predicate str end replacement
return (either identity identity result)
return (either id id result)
findStem
:: (Foldable t, Functor t, Eq a) =>
([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
measureGT :: Int -> [Char] -> Bool
measureGT = flip ((>) . measure)
step1a :: [Char] -> [Char]
step1a word = fromMaybe word result
where
result = findStem (const True) word suffixes
suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
where result = findStem (const True) word [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
beforeStep1b :: [Char] -> Either [Char] [Char]
beforeStep1b word = fromMaybe (Left word) result
where
cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
......@@ -111,27 +60,22 @@ beforeStep1b word = fromMaybe (Left word) result
cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
cond23 (statefulReplace containsVowel word "ing" "" )
afterStep1b :: [Char] -> [Char]
afterStep1b word = fromMaybe word result
where
double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z'])
double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) "lsz")
mEq1AndCvc = measure word == 1 && cvc word
iif cond val = if cond then Just val else Nothing
result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
`mplus` iif double (init word)
`mplus` iif mEq1AndCvc (word ++ "e")
step1b :: [Char] -> [Char]
step1b = either identity afterStep1b . beforeStep1b
step1b = either id afterStep1b . beforeStep1b
step1c :: [Char] -> [Char]
step1c word = fromMaybe word result
where result = replaceEnd containsVowel word "y" "i"
step1 :: [Char] -> [Char]
step1 = step1c . step1b . step1a
step2 :: [Char] -> [Char]
step2 word = fromMaybe word result
where
result = findStem (measureGT 0) word
......@@ -157,7 +101,6 @@ step2 word = fromMaybe word result
, ("biliti", "ble" )
, ("logi", "log" ) ]
step3 :: [Char] -> [Char]
step3 word = fromMaybe word result
where
result = findStem (measureGT 0) word
......@@ -169,49 +112,37 @@ step3 word = fromMaybe word result
, ("ful" , "" )
, ("ness" , "" ) ]
step4 :: [Char] -> [Char]
step4 word = fromMaybe word result
where
gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t']
gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) "st"
findGT1 = findStem (measureGT 1) word . map (flip (,) "")
result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
(findStem gt1andST word [("ion","")]) `mplus`
(findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
step5a :: [Char] -> [Char]
step5a word = fromMaybe word result
where
test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
result = replaceEnd test word "e" ""
step5b :: [Char] -> [Char]
step5b word = fromMaybe word result
where
cond s = last s == 'l' && measureGT 1 s
result = replaceEnd cond word "l" ""
step5 :: [Char] -> [Char]
step5 = step5b . step5a
allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1
stem :: Text -> Text
stem s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char]
stem' s | length s < 3 = s
stem s | length s < 3 = s
| otherwise = allSteps s
fixpoint :: Eq t => (t -> t) -> t -> t
fixpoint f x = let fx = f x in
if fx == x
then x
else fixpoint f fx
fixstem :: [Char] -> [Char]
fixstem = fixpoint stem'
fixstem = fixpoint stem
{-
......
......@@ -42,9 +42,8 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, Eq, (==), (>=), (<=), (<>), (/=)
, (&&), (||), not
, fst, snd, toS
, elem, die, mod, div, const
, elem, die, mod, div
, curry, uncurry
, otherwise
)
-- TODO import functions optimized in Utils.Count
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment