Commit e21bad3e authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT/STEM] implemenging Porter lib into Gargantext for English language.

parent dbff4c96
...@@ -40,6 +40,7 @@ library: ...@@ -40,6 +40,7 @@ library:
- Gargantext.Ngrams.CoreNLP - Gargantext.Ngrams.CoreNLP
- Gargantext.Ngrams.Parser - Gargantext.Ngrams.Parser
- Gargantext.Ngrams.Lang.En - Gargantext.Ngrams.Lang.En
- Gargantext.Ngrams.Stem.En
- Gargantext.Ngrams.Lang.Fr - Gargantext.Ngrams.Lang.Fr
- Gargantext.Ngrams.Metrics - Gargantext.Ngrams.Metrics
- Gargantext.Ngrams.TextMining - Gargantext.Ngrams.TextMining
......
...@@ -23,8 +23,9 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters ...@@ -23,8 +23,9 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
, module Gargantext.Ngrams.Occurrences , module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining , module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics , module Gargantext.Ngrams.Metrics
, Ngrams(..), ngrams, occ, sumOcc, text2fis , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
, ListName(..), equivNgrams, isGram , ListName(..), equivNgrams, isGram, sentences
, ngramsTest
--, module Gargantext.Ngrams.Words --, module Gargantext.Ngrams.Words
) where ) where
...@@ -43,7 +44,10 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS ...@@ -43,7 +44,10 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import Data.List (sort) import Data.List (sort)
import Data.Char (Char, isAlphaNum, isSpace) import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, words, filter, toLower) import Data.Text (Text, filter, toLower, split, lines, concat)
import qualified Data.Text as DT
import Data.Text.IO (readFile)
import Data.Map.Strict (Map import Data.Map.Strict (Map
, empty , empty
, insertWith, unionWith , insertWith, unionWith
...@@ -80,11 +84,19 @@ type Occ = Int ...@@ -80,11 +84,19 @@ type Occ = Int
ngrams :: Text -> [Text] ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs ngrams xs = monograms $ toLower $ filter isGram xs
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
monograms :: Text -> [Text] monograms :: Text -> [Text]
monograms = words monograms txt = split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
isGram :: Char -> Bool isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/'] isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
-- | Compute the occurrences (occ) -- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ occ :: Ord a => [a] -> Map a Occ
...@@ -129,4 +141,30 @@ text2fis n xs = list2fis n (map ngrams xs) ...@@ -129,4 +141,30 @@ text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis]) --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined --text2fisWith = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
sentences txt = split isStop txt
isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!']
-- | Tests
-- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
ngramsTest = ws
where
txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt"
-- | Number of sentences
ls = sentences <$> txt
-- | Number of monograms used in the full text
ws = ngrams <$> txt
-- | stem ngrams
-- TODO
-- group ngrams
ocs = occ <$> ws
...@@ -28,7 +28,7 @@ import Gargantext.Prelude ...@@ -28,7 +28,7 @@ import Gargantext.Prelude
import Gargantext.Utils.Prefix (unPrefix) import Gargantext.Utils.Prefix (unPrefix)
import Data.Text (Text) import Data.Text (Text)
import Network.HTTP.Simple import Network.HTTP.Simple
data Token = Token { _tokenIndex :: Int data Token = Token { _tokenIndex :: Int
......
module Language.Porter (stem, fixstem) {-|
where Module : Gargantext.
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Ngrams.Stem.En
where
import Control.Monad import Control.Monad
import Data.Either
import Data.Maybe import Data.Maybe
import Data.List import Data.Text (Text(), pack, unpack)
import Data.List hiding (map, head)
import Gargantext.Prelude
vowels :: [Char]
vowels = ['a','e','i','o','u']
isConsonant :: [Char] -> Int -> Bool
isConsonant str i isConsonant str i
| c `elem` "aeiou" = False | c `elem` vowels = False
| c == 'y' = i == 0 || isVowel str (i - 1) | c == 'y' = i == 0 || isVowel str (i - 1)
| otherwise = True | otherwise = True
where where
c = str !! i c = str !! i
isVowel :: [Char] -> Int -> Bool
isVowel = (not .) . isConsonant isVowel = (not .) . isConsonant
byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
byIndex fun str = fun str [0..length str - 1] byIndex fun str = fun str [0..length str - 1]
measure = length . filter not . init . (True:) . map head . group . byIndex (map . isConsonant) containsVowel :: [Char] -> Bool
containsVowel = byIndex (any . isVowel) containsVowel = byIndex (any . isVowel)
-- | /!\ unsafe fromJust
measure :: [Char] -> Int
measure = length . filter not . init . (True:)
. map fromJust . map head
. group . byIndex (map . isConsonant)
endsWithDouble :: [Char] -> Bool
endsWithDouble = startsWithDouble . reverse endsWithDouble = startsWithDouble . reverse
where where
startsWithDouble l | length l < 2 = False startsWithDouble l = case l of
| otherwise = let (x:y:_) = l in x == y && x `notElem` "aeiou" (x:y:_) -> x == y && x `notElem` vowels
_ -> False
cvc :: [Char] -> Bool
cvc word | length word < 3 = False cvc word | length word < 3 = False
| otherwise = isConsonant word lastIndex && | otherwise = isConsonant word lastIndex &&
isVowel word (lastIndex - 1) && isVowel word (lastIndex - 1) &&
isConsonant word (lastIndex - 2) && isConsonant word (lastIndex - 2) &&
last word `notElem` "wxy" last word `notElem` ['w','x','y']
where lastIndex = length word - 1 where lastIndex = length word - 1
statefulReplace :: Eq a => ([a] -> Bool)
-> [a] -> [a] -> [a]
-> Maybe (Data.Either.Either [a] [a])
statefulReplace predicate str end replacement statefulReplace predicate str end replacement
| end `isSuffixOf` str = Just replaced | end `isSuffixOf` str = Just replaced
| otherwise = Nothing | otherwise = Nothing
...@@ -40,17 +82,26 @@ statefulReplace predicate str end replacement ...@@ -40,17 +82,26 @@ statefulReplace predicate str end replacement
replaced | predicate part = Right (part ++ replacement) replaced | predicate part = Right (part ++ replacement)
| otherwise = Left str | otherwise = Left str
replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
replaceEnd predicate str end replacement = do replaceEnd predicate str end replacement = do
result <- statefulReplace predicate str end replacement result <- statefulReplace predicate str end replacement
return (either id id result) return (either identity identity result)
findStem
:: (Foldable t, Functor t, Eq a) =>
([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
measureGT :: Int -> [Char] -> Bool
measureGT = flip ((>) . measure) measureGT = flip ((>) . measure)
step1a :: [Char] -> [Char]
step1a word = fromMaybe word result step1a word = fromMaybe word result
where result = findStem (const True) word [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")] where
result = findStem (const True) word suffixes
suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
beforeStep1b :: [Char] -> Either [Char] [Char]
beforeStep1b word = fromMaybe (Left word) result beforeStep1b word = fromMaybe (Left word) result
where where
cond23 x = do { v <- x; either (const Nothing) (return . Right) v } cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
...@@ -60,22 +111,27 @@ beforeStep1b word = fromMaybe (Left word) result ...@@ -60,22 +111,27 @@ beforeStep1b word = fromMaybe (Left word) result
cond23 (statefulReplace containsVowel word "ed" "" ) `mplus` cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
cond23 (statefulReplace containsVowel word "ing" "" ) cond23 (statefulReplace containsVowel word "ing" "" )
afterStep1b :: [Char] -> [Char]
afterStep1b word = fromMaybe word result afterStep1b word = fromMaybe word result
where where
double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) "lsz") double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z'])
mEq1AndCvc = measure word == 1 && cvc word mEq1AndCvc = measure word == 1 && cvc word
iif cond val = if cond then Just val else Nothing iif cond val = if cond then Just val else Nothing
result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")] result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
`mplus` iif double (init word) `mplus` iif double (init word)
`mplus` iif mEq1AndCvc (word ++ "e") `mplus` iif mEq1AndCvc (word ++ "e")
step1b = either id afterStep1b . beforeStep1b step1b :: [Char] -> [Char]
step1b = either identity afterStep1b . beforeStep1b
step1c :: [Char] -> [Char]
step1c word = fromMaybe word result step1c word = fromMaybe word result
where result = replaceEnd containsVowel word "y" "i" where result = replaceEnd containsVowel word "y" "i"
step1 :: [Char] -> [Char]
step1 = step1c . step1b . step1a step1 = step1c . step1b . step1a
step2 :: [Char] -> [Char]
step2 word = fromMaybe word result step2 word = fromMaybe word result
where where
result = findStem (measureGT 0) word result = findStem (measureGT 0) word
...@@ -101,6 +157,7 @@ step2 word = fromMaybe word result ...@@ -101,6 +157,7 @@ step2 word = fromMaybe word result
, ("biliti", "ble" ) , ("biliti", "ble" )
, ("logi", "log" ) ] , ("logi", "log" ) ]
step3 :: [Char] -> [Char]
step3 word = fromMaybe word result step3 word = fromMaybe word result
where where
result = findStem (measureGT 0) word result = findStem (measureGT 0) word
...@@ -112,37 +169,49 @@ step3 word = fromMaybe word result ...@@ -112,37 +169,49 @@ step3 word = fromMaybe word result
, ("ful" , "" ) , ("ful" , "" )
, ("ness" , "" ) ] , ("ness" , "" ) ]
step4 :: [Char] -> [Char]
step4 word = fromMaybe word result step4 word = fromMaybe word result
where where
gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) "st" gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t']
findGT1 = findStem (measureGT 1) word . map (flip (,) "") findGT1 = findStem (measureGT 1) word . map (flip (,) "")
result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus` result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
(findStem gt1andST word [("ion","")]) `mplus` (findStem gt1andST word [("ion","")]) `mplus`
(findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"]) (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
step5a :: [Char] -> [Char]
step5a word = fromMaybe word result step5a word = fromMaybe word result
where where
test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str)) test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
result = replaceEnd test word "e" "" result = replaceEnd test word "e" ""
step5b :: [Char] -> [Char]
step5b word = fromMaybe word result step5b word = fromMaybe word result
where where
cond s = last s == 'l' && measureGT 1 s cond s = last s == 'l' && measureGT 1 s
result = replaceEnd cond word "l" "" result = replaceEnd cond word "l" ""
step5 :: [Char] -> [Char]
step5 = step5b . step5a step5 = step5b . step5a
allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1 allSteps = step5 . step4 . step3 . step2 . step1
stem s | length s < 3 = s stem :: Text -> Text
stem s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char]
stem' s | length s < 3 = s
| otherwise = allSteps s | otherwise = allSteps s
fixpoint :: Eq t => (t -> t) -> t -> t
fixpoint f x = let fx = f x in fixpoint f x = let fx = f x in
if fx == x if fx == x
then x then x
else fixpoint f fx else fixpoint f fx
fixstem = fixpoint stem fixstem :: [Char] -> [Char]
fixstem = fixpoint stem'
{- {-
......
...@@ -42,8 +42,9 @@ import Protolude ( Bool(True, False), Int, Double, Integer ...@@ -42,8 +42,9 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, Eq, (==), (>=), (<=), (<>), (/=) , Eq, (==), (>=), (<=), (<>), (/=)
, (&&), (||), not , (&&), (||), not
, fst, snd, toS , fst, snd, toS
, elem, die, mod, div , elem, die, mod, div, const
, curry, uncurry , curry, uncurry
, otherwise
) )
-- TODO import functions optimized in Utils.Count -- TODO import functions optimized in Utils.Count
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment