Commit 02202afe authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT/STEM] implementing Porter lib into Gargantext for English language.

parent 20b24e1b
......@@ -40,6 +40,7 @@ library:
- Gargantext.Ngrams.CoreNLP
- Gargantext.Ngrams.Parser
- Gargantext.Ngrams.Lang.En
- Gargantext.Ngrams.Stem.En
- Gargantext.Ngrams.Lang.Fr
- Gargantext.Ngrams.Metrics
- Gargantext.Ngrams.TextMining
......
......@@ -23,8 +23,9 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
, module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics
, Ngrams(..), ngrams, occ, sumOcc, text2fis
, ListName(..), equivNgrams, isGram
, Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
, ListName(..), equivNgrams, isGram, sentences
, ngramsTest
--, module Gargantext.Ngrams.Words
) where
......@@ -43,7 +44,10 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import Data.List (sort)
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, words, filter, toLower)
import Data.Text (Text, filter, toLower, split, lines, concat)
import qualified Data.Text as DT
import Data.Text.IO (readFile)
import Data.Map.Strict (Map
, empty
, insertWith, unionWith
......@@ -80,11 +84,19 @@ type Occ = Int
ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
monograms :: Text -> [Text]
monograms = words
monograms txt = split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/']
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
-- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ
......@@ -129,4 +141,30 @@ text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
sentences txt = split isStop txt
isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!']
-- | Tests
-- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
ngramsTest = ws
where
txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt"
-- | Number of sentences
ls = sentences <$> txt
-- | Number of monograms used in the full text
ws = ngrams <$> txt
-- | stem ngrams
-- TODO
-- group ngrams
ocs = occ <$> ws
......@@ -28,7 +28,7 @@ import Gargantext.Prelude
import Gargantext.Utils.Prefix (unPrefix)
import Data.Text (Text)
import Network.HTTP.Simple
import Network.HTTP.Simple
data Token = Token { _tokenIndex :: Int
......
module Language.Porter (stem, fixstem)
where
{-|
Module : Gargantext.
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Ngrams.Stem.En
where
import Control.Monad
import Data.Either
import Data.Maybe
import Data.List
import Data.Text (Text(), pack, unpack)
import Data.List hiding (map, head)
import Gargantext.Prelude
vowels :: [Char]
vowels = ['a','e','i','o','u']
isConsonant :: [Char] -> Int -> Bool
isConsonant str i
| c `elem` "aeiou" = False
| c == 'y' = i == 0 || isVowel str (i - 1)
| otherwise = True
| c `elem` vowels = False
| c == 'y' = i == 0 || isVowel str (i - 1)
| otherwise = True
where
c = str !! i
isVowel :: [Char] -> Int -> Bool
isVowel = (not .) . isConsonant
byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
byIndex fun str = fun str [0..length str - 1]
measure = length . filter not . init . (True:) . map head . group . byIndex (map . isConsonant)
containsVowel :: [Char] -> Bool
containsVowel = byIndex (any . isVowel)
-- | /!\ unsafe fromJust
measure :: [Char] -> Int
measure = length . filter not . init . (True:)
. map fromJust . map head
. group . byIndex (map . isConsonant)
endsWithDouble :: [Char] -> Bool
endsWithDouble = startsWithDouble . reverse
where
startsWithDouble l | length l < 2 = False
| otherwise = let (x:y:_) = l in x == y && x `notElem` "aeiou"
startsWithDouble l = case l of
(x:y:_) -> x == y && x `notElem` vowels
_ -> False
cvc :: [Char] -> Bool
cvc word | length word < 3 = False
| otherwise = isConsonant word lastIndex &&
isVowel word (lastIndex - 1) &&
isConsonant word (lastIndex - 2) &&
last word `notElem` "wxy"
where lastIndex = length word - 1
last word `notElem` ['w','x','y']
where lastIndex = length word - 1
statefulReplace :: Eq a => ([a] -> Bool)
-> [a] -> [a] -> [a]
-> Maybe (Data.Either.Either [a] [a])
statefulReplace predicate str end replacement
| end `isSuffixOf` str = Just replaced
| otherwise = Nothing
......@@ -40,17 +82,26 @@ statefulReplace predicate str end replacement
replaced | predicate part = Right (part ++ replacement)
| otherwise = Left str
replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
replaceEnd predicate str end replacement = do
result <- statefulReplace predicate str end replacement
return (either id id result)
return (either identity identity result)
findStem
:: (Foldable t, Functor t, Eq a) =>
([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
measureGT :: Int -> [Char] -> Bool
measureGT = flip ((>) . measure)
step1a :: [Char] -> [Char]
step1a word = fromMaybe word result
where result = findStem (const True) word [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
where
result = findStem (const True) word suffixes
suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")]
beforeStep1b :: [Char] -> Either [Char] [Char]
beforeStep1b word = fromMaybe (Left word) result
where
cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
......@@ -60,22 +111,27 @@ beforeStep1b word = fromMaybe (Left word) result
cond23 (statefulReplace containsVowel word "ed" "" ) `mplus`
cond23 (statefulReplace containsVowel word "ing" "" )
afterStep1b :: [Char] -> [Char]
afterStep1b word = fromMaybe word result
where
double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) "lsz")
double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z'])
mEq1AndCvc = measure word == 1 && cvc word
iif cond val = if cond then Just val else Nothing
result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
`mplus` iif double (init word)
`mplus` iif mEq1AndCvc (word ++ "e")
step1b = either id afterStep1b . beforeStep1b
step1b :: [Char] -> [Char]
step1b = either identity afterStep1b . beforeStep1b
step1c :: [Char] -> [Char]
step1c word = fromMaybe word result
where result = replaceEnd containsVowel word "y" "i"
step1 :: [Char] -> [Char]
step1 = step1c . step1b . step1a
step2 :: [Char] -> [Char]
step2 word = fromMaybe word result
where
result = findStem (measureGT 0) word
......@@ -101,6 +157,7 @@ step2 word = fromMaybe word result
, ("biliti", "ble" )
, ("logi", "log" ) ]
step3 :: [Char] -> [Char]
step3 word = fromMaybe word result
where
result = findStem (measureGT 0) word
......@@ -112,37 +169,49 @@ step3 word = fromMaybe word result
, ("ful" , "" )
, ("ness" , "" ) ]
step4 :: [Char] -> [Char]
step4 word = fromMaybe word result
where
gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) "st"
gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t']
findGT1 = findStem (measureGT 1) word . map (flip (,) "")
result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
(findStem gt1andST word [("ion","")]) `mplus`
(findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
step5a :: [Char] -> [Char]
step5a word = fromMaybe word result
where
test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
result = replaceEnd test word "e" ""
step5b :: [Char] -> [Char]
step5b word = fromMaybe word result
where
cond s = last s == 'l' && measureGT 1 s
result = replaceEnd cond word "l" ""
step5 :: [Char] -> [Char]
step5 = step5b . step5a
allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1
stem s | length s < 3 = s
stem :: Text -> Text
stem s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char]
stem' s | length s < 3 = s
| otherwise = allSteps s
fixpoint :: Eq t => (t -> t) -> t -> t
fixpoint f x = let fx = f x in
if fx == x
then x
else fixpoint f fx
fixstem = fixpoint stem
fixstem :: [Char] -> [Char]
fixstem = fixpoint stem'
{-
......
......@@ -42,8 +42,9 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, Eq, (==), (>=), (<=), (<>), (/=)
, (&&), (||), not
, fst, snd, toS
, elem, die, mod, div
, elem, die, mod, div, const
, curry, uncurry
, otherwise
)
-- TODO import functions optimized in Utils.Count
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment