[FEAT/STEM] implemenging Porter lib into Gargantext for English language.

e21bad3e · Alexandre Delanoë · dbff4c96 · e21bad3e · e21bad3e · e21bad3e
Commit e21bad3e authored May 02, 2018 by Alexandre Delanoë
5 changed files
--- a/package.yaml
+++ b/package.yaml
@@ -40,6 +40,7 @@ library:
  - Gargantext.Ngrams.CoreNLP
  - Gargantext.Ngrams.Parser
  - Gargantext.Ngrams.Lang.En
+  - Gargantext.Ngrams.Stem.En
  - Gargantext.Ngrams.Lang.Fr
  - Gargantext.Ngrams.Metrics
  - Gargantext.Ngrams.TextMining

--- a/src/Gargantext/Ngrams.hs
+++ b/src/Gargantext/Ngrams.hs
@@ -23,8 +23,9 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
                         , module Gargantext.Ngrams.Occurrences
                         , module Gargantext.Ngrams.TextMining
                         , module Gargantext.Ngrams.Metrics
-                         , Ngrams(..), ngrams, occ, sumOcc, text2fis
+                         , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
-                         , ListName(..), equivNgrams, isGram
+                         , ListName(..), equivNgrams, isGram, sentences
+                         , ngramsTest
                             --, module Gargantext.Ngrams.Words
                         ) where
@@ -43,7 +44,10 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
 import Data.List (sort)
 import Data.Char (Char, isAlphaNum, isSpace)
-import Data.Text (Text, words, filter, toLower)
+import Data.Text (Text, filter, toLower, split, lines, concat)
+import qualified Data.Text as DT
+import Data.Text.IO (readFile)
 import Data.Map.Strict  (Map
                        , empty
                        , insertWith, unionWith
@@ -80,11 +84,19 @@ type Occ     = Int
 ngrams :: Text -> [Text]
 ngrams xs = monograms $ toLower $ filter isGram xs
+clean :: Text -> Text
+clean txt = DT.map clean' txt
+  where
+    clean' '’' = '\''
+    clean' c  = c
 monograms :: Text -> [Text]
-monograms = words
+monograms txt = split isWord txt
+  where
+    isWord c = c `elem` [' ', '\'', ',', ';']
 isGram :: Char -> Bool
-isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/']
+isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
 -- | Compute the occurrences (occ)
 occ :: Ord a => [a] -> Map a Occ
@@ -129,4 +141,30 @@ text2fis n xs = list2fis n (map ngrams xs)
 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
 --text2fisWith = undefined
+-------------------------------------------------------------------
+-- Contexts of text
+sentences :: Text -> [Text]
+sentences txt = split isStop txt
+isStop :: Char -> Bool
+isStop c = c `elem` ['.','?','!']
+-- | Tests
+-- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
+ngramsTest =  ws
+  where
+    txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt"
+    -- | Number of sentences
+    ls   = sentences <$> txt
+    -- | Number of monograms used in the full text
+    ws   = ngrams    <$> txt
+    -- | stem ngrams
+    -- TODO
+    -- group ngrams
+    ocs  = occ       <$> ws
--- a/src/Gargantext/Ngrams/CoreNLP.hs
+++ b/src/Gargantext/Ngrams/CoreNLP.hs
@@ -28,7 +28,7 @@ import Gargantext.Prelude
 import Gargantext.Utils.Prefix (unPrefix)
 import Data.Text (Text)
-import           Network.HTTP.Simple
+import Network.HTTP.Simple
 data Token = Token { _tokenIndex                :: Int

--- a/src/Gargantext/Ngrams/Stem/En.hs
+++ b/src/Gargantext/Ngrams/Stem/En.hs
-module Language.Porter (stem, fixstem)
+{-|
-where
+Module      : Gargantext.
+Description : Porter Algorithm Implementation purely Haskell
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Adapted from:
+  - source: https://hackage.haskell.org/package/porter
+  - [Char] -> [Text]
+  - adding Types signatures
+  - fixes unseen cases
+-}
+{-# LANGUAGE NoImplicitPrelude #-}
+module Gargantext.Ngrams.Stem.En
+  where
 import Control.Monad
+import Data.Either
 import Data.Maybe
-import Data.List
+import Data.Text (Text(), pack, unpack)
+import Data.List hiding (map, head)
+import Gargantext.Prelude
+vowels :: [Char]
+vowels = ['a','e','i','o','u']
+isConsonant :: [Char] -> Int -> Bool
 isConsonant str i
-    | c `elem` "aeiou"  = False
+    | c `elem` vowels  = False
-    | c == 'y'          = i == 0 || isVowel str (i - 1)
+    | c == 'y'         = i == 0 || isVowel str (i - 1)
-    | otherwise         = True
+    | otherwise        = True
    where
        c = str !! i
+isVowel :: [Char] -> Int -> Bool
 isVowel = (not .) . isConsonant
+byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2
 byIndex fun str = fun str [0..length str - 1]
-measure = length . filter not . init . (True:) . map head . group . byIndex (map . isConsonant)
+containsVowel :: [Char] -> Bool
 containsVowel = byIndex (any . isVowel)
+-- | /!\ unsafe fromJust
+measure :: [Char] -> Int
+measure = length . filter not . init . (True:) 
+                 . map fromJust . map head 
+                 . group . byIndex (map . isConsonant)
+endsWithDouble :: [Char] -> Bool
 endsWithDouble = startsWithDouble . reverse
    where
-        startsWithDouble l | length l < 2 = False
+        startsWithDouble l = case l of
-                           | otherwise    = let (x:y:_) = l in x == y && x `notElem` "aeiou"
+                               (x:y:_) -> x == y && x `notElem` vowels
+                               _       -> False
+cvc :: [Char] -> Bool
 cvc word | length word < 3 = False
         | otherwise       = isConsonant word lastIndex       &&
                             isVowel     word (lastIndex - 1) &&
                             isConsonant word (lastIndex - 2) &&
-                             last word `notElem` "wxy"
+                             last word `notElem` ['w','x','y']
-    where lastIndex = length word - 1
+              where lastIndex = length word - 1
+statefulReplace :: Eq a => ([a] -> Bool)
+                        -> [a] -> [a] -> [a] 
+                        -> Maybe (Data.Either.Either [a] [a])
 statefulReplace predicate str end replacement
    | end `isSuffixOf` str  = Just replaced
    | otherwise             = Nothing
@@ -40,17 +82,26 @@ statefulReplace predicate str end replacement
        replaced | predicate part = Right (part ++ replacement)
                 | otherwise      = Left str
+replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a]
 replaceEnd predicate str end replacement = do
            result <- statefulReplace predicate str end replacement
-            return (either id id result)
+            return (either identity identity result)
+findStem
+  :: (Foldable t, Functor t, Eq a) =>
+     ([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a]
 findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs
+measureGT :: Int -> [Char] -> Bool
 measureGT = flip ((>) . measure)
+step1a :: [Char] -> [Char]
 step1a word = fromMaybe word result
-    where result = findStem (const True) word [("sses", "ss"), ("ies",  "i"), ("ss", "ss"), ("s", "")]
+    where 
+      result = findStem (const True) word suffixes
+      suffixes = [("sses", "ss"), ("ies",  "i"), ("ss", "ss"), ("s", "")]
+beforeStep1b :: [Char] -> Either [Char] [Char]
 beforeStep1b word = fromMaybe (Left word) result
    where
       cond23 x = do { v <- x; either (const Nothing) (return . Right) v }
@@ -60,22 +111,27 @@ beforeStep1b word = fromMaybe (Left word) result
           cond23 (statefulReplace containsVowel word "ed"  ""  ) `mplus`
           cond23 (statefulReplace containsVowel word "ing" ""  )
+afterStep1b :: [Char] -> [Char]
 afterStep1b word = fromMaybe word result
    where
-        double        = endsWithDouble word && not (any ((`isSuffixOf` word) . return) "lsz")
+        double        = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z'])
        mEq1AndCvc    = measure word == 1 && cvc word
        iif cond val  = if cond then Just val else Nothing
        result        = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")]
                        `mplus` iif double (init word)
                        `mplus` iif mEq1AndCvc (word ++ "e")
-step1b = either id afterStep1b . beforeStep1b
+step1b :: [Char] -> [Char]
+step1b = either identity afterStep1b . beforeStep1b
+step1c :: [Char] -> [Char]
 step1c word = fromMaybe word result
    where result = replaceEnd containsVowel word "y" "i"
+step1 :: [Char] -> [Char]
 step1 = step1c . step1b . step1a
+step2 :: [Char] -> [Char]
 step2 word = fromMaybe word result
    where
       result = findStem (measureGT 0) word
@@ -101,6 +157,7 @@ step2 word = fromMaybe word result
           , ("biliti",  "ble" )
           , ("logi",    "log" ) ]
+step3 :: [Char] -> [Char]
 step3 word = fromMaybe word result
    where
       result = findStem (measureGT 0) word
@@ -112,37 +169,49 @@ step3 word = fromMaybe word result
           , ("ful"  , ""  )
           , ("ness" , ""  ) ]
+step4 :: [Char] -> [Char]
 step4 word = fromMaybe word result
    where
-        gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) "st"
+        gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t']
        findGT1      = findStem (measureGT 1) word . map (flip (,) "")
        result       = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus`
                       (findStem gt1andST word [("ion","")]) `mplus`
                       (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"])
+step5a :: [Char] -> [Char]
 step5a word = fromMaybe word result
    where
        test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str))
        result   = replaceEnd test word "e" ""
+step5b :: [Char] -> [Char]
 step5b word = fromMaybe word result
    where
       cond s = last s == 'l' && measureGT 1 s
       result = replaceEnd cond word "l" ""
+step5 :: [Char] -> [Char]
 step5 = step5b . step5a
+allSteps :: [Char] -> [Char]
 allSteps = step5 . step4 . step3 . step2 . step1
-stem s | length s < 3 = s
+stem :: Text -> Text
+stem s = pack (stem' $ unpack s)
+stem' :: [Char] -> [Char]
+stem' s | length s < 3 = s
       | otherwise    = allSteps s
+fixpoint :: Eq t => (t -> t) -> t -> t
 fixpoint f x = let fx = f x in
                   if fx == x
                      then x
                      else fixpoint f fx
-fixstem = fixpoint stem
+fixstem :: [Char] -> [Char]
+fixstem = fixpoint stem'
 {-

--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -42,8 +42,9 @@ import Protolude ( Bool(True, False), Int, Double, Integer
                 , Eq, (==), (>=), (<=), (<>), (/=)
                 , (&&), (||), not
                 , fst, snd, toS
-                 , elem, die, mod, div
+                 , elem, die, mod, div, const
                 , curry, uncurry
+                 , otherwise
                 )
 -- TODO import functions optimized in Utils.Count