diff --git a/package.yaml b/package.yaml index de6001abc69c35d10c550677e25d4fa6358129d3..c77fb050d200c60f9f22c3772638f34751fa901f 100644 --- a/package.yaml +++ b/package.yaml @@ -40,6 +40,7 @@ library: - Gargantext.Ngrams.CoreNLP - Gargantext.Ngrams.Parser - Gargantext.Ngrams.Lang.En + - Gargantext.Ngrams.Stem.En - Gargantext.Ngrams.Lang.Fr - Gargantext.Ngrams.Metrics - Gargantext.Ngrams.TextMining diff --git a/src/Gargantext/Ngrams.hs b/src/Gargantext/Ngrams.hs index 2fe57c0c5dddd6cb0c57770e93be8729c1c346f3..9b9f9fa7b1cf07302cb1ec442cf76f4e795f1020 100644 --- a/src/Gargantext/Ngrams.hs +++ b/src/Gargantext/Ngrams.hs @@ -23,8 +23,9 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters , module Gargantext.Ngrams.Occurrences , module Gargantext.Ngrams.TextMining , module Gargantext.Ngrams.Metrics - , Ngrams(..), ngrams, occ, sumOcc, text2fis - , ListName(..), equivNgrams, isGram + , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean + , ListName(..), equivNgrams, isGram, sentences + , ngramsTest --, module Gargantext.Ngrams.Words ) where @@ -43,7 +44,10 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS import Data.List (sort) import Data.Char (Char, isAlphaNum, isSpace) -import Data.Text (Text, words, filter, toLower) +import Data.Text (Text, filter, toLower, split, lines, concat) +import qualified Data.Text as DT +import Data.Text.IO (readFile) + import Data.Map.Strict (Map , empty , insertWith, unionWith @@ -80,11 +84,19 @@ type Occ = Int ngrams :: Text -> [Text] ngrams xs = monograms $ toLower $ filter isGram xs +clean :: Text -> Text +clean txt = DT.map clean' txt + where + clean' '’' = '\'' + clean' c = c + monograms :: Text -> [Text] -monograms = words +monograms txt = split isWord txt + where + isWord c = c `elem` [' ', '\'', ',', ';'] isGram :: Char -> Bool -isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/'] +isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\''] -- | Compute the occurrences (occ) occ :: Ord a => [a] -> Map a Occ @@ -129,4 +141,30 @@ text2fis n xs = list2fis n (map ngrams xs) --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis]) --text2fisWith = undefined +------------------------------------------------------------------- +-- Contexts of text + +sentences :: Text -> [Text] +sentences txt = split isStop txt + +isStop :: Char -> Bool +isStop c = c `elem` ['.','?','!'] + + +-- | Tests +-- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html +ngramsTest = ws + where + txt = concat <$> lines <$> clean <$> readFile "Giono-arbres.txt" + -- | Number of sentences + ls = sentences <$> txt + -- | Number of monograms used in the full text + ws = ngrams <$> txt + -- | stem ngrams + -- TODO + -- group ngrams + ocs = occ <$> ws + + + diff --git a/src/Gargantext/Ngrams/CoreNLP.hs b/src/Gargantext/Ngrams/CoreNLP.hs index 4ce14fc4de16a71cf223b6a078d15545858b37b5..69db9e21197f49106914c68513bd598b9b196969 100644 --- a/src/Gargantext/Ngrams/CoreNLP.hs +++ b/src/Gargantext/Ngrams/CoreNLP.hs @@ -28,7 +28,7 @@ import Gargantext.Prelude import Gargantext.Utils.Prefix (unPrefix) import Data.Text (Text) -import Network.HTTP.Simple +import Network.HTTP.Simple data Token = Token { _tokenIndex :: Int diff --git a/src/Gargantext/Ngrams/Stem/En.hs b/src/Gargantext/Ngrams/Stem/En.hs index 3d301b409caefabff1b7a38c653ac9bdd96da748..8d5beda52ca5b171e20eaa763f93bcd71aa8c1bd 100644 --- a/src/Gargantext/Ngrams/Stem/En.hs +++ b/src/Gargantext/Ngrams/Stem/En.hs @@ -1,37 +1,79 @@ -module Language.Porter (stem, fixstem) -where +{-| +Module : Gargantext. +Description : Porter Algorithm Implementation purely Haskell +Copyright : (c) CNRS, 2017-Present +License : AGPL + CECILL v3 +Maintainer : team@gargantext.org +Stability : experimental +Portability : POSIX + +Adapted from: + - source: https://hackage.haskell.org/package/porter + - [Char] -> [Text] + - adding Types signatures + - fixes unseen cases + +-} + +{-# LANGUAGE NoImplicitPrelude #-} + +module Gargantext.Ngrams.Stem.En + where import Control.Monad +import Data.Either import Data.Maybe -import Data.List +import Data.Text (Text(), pack, unpack) + +import Data.List hiding (map, head) + +import Gargantext.Prelude +vowels :: [Char] +vowels = ['a','e','i','o','u'] + +isConsonant :: [Char] -> Int -> Bool isConsonant str i - | c `elem` "aeiou" = False - | c == 'y' = i == 0 || isVowel str (i - 1) - | otherwise = True + | c `elem` vowels = False + | c == 'y' = i == 0 || isVowel str (i - 1) + | otherwise = True where c = str !! i +isVowel :: [Char] -> Int -> Bool isVowel = (not .) . isConsonant +byIndex :: Foldable t1 => (t1 a -> [Int] -> t2) -> t1 a -> t2 byIndex fun str = fun str [0..length str - 1] -measure = length . filter not . init . (True:) . map head . group . byIndex (map . isConsonant) - +containsVowel :: [Char] -> Bool containsVowel = byIndex (any . isVowel) +-- | /!\ unsafe fromJust +measure :: [Char] -> Int +measure = length . filter not . init . (True:) + . map fromJust . map head + . group . byIndex (map . isConsonant) + + +endsWithDouble :: [Char] -> Bool endsWithDouble = startsWithDouble . reverse where - startsWithDouble l | length l < 2 = False - | otherwise = let (x:y:_) = l in x == y && x `notElem` "aeiou" + startsWithDouble l = case l of + (x:y:_) -> x == y && x `notElem` vowels + _ -> False +cvc :: [Char] -> Bool cvc word | length word < 3 = False | otherwise = isConsonant word lastIndex && isVowel word (lastIndex - 1) && isConsonant word (lastIndex - 2) && - last word `notElem` "wxy" - where lastIndex = length word - 1 + last word `notElem` ['w','x','y'] + where lastIndex = length word - 1 +statefulReplace :: Eq a => ([a] -> Bool) + -> [a] -> [a] -> [a] + -> Maybe (Data.Either.Either [a] [a]) statefulReplace predicate str end replacement | end `isSuffixOf` str = Just replaced | otherwise = Nothing @@ -40,17 +82,26 @@ statefulReplace predicate str end replacement replaced | predicate part = Right (part ++ replacement) | otherwise = Left str +replaceEnd :: Eq a => ([a] -> Bool) -> [a] -> [a] -> [a] -> Maybe [a] replaceEnd predicate str end replacement = do result <- statefulReplace predicate str end replacement - return (either id id result) + return (either identity identity result) +findStem + :: (Foldable t, Functor t, Eq a) => + ([a] -> Bool) -> [a] -> t ([a], [a]) -> Maybe [a] findStem f word pairs = msum $ map (uncurry (replaceEnd f word)) pairs +measureGT :: Int -> [Char] -> Bool measureGT = flip ((>) . measure) +step1a :: [Char] -> [Char] step1a word = fromMaybe word result - where result = findStem (const True) word [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")] + where + result = findStem (const True) word suffixes + suffixes = [("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")] +beforeStep1b :: [Char] -> Either [Char] [Char] beforeStep1b word = fromMaybe (Left word) result where cond23 x = do { v <- x; either (const Nothing) (return . Right) v } @@ -60,22 +111,27 @@ beforeStep1b word = fromMaybe (Left word) result cond23 (statefulReplace containsVowel word "ed" "" ) `mplus` cond23 (statefulReplace containsVowel word "ing" "" ) +afterStep1b :: [Char] -> [Char] afterStep1b word = fromMaybe word result where - double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) "lsz") + double = endsWithDouble word && not (any ((`isSuffixOf` word) . return) ['l','s','z']) mEq1AndCvc = measure word == 1 && cvc word iif cond val = if cond then Just val else Nothing result = findStem (const True) word [("at", "ate"), ("bl", "ble"), ("iz", "ize")] `mplus` iif double (init word) `mplus` iif mEq1AndCvc (word ++ "e") -step1b = either id afterStep1b . beforeStep1b +step1b :: [Char] -> [Char] +step1b = either identity afterStep1b . beforeStep1b +step1c :: [Char] -> [Char] step1c word = fromMaybe word result where result = replaceEnd containsVowel word "y" "i" +step1 :: [Char] -> [Char] step1 = step1c . step1b . step1a +step2 :: [Char] -> [Char] step2 word = fromMaybe word result where result = findStem (measureGT 0) word @@ -101,6 +157,7 @@ step2 word = fromMaybe word result , ("biliti", "ble" ) , ("logi", "log" ) ] +step3 :: [Char] -> [Char] step3 word = fromMaybe word result where result = findStem (measureGT 0) word @@ -112,37 +169,49 @@ step3 word = fromMaybe word result , ("ful" , "" ) , ("ness" , "" ) ] +step4 :: [Char] -> [Char] step4 word = fromMaybe word result where - gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) "st" + gt1andST str = (measureGT 1) str && any ((`isSuffixOf` str) . return) ['s','t'] findGT1 = findStem (measureGT 1) word . map (flip (,) "") result = (findGT1 ["al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent"]) `mplus` (findStem gt1andST word [("ion","")]) `mplus` (findGT1 ["ou", "ism", "ate", "iti", "ous", "ive", "ize"]) +step5a :: [Char] -> [Char] step5a word = fromMaybe word result where test str = (measureGT 1 str) || ((measure str == 1) && (not $ cvc str)) result = replaceEnd test word "e" "" +step5b :: [Char] -> [Char] step5b word = fromMaybe word result where cond s = last s == 'l' && measureGT 1 s result = replaceEnd cond word "l" "" +step5 :: [Char] -> [Char] step5 = step5b . step5a +allSteps :: [Char] -> [Char] allSteps = step5 . step4 . step3 . step2 . step1 -stem s | length s < 3 = s +stem :: Text -> Text +stem s = pack (stem' $ unpack s) + +stem' :: [Char] -> [Char] +stem' s | length s < 3 = s | otherwise = allSteps s +fixpoint :: Eq t => (t -> t) -> t -> t fixpoint f x = let fx = f x in if fx == x then x else fixpoint f fx -fixstem = fixpoint stem +fixstem :: [Char] -> [Char] +fixstem = fixpoint stem' + {- diff --git a/src/Gargantext/Prelude.hs b/src/Gargantext/Prelude.hs index c8ac142bae0f69256d5bdbc1f0beb730de11d671..b967adf14595a156ffc619597d2f1ed23c42ea6f 100644 --- a/src/Gargantext/Prelude.hs +++ b/src/Gargantext/Prelude.hs @@ -42,8 +42,9 @@ import Protolude ( Bool(True, False), Int, Double, Integer , Eq, (==), (>=), (<=), (<>), (/=) , (&&), (||), not , fst, snd, toS - , elem, die, mod, div + , elem, die, mod, div, const , curry, uncurry + , otherwise ) -- TODO import functions optimized in Utils.Count