Words_hs 1.66 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
module Data.Gargantext.Ngrams.Words where
import Data.List (partition)
import Data.Set (fromList, notMember, member)
import Data.Char (isPunctuation, toLower, isAlpha, isSpace)

import NLP.Stemmer (stem, Stemmer(..))
import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
import Language.Aspell.Options (ACOption(..))

--import Data.Either.Utils (fromRight)
import Data.ByteString.Internal (packChars)


get_lang x = do
    let lang = Lang (packChars x)
    spell_lang <- spellCheckerWithOptions [lang]
    return spell_lang

check' lang x = check lang (packChars x)
suggest' lang x = suggest lang (packChars x)

--spell_lang <- spellChecker
--lang = fromRight s
--suggest' lang x

-- stem French "naturelles"


-- paragraphes
-- lines
-- sentences

-- Prelude.map (\x -> stem French x) $ cleanText "Les hirondelles s envolent dans les cieux."
repl :: Char -> Char
repl x
    | x == '\'' = ' '
    | x == '/' = ' '
    -- | x == '\t' = ' '
    -- | x == '\n' = ' '
    | otherwise = x

cleanText text = do
    -- pb avec \'
    --words $ filter (not . isPunctuation) $ Prelude.map toLower text
    words $ filter (\x -> isAlpha x || isSpace x) $ Prelude.map (repl . toLower) text

isMiamWord word = do
    let miamWord_set = fromList ["salut", "phrase"]
    member word miamWord_set

isStopWord word = do
    let stopWord_set = fromList ["de", "la", "une", "avec"]
    member word stopWord_set

wordsMain = do
    let text = "Salut, ceci est une phrase \n\n avec de la ponctuation !"
    print $ partition (not . isStopWord) $ cleanText text
    print $ filter (not . isStopWord) $ cleanText text
    --print $ filter isStopWord $ words $ filter (not . isPunctuation) text