Stem.hs 1.82 KB
Newer Older
1 2 3 4 5 6 7 8 9
{-|
Module      : Gargantext.Text.Ngrams.Stem
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

10 11 12 13 14 15 16 17
In linguistic morphology and information retrieval, stemming is the
process of reducing inflected (or sometimes derived) words to their word
stem, base or root form—generally a written word form. The @stem@ needs
not be identical to the morphological root of the word; it is usually
sufficient that related words map to the same stem, even if this stem is
not in itself a valid root.
Source : https://en.wikipedia.org/wiki/Stemming

18 19
-}

20
{-# LANGUAGE NoImplicitPrelude #-}
21

22
module Gargantext.Text.Terms.Mono.Stem (stem, Lang(..))
23 24 25 26 27 28
  where

import Data.Text (Text)
import qualified Data.Text   as DT
import qualified NLP.Stemmer as N

29
import Gargantext.Prelude
30 31 32 33 34 35 36
import Gargantext.Core (Lang(..))

-- (stem, Stemmer(..))

--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))

37 38 39 40 41 42 43 44 45 46 47 48 49 50 51

-- | Stemmer

-- A stemmer for English, for example, should identify the string "cats"
-- (and possibly "catlike", "catty" etc.) as based on the root "cat".

-- and
-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
-- algorithm reduces the words "fishing", "fished", and "fisher" to the
-- root word, "fish". On the other hand, "argue", "argued", "argues",
-- "arguing", and "argus" reduce to the stem "argu" (illustrating the
-- case where the stem is not itself a word or root) but "argument" and
-- "arguments" reduce to the stem "argument".


52 53 54 55 56 57
stem :: Lang -> Text -> Text
stem lang = DT.pack . N.stem lang' . DT.unpack
  where
    lang' = case lang of
              EN -> N.English
              FR -> N.French
58
              _  -> panic $ DT.pack "not implemented yet"
59

60 61