Add unified stemming interface

This commit refactors ever so slightly the '.Stem' modules to create a unified interface; now Gargantext.Core.Text.Terms.Mono.Stem exports a single 'stem' function which can be used with different `StemmingAlgorithm`s, and now the other algorithms' functions have been shielded behind internal modules. This makes possible to see at glance which part of Garg are using which stemming algorithm, as it wasn't always clearer before. This also allows us to have a better control over the langugages, as not all the stemming algorithms supports all the languages that Garg supports.

Add unified stemming interface
This commit refactors ever so slightly the '.Stem' modules to create a unified interface; now Gargantext.Core.Text.Terms.Mono.Stem exports a single 'stem' function which can be used with different `StemmingAlgorithm`s, and now the other algorithms' functions have been shielded behind internal modules. This makes possible to see at glance which part of Garg are using which stemming algorithm, as it wasn't always clearer before. This also allows us to have a better control over the langugages, as not all the stemming algorithms supports all the languages that Garg supports.
2941ac6e · Alfredo Di Napoli · 63fcd605 · 2941ac6e · 2941ac6e · 2941ac6e
Commit 2941ac6e authored Mar 11, 2024 by Alfredo Di Napoli
14 changed files
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -158,8 +158,10 @@ library
      Gargantext.Core.Text.Terms
      Gargantext.Core.Text.Terms.Eleve
      Gargantext.Core.Text.Terms.Mono
-      Gargantext.Core.Text.Terms.Mono.Stem.En
-      Gargantext.Core.Text.Terms.Mono.Stem.Lancaster
+      Gargantext.Core.Text.Terms.Mono.Stem
+      Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter
+      Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster
+      Gargantext.Core.Text.Terms.Mono.Stem.Internal.Porter
      Gargantext.Core.Text.Terms.Multi
      Gargantext.Core.Text.Terms.Multi.Lang.En
      Gargantext.Core.Text.Terms.Multi.Lang.Fr
@@ -327,7 +329,6 @@ library
      Gargantext.Core.Text.Samples.FR
      Gargantext.Core.Text.Samples.PL
      Gargantext.Core.Text.Samples.ZH
-      Gargantext.Core.Text.Terms.Mono.Stem
      Gargantext.Core.Text.Terms.Mono.Token
      Gargantext.Core.Text.Terms.Mono.Token.En
      Gargantext.Core.Text.Terms.Multi.Group

--- a/src/Gargantext/Core/Text/List/Group/WithStem.hs
+++ b/src/Gargantext/Core/Text/List/Group/WithStem.hs
@@ -32,7 +32,7 @@ import Gargantext.Core (Lang(..), Form, Lem, NLPServerConfig)
 import Gargantext.Core.Text.List.Group.Prelude
 import Gargantext.Core.Text.List.Social.Patch
 import Gargantext.Core.Text.List.Social.Prelude
-import Gargantext.Core.Text.Terms.Mono.Stem (stem)
+import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
 import Gargantext.Prelude

 ------------------------------------------------------------------------
@@ -73,7 +73,7 @@ groupWith GroupIdentity  t = identity t
 groupWith (GroupParams { unGroupParams_lang = l }) t =
                    NgramsTerm
                  $ Text.intercalate " "
-                  $ map (stem l)
+                  $ map (stem l PorterAlgorithm)
                  -- . take n
                  $ List.sort
                  -- \$ Set.toList

--- a/src/Gargantext/Core/Text/Search.hs
+++ b/src/Gargantext/Core/Text/Search.hs
@@ -57,7 +57,7 @@ docSearchConfig =

    normaliseQueryToken :: Text -> DocField -> Text
    normaliseQueryToken tok =
-      let tokStem = ST.stem ST.EN
+      let tokStem = ST.stem ST.EN ST.PorterAlgorithm
       in \field -> case field of
                      TitleField    -> tokStem tok
                      AbstractField -> tokStem tok

--- a/src/Gargantext/Core/Text/Terms.hs
+++ b/src/Gargantext/Core/Text/Terms.hs
@@ -49,7 +49,7 @@ import Gargantext.Core
 import Gargantext.Core.Text (sentences, HasText(..))
 import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
 import Gargantext.Core.Text.Terms.Mono  (monoTerms)
-import Gargantext.Core.Text.Terms.Mono.Stem (stem)
+import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
 import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
 import Gargantext.Core.Text.Terms.Multi (multiterms)
 import Gargantext.Core.Types
@@ -211,7 +211,7 @@ uniText = map (List.filter (not . isPunctuation))

 text2term :: Lang -> [Text] -> Terms
 text2term _ [] = Terms [] Set.empty
-text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
+text2term lang txt = Terms txt (Set.fromList $ map (stem lang PorterAlgorithm) txt)

 isPunctuation :: Text -> Bool
 isPunctuation x = List.elem x $  (Text.pack . pure)

--- a/src/Gargantext/Core/Text/Terms/Mono.hs
+++ b/src/Gargantext/Core/Text/Terms/Mono.hs
@@ -19,7 +19,7 @@ import Data.List qualified as L
 import Data.Set qualified as S
 import Data.Text qualified as T
 import Gargantext.Core
-import Gargantext.Core.Text.Terms.Mono.Stem (stem)
+import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
 import Gargantext.Core.Types
 import Gargantext.Prelude hiding (words)
 import Prelude (String)
@@ -43,7 +43,7 @@ monoTexts = L.concat . monoTextsBySentence

 -- | TODO use text2term only
 monoText2term :: Lang -> Text -> Terms
-monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
+monoText2term lang txt = Terms [txt] (S.singleton $ stem lang PorterAlgorithm txt)

 monoTextsBySentence :: Text -> [[Text]]
 monoTextsBySentence = map T.words

--- a/src/Gargantext/Core/Text/Terms/Mono/Stem.hs
+++ b/src/Gargantext/Core/Text/Terms/Mono/Stem.hs
 {-|
-Module      : Gargantext.Core.Text.Ngrams.Stem
-Description : 
+Module      : Gargantext.Core.Text.Terms.Mono.Stem
+Description : Stemming of mono (i.e. single word) terms.
 Copyright   : (c) CNRS, 2017-Present
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
@@ -14,45 +14,67 @@ not be identical to the morphological root of the word; it is usually
 sufficient that related words map to the same stem, even if this stem is
 not in itself a valid root.
 Source : https://en.wikipedia.org/wiki/Stemming
+A stemmer for English, for example, should identify the string "cats"
+(and possibly "catlike", "catty" etc.) as based on the root "cat", and
+"stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
+algorithm reduces the words "fishing", "fished", and "fisher" to the
+root word, "fish". On the other hand, "argue", "argued", "argues",
+"arguing", and "argus" reduce to the stem "argu" (illustrating the
+case where the stem is not itself a word or root) but "argument" and
+"arguments" reduce to the stem "argument".

 -}


-module Gargantext.Core.Text.Terms.Mono.Stem (stem, Lang(..))
-  where
-
-import Data.Text qualified as DT
-import Gargantext.Core (Lang(..))
-import Gargantext.Prelude
-import NLP.Stemmer qualified as N
-
-- (stem, Stemmer(..))
-
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
+module Gargantext.Core.Text.Terms.Mono.Stem (

+  -- * Types
+  StemmingAlgorithm(..),

-- | Stemmer
+  -- * Universal stemming function
+  stem,

-- A stemmer for English, for example, should identify the string "cats"
-- (and possibly "catlike", "catty" etc.) as based on the root "cat".
-
-- and
-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
-- algorithm reduces the words "fishing", "fished", and "fisher" to the
-- root word, "fish". On the other hand, "argue", "argued", "argues",
-- "arguing", and "argus" reduce to the stem "argu" (illustrating the
-- case where the stem is not itself a word or root) but "argument" and
-- "arguments" reduce to the stem "argument".
-
-
-stem :: Lang -> Text -> Text
-stem lang = DT.pack . N.stem lang' . DT.unpack
-  where
-    lang' = case lang of
-              EN -> N.English
-              FR -> N.French
-              _  -> panicTrace $ DT.pack "not implemented yet"
+  -- * Handy re-exports
+  Lang(..)

+  ) where

+import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Porter qualified as Porter
+import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster qualified as Lancaster
+import Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter qualified as GargPorter
+import Gargantext.Core (Lang(..))
+import Gargantext.Prelude

+-- | A stemming algorithm. There are different stemming algorithm,
+-- each with different tradeoffs, strengths and weaknesses. Typically
+-- one uses one or the other based on the given task at hand.
+data StemmingAlgorithm
+  = -- | The porter algorithm is the classic stemming algorithm, possibly
+    -- one of the most widely used.
+    PorterAlgorithm
+    -- | Slight variation of the porter algorithm; it's more aggressive with
+    -- stemming, which might or might not be what you want. It also makes some
+    -- subtle chances to the stem; for example, the stemming of \"dancer\" using
+    -- Porter is simply \"dancer\" (i.e. it cannot be further stemmed). Using
+    -- Lancaster we would get \"dant\", which is not a prefix of the initial word anymore.
+  | LancasterAlgorithm
+    -- | A variation of the Porter algorithm tailored for Gargantext.
+  | GargPorterAlgorithm
+  deriving (Show, Eq, Ord)
+
+-- | Stems the input 'Text' based on the input 'Lang' and using the
+-- given 'StemmingAlgorithm'.
+stem :: Lang -> StemmingAlgorithm -> Text -> Text
+stem lang algo unstemmed = case algo of
+  PorterAlgorithm
+    -> Porter.stem lang unstemmed
+  LancasterAlgorithm
+    | EN <- lang
+    -> Lancaster.stem unstemmed
+    | otherwise
+    -> unstemmed -- Lancaster doesn't support any other language which is not english.
+  GargPorterAlgorithm
+    | EN <- lang
+    -> GargPorter.stem unstemmed
+    | otherwise
+    -> unstemmed -- Our garg porter doesn't support other languages other than english.
--- a/src/Gargantext/Core/Text/Terms/Mono/Stem/En.hs
+++ b/src/Gargantext/Core/Text/Terms/Mono/Stem/En.hs
@@ -16,7 +16,7 @@ Adapted from:
 -}


-module Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
+module Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter (stem)
  where

 import Control.Monad
@@ -194,8 +194,8 @@ step5 = step5b . step5a
 allSteps :: [Char] -> [Char]
 allSteps = step5 . step4 . step3 . step2 . step1

-stemIt :: Text -> Text
-stemIt s = pack (stem' $ unpack s)
+stem :: Text -> Text
+stem s = pack (stem' $ unpack s)

 stem' :: [Char] -> [Char]
 stem' s | length s < 3 = s

--- a/src/Gargantext/Core/Text/Terms/Mono/Stem/Lancaster.hs
+++ b/src/Gargantext/Core/Text/Terms/Mono/Stem/Lancaster.hs
 {-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Core.Text.Terms.Mono.Stem.Lancaster
-  ( stemIt
+module Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster
+  ( stem
  ) where

 import Prelude
@@ -84,8 +84,8 @@ vowelsSet :: String
 vowelsSet = "aeiouy"
 {-# INLINE vowelsSet #-}

-stemIt :: Text -> Text
-stemIt inputText = lancasterStemmer inputText rulesPaper
+stem :: Text -> Text
+stem inputText = lancasterStemmer inputText rulesPaper

 -- Lancaster Stemmer
 lancasterStemmer :: Text -> RuleCollection -> Text
@@ -113,8 +113,8 @@ applyRules value isIntact rules =
            then Nothing
            else case T.stripSuffix m val of
                    Nothing -> Nothing
-                    Just stem ->
-                        let next = stem `T.append` r
+                    Just stm ->
+                        let next = stm `T.append` r
                        in if not (acceptable next)
                            then Nothing
                            else if t == cont || t == contint

--- a/src/Gargantext/Core/Text/Terms/Mono/Stem/Internal/Porter.hs
+++ b/src/Gargantext/Core/Text/Terms/Mono/Stem/Internal/Porter.hs
+{-# LANGUAGE LambdaCase #-}
+module Gargantext.Core.Text.Terms.Mono.Stem.Internal.Porter
+  ( stem ) where
+
+import Prelude
+
+import Data.Text qualified as T
+import Gargantext.Core (Lang(..))
+import NLP.Stemmer qualified as N
+
+fromGargLang :: Lang -> N.Stemmer
+fromGargLang = \case
+  DE -> N.German
+  EL -> N.Porter -- no greek specialised algo, defaults to 'Porter'
+  EN -> N.English
+  ES -> N.Spanish
+  FR -> N.French
+  IT -> N.Italian
+  PL -> N.Porter -- no Polish specialised algo, defaults to 'Porter'
+  PT -> N.Portuguese
+  RU -> N.Russian
+  UK -> N.Porter -- no Ukraine specialised algo, defaults to 'Porter'
+  ZH -> N.Porter -- no chinese specialised algo, defaults to 'Porter'
+
+stem :: Lang -> T.Text -> T.Text
+stem lang = T.pack . N.stem (fromGargLang lang) . T.unpack
--- a/src/Gargantext/Database/Action/Flow.hs
+++ b/src/Gargantext/Database/Action/Flow.hs
@@ -76,7 +76,7 @@ import Gargantext.Core.Text.List (buildNgramsLists)
 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
 import Gargantext.Core.Text.Terms
-import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
+import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
 import Gargantext.Core.Types (HasValidationError, TermsCount)
 import Gargantext.Core.Types.Individu (User(..))
 import Gargantext.Core.Types.Main
@@ -138,12 +138,12 @@ getDataText (ExternalOrigin api) la q mPubmedAPIKey mAuthKey li = do
  cfg <- view hasConfig
  eRes <- liftBase $ API.get api (_tt_lang la) q mPubmedAPIKey mAuthKey (_gc_epo_api_url cfg) li
  pure $ DataNew <$> eRes
-getDataText (InternalOrigin _) _la q _ _ _li = do
+getDataText (InternalOrigin _) la q _ _ _li = do
  (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
                                           (UserName userMaster)
                                           (Left "")
                                           (Nothing :: Maybe HyperdataCorpus)
-  ids <-  map fst <$> searchDocInDatabase cId (stemIt $ API.getRawQuery q)
+  ids <-  map fst <$> searchDocInDatabase cId (stem (_tt_lang la) GargPorterAlgorithm $ API.getRawQuery q)
  pure $ Right $ DataOld ids

 getDataText_Debug :: (HasNodeError err)

--- a/src/Gargantext/Database/Action/Search.hs
+++ b/src/Gargantext/Database/Action/Search.hs
@@ -33,7 +33,7 @@ import Data.Text qualified as T
 import Data.Time (UTCTime)
 import Gargantext.Core
 import Gargantext.Core.Text.Corpus.Query qualified as API
-import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
+import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
 import Gargantext.Core.Types
 import Gargantext.Core.Types.Query (IsTrash, Limit, Offset)
 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
@@ -181,7 +181,7 @@ searchInCorpus :: HasDBid NodeType
 searchInCorpus cId t q o l order = runOpaQuery
                                 $ filterWith o l order
                                 $ queryInCorpus cId t
-                                 $ API.mapQuery (Term . stemIt . getTerm) q
+                                 $ API.mapQuery (Term . stem EN GargPorterAlgorithm . getTerm) q

 searchCountInCorpus :: HasDBid NodeType
                    => CorpusId
@@ -190,7 +190,7 @@ searchCountInCorpus :: HasDBid NodeType
                    -> DBCmd err Int
 searchCountInCorpus cId t q = runCountOpaQuery
                            $ queryInCorpus cId t
-                            $ API.mapQuery (Term . stemIt . getTerm) q
+                            $ API.mapQuery (Term . stem EN GargPorterAlgorithm . getTerm) q

 queryInCorpus :: HasDBid NodeType
              => CorpusId
@@ -233,7 +233,7 @@ searchInCorpusWithContacts cId aId q o l _order =
              $ offset'  o
              $ orderBy (desc _fp_score)
              $ selectGroup cId aId
-              $ API.mapQuery (Term . stemIt . getTerm) q
+              $ API.mapQuery (Term . stem EN GargPorterAlgorithm . getTerm) q

 selectGroup :: HasDBid NodeType
            => CorpusId

--- a/src/Gargantext/Database/Action/TSQuery.hs
+++ b/src/Gargantext/Database/Action/TSQuery.hs
@@ -19,7 +19,7 @@ import Data.String (IsString(..))
 import Database.PostgreSQL.Simple (Query)
 import Database.PostgreSQL.Simple.ToField
 import Gargantext.Core
-import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
+import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
 import Gargantext.Core.Types
 import Gargantext.Core.Types.Query (Limit, Offset)
 import Gargantext.Database.Prelude (DBCmd, runPGSQuery)
@@ -30,7 +30,7 @@ newtype TSQuery = UnsafeTSQuery [Text]

 -- | TODO [""] -> panic "error"
 toTSQuery :: [Text] -> TSQuery
-toTSQuery txt = UnsafeTSQuery $ map stemIt txt
+toTSQuery txt = UnsafeTSQuery $ map (stem EN GargPorterAlgorithm) txt


 instance IsString TSQuery

--- a/test/Test/Database/Operations/DocumentSearch.hs
+++ b/test/Test/Database/Operations/DocumentSearch.hs
@@ -34,7 +34,7 @@ import Gargantext.Database.Schema.Node (NodePoly(..))
 import Test.Database.Types
 import Test.Hspec.Expectations
 import Test.Tasty.HUnit
-import Gargantext.Core.Text.Terms.Mono.Stem.En
+import Gargantext.Core.Text.Terms.Mono.Stem
 import Gargantext.Database.Admin.Config (userMaster)
 import qualified Data.Text as T
 import qualified Gargantext.Core.Text.Corpus.Query as API
@@ -137,8 +137,8 @@ corpusAddDocuments env = do

 stemmingTest :: TestEnv -> Assertion
 stemmingTest _env = do
-  stemIt "Ajeje"    `shouldBe` "Ajeje"
-  stemIt "PyPlasm:" `shouldBe` "PyPlasm:"
+  stem EN GargPorterAlgorithm "Ajeje"    `shouldBe` "Ajeje"
+  stem EN GargPorterAlgorithm "PyPlasm:" `shouldBe` "PyPlasm:"

 mkQ :: T.Text -> API.Query
 mkQ txt = either (\e -> error $ "(query) = " <> T.unpack txt <> ": " <> e) id . API.parseQuery . API.RawQuery $ txt

--- a/test/Test/Offline/Stemming/Lancaster.hs
+++ b/test/Test/Offline/Stemming/Lancaster.hs
@@ -5,7 +5,7 @@ import Prelude

 import Data.ByteString.Char8 qualified as C8
 import Data.Text qualified as T
-import Gargantext.Core.Text.Terms.Mono.Stem.Lancaster (stemIt)
+import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster (stem)
 import Gargantext.Prelude (toS)
 import Test.Tasty
 import Test.Tasty.Golden (goldenVsStringDiff)
@@ -128,4 +128,4 @@ testWords = [
  ]

 mkTestVector :: IO BL.ByteString
-mkTestVector = pure $ toS $ C8.unlines (map (\(indx, w) -> (C8.pack $ show indx) <> "," <> TE.encodeUtf8 (stemIt w)) testWords)
+mkTestVector = pure $ toS $ C8.unlines (map (\(indx, w) -> (C8.pack $ show indx) <> "," <> TE.encodeUtf8 (stem w)) testWords)