Commit 2941ac6e authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Add unified stemming interface

This commit refactors ever so slightly the '.Stem' modules to create a
unified interface; now Gargantext.Core.Text.Terms.Mono.Stem exports a
single 'stem' function which can be used with different
`StemmingAlgorithm`s, and now the other algorithms' functions have been
shielded behind internal modules.

This makes possible to see at glance which part of Garg are using which
stemming algorithm, as it wasn't always clearer before.

This also allows us to have a better control over the langugages, as not
all the stemming algorithms supports all the languages that Garg
supports.
parent 63fcd605
......@@ -158,8 +158,10 @@ library
Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Stem.Lancaster
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter
Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster
Gargantext.Core.Text.Terms.Mono.Stem.Internal.Porter
Gargantext.Core.Text.Terms.Multi
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr
......@@ -327,7 +329,6 @@ library
Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.PL
Gargantext.Core.Text.Samples.ZH
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Token
Gargantext.Core.Text.Terms.Mono.Token.En
Gargantext.Core.Text.Terms.Multi.Group
......
......@@ -32,7 +32,7 @@ import Gargantext.Core (Lang(..), Form, Lem, NLPServerConfig)
import Gargantext.Core.Text.List.Group.Prelude
import Gargantext.Core.Text.List.Social.Patch
import Gargantext.Core.Text.List.Social.Prelude
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Prelude
------------------------------------------------------------------------
......@@ -73,7 +73,7 @@ groupWith GroupIdentity t = identity t
groupWith (GroupParams { unGroupParams_lang = l }) t =
NgramsTerm
$ Text.intercalate " "
$ map (stem l)
$ map (stem l PorterAlgorithm)
-- . take n
$ List.sort
-- \$ Set.toList
......
......@@ -57,7 +57,7 @@ docSearchConfig =
normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok =
let tokStem = ST.stem ST.EN
let tokStem = ST.stem ST.EN ST.PorterAlgorithm
in \field -> case field of
TitleField -> tokStem tok
AbstractField -> tokStem tok
......
......@@ -49,7 +49,7 @@ import Gargantext.Core
import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types
......@@ -211,7 +211,7 @@ uniText = map (List.filter (not . isPunctuation))
text2term :: Lang -> [Text] -> Terms
text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
text2term lang txt = Terms txt (Set.fromList $ map (stem lang PorterAlgorithm) txt)
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
......
......@@ -19,7 +19,7 @@ import Data.List qualified as L
import Data.Set qualified as S
import Data.Text qualified as T
import Gargantext.Core
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types
import Gargantext.Prelude hiding (words)
import Prelude (String)
......@@ -43,7 +43,7 @@ monoTexts = L.concat . monoTextsBySentence
-- | TODO use text2term only
monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang PorterAlgorithm txt)
monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map T.words
......
{-|
Module : Gargantext.Core.Text.Ngrams.Stem
Description :
Module : Gargantext.Core.Text.Terms.Mono.Stem
Description : Stemming of mono (i.e. single word) terms.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -14,45 +14,67 @@ not be identical to the morphological root of the word; it is usually
sufficient that related words map to the same stem, even if this stem is
not in itself a valid root.
Source : https://en.wikipedia.org/wiki/Stemming
A stemmer for English, for example, should identify the string "cats"
(and possibly "catlike", "catty" etc.) as based on the root "cat", and
"stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
algorithm reduces the words "fishing", "fished", and "fisher" to the
root word, "fish". On the other hand, "argue", "argued", "argues",
"arguing", and "argus" reduce to the stem "argu" (illustrating the
case where the stem is not itself a word or root) but "argument" and
"arguments" reduce to the stem "argument".
-}
module Gargantext.Core.Text.Terms.Mono.Stem (stem, Lang(..))
where
import Data.Text qualified as DT
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import NLP.Stemmer qualified as N
-- (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
module Gargantext.Core.Text.Terms.Mono.Stem (
-- * Types
StemmingAlgorithm(..),
-- | Stemmer
-- * Universal stemming function
stem,
-- A stemmer for English, for example, should identify the string "cats"
-- (and possibly "catlike", "catty" etc.) as based on the root "cat".
-- and
-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
-- algorithm reduces the words "fishing", "fished", and "fisher" to the
-- root word, "fish". On the other hand, "argue", "argued", "argues",
-- "arguing", and "argus" reduce to the stem "argu" (illustrating the
-- case where the stem is not itself a word or root) but "argument" and
-- "arguments" reduce to the stem "argument".
stem :: Lang -> Text -> Text
stem lang = DT.pack . N.stem lang' . DT.unpack
where
lang' = case lang of
EN -> N.English
FR -> N.French
_ -> panicTrace $ DT.pack "not implemented yet"
-- * Handy re-exports
Lang(..)
) where
import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Porter qualified as Porter
import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster qualified as Lancaster
import Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter qualified as GargPorter
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
-- | A stemming algorithm. There are different stemming algorithm,
-- each with different tradeoffs, strengths and weaknesses. Typically
-- one uses one or the other based on the given task at hand.
data StemmingAlgorithm
= -- | The porter algorithm is the classic stemming algorithm, possibly
-- one of the most widely used.
PorterAlgorithm
-- | Slight variation of the porter algorithm; it's more aggressive with
-- stemming, which might or might not be what you want. It also makes some
-- subtle chances to the stem; for example, the stemming of \"dancer\" using
-- Porter is simply \"dancer\" (i.e. it cannot be further stemmed). Using
-- Lancaster we would get \"dant\", which is not a prefix of the initial word anymore.
| LancasterAlgorithm
-- | A variation of the Porter algorithm tailored for Gargantext.
| GargPorterAlgorithm
deriving (Show, Eq, Ord)
-- | Stems the input 'Text' based on the input 'Lang' and using the
-- given 'StemmingAlgorithm'.
stem :: Lang -> StemmingAlgorithm -> Text -> Text
stem lang algo unstemmed = case algo of
PorterAlgorithm
-> Porter.stem lang unstemmed
LancasterAlgorithm
| EN <- lang
-> Lancaster.stem unstemmed
| otherwise
-> unstemmed -- Lancaster doesn't support any other language which is not english.
GargPorterAlgorithm
| EN <- lang
-> GargPorter.stem unstemmed
| otherwise
-> unstemmed -- Our garg porter doesn't support other languages other than english.
......@@ -16,7 +16,7 @@ Adapted from:
-}
module Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
module Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter (stem)
where
import Control.Monad
......@@ -194,8 +194,8 @@ step5 = step5b . step5a
allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1
stemIt :: Text -> Text
stemIt s = pack (stem' $ unpack s)
stem :: Text -> Text
stem s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char]
stem' s | length s < 3 = s
......
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Terms.Mono.Stem.Lancaster
( stemIt
module Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster
( stem
) where
import Prelude
......@@ -84,8 +84,8 @@ vowelsSet :: String
vowelsSet = "aeiouy"
{-# INLINE vowelsSet #-}
stemIt :: Text -> Text
stemIt inputText = lancasterStemmer inputText rulesPaper
stem :: Text -> Text
stem inputText = lancasterStemmer inputText rulesPaper
-- Lancaster Stemmer
lancasterStemmer :: Text -> RuleCollection -> Text
......@@ -113,8 +113,8 @@ applyRules value isIntact rules =
then Nothing
else case T.stripSuffix m val of
Nothing -> Nothing
Just stem ->
let next = stem `T.append` r
Just stm ->
let next = stm `T.append` r
in if not (acceptable next)
then Nothing
else if t == cont || t == contint
......
{-# LANGUAGE LambdaCase #-}
module Gargantext.Core.Text.Terms.Mono.Stem.Internal.Porter
( stem ) where
import Prelude
import Data.Text qualified as T
import Gargantext.Core (Lang(..))
import NLP.Stemmer qualified as N
fromGargLang :: Lang -> N.Stemmer
fromGargLang = \case
DE -> N.German
EL -> N.Porter -- no greek specialised algo, defaults to 'Porter'
EN -> N.English
ES -> N.Spanish
FR -> N.French
IT -> N.Italian
PL -> N.Porter -- no Polish specialised algo, defaults to 'Porter'
PT -> N.Portuguese
RU -> N.Russian
UK -> N.Porter -- no Ukraine specialised algo, defaults to 'Porter'
ZH -> N.Porter -- no chinese specialised algo, defaults to 'Porter'
stem :: Lang -> T.Text -> T.Text
stem lang = T.pack . N.stem (fromGargLang lang) . T.unpack
......@@ -76,7 +76,7 @@ import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types (HasValidationError, TermsCount)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main
......@@ -138,12 +138,12 @@ getDataText (ExternalOrigin api) la q mPubmedAPIKey mAuthKey li = do
cfg <- view hasConfig
eRes <- liftBase $ API.get api (_tt_lang la) q mPubmedAPIKey mAuthKey (_gc_epo_api_url cfg) li
pure $ DataNew <$> eRes
getDataText (InternalOrigin _) _la q _ _ _li = do
getDataText (InternalOrigin _) la q _ _ _li = do
(_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
(UserName userMaster)
(Left "")
(Nothing :: Maybe HyperdataCorpus)
ids <- map fst <$> searchDocInDatabase cId (stemIt $ API.getRawQuery q)
ids <- map fst <$> searchDocInDatabase cId (stem (_tt_lang la) GargPorterAlgorithm $ API.getRawQuery q)
pure $ Right $ DataOld ids
getDataText_Debug :: (HasNodeError err)
......
......@@ -33,7 +33,7 @@ import Data.Text qualified as T
import Data.Time (UTCTime)
import Gargantext.Core
import Gargantext.Core.Text.Corpus.Query qualified as API
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types
import Gargantext.Core.Types.Query (IsTrash, Limit, Offset)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
......@@ -181,7 +181,7 @@ searchInCorpus :: HasDBid NodeType
searchInCorpus cId t q o l order = runOpaQuery
$ filterWith o l order
$ queryInCorpus cId t
$ API.mapQuery (Term . stemIt . getTerm) q
$ API.mapQuery (Term . stem EN GargPorterAlgorithm . getTerm) q
searchCountInCorpus :: HasDBid NodeType
=> CorpusId
......@@ -190,7 +190,7 @@ searchCountInCorpus :: HasDBid NodeType
-> DBCmd err Int
searchCountInCorpus cId t q = runCountOpaQuery
$ queryInCorpus cId t
$ API.mapQuery (Term . stemIt . getTerm) q
$ API.mapQuery (Term . stem EN GargPorterAlgorithm . getTerm) q
queryInCorpus :: HasDBid NodeType
=> CorpusId
......@@ -233,7 +233,7 @@ searchInCorpusWithContacts cId aId q o l _order =
$ offset' o
$ orderBy (desc _fp_score)
$ selectGroup cId aId
$ API.mapQuery (Term . stemIt . getTerm) q
$ API.mapQuery (Term . stem EN GargPorterAlgorithm . getTerm) q
selectGroup :: HasDBid NodeType
=> CorpusId
......
......@@ -19,7 +19,7 @@ import Data.String (IsString(..))
import Database.PostgreSQL.Simple (Query)
import Database.PostgreSQL.Simple.ToField
import Gargantext.Core
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types
import Gargantext.Core.Types.Query (Limit, Offset)
import Gargantext.Database.Prelude (DBCmd, runPGSQuery)
......@@ -30,7 +30,7 @@ newtype TSQuery = UnsafeTSQuery [Text]
-- | TODO [""] -> panic "error"
toTSQuery :: [Text] -> TSQuery
toTSQuery txt = UnsafeTSQuery $ map stemIt txt
toTSQuery txt = UnsafeTSQuery $ map (stem EN GargPorterAlgorithm) txt
instance IsString TSQuery
......
......@@ -34,7 +34,7 @@ import Gargantext.Database.Schema.Node (NodePoly(..))
import Test.Database.Types
import Test.Hspec.Expectations
import Test.Tasty.HUnit
import Gargantext.Core.Text.Terms.Mono.Stem.En
import Gargantext.Core.Text.Terms.Mono.Stem
import Gargantext.Database.Admin.Config (userMaster)
import qualified Data.Text as T
import qualified Gargantext.Core.Text.Corpus.Query as API
......@@ -137,8 +137,8 @@ corpusAddDocuments env = do
stemmingTest :: TestEnv -> Assertion
stemmingTest _env = do
stemIt "Ajeje" `shouldBe` "Ajeje"
stemIt "PyPlasm:" `shouldBe` "PyPlasm:"
stem EN GargPorterAlgorithm "Ajeje" `shouldBe` "Ajeje"
stem EN GargPorterAlgorithm "PyPlasm:" `shouldBe` "PyPlasm:"
mkQ :: T.Text -> API.Query
mkQ txt = either (\e -> error $ "(query) = " <> T.unpack txt <> ": " <> e) id . API.parseQuery . API.RawQuery $ txt
......
......@@ -5,7 +5,7 @@ import Prelude
import Data.ByteString.Char8 qualified as C8
import Data.Text qualified as T
import Gargantext.Core.Text.Terms.Mono.Stem.Lancaster (stemIt)
import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster (stem)
import Gargantext.Prelude (toS)
import Test.Tasty
import Test.Tasty.Golden (goldenVsStringDiff)
......@@ -128,4 +128,4 @@ testWords = [
]
mkTestVector :: IO BL.ByteString
mkTestVector = pure $ toS $ C8.unlines (map (\(indx, w) -> (C8.pack $ show indx) <> "," <> TE.encodeUtf8 (stemIt w)) testWords)
mkTestVector = pure $ toS $ C8.unlines (map (\(indx, w) -> (C8.pack $ show indx) <> "," <> TE.encodeUtf8 (stem w)) testWords)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment