Commit 2941ac6e authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Add unified stemming interface

This commit refactors ever so slightly the '.Stem' modules to create a
unified interface; now Gargantext.Core.Text.Terms.Mono.Stem exports a
single 'stem' function which can be used with different
`StemmingAlgorithm`s, and now the other algorithms' functions have been
shielded behind internal modules.

This makes possible to see at glance which part of Garg are using which
stemming algorithm, as it wasn't always clearer before.

This also allows us to have a better control over the langugages, as not
all the stemming algorithms supports all the languages that Garg
supports.
parent 63fcd605
Pipeline #5724 passed with stages
in 113 minutes and 10 seconds
...@@ -158,8 +158,10 @@ library ...@@ -158,8 +158,10 @@ library
Gargantext.Core.Text.Terms Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Mono.Stem.En Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.Lancaster Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter
Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster
Gargantext.Core.Text.Terms.Mono.Stem.Internal.Porter
Gargantext.Core.Text.Terms.Multi Gargantext.Core.Text.Terms.Multi
Gargantext.Core.Text.Terms.Multi.Lang.En Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr Gargantext.Core.Text.Terms.Multi.Lang.Fr
...@@ -327,7 +329,6 @@ library ...@@ -327,7 +329,6 @@ library
Gargantext.Core.Text.Samples.FR Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.PL Gargantext.Core.Text.Samples.PL
Gargantext.Core.Text.Samples.ZH Gargantext.Core.Text.Samples.ZH
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Token Gargantext.Core.Text.Terms.Mono.Token
Gargantext.Core.Text.Terms.Mono.Token.En Gargantext.Core.Text.Terms.Mono.Token.En
Gargantext.Core.Text.Terms.Multi.Group Gargantext.Core.Text.Terms.Multi.Group
......
...@@ -32,7 +32,7 @@ import Gargantext.Core (Lang(..), Form, Lem, NLPServerConfig) ...@@ -32,7 +32,7 @@ import Gargantext.Core (Lang(..), Form, Lem, NLPServerConfig)
import Gargantext.Core.Text.List.Group.Prelude import Gargantext.Core.Text.List.Group.Prelude
import Gargantext.Core.Text.List.Social.Patch import Gargantext.Core.Text.List.Social.Patch
import Gargantext.Core.Text.List.Social.Prelude import Gargantext.Core.Text.List.Social.Prelude
import Gargantext.Core.Text.Terms.Mono.Stem (stem) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Prelude import Gargantext.Prelude
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -73,7 +73,7 @@ groupWith GroupIdentity t = identity t ...@@ -73,7 +73,7 @@ groupWith GroupIdentity t = identity t
groupWith (GroupParams { unGroupParams_lang = l }) t = groupWith (GroupParams { unGroupParams_lang = l }) t =
NgramsTerm NgramsTerm
$ Text.intercalate " " $ Text.intercalate " "
$ map (stem l) $ map (stem l PorterAlgorithm)
-- . take n -- . take n
$ List.sort $ List.sort
-- \$ Set.toList -- \$ Set.toList
......
...@@ -57,7 +57,7 @@ docSearchConfig = ...@@ -57,7 +57,7 @@ docSearchConfig =
normaliseQueryToken :: Text -> DocField -> Text normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok = normaliseQueryToken tok =
let tokStem = ST.stem ST.EN let tokStem = ST.stem ST.EN ST.PorterAlgorithm
in \field -> case field of in \field -> case field of
TitleField -> tokStem tok TitleField -> tokStem tok
AbstractField -> tokStem tok AbstractField -> tokStem tok
......
...@@ -49,7 +49,7 @@ import Gargantext.Core ...@@ -49,7 +49,7 @@ import Gargantext.Core
import Gargantext.Core.Text (sentences, HasText(..)) import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken) import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono (monoTerms) import Gargantext.Core.Text.Terms.Mono (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize) import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms) import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types import Gargantext.Core.Types
...@@ -211,7 +211,7 @@ uniText = map (List.filter (not . isPunctuation)) ...@@ -211,7 +211,7 @@ uniText = map (List.filter (not . isPunctuation))
text2term :: Lang -> [Text] -> Terms text2term :: Lang -> [Text] -> Terms
text2term _ [] = Terms [] Set.empty text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt) text2term lang txt = Terms txt (Set.fromList $ map (stem lang PorterAlgorithm) txt)
isPunctuation :: Text -> Bool isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure) isPunctuation x = List.elem x $ (Text.pack . pure)
......
...@@ -19,7 +19,7 @@ import Data.List qualified as L ...@@ -19,7 +19,7 @@ import Data.List qualified as L
import Data.Set qualified as S import Data.Set qualified as S
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Text.Terms.Mono.Stem (stem) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Prelude hiding (words) import Gargantext.Prelude hiding (words)
import Prelude (String) import Prelude (String)
...@@ -43,7 +43,7 @@ monoTexts = L.concat . monoTextsBySentence ...@@ -43,7 +43,7 @@ monoTexts = L.concat . monoTextsBySentence
-- | TODO use text2term only -- | TODO use text2term only
monoText2term :: Lang -> Text -> Terms monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt) monoText2term lang txt = Terms [txt] (S.singleton $ stem lang PorterAlgorithm txt)
monoTextsBySentence :: Text -> [[Text]] monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map T.words monoTextsBySentence = map T.words
......
{-| {-|
Module : Gargantext.Core.Text.Ngrams.Stem Module : Gargantext.Core.Text.Terms.Mono.Stem
Description : Description : Stemming of mono (i.e. single word) terms.
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
...@@ -14,45 +14,67 @@ not be identical to the morphological root of the word; it is usually ...@@ -14,45 +14,67 @@ not be identical to the morphological root of the word; it is usually
sufficient that related words map to the same stem, even if this stem is sufficient that related words map to the same stem, even if this stem is
not in itself a valid root. not in itself a valid root.
Source : https://en.wikipedia.org/wiki/Stemming Source : https://en.wikipedia.org/wiki/Stemming
A stemmer for English, for example, should identify the string "cats"
(and possibly "catlike", "catty" etc.) as based on the root "cat", and
"stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
algorithm reduces the words "fishing", "fished", and "fisher" to the
root word, "fish". On the other hand, "argue", "argued", "argues",
"arguing", and "argus" reduce to the stem "argu" (illustrating the
case where the stem is not itself a word or root) but "argument" and
"arguments" reduce to the stem "argument".
-} -}
module Gargantext.Core.Text.Terms.Mono.Stem (stem, Lang(..)) module Gargantext.Core.Text.Terms.Mono.Stem (
where
import Data.Text qualified as DT
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import NLP.Stemmer qualified as N
-- (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
-- * Types
StemmingAlgorithm(..),
-- | Stemmer -- * Universal stemming function
stem,
-- A stemmer for English, for example, should identify the string "cats" -- * Handy re-exports
-- (and possibly "catlike", "catty" etc.) as based on the root "cat". Lang(..)
-- and
-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
-- algorithm reduces the words "fishing", "fished", and "fisher" to the
-- root word, "fish". On the other hand, "argue", "argued", "argues",
-- "arguing", and "argus" reduce to the stem "argu" (illustrating the
-- case where the stem is not itself a word or root) but "argument" and
-- "arguments" reduce to the stem "argument".
stem :: Lang -> Text -> Text
stem lang = DT.pack . N.stem lang' . DT.unpack
where
lang' = case lang of
EN -> N.English
FR -> N.French
_ -> panicTrace $ DT.pack "not implemented yet"
) where
import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Porter qualified as Porter
import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster qualified as Lancaster
import Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter qualified as GargPorter
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
-- | A stemming algorithm. There are different stemming algorithm,
-- each with different tradeoffs, strengths and weaknesses. Typically
-- one uses one or the other based on the given task at hand.
data StemmingAlgorithm
= -- | The porter algorithm is the classic stemming algorithm, possibly
-- one of the most widely used.
PorterAlgorithm
-- | Slight variation of the porter algorithm; it's more aggressive with
-- stemming, which might or might not be what you want. It also makes some
-- subtle chances to the stem; for example, the stemming of \"dancer\" using
-- Porter is simply \"dancer\" (i.e. it cannot be further stemmed). Using
-- Lancaster we would get \"dant\", which is not a prefix of the initial word anymore.
| LancasterAlgorithm
-- | A variation of the Porter algorithm tailored for Gargantext.
| GargPorterAlgorithm
deriving (Show, Eq, Ord)
-- | Stems the input 'Text' based on the input 'Lang' and using the
-- given 'StemmingAlgorithm'.
stem :: Lang -> StemmingAlgorithm -> Text -> Text
stem lang algo unstemmed = case algo of
PorterAlgorithm
-> Porter.stem lang unstemmed
LancasterAlgorithm
| EN <- lang
-> Lancaster.stem unstemmed
| otherwise
-> unstemmed -- Lancaster doesn't support any other language which is not english.
GargPorterAlgorithm
| EN <- lang
-> GargPorter.stem unstemmed
| otherwise
-> unstemmed -- Our garg porter doesn't support other languages other than english.
...@@ -16,7 +16,7 @@ Adapted from: ...@@ -16,7 +16,7 @@ Adapted from:
-} -}
module Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt) module Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter (stem)
where where
import Control.Monad import Control.Monad
...@@ -194,8 +194,8 @@ step5 = step5b . step5a ...@@ -194,8 +194,8 @@ step5 = step5b . step5a
allSteps :: [Char] -> [Char] allSteps :: [Char] -> [Char]
allSteps = step5 . step4 . step3 . step2 . step1 allSteps = step5 . step4 . step3 . step2 . step1
stemIt :: Text -> Text stem :: Text -> Text
stemIt s = pack (stem' $ unpack s) stem s = pack (stem' $ unpack s)
stem' :: [Char] -> [Char] stem' :: [Char] -> [Char]
stem' s | length s < 3 = s stem' s | length s < 3 = s
......
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Terms.Mono.Stem.Lancaster module Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster
( stemIt ( stem
) where ) where
import Prelude import Prelude
...@@ -84,8 +84,8 @@ vowelsSet :: String ...@@ -84,8 +84,8 @@ vowelsSet :: String
vowelsSet = "aeiouy" vowelsSet = "aeiouy"
{-# INLINE vowelsSet #-} {-# INLINE vowelsSet #-}
stemIt :: Text -> Text stem :: Text -> Text
stemIt inputText = lancasterStemmer inputText rulesPaper stem inputText = lancasterStemmer inputText rulesPaper
-- Lancaster Stemmer -- Lancaster Stemmer
lancasterStemmer :: Text -> RuleCollection -> Text lancasterStemmer :: Text -> RuleCollection -> Text
...@@ -113,8 +113,8 @@ applyRules value isIntact rules = ...@@ -113,8 +113,8 @@ applyRules value isIntact rules =
then Nothing then Nothing
else case T.stripSuffix m val of else case T.stripSuffix m val of
Nothing -> Nothing Nothing -> Nothing
Just stem -> Just stm ->
let next = stem `T.append` r let next = stm `T.append` r
in if not (acceptable next) in if not (acceptable next)
then Nothing then Nothing
else if t == cont || t == contint else if t == cont || t == contint
......
{-# LANGUAGE LambdaCase #-}
module Gargantext.Core.Text.Terms.Mono.Stem.Internal.Porter
( stem ) where
import Prelude
import Data.Text qualified as T
import Gargantext.Core (Lang(..))
import NLP.Stemmer qualified as N
fromGargLang :: Lang -> N.Stemmer
fromGargLang = \case
DE -> N.German
EL -> N.Porter -- no greek specialised algo, defaults to 'Porter'
EN -> N.English
ES -> N.Spanish
FR -> N.French
IT -> N.Italian
PL -> N.Porter -- no Polish specialised algo, defaults to 'Porter'
PT -> N.Portuguese
RU -> N.Russian
UK -> N.Porter -- no Ukraine specialised algo, defaults to 'Porter'
ZH -> N.Porter -- no chinese specialised algo, defaults to 'Porter'
stem :: Lang -> T.Text -> T.Text
stem lang = T.pack . N.stem (fromGargLang lang) . T.unpack
...@@ -76,7 +76,7 @@ import Gargantext.Core.Text.List (buildNgramsLists) ...@@ -76,7 +76,7 @@ import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..)) import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Text.Terms import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types (HasValidationError, TermsCount) import Gargantext.Core.Types (HasValidationError, TermsCount)
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main import Gargantext.Core.Types.Main
...@@ -138,12 +138,12 @@ getDataText (ExternalOrigin api) la q mPubmedAPIKey mAuthKey li = do ...@@ -138,12 +138,12 @@ getDataText (ExternalOrigin api) la q mPubmedAPIKey mAuthKey li = do
cfg <- view hasConfig cfg <- view hasConfig
eRes <- liftBase $ API.get api (_tt_lang la) q mPubmedAPIKey mAuthKey (_gc_epo_api_url cfg) li eRes <- liftBase $ API.get api (_tt_lang la) q mPubmedAPIKey mAuthKey (_gc_epo_api_url cfg) li
pure $ DataNew <$> eRes pure $ DataNew <$> eRes
getDataText (InternalOrigin _) _la q _ _ _li = do getDataText (InternalOrigin _) la q _ _ _li = do
(_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
(UserName userMaster) (UserName userMaster)
(Left "") (Left "")
(Nothing :: Maybe HyperdataCorpus) (Nothing :: Maybe HyperdataCorpus)
ids <- map fst <$> searchDocInDatabase cId (stemIt $ API.getRawQuery q) ids <- map fst <$> searchDocInDatabase cId (stem (_tt_lang la) GargPorterAlgorithm $ API.getRawQuery q)
pure $ Right $ DataOld ids pure $ Right $ DataOld ids
getDataText_Debug :: (HasNodeError err) getDataText_Debug :: (HasNodeError err)
......
...@@ -33,7 +33,7 @@ import Data.Text qualified as T ...@@ -33,7 +33,7 @@ import Data.Text qualified as T
import Data.Time (UTCTime) import Data.Time (UTCTime)
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Text.Corpus.Query qualified as API import Gargantext.Core.Text.Corpus.Query qualified as API
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Core.Types.Query (IsTrash, Limit, Offset) import Gargantext.Core.Types.Query (IsTrash, Limit, Offset)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), HyperdataContact(..))
...@@ -181,7 +181,7 @@ searchInCorpus :: HasDBid NodeType ...@@ -181,7 +181,7 @@ searchInCorpus :: HasDBid NodeType
searchInCorpus cId t q o l order = runOpaQuery searchInCorpus cId t q o l order = runOpaQuery
$ filterWith o l order $ filterWith o l order
$ queryInCorpus cId t $ queryInCorpus cId t
$ API.mapQuery (Term . stemIt . getTerm) q $ API.mapQuery (Term . stem EN GargPorterAlgorithm . getTerm) q
searchCountInCorpus :: HasDBid NodeType searchCountInCorpus :: HasDBid NodeType
=> CorpusId => CorpusId
...@@ -190,7 +190,7 @@ searchCountInCorpus :: HasDBid NodeType ...@@ -190,7 +190,7 @@ searchCountInCorpus :: HasDBid NodeType
-> DBCmd err Int -> DBCmd err Int
searchCountInCorpus cId t q = runCountOpaQuery searchCountInCorpus cId t q = runCountOpaQuery
$ queryInCorpus cId t $ queryInCorpus cId t
$ API.mapQuery (Term . stemIt . getTerm) q $ API.mapQuery (Term . stem EN GargPorterAlgorithm . getTerm) q
queryInCorpus :: HasDBid NodeType queryInCorpus :: HasDBid NodeType
=> CorpusId => CorpusId
...@@ -233,7 +233,7 @@ searchInCorpusWithContacts cId aId q o l _order = ...@@ -233,7 +233,7 @@ searchInCorpusWithContacts cId aId q o l _order =
$ offset' o $ offset' o
$ orderBy (desc _fp_score) $ orderBy (desc _fp_score)
$ selectGroup cId aId $ selectGroup cId aId
$ API.mapQuery (Term . stemIt . getTerm) q $ API.mapQuery (Term . stem EN GargPorterAlgorithm . getTerm) q
selectGroup :: HasDBid NodeType selectGroup :: HasDBid NodeType
=> CorpusId => CorpusId
......
...@@ -19,7 +19,7 @@ import Data.String (IsString(..)) ...@@ -19,7 +19,7 @@ import Data.String (IsString(..))
import Database.PostgreSQL.Simple (Query) import Database.PostgreSQL.Simple (Query)
import Database.PostgreSQL.Simple.ToField import Database.PostgreSQL.Simple.ToField
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Core.Types.Query (Limit, Offset) import Gargantext.Core.Types.Query (Limit, Offset)
import Gargantext.Database.Prelude (DBCmd, runPGSQuery) import Gargantext.Database.Prelude (DBCmd, runPGSQuery)
...@@ -30,7 +30,7 @@ newtype TSQuery = UnsafeTSQuery [Text] ...@@ -30,7 +30,7 @@ newtype TSQuery = UnsafeTSQuery [Text]
-- | TODO [""] -> panic "error" -- | TODO [""] -> panic "error"
toTSQuery :: [Text] -> TSQuery toTSQuery :: [Text] -> TSQuery
toTSQuery txt = UnsafeTSQuery $ map stemIt txt toTSQuery txt = UnsafeTSQuery $ map (stem EN GargPorterAlgorithm) txt
instance IsString TSQuery instance IsString TSQuery
......
...@@ -34,7 +34,7 @@ import Gargantext.Database.Schema.Node (NodePoly(..)) ...@@ -34,7 +34,7 @@ import Gargantext.Database.Schema.Node (NodePoly(..))
import Test.Database.Types import Test.Database.Types
import Test.Hspec.Expectations import Test.Hspec.Expectations
import Test.Tasty.HUnit import Test.Tasty.HUnit
import Gargantext.Core.Text.Terms.Mono.Stem.En import Gargantext.Core.Text.Terms.Mono.Stem
import Gargantext.Database.Admin.Config (userMaster) import Gargantext.Database.Admin.Config (userMaster)
import qualified Data.Text as T import qualified Data.Text as T
import qualified Gargantext.Core.Text.Corpus.Query as API import qualified Gargantext.Core.Text.Corpus.Query as API
...@@ -137,8 +137,8 @@ corpusAddDocuments env = do ...@@ -137,8 +137,8 @@ corpusAddDocuments env = do
stemmingTest :: TestEnv -> Assertion stemmingTest :: TestEnv -> Assertion
stemmingTest _env = do stemmingTest _env = do
stemIt "Ajeje" `shouldBe` "Ajeje" stem EN GargPorterAlgorithm "Ajeje" `shouldBe` "Ajeje"
stemIt "PyPlasm:" `shouldBe` "PyPlasm:" stem EN GargPorterAlgorithm "PyPlasm:" `shouldBe` "PyPlasm:"
mkQ :: T.Text -> API.Query mkQ :: T.Text -> API.Query
mkQ txt = either (\e -> error $ "(query) = " <> T.unpack txt <> ": " <> e) id . API.parseQuery . API.RawQuery $ txt mkQ txt = either (\e -> error $ "(query) = " <> T.unpack txt <> ": " <> e) id . API.parseQuery . API.RawQuery $ txt
......
...@@ -5,7 +5,7 @@ import Prelude ...@@ -5,7 +5,7 @@ import Prelude
import Data.ByteString.Char8 qualified as C8 import Data.ByteString.Char8 qualified as C8
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.Core.Text.Terms.Mono.Stem.Lancaster (stemIt) import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster (stem)
import Gargantext.Prelude (toS) import Gargantext.Prelude (toS)
import Test.Tasty import Test.Tasty
import Test.Tasty.Golden (goldenVsStringDiff) import Test.Tasty.Golden (goldenVsStringDiff)
...@@ -128,4 +128,4 @@ testWords = [ ...@@ -128,4 +128,4 @@ testWords = [
] ]
mkTestVector :: IO BL.ByteString mkTestVector :: IO BL.ByteString
mkTestVector = pure $ toS $ C8.unlines (map (\(indx, w) -> (C8.pack $ show indx) <> "," <> TE.encodeUtf8 (stemIt w)) testWords) mkTestVector = pure $ toS $ C8.unlines (map (\(indx, w) -> (C8.pack $ show indx) <> "," <> TE.encodeUtf8 (stem w)) testWords)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment