
module Test.Offline.Stemming.Lancaster where

import Prelude

import Data.ByteString.Char8 qualified as C8
import Data.Text qualified as T
import Gargantext.Core.Text.Terms.Mono.Stem.Internal.Lancaster (stem)
import Gargantext.Prelude (toS)
import Test.Tasty
import Test.Tasty.Golden (goldenVsString)
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text.Encoding as TE


tests :: TestTree
tests = testGroup "Lancaster" [
    goldenVsString "test vector works" "test-data/stemming/lancaster.txt" mkTestVector
  ]

-- | List un /unstemmed/ test words
testWords :: [(Int, T.Text)]
testWords = [
      (1, "collaboration")
    , (2, "postpartum")
    , (3, "cat")
    , (4, "cats")
    , (5, "dog")
    , (6, "dogs")
    , (7, "run")
    , (8, "running")
    , (9, "runner")
    , (10, "jump")
    , (11, "jumped")
    , (12, "jumping")
    , (13, "swim")
    , (14, "swimming")
    , (15, "swimmer")
    , (16, "fish")
    , (17, "fishing")
    , (18, "fisher")
    , (19, "eat")
    , (20, "eating")
    , (21, "eater")
    , (22, "talk")
    , (23, "talking")
    , (24, "talks")
    , (25, "walk")
    , (26, "walking")
    , (27, "walker")
    , (28, "dance")
    , (29, "dancing")
    , (30, "dancer")
    , (31, "sing")
    , (32, "singing")
    , (33, "singer")
    , (34, "play")
    , (35, "playing")
    , (36, "player")
    , (37, "work")
    , (38, "working")
    , (39, "worker")
    , (40, "teach")
    , (41, "teaching")
    , (42, "teacher")
    , (43, "learn")
    , (44, "learning")
    , (45, "learner")
    , (46, "read")
    , (47, "reading")
    , (48, "reader")
    , (49, "write")
    , (50, "writing")
    , (51, "writer")
    , (52, "paint")
    , (53, "painting")
    , (54, "painter")
    , (55, "draw")
    , (56, "drawing")
    , (57, "drawer")
    , (58, "speak")
    , (59, "speaking")
    , (60, "speaker")
    , (61, "think")
    , (62, "thinking")
    , (63, "thinker")
    , (64, "see")
    , (65, "seeing")
    , (66, "seen")
    , (67, "hear")
    , (68, "hearing")
    , (69, "heard")
    , (70, "touch")
    , (71, "touching")
    , (72, "touched")
    , (73, "smell")
    , (74, "smelling")
    , (75, "smelled")
    , (76, "taste")
    , (77, "tasting")
    , (78, "tasted")
    , (79, "laugh")
    , (80, "laughing")
    , (81, "laughed")
    , (82, "cry")
    , (83, "crying")
    , (84, "cried")
    , (85, "smile")
    , (86, "smiling")
    , (87, "smiled")
    , (88, "frown")
    , (89, "frowning")
    , (90, "frowned")
    , (91, "happy")
    , (92, "happier")
    , (93, "happiest")
    , (94, "sad")
    , (95, "sadder")
    , (96, "saddest")
    , (97, "angry")
    , (98, "angrier")
    , (99, "angriest")
    , (100, "calm")
    , (101, "calmer")
    , (102, "calmest")
    , (103, "corroborate")
  ]

mkTestVector :: IO BL.ByteString
mkTestVector = pure $ toS $ C8.unlines (map (\(indx, w) -> (C8.pack $ show indx) <> "," <> TE.encodeUtf8 (stem w)) testWords)
