Commit 77fe2ea6 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Test vectors and tests for Lancaster stemming

parent 396fbd52
Pipeline #5699 failed with stages
in 27 minutes and 10 seconds
module Main where
import Prelude
import Data.TreeDiff.Class
import Data.TreeDiff.Pretty
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import System.Environment (getArgs)
import System.Exit (exitFailure)
import Control.Monad (unless)
import qualified Data.List as L
-- | Renders in a pretty way the content of two golden files. The
-- first file should contain the expected output, the second the
-- actual data generated by the test suite.
main :: IO ()
main = do
(refPath:newPath:_) <- getArgs
ref <- T.lines <$> TIO.readFile refPath
new <- T.lines <$> TIO.readFile newPath
let differences = filter (\(r,n) -> r /= n) $ zip ref new
unless (L.null differences) $ do
putStrLn $ show $ ansiWlEditExpr $ ediff' (map fst differences) (map snd differences)
exitFailure
......@@ -38,6 +38,7 @@ data-files:
test-data/phylo/bpa_phylo_test.json
test-data/phylo/open_science.json
test-data/phylo/issue-290-small.golden.json
test-data/stemming/lancaster.txt
test-data/test_config.ini
gargantext-cors-settings.toml
.clippy.dhall
......@@ -862,6 +863,7 @@ test-suite garg-test-tasty
Test.Offline.Errors
Test.Offline.JSON
Test.Offline.Phylo
Test.Offline.Stemming.Lancaster
Test.Parsers.Date
Test.Parsers.Types
Test.Parsers.WOS
......@@ -907,6 +909,7 @@ test-suite garg-test-tasty
, patches-map ^>= 0.1.0.1
, postgres-options >= 0.2 && < 0.3
, postgresql-simple >= 0.6.4 && < 0.7
, pretty
, process ^>= 1.6.13.2
, quickcheck-instances ^>= 0.3.25.2
, raw-strings-qq
......@@ -921,6 +924,7 @@ test-suite garg-test-tasty
, shelly >= 1.9 && < 2
, stm ^>= 2.5.0.1
, tasty ^>= 1.4.2.1
, tasty-golden
, tasty-hspec
, tasty-hunit
, tasty-quickcheck
......@@ -929,6 +933,7 @@ test-suite garg-test-tasty
, text ^>= 1.2.4.1
, time ^>= 1.9.3
, tmp-postgres >= 1.34.1 && < 1.35
, tree-diff
, unordered-containers ^>= 0.2.16.0
, validity ^>= 0.11.0.1
, wai
......@@ -1059,3 +1064,16 @@ executable gargantext-phylo-profile
, vector
, directory
default-language: Haskell2010
executable garg-golden-file-diff
import:
defaults
, optimized
main-is: Main.hs
hs-source-dirs:
bin/gargantext-golden-file-diff
build-depends:
base
, text
, tree-diff
default-language: Haskell2010
......@@ -38,7 +38,7 @@ rulesPaper =
[ ('a', [ Rule "ia" "" intact, Rule "a" "" intact ])
, ('b', [ Rule "bb" "b" stop ])
, ('c', [ Rule "ytic" "ys" stop, Rule "ic" "" cont, Rule "nc" "nt" cont ])
, ('d', [ Rule "dd" "d" stop, Rule "ied" "y" cont, Rule "ceed" "cess" stop, Rule "eed" "ee" stop
, ('d', [ Rule "dd" "d" stop, Rule "ied" "i" stop, Rule "ceed" "cess" stop, Rule "eed" "ee" stop
, Rule "ed" "" cont, Rule "hood" "" cont ])
, ('e', [ Rule "e" "" cont ])
, ('f', [ Rule "lief" "liev" stop, Rule "if" "" cont ])
......@@ -121,7 +121,7 @@ applyRules value isIntact rules =
then Just $ applyRules next False rules
else Just next
-- | A stem is acceptable if
-- | Returns 'True' if a stem is acceptable.
acceptable :: Text -> Bool
acceptable val
| T.null val = False
......
1,collab
2,postpart
3,cat
4,cat
5,dog
6,dog
7,run
8,run
9,run
10,jump
11,jump
12,jump
13,swim
14,swim
15,swim
16,fish
17,fish
18,fish
19,eat
20,eat
21,eat
22,talk
23,talk
24,talk
25,walk
26,walk
27,walk
28,dant
29,dant
30,dant
31,sing
32,sing
33,sing
34,play
35,play
36,play
37,work
38,work
39,work
40,teach
41,teach
42,teach
43,learn
44,learn
45,learn
46,read
47,read
48,read
49,writ
50,writ
51,writ
52,paint
53,paint
54,paint
55,draw
56,draw
57,draw
58,speak
59,speak
60,speak
61,think
62,think
63,think
64,see
65,see
66,seen
67,hear
68,hear
69,heard
70,touch
71,touch
72,touch
73,smel
74,smel
75,smel
76,tast
77,tast
78,tast
79,laugh
80,laugh
81,laugh
82,cry
83,cry
84,cri
85,smil
86,smil
87,smil
88,frown
89,frown
90,frown
91,happy
92,happy
93,happiest
94,sad
95,sad
96,saddest
97,angry
98,angry
99,angriest
100,calm
101,calm
102,calmest
103,corrob
module Test.Offline.Stemming.Lancaster where
import Prelude
import Data.ByteString.Char8 qualified as C8
import Data.Text qualified as T
import Gargantext.Core.Text.Terms.Mono.Stem.Lancaster (stemIt)
import Gargantext.Prelude (toS)
import Test.Tasty
import Test.Tasty.Golden (goldenVsStringDiff)
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text.Encoding as TE
tests :: TestTree
tests = testGroup "Lancaster" [
goldenVsStringDiff "test vector works" (\ref new -> ["cabal", "v2-run", "-v0", "garg-golden-file-diff", "--", ref, new]) "test-data/stemming/lancaster.txt" mkTestVector
]
-- | List un /unstemmed/ test words
testWords :: [(Int, T.Text)]
testWords = [
(1, "collaboration")
, (2, "postpartum")
, (3, "cat")
, (4, "cats")
, (5, "dog")
, (6, "dogs")
, (7, "run")
, (8, "running")
, (9, "runner")
, (10, "jump")
, (11, "jumped")
, (12, "jumping")
, (13, "swim")
, (14, "swimming")
, (15, "swimmer")
, (16, "fish")
, (17, "fishing")
, (18, "fisher")
, (19, "eat")
, (20, "eating")
, (21, "eater")
, (22, "talk")
, (23, "talking")
, (24, "talks")
, (25, "walk")
, (26, "walking")
, (27, "walker")
, (28, "dance")
, (29, "dancing")
, (30, "dancer")
, (31, "sing")
, (32, "singing")
, (33, "singer")
, (34, "play")
, (35, "playing")
, (36, "player")
, (37, "work")
, (38, "working")
, (39, "worker")
, (40, "teach")
, (41, "teaching")
, (42, "teacher")
, (43, "learn")
, (44, "learning")
, (45, "learner")
, (46, "read")
, (47, "reading")
, (48, "reader")
, (49, "write")
, (50, "writing")
, (51, "writer")
, (52, "paint")
, (53, "painting")
, (54, "painter")
, (55, "draw")
, (56, "drawing")
, (57, "drawer")
, (58, "speak")
, (59, "speaking")
, (60, "speaker")
, (61, "think")
, (62, "thinking")
, (63, "thinker")
, (64, "see")
, (65, "seeing")
, (66, "seen")
, (67, "hear")
, (68, "hearing")
, (69, "heard")
, (70, "touch")
, (71, "touching")
, (72, "touched")
, (73, "smell")
, (74, "smelling")
, (75, "smelled")
, (76, "taste")
, (77, "tasting")
, (78, "tasted")
, (79, "laugh")
, (80, "laughing")
, (81, "laughed")
, (82, "cry")
, (83, "crying")
, (84, "cried")
, (85, "smile")
, (86, "smiling")
, (87, "smiled")
, (88, "frown")
, (89, "frowning")
, (90, "frowned")
, (91, "happy")
, (92, "happier")
, (93, "happiest")
, (94, "sad")
, (95, "sadder")
, (96, "saddest")
, (97, "angry")
, (98, "angrier")
, (99, "angriest")
, (100, "calm")
, (101, "calmer")
, (102, "calmest")
, (103, "corroborate")
]
mkTestVector :: IO BL.ByteString
mkTestVector = pure $ toS $ C8.unlines (map (\(indx, w) -> (C8.pack $ show indx) <> "," <> TE.encodeUtf8 (stemIt w)) testWords)
......@@ -12,17 +12,18 @@ module Main where
import Gargantext.Prelude
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Utils as Utils
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.Phylo as Phylo
import qualified Test.Parsers.Date as PD
import qualified Test.Utils.Crypto as Crypto
import qualified Test.Utils.Jobs as Jobs
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Utils as Utils
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.Phylo as Phylo
import qualified Test.Offline.Stemming.Lancaster as Lancaster
import qualified Test.Parsers.Date as PD
import qualified Test.Utils.Crypto as Crypto
import qualified Test.Utils.Jobs as Jobs
import Test.Tasty
import Test.Tasty.Hspec
......@@ -50,4 +51,5 @@ main = do
, JSON.tests
, Errors.tests
, Phylo.tests
, testGroup "Stemming" [ Lancaster.tests ]
]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment