Commit ddfedc92 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/adinapoli/improve-half-words-stemming' into dev

parents efbe327f 052ee0a7
module Main where
import Prelude
import Data.TreeDiff.Class
import Data.TreeDiff.Pretty
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import System.Environment (getArgs)
import System.Exit (exitFailure)
import Control.Monad (unless)
import qualified Data.List as L
-- | Renders in a pretty way the content of two golden files. The
-- first file should contain the expected output, the second the
-- actual data generated by the test suite.
main :: IO ()
main = do
(refPath:newPath:_) <- getArgs
ref <- T.lines <$> TIO.readFile refPath
new <- T.lines <$> TIO.readFile newPath
let differences = filter (\(r,n) -> r /= n) $ zip ref new
unless (L.null differences) $ do
putStrLn $ show $ ansiWlEditExpr $ ediff' (map fst differences) (map snd differences)
exitFailure
......@@ -19,7 +19,7 @@ fi
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in.
expected_cabal_project_hash="c9fe39301e8b60bfd183e60e7e25a14cd1c9c66d8739bf9041ca3f4db89db7c6"
expected_cabal_project_freeze_hash="db24c7d3006167102532e3101e2b49bae13d478003459c7d3f1d66590e57740a"
expected_cabal_project_freeze_hash="2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5"
cabal --store-dir=$STORE_DIR v2-build --dry-run
cabal2stack --system-ghc --allow-newer --resolver lts-21.17 --resolver-file devops/stack/lts-21.17.yaml -o stack.yaml
......
......@@ -585,6 +585,8 @@ constraints: any.Cabal ==3.8.1.0,
tasty +unix,
any.tasty-bench ==0.3.5,
tasty-bench -debug +tasty,
any.tasty-golden ==2.3.5,
tasty-golden -build-example,
any.tasty-hspec ==1.2.0.3,
any.tasty-hunit ==0.10.1,
any.tasty-quickcheck ==0.10.2,
......@@ -637,6 +639,7 @@ constraints: any.Cabal ==3.8.1.0,
transformers-base +orphaninstances,
any.transformers-compat ==0.7.2,
transformers-compat -five +five-three -four +generic-deriving +mtl -three -two,
any.tree-diff ==0.3.0.1,
any.tuple ==0.3.0.2,
any.type-equality ==1,
any.typed-process ==0.2.11.1,
......
......@@ -38,6 +38,7 @@ data-files:
test-data/phylo/bpa_phylo_test.json
test-data/phylo/open_science.json
test-data/phylo/issue-290-small.golden.json
test-data/stemming/lancaster.txt
test-data/test_config.ini
gargantext-cors-settings.toml
.clippy.dhall
......@@ -157,6 +158,7 @@ library
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Stem.Lancaster
Gargantext.Core.Text.Terms.Multi
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr
......@@ -862,6 +864,7 @@ test-suite garg-test-tasty
Test.Offline.Errors
Test.Offline.JSON
Test.Offline.Phylo
Test.Offline.Stemming.Lancaster
Test.Parsers.Date
Test.Parsers.Types
Test.Parsers.WOS
......@@ -907,6 +910,7 @@ test-suite garg-test-tasty
, patches-map ^>= 0.1.0.1
, postgres-options >= 0.2 && < 0.3
, postgresql-simple >= 0.6.4 && < 0.7
, pretty
, process ^>= 1.6.13.2
, quickcheck-instances ^>= 0.3.25.2
, raw-strings-qq
......@@ -921,6 +925,7 @@ test-suite garg-test-tasty
, shelly >= 1.9 && < 2
, stm ^>= 2.5.0.1
, tasty ^>= 1.4.2.1
, tasty-golden
, tasty-hspec
, tasty-hunit
, tasty-quickcheck
......@@ -929,6 +934,7 @@ test-suite garg-test-tasty
, text ^>= 1.2.4.1
, time ^>= 1.9.3
, tmp-postgres >= 1.34.1 && < 1.35
, tree-diff
, unordered-containers ^>= 0.2.16.0
, validity ^>= 0.11.0.1
, wai
......@@ -1059,3 +1065,16 @@ executable gargantext-phylo-profile
, vector
, directory
default-language: Haskell2010
executable garg-golden-file-diff
import:
defaults
, optimized
main-is: Main.hs
hs-source-dirs:
bin/gargantext-golden-file-diff
build-depends:
base
, text
, tree-diff
default-language: Haskell2010
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Terms.Mono.Stem.Lancaster
( stemIt
) where
import Prelude
import Data.Text (Text)
import qualified Data.Text as T
data Rule = Rule
{ _match :: Text
, _replacement :: Text
, _ruleType :: RuleType
} deriving (Show, Eq)
data RuleType
= Intact
| Continue
| Contint
| Stop
| Protect
deriving (Show, Eq)
type RuleCollection = [(Char, [Rule])]
stop, intact, cont, protect, contint :: RuleType
stop = Stop
intact = Intact
cont = Continue
protect = Protect
contint = Contint
-- Define rules
rulesPaper :: RuleCollection
rulesPaper =
[ ('a', [ Rule "ia" "" intact, Rule "a" "" intact ])
, ('b', [ Rule "bb" "b" stop ])
, ('c', [ Rule "ytic" "ys" stop, Rule "ic" "" cont, Rule "nc" "nt" cont ])
, ('d', [ Rule "dd" "d" stop, Rule "ied" "i" stop, Rule "ceed" "cess" stop, Rule "eed" "ee" stop
, Rule "ed" "" cont, Rule "hood" "" cont ])
, ('e', [ Rule "e" "" cont ])
, ('f', [ Rule "lief" "liev" stop, Rule "if" "" cont ])
, ('g', [ Rule "ing" "" cont, Rule "iag" "y" stop, Rule "ag" "" cont, Rule "gg" "g" stop ])
, ('h', [ Rule "th" "" intact, Rule "guish" "ct" stop, Rule "ish" "" cont ])
, ('i', [ Rule "i" "" intact, Rule "i" "y" cont ])
, ('j', [ Rule "ij" "id" stop, Rule "fuj" "fus" stop, Rule "uj" "ud" stop, Rule "oj" "od" stop
, Rule "hej" "her" stop, Rule "verj" "vert" stop, Rule "misj" "mit" stop, Rule "nj" "nd" stop
, Rule "j" "s" stop ])
, ('l', [ Rule "ifiabl" "" stop, Rule "iabl" "y" stop, Rule "abl" "" cont, Rule "ibl" "" stop
, Rule "bil" "bl" cont, Rule "cl" "c" stop, Rule "iful" "y" stop, Rule "ful" "" cont
, Rule "ul" "" stop, Rule "ial" "" cont, Rule "ual" "" cont, Rule "al" "" cont
, Rule "ll" "l" stop ])
, ('m', [ Rule "ium" "" stop, Rule "um" "" intact, Rule "ism" "" cont, Rule "mm" "m" stop ])
, ('n', [ Rule "sion" "j" cont, Rule "xion" "ct" stop, Rule "ion" "" cont, Rule "ian" "" cont
, Rule "an" "" cont, Rule "een" "" protect, Rule "en" "" cont, Rule "nn" "n" stop ])
, ('p', [ Rule "ship" "" cont, Rule "pp" "p" stop ])
, ('r', [ Rule "er" "" cont, Rule "ear" "" protect, Rule "ar" "" stop, Rule "or" "" cont
, Rule "ur" "" cont, Rule "rr" "r" stop, Rule "tr" "t" cont, Rule "ier" "y" cont ])
, ('s', [ Rule "ies" "y" cont, Rule "sis" "s" stop, Rule "is" "" cont, Rule "ness" "" cont
, Rule "ss" "" protect, Rule "ous" "" cont, Rule "us" "" intact, Rule "s" "" contint
, Rule "s" "" protect ])
, ('t', [ Rule "plicat" "ply" stop, Rule "at" "" cont, Rule "ment" "" cont, Rule "ent" "" cont
, Rule "ant" "" cont, Rule "ript" "rib" stop, Rule "orpt" "orb" stop, Rule "duct" "duc" stop
, Rule "sumpt" "sum" stop, Rule "cept" "ceiv" stop, Rule "olut" "olv" stop
, Rule "sist" "" protect, Rule "ist" "" cont, Rule "tt" "t" stop ])
, ('u', [ Rule "iqu" "" stop, Rule "ogu" "og" stop ])
, ('v', [ Rule "siv" "j" cont, Rule "eiv" "" protect, Rule "iv" "" cont ])
, ('y', [ Rule "bly" "bl" cont, Rule "ily" "y" cont, Rule "ply" "" protect, Rule "ly" "" cont
, Rule "ogy" "og" stop, Rule "phy" "ph" stop, Rule "omy" "om" stop, Rule "opy" "op" stop
, Rule "ity" "" cont, Rule "ety" "" cont, Rule "lty" "l" stop, Rule "istry" "" stop
, Rule "ary" "" cont, Rule "ory" "" cont, Rule "ify" "" stop, Rule "ncy" "nt" cont
, Rule "acy" "" cont ])
, ('z', [ Rule "iz" "" cont, Rule "yz" "ys" stop ])
]
-- Returns 'True' if the input character is a vowel.
isVowel :: Char -> Bool
isVowel c = c `elem` vowelsSet
{-# INLINE isVowel #-}
vowelsSet :: String
vowelsSet = "aeiouy"
{-# INLINE vowelsSet #-}
stemIt :: Text -> Text
stemIt inputText = lancasterStemmer inputText rulesPaper
-- Lancaster Stemmer
lancasterStemmer :: Text -> RuleCollection -> Text
lancasterStemmer inputText rules = applyRules (T.toLower inputText) True rules
applyRules :: Text -> Bool -> RuleCollection -> Text
applyRules value isIntact rules =
case T.unsnoc value of
Nothing -> value
Just (_, lastChar) ->
case lookup lastChar rules of
Nothing -> value
Just ruleset -> applyRuleSet value isIntact ruleset
where
applyRuleSet :: Text -> Bool -> [Rule] -> Text
applyRuleSet val _ [] = val
applyRuleSet val isIntact' (rule:rest) =
case ruleApplication value isIntact' rule of
Just res -> res
Nothing -> applyRuleSet val isIntact' rest
ruleApplication :: Text -> Bool -> Rule -> Maybe Text
ruleApplication val isIntact' (Rule m r t) =
if not isIntact' && (t == intact || t == contint)
then Nothing
else case T.stripSuffix m val of
Nothing -> Nothing
Just stem ->
let next = stem `T.append` r
in if not (acceptable next)
then Nothing
else if t == cont || t == contint
then Just $ applyRules next False rules
else Just next
-- | Returns 'True' if a stem is acceptable.
acceptable :: Text -> Bool
acceptable val
| T.null val = False
| otherwise
= if isVowel (T.head val)
then T.length val > 1
else T.length val > 2 && T.any isVowel val
......@@ -323,7 +323,7 @@ flags:
"full-text-search":
"build-search-demo": false
gargantext:
"disable-db-obfuscation-executable": false
"disable-db-obfuscation-executable": true
"no-phylo-debug-logs": false
"test-crypto": false
"generic-deriving":
......@@ -562,6 +562,8 @@ flags:
"tasty-bench":
debug: false
tasty: true
"tasty-golden":
"build-example": false
texmath:
executable: false
server: false
......
1,collab
2,postpart
3,cat
4,cat
5,dog
6,dog
7,run
8,run
9,run
10,jump
11,jump
12,jump
13,swim
14,swim
15,swim
16,fish
17,fish
18,fish
19,eat
20,eat
21,eat
22,talk
23,talk
24,talk
25,walk
26,walk
27,walk
28,dant
29,dant
30,dant
31,sing
32,sing
33,sing
34,play
35,play
36,play
37,work
38,work
39,work
40,teach
41,teach
42,teach
43,learn
44,learn
45,learn
46,read
47,read
48,read
49,writ
50,writ
51,writ
52,paint
53,paint
54,paint
55,draw
56,draw
57,draw
58,speak
59,speak
60,speak
61,think
62,think
63,think
64,see
65,see
66,seen
67,hear
68,hear
69,heard
70,touch
71,touch
72,touch
73,smel
74,smel
75,smel
76,tast
77,tast
78,tast
79,laugh
80,laugh
81,laugh
82,cry
83,cry
84,cri
85,smil
86,smil
87,smil
88,frown
89,frown
90,frown
91,happy
92,happy
93,happiest
94,sad
95,sad
96,saddest
97,angry
98,angry
99,angriest
100,calm
101,calm
102,calmest
103,corrob
module Test.Offline.Stemming.Lancaster where
import Prelude
import Data.ByteString.Char8 qualified as C8
import Data.Text qualified as T
import Gargantext.Core.Text.Terms.Mono.Stem.Lancaster (stemIt)
import Gargantext.Prelude (toS)
import Test.Tasty
import Test.Tasty.Golden (goldenVsStringDiff)
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text.Encoding as TE
tests :: TestTree
tests = testGroup "Lancaster" [
goldenVsStringDiff "test vector works" (\ref new -> ["cabal", "v2-run", "-v0", "garg-golden-file-diff", "--", ref, new]) "test-data/stemming/lancaster.txt" mkTestVector
]
-- | List un /unstemmed/ test words
testWords :: [(Int, T.Text)]
testWords = [
(1, "collaboration")
, (2, "postpartum")
, (3, "cat")
, (4, "cats")
, (5, "dog")
, (6, "dogs")
, (7, "run")
, (8, "running")
, (9, "runner")
, (10, "jump")
, (11, "jumped")
, (12, "jumping")
, (13, "swim")
, (14, "swimming")
, (15, "swimmer")
, (16, "fish")
, (17, "fishing")
, (18, "fisher")
, (19, "eat")
, (20, "eating")
, (21, "eater")
, (22, "talk")
, (23, "talking")
, (24, "talks")
, (25, "walk")
, (26, "walking")
, (27, "walker")
, (28, "dance")
, (29, "dancing")
, (30, "dancer")
, (31, "sing")
, (32, "singing")
, (33, "singer")
, (34, "play")
, (35, "playing")
, (36, "player")
, (37, "work")
, (38, "working")
, (39, "worker")
, (40, "teach")
, (41, "teaching")
, (42, "teacher")
, (43, "learn")
, (44, "learning")
, (45, "learner")
, (46, "read")
, (47, "reading")
, (48, "reader")
, (49, "write")
, (50, "writing")
, (51, "writer")
, (52, "paint")
, (53, "painting")
, (54, "painter")
, (55, "draw")
, (56, "drawing")
, (57, "drawer")
, (58, "speak")
, (59, "speaking")
, (60, "speaker")
, (61, "think")
, (62, "thinking")
, (63, "thinker")
, (64, "see")
, (65, "seeing")
, (66, "seen")
, (67, "hear")
, (68, "hearing")
, (69, "heard")
, (70, "touch")
, (71, "touching")
, (72, "touched")
, (73, "smell")
, (74, "smelling")
, (75, "smelled")
, (76, "taste")
, (77, "tasting")
, (78, "tasted")
, (79, "laugh")
, (80, "laughing")
, (81, "laughed")
, (82, "cry")
, (83, "crying")
, (84, "cried")
, (85, "smile")
, (86, "smiling")
, (87, "smiled")
, (88, "frown")
, (89, "frowning")
, (90, "frowned")
, (91, "happy")
, (92, "happier")
, (93, "happiest")
, (94, "sad")
, (95, "sadder")
, (96, "saddest")
, (97, "angry")
, (98, "angrier")
, (99, "angriest")
, (100, "calm")
, (101, "calmer")
, (102, "calmest")
, (103, "corroborate")
]
mkTestVector :: IO BL.ByteString
mkTestVector = pure $ toS $ C8.unlines (map (\(indx, w) -> (C8.pack $ show indx) <> "," <> TE.encodeUtf8 (stemIt w)) testWords)
......@@ -12,17 +12,18 @@ module Main where
import Gargantext.Prelude
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Utils as Utils
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.Phylo as Phylo
import qualified Test.Parsers.Date as PD
import qualified Test.Utils.Crypto as Crypto
import qualified Test.Utils.Jobs as Jobs
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Utils as Utils
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.Phylo as Phylo
import qualified Test.Offline.Stemming.Lancaster as Lancaster
import qualified Test.Parsers.Date as PD
import qualified Test.Utils.Crypto as Crypto
import qualified Test.Utils.Jobs as Jobs
import Test.Tasty
import Test.Tasty.Hspec
......@@ -50,4 +51,5 @@ main = do
, JSON.tests
, Errors.tests
, Phylo.tests
, testGroup "Stemming" [ Lancaster.tests ]
]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment