Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
160
Issues
160
List
Board
Labels
Milestones
Merge Requests
14
Merge Requests
14
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
43901901
Commit
43901901
authored
Oct 14, 2024
by
Alfredo Di Napoli
2
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix bug in GargPorter implementation
parent
d44831d0
Pipeline
#6830
failed with stages
in 45 minutes and 57 seconds
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
230 additions
and
222 deletions
+230
-222
GargPorter.hs
...rgantext/Core/Text/Terms/Mono/Stem/Internal/GargPorter.hs
+226
-221
Table.hs
test/Test/API/Private/Table.hs
+2
-1
DocumentSearch.hs
test/Test/Database/Operations/DocumentSearch.hs
+2
-0
No files found.
src/Gargantext/Core/Text/Terms/Mono/Stem/Internal/GargPorter.hs
View file @
43901901
{-|
Module : Gargantext.Core.Text.Ngrams.Stem.En
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
module
Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter
(
stem
)
where
import
Control.Monad
import
Data.Either
import
Data.List
((
!!
))
import
Data.List
qualified
as
List
hiding
(
map
,
head
)
import
Data.Maybe
import
Data.Text
(
pack
,
unpack
)
import
Gargantext.Prelude
vowels
::
[
Char
]
vowels
=
[
'a'
,
'e'
,
'i'
,
'o'
,
'u'
]
isConsonant
::
[
Char
]
->
Int
->
Bool
isConsonant
str
i
|
c
`
elem
`
vowels
=
False
|
c
==
'y'
=
i
==
0
||
isVowel
str
(
i
-
1
)
|
otherwise
=
True
where
c
=
str
!!
i
isVowel
::
[
Char
]
->
Int
->
Bool
isVowel
=
(
not
.
)
.
isConsonant
byIndex
::
Foldable
t1
=>
(
t1
a
->
[
Int
]
->
t2
)
->
t1
a
->
t2
byIndex
fun
str
=
fun
str
[
0
..
length
str
-
1
]
containsVowel
::
[
Char
]
->
Bool
containsVowel
=
byIndex
(
any
.
isVowel
)
-- | /!\ unsafe fromJust
measure
::
[
Char
]
->
Int
measure
=
length
.
filter
not
.
List
.
init
.
(
True
:
)
.
map
fromJust
.
map
head
.
List
.
group
.
byIndex
(
map
.
isConsonant
)
endsWithDouble
::
[
Char
]
->
Bool
endsWithDouble
=
startsWithDouble
.
reverse
where
startsWithDouble
l
=
case
l
of
(
x
:
y
:
_
)
->
x
==
y
&&
x
`
List
.
notElem
`
vowels
_
->
False
cvc
::
[
Char
]
->
Bool
cvc
word
|
length
word
<
3
=
False
|
otherwise
=
isConsonant
word
lastIndex
&&
isVowel
word
(
lastIndex
-
1
)
&&
isConsonant
word
(
lastIndex
-
2
)
&&
List
.
last
word
`
List
.
notElem
`
[
'w'
,
'x'
,
'y'
]
where
lastIndex
=
length
word
-
1
statefulReplace
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
(
Data
.
Either
.
Either
[
a
]
[
a
])
statefulReplace
predicate
str
end
replacement
|
end
`
List
.
isSuffixOf
`
str
=
Just
replaced
|
otherwise
=
Nothing
where
part
=
take
(
length
str
-
length
end
)
str
replaced
|
predicate
part
=
Right
(
part
<>
replacement
)
|
otherwise
=
Left
str
replaceEnd
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
[
a
]
replaceEnd
predicate
str
end
replacement
=
do
result
<-
statefulReplace
predicate
str
end
replacement
pure
(
either
identity
identity
result
)
findStem
::
(
Foldable
t
,
Functor
t
,
Eq
a
)
=>
([
a
]
->
Bool
)
->
[
a
]
->
t
([
a
],
[
a
])
->
Maybe
[
a
]
findStem
f
word
pairs
=
msum
$
map
(
uncurry
(
replaceEnd
f
word
))
pairs
measureGT
::
Int
->
[
Char
]
->
Bool
measureGT
=
flip
((
>
)
.
measure
)
step1a
::
[
Char
]
->
[
Char
]
step1a
word
=
fromMaybe
word
result
where
result
=
findStem
(
const
True
)
word
suffixes
suffixes
=
[(
"sses"
,
"ss"
),
(
"ies"
,
"i"
),
(
"ss"
,
"ss"
),
(
"s"
,
""
)]
beforeStep1b
::
[
Char
]
->
Either
[
Char
]
[
Char
]
beforeStep1b
word
=
fromMaybe
(
Left
word
)
result
where
cond23
x
=
do
{
v
<-
x
;
either
(
const
Nothing
)
(
return
.
Right
)
v
}
cond1
x
=
do
{
v
<-
x
;
pure
(
Left
v
)
}
result
=
cond1
(
replaceEnd
(
measureGT
0
)
word
"eed"
"ee"
)
`
mplus
`
cond23
(
statefulReplace
containsVowel
word
"ed"
""
)
`
mplus
`
cond23
(
statefulReplace
containsVowel
word
"ing"
""
)
afterStep1b
::
[
Char
]
->
[
Char
]
afterStep1b
word
=
fromMaybe
word
result
where
double
=
endsWithDouble
word
&&
not
(
any
((`
List
.
isSuffixOf
`
word
)
.
return
)
[
'l'
,
's'
,
'z'
])
mEq1AndCvc
=
measure
word
==
1
&&
cvc
word
iif
cond
val
=
if
cond
then
Just
val
else
Nothing
result
=
findStem
(
const
True
)
word
[(
"at"
,
"ate"
),
(
"bl"
,
"ble"
),
(
"iz"
,
"ize"
)]
`
mplus
`
iif
double
(
List
.
init
word
)
`
mplus
`
iif
mEq1AndCvc
(
word
<>
"e"
)
step1b
::
[
Char
]
->
[
Char
]
step1b
=
either
identity
afterStep1b
.
beforeStep1b
step1c
::
[
Char
]
->
[
Char
]
step1c
word
=
fromMaybe
word
result
where
result
=
replaceEnd
containsVowel
word
"y"
"i"
step1
::
[
Char
]
->
[
Char
]
step1
=
step1c
.
step1b
.
step1a
step2
::
[
Char
]
->
[
Char
]
step2
word
=
fromMaybe
word
result
where
result
=
findStem
(
measureGT
0
)
word
[
(
"ational"
,
"ate"
)
,
(
"tional"
,
"tion"
)
,
(
"enci"
,
"ence"
)
,
(
"anci"
,
"ance"
)
,
(
"izer"
,
"ize"
)
,
(
"bli"
,
"ble"
)
,
(
"alli"
,
"al"
)
,
(
"entli"
,
"ent"
)
,
(
"eli"
,
"e"
)
,
(
"ousli"
,
"ous"
)
,
(
"ization"
,
"ize"
)
,
(
"ation"
,
"ate"
)
,
(
"ator"
,
"ate"
)
,
(
"alism"
,
"al"
)
,
(
"iveness"
,
"ive"
)
,
(
"fulness"
,
"ful"
)
,
(
"ousness"
,
"ous"
)
,
(
"aliti"
,
"al"
)
,
(
"iviti"
,
"ive"
)
,
(
"biliti"
,
"ble"
)
,
(
"logi"
,
"log"
)
]
step3
::
[
Char
]
->
[
Char
]
step3
word
=
fromMaybe
word
result
where
result
=
findStem
(
measureGT
0
)
word
[
(
"icate"
,
"ic"
)
,
(
"ative"
,
""
)
,
(
"alize"
,
"al"
)
,
(
"iciti"
,
"ic"
)
,
(
"ical"
,
"ic"
)
,
(
"ful"
,
""
)
,
(
"ness"
,
""
)
]
step4
::
[
Char
]
->
[
Char
]
step4
word
=
fromMaybe
word
result
where
gt1andST
str
=
(
measureGT
1
)
str
&&
any
((`
List
.
isSuffixOf
`
str
)
.
return
)
[
's'
,
't'
]
findGT1
=
findStem
(
measureGT
1
)
word
.
map
(
flip
(,)
""
)
result
=
(
findGT1
[
"al"
,
"ance"
,
"ence"
,
"er"
,
"ic"
,
"able"
,
"ible"
,
"ant"
,
"ement"
,
"ment"
,
"ent"
])
`
mplus
`
(
findStem
gt1andST
word
[(
"ion"
,
""
)])
`
mplus
`
(
findGT1
[
"ou"
,
"ism"
,
"ate"
,
"iti"
,
"ous"
,
"ive"
,
"ize"
])
step5a
::
[
Char
]
->
[
Char
]
step5a
word
=
fromMaybe
word
result
where
test
str
=
(
measureGT
1
str
)
||
((
measure
str
==
1
)
&&
(
not
$
cvc
str
))
result
=
replaceEnd
test
word
"e"
""
step5b
::
[
Char
]
->
[
Char
]
step5b
word
=
fromMaybe
word
result
where
cond
s
=
List
.
last
s
==
'l'
&&
measureGT
1
s
result
=
replaceEnd
cond
word
"l"
""
step5
::
[
Char
]
->
[
Char
]
step5
=
step5b
.
step5a
allSteps
::
[
Char
]
->
[
Char
]
allSteps
=
step5
.
step4
.
step3
.
step2
.
step1
stem
::
Text
->
Text
stem
s
=
pack
(
stem'
$
unpack
s
)
stem'
::
[
Char
]
->
[
Char
]
stem'
s
|
length
s
<
3
=
s
|
otherwise
=
allSteps
s
--fixpoint :: Eq t => (t -> t) -> t -> t
--fixpoint f x = let fx = f x in
-- if fx == x
-- then x
-- else fixpoint f fx
--
--fixstem :: [Char] -> [Char]
--fixstem = fixpoint stem'
{-
main :: IO ()
main = do
content <- readFile "input.txt"
writeFile "output.txt" $ unlines $ map stem $ lines content
-}
{-|
Module : Gargantext.Core.Text.Ngrams.Stem.En
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
module
Gargantext.Core.Text.Terms.Mono.Stem.Internal.GargPorter
(
stem
)
where
import
Control.Monad
import
Data.Either
import
Data.List
((
!!
))
import
Data.List
qualified
as
List
hiding
(
map
,
head
)
import
Data.Maybe
import
Data.Text
(
pack
,
unpack
)
import
Gargantext.Prelude
vowels
::
[
Char
]
vowels
=
[
'a'
,
'e'
,
'i'
,
'o'
,
'u'
]
isConsonant
::
[
Char
]
->
Int
->
Bool
isConsonant
str
i
|
c
`
elem
`
vowels
=
False
|
c
==
'y'
=
i
==
0
||
isVowel
str
(
i
-
1
)
|
otherwise
=
True
where
c
=
str
!!
i
isVowel
::
[
Char
]
->
Int
->
Bool
isVowel
=
(
not
.
)
.
isConsonant
byIndex
::
Foldable
t1
=>
(
t1
a
->
[
Int
]
->
t2
)
->
t1
a
->
t2
byIndex
fun
str
=
fun
str
[
0
..
length
str
-
1
]
containsVowel
::
[
Char
]
->
Bool
containsVowel
=
byIndex
(
any
.
isVowel
)
-- | /!\ unsafe fromJust
measure
::
[
Char
]
->
Int
measure
=
length
.
filter
not
.
List
.
init
.
(
True
:
)
.
map
fromJust
.
map
head
.
List
.
group
.
byIndex
(
map
.
isConsonant
)
endsWithDouble
::
[
Char
]
->
Bool
endsWithDouble
=
startsWithDouble
.
reverse
where
startsWithDouble
l
=
case
l
of
(
x
:
y
:
_
)
->
x
==
y
&&
x
`
List
.
notElem
`
vowels
_
->
False
cvc
::
[
Char
]
->
Bool
cvc
word
|
length
word
<
3
=
False
|
otherwise
=
isConsonant
word
lastIndex
&&
isVowel
word
(
lastIndex
-
1
)
&&
isConsonant
word
(
lastIndex
-
2
)
&&
List
.
last
word
`
List
.
notElem
`
[
'w'
,
'x'
,
'y'
]
where
lastIndex
=
length
word
-
1
statefulReplace
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
(
Data
.
Either
.
Either
[
a
]
[
a
])
statefulReplace
predicate
str
end
replacement
|
end
`
List
.
isSuffixOf
`
str
=
Just
replaced
|
otherwise
=
Nothing
where
part
=
take
(
length
str
-
length
end
)
str
replaced
|
predicate
part
=
Right
(
part
<>
replacement
)
|
otherwise
=
Left
str
replaceEnd
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
[
a
]
replaceEnd
predicate
str
end
replacement
=
do
result
<-
statefulReplace
predicate
str
end
replacement
pure
(
either
identity
identity
result
)
findStem
::
(
Foldable
t
,
Functor
t
,
Eq
a
)
=>
([
a
]
->
Bool
)
->
[
a
]
->
t
([
a
],
[
a
])
->
Maybe
[
a
]
findStem
f
word
pairs
=
msum
$
map
(
uncurry
(
replaceEnd
f
word
))
pairs
measureGT
::
Int
->
[
Char
]
->
Bool
measureGT
=
flip
((
>
)
.
measure
)
step1a
::
[
Char
]
->
[
Char
]
step1a
word
=
fromMaybe
word
result
where
result
=
findStem
(
const
True
)
word
suffixes
suffixes
=
[(
"sses"
,
"ss"
),
(
"ies"
,
"i"
),
(
"ss"
,
"ss"
),
(
"s"
,
""
)]
beforeStep1b
::
[
Char
]
->
Either
[
Char
]
[
Char
]
beforeStep1b
word
=
fromMaybe
(
Left
word
)
result
where
cond23
x
=
do
{
v
<-
x
;
either
(
const
Nothing
)
(
return
.
Right
)
v
}
cond1
x
=
do
{
v
<-
x
;
pure
(
Left
v
)
}
result
=
cond1
(
replaceEnd
(
measureGT
0
)
word
"eed"
"ee"
)
`
mplus
`
cond23
(
statefulReplace
containsVowel
word
"ed"
""
)
`
mplus
`
cond23
(
statefulReplace
containsVowel
word
"ing"
""
)
afterStep1b
::
[
Char
]
->
[
Char
]
afterStep1b
word
=
fromMaybe
word
result
where
double
=
endsWithDouble
word
&&
not
(
any
((`
List
.
isSuffixOf
`
word
)
.
return
)
[
'l'
,
's'
,
'z'
])
mEq1AndCvc
=
measure
word
==
1
&&
cvc
word
iif
cond
val
=
if
cond
then
Just
val
else
Nothing
result
=
findStem
(
const
True
)
word
[(
"at"
,
"ate"
),
(
"bl"
,
"ble"
),
(
"iz"
,
"ize"
)]
`
mplus
`
iif
double
(
List
.
init
word
)
`
mplus
`
iif
mEq1AndCvc
(
word
<>
"e"
)
step1b
::
[
Char
]
->
[
Char
]
step1b
=
either
identity
afterStep1b
.
beforeStep1b
-- Issue #415: According to the Porter stemming rules, we need to replace `y` with `i` only if there
-- are no other vocals at the end.
step1c
::
[
Char
]
->
[
Char
]
step1c
word
|
length
word
>
2
&&
List
.
last
word
==
'y'
&&
isConsonant
word
(
List
.
length
word
-
2
)
=
fromMaybe
word
$
replaceEnd
containsVowel
word
"y"
"i"
|
otherwise
=
word
step1
::
[
Char
]
->
[
Char
]
step1
=
step1c
.
step1b
.
step1a
step2
::
[
Char
]
->
[
Char
]
step2
word
=
fromMaybe
word
result
where
result
=
findStem
(
measureGT
0
)
word
[
(
"ational"
,
"ate"
)
,
(
"tional"
,
"tion"
)
,
(
"enci"
,
"ence"
)
,
(
"anci"
,
"ance"
)
,
(
"izer"
,
"ize"
)
,
(
"bli"
,
"ble"
)
,
(
"alli"
,
"al"
)
,
(
"entli"
,
"ent"
)
,
(
"eli"
,
"e"
)
,
(
"ousli"
,
"ous"
)
,
(
"ization"
,
"ize"
)
,
(
"ation"
,
"ate"
)
,
(
"ator"
,
"ate"
)
,
(
"alism"
,
"al"
)
,
(
"iveness"
,
"ive"
)
,
(
"fulness"
,
"ful"
)
,
(
"ousness"
,
"ous"
)
,
(
"aliti"
,
"al"
)
,
(
"iviti"
,
"ive"
)
,
(
"biliti"
,
"ble"
)
,
(
"logi"
,
"log"
)
]
step3
::
[
Char
]
->
[
Char
]
step3
word
=
fromMaybe
word
result
where
result
=
findStem
(
measureGT
0
)
word
[
(
"icate"
,
"ic"
)
,
(
"ative"
,
""
)
,
(
"alize"
,
"al"
)
,
(
"iciti"
,
"ic"
)
,
(
"ical"
,
"ic"
)
,
(
"ful"
,
""
)
,
(
"ness"
,
""
)
]
step4
::
[
Char
]
->
[
Char
]
step4
word
=
fromMaybe
word
result
where
gt1andST
str
=
(
measureGT
1
)
str
&&
any
((`
List
.
isSuffixOf
`
str
)
.
return
)
[
's'
,
't'
]
findGT1
=
findStem
(
measureGT
1
)
word
.
map
(
flip
(,)
""
)
result
=
(
findGT1
[
"al"
,
"ance"
,
"ence"
,
"er"
,
"ic"
,
"able"
,
"ible"
,
"ant"
,
"ement"
,
"ment"
,
"ent"
])
`
mplus
`
(
findStem
gt1andST
word
[(
"ion"
,
""
)])
`
mplus
`
(
findGT1
[
"ou"
,
"ism"
,
"ate"
,
"iti"
,
"ous"
,
"ive"
,
"ize"
])
step5a
::
[
Char
]
->
[
Char
]
step5a
word
=
fromMaybe
word
result
where
test
str
=
(
measureGT
1
str
)
||
((
measure
str
==
1
)
&&
(
not
$
cvc
str
))
result
=
replaceEnd
test
word
"e"
""
step5b
::
[
Char
]
->
[
Char
]
step5b
word
=
fromMaybe
word
result
where
cond
s
=
List
.
last
s
==
'l'
&&
measureGT
1
s
result
=
replaceEnd
cond
word
"l"
""
step5
::
[
Char
]
->
[
Char
]
step5
=
step5b
.
step5a
allSteps
::
[
Char
]
->
[
Char
]
allSteps
=
step5
.
step4
.
step3
.
step2
.
step1
stem
::
Text
->
Text
stem
s
=
pack
(
stem'
$
unpack
s
)
stem'
::
[
Char
]
->
[
Char
]
stem'
s
|
length
s
<
3
=
s
|
otherwise
=
allSteps
s
--fixpoint :: Eq t => (t -> t) -> t -> t
--fixpoint f x = let fx = f x in
-- if fx == x
-- then x
-- else fixpoint f fx
--
--fixstem :: [Char] -> [Char]
--fixstem = fixpoint stem'
{-
main :: IO ()
main = do
content <- readFile "input.txt"
writeFile "output.txt" $ unlines $ map stem $ lines content
-}
test/Test/API/Private/Table.hs
View file @
43901901
...
...
@@ -52,6 +52,7 @@ tests = sequential $ aroundAll withTestDBAndPort $ do
let
corpusId
=
_sctx_data
withApplication
_sctx_app
$
do
withValidLogin
_sctx_port
"alice"
(
GargPassword
"alice"
)
$
\
clientEnv
token
->
do
liftIO
$
do
(
HashedResponse
_
tr1
)
<-
checkEither
$
runClientM
(
get_table
token
...
...
@@ -63,8 +64,8 @@ tests = sequential $ aroundAll withTestDBAndPort $ do
(
Just
$
RawQuery
"soy"
)
Nothing
)
clientEnv
length
(
tr_docs
tr1
)
`
shouldBe
`
4
length
(
tr_docs
tr1
)
`
shouldBe
`
3
createSoySauceCorpus
::
SpecContext
()
->
IO
(
SpecContext
CorpusId
)
createSoySauceCorpus
ctx
@
SpecContext
{
..
}
=
do
...
...
test/Test/Database/Operations/DocumentSearch.hs
View file @
43901901
...
...
@@ -135,6 +135,8 @@ stemmingTest :: TestEnv -> Assertion
stemmingTest
_env
=
do
stem
EN
GargPorterAlgorithm
"Ajeje"
`
shouldBe
`
"Ajeje"
stem
EN
GargPorterAlgorithm
"PyPlasm:"
`
shouldBe
`
"PyPlasm:"
stem
EN
GargPorterAlgorithm
"soy"
`
shouldBe
`
"soy"
stem
EN
GargPorterAlgorithm
"cry"
`
shouldBe
`
"cri"
-- This test outlines the main differences between Porter and Lancaster.
stem
EN
GargPorterAlgorithm
"dancer"
`
shouldBe
`
"dancer"
stem
EN
LancasterAlgorithm
"dancer"
`
shouldBe
`
"dant"
...
...
Przemyslaw Kaminski
@cgenie
mentioned in commit
84a773b4
·
Oct 16, 2024
mentioned in commit
84a773b4
mentioned in commit 84a773b4b001adde02d6543c0ed7d16acd289079
Toggle commit list
Przemyslaw Kaminski
@cgenie
mentioned in commit
83c8708f
·
Nov 07, 2024
mentioned in commit
83c8708f
mentioned in commit 83c8708f08e563243a0ff361e51a46c7d7822bb7
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment