Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
145
Issues
145
List
Board
Labels
Milestones
Merge Requests
6
Merge Requests
6
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
9ef8d007
Commit
9ef8d007
authored
May 02, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT/STEM] implementing Porter lib into Gargantext for English language.
parent
dbff4c96
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
135 additions
and
26 deletions
+135
-26
package.yaml
package.yaml
+1
-0
Ngrams.hs
src/Gargantext/Ngrams.hs
+43
-5
CoreNLP.hs
src/Gargantext/Ngrams/CoreNLP.hs
+1
-1
En.hs
src/Gargantext/Ngrams/Stem/En.hs
+88
-19
Prelude.hs
src/Gargantext/Prelude.hs
+2
-1
No files found.
package.yaml
View file @
9ef8d007
...
...
@@ -40,6 +40,7 @@ library:
-
Gargantext.Ngrams.CoreNLP
-
Gargantext.Ngrams.Parser
-
Gargantext.Ngrams.Lang.En
-
Gargantext.Ngrams.Stem.En
-
Gargantext.Ngrams.Lang.Fr
-
Gargantext.Ngrams.Metrics
-
Gargantext.Ngrams.TextMining
...
...
src/Gargantext/Ngrams.hs
View file @
9ef8d007
...
...
@@ -23,8 +23,9 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
,
module
Gargantext
.
Ngrams
.
Occurrences
,
module
Gargantext
.
Ngrams
.
TextMining
,
module
Gargantext
.
Ngrams
.
Metrics
,
Ngrams
(
..
),
ngrams
,
occ
,
sumOcc
,
text2fis
,
ListName
(
..
),
equivNgrams
,
isGram
,
Ngrams
(
..
),
ngrams
,
occ
,
sumOcc
,
text2fis
,
clean
,
ListName
(
..
),
equivNgrams
,
isGram
,
sentences
,
ngramsTest
--, module Gargantext.Ngrams.Words
)
where
...
...
@@ -43,7 +44,10 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import
Data.List
(
sort
)
import
Data.Char
(
Char
,
isAlphaNum
,
isSpace
)
import
Data.Text
(
Text
,
words
,
filter
,
toLower
)
import
Data.Text
(
Text
,
filter
,
toLower
,
split
,
lines
,
concat
)
import
qualified
Data.Text
as
DT
import
Data.Text.IO
(
readFile
)
import
Data.Map.Strict
(
Map
,
empty
,
insertWith
,
unionWith
...
...
@@ -80,11 +84,19 @@ type Occ = Int
ngrams
::
Text
->
[
Text
]
ngrams
xs
=
monograms
$
toLower
$
filter
isGram
xs
clean
::
Text
->
Text
clean
txt
=
DT
.
map
clean'
txt
where
clean'
'’'
=
'
\'
'
clean'
c
=
c
monograms
::
Text
->
[
Text
]
monograms
=
words
monograms
txt
=
split
isWord
txt
where
isWord
c
=
c
`
elem
`
[
' '
,
'
\'
'
,
','
,
';'
]
isGram
::
Char
->
Bool
isGram
c
=
isAlphaNum
c
||
isSpace
c
||
c
`
elem
`
[
'-'
,
'/'
]
isGram
c
=
isAlphaNum
c
||
isSpace
c
||
c
`
elem
`
[
'-'
,
'/'
,
'
\'
'
]
-- | Compute the occurrences (occ)
occ
::
Ord
a
=>
[
a
]
->
Map
a
Occ
...
...
@@ -129,4 +141,30 @@ text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences
::
Text
->
[
Text
]
sentences
txt
=
split
isStop
txt
isStop
::
Char
->
Bool
isStop
c
=
c
`
elem
`
[
'.'
,
'?'
,
'!'
]
-- | Tests
-- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
ngramsTest
=
ws
where
txt
=
concat
<$>
lines
<$>
clean
<$>
readFile
"Giono-arbres.txt"
-- | Number of sentences
ls
=
sentences
<$>
txt
-- | Number of monograms used in the full text
ws
=
ngrams
<$>
txt
-- | stem ngrams
-- TODO
-- group ngrams
ocs
=
occ
<$>
ws
src/Gargantext/Ngrams/CoreNLP.hs
View file @
9ef8d007
src/Gargantext/Ngrams/Stem/En.hs
View file @
9ef8d007
module
Language.Porter
(
stem
,
fixstem
)
where
{-|
Module : Gargantext.
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Ngrams.Stem.En
where
import
Control.Monad
import
Data.Either
import
Data.Maybe
import
Data.List
import
Data.Text
(
Text
(),
pack
,
unpack
)
import
Data.List
hiding
(
map
,
head
)
import
Gargantext.Prelude
vowels
::
[
Char
]
vowels
=
[
'a'
,
'e'
,
'i'
,
'o'
,
'u'
]
isConsonant
::
[
Char
]
->
Int
->
Bool
isConsonant
str
i
|
c
`
elem
`
"aeiou"
=
False
|
c
`
elem
`
vowels
=
False
|
c
==
'y'
=
i
==
0
||
isVowel
str
(
i
-
1
)
|
otherwise
=
True
where
c
=
str
!!
i
isVowel
::
[
Char
]
->
Int
->
Bool
isVowel
=
(
not
.
)
.
isConsonant
byIndex
::
Foldable
t1
=>
(
t1
a
->
[
Int
]
->
t2
)
->
t1
a
->
t2
byIndex
fun
str
=
fun
str
[
0
..
length
str
-
1
]
measure
=
length
.
filter
not
.
init
.
(
True
:
)
.
map
head
.
group
.
byIndex
(
map
.
isConsonant
)
containsVowel
::
[
Char
]
->
Bool
containsVowel
=
byIndex
(
any
.
isVowel
)
-- | /!\ unsafe fromJust
measure
::
[
Char
]
->
Int
measure
=
length
.
filter
not
.
init
.
(
True
:
)
.
map
fromJust
.
map
head
.
group
.
byIndex
(
map
.
isConsonant
)
endsWithDouble
::
[
Char
]
->
Bool
endsWithDouble
=
startsWithDouble
.
reverse
where
startsWithDouble
l
|
length
l
<
2
=
False
|
otherwise
=
let
(
x
:
y
:
_
)
=
l
in
x
==
y
&&
x
`
notElem
`
"aeiou"
startsWithDouble
l
=
case
l
of
(
x
:
y
:
_
)
->
x
==
y
&&
x
`
notElem
`
vowels
_
->
False
cvc
::
[
Char
]
->
Bool
cvc
word
|
length
word
<
3
=
False
|
otherwise
=
isConsonant
word
lastIndex
&&
isVowel
word
(
lastIndex
-
1
)
&&
isConsonant
word
(
lastIndex
-
2
)
&&
last
word
`
notElem
`
"wxy"
last
word
`
notElem
`
[
'w'
,
'x'
,
'y'
]
where
lastIndex
=
length
word
-
1
statefulReplace
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
(
Data
.
Either
.
Either
[
a
]
[
a
])
statefulReplace
predicate
str
end
replacement
|
end
`
isSuffixOf
`
str
=
Just
replaced
|
otherwise
=
Nothing
...
...
@@ -40,17 +82,26 @@ statefulReplace predicate str end replacement
replaced
|
predicate
part
=
Right
(
part
++
replacement
)
|
otherwise
=
Left
str
replaceEnd
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
[
a
]
replaceEnd
predicate
str
end
replacement
=
do
result
<-
statefulReplace
predicate
str
end
replacement
return
(
either
id
id
result
)
return
(
either
id
entity
identity
result
)
findStem
::
(
Foldable
t
,
Functor
t
,
Eq
a
)
=>
([
a
]
->
Bool
)
->
[
a
]
->
t
([
a
],
[
a
])
->
Maybe
[
a
]
findStem
f
word
pairs
=
msum
$
map
(
uncurry
(
replaceEnd
f
word
))
pairs
measureGT
::
Int
->
[
Char
]
->
Bool
measureGT
=
flip
((
>
)
.
measure
)
step1a
::
[
Char
]
->
[
Char
]
step1a
word
=
fromMaybe
word
result
where
result
=
findStem
(
const
True
)
word
[(
"sses"
,
"ss"
),
(
"ies"
,
"i"
),
(
"ss"
,
"ss"
),
(
"s"
,
""
)]
where
result
=
findStem
(
const
True
)
word
suffixes
suffixes
=
[(
"sses"
,
"ss"
),
(
"ies"
,
"i"
),
(
"ss"
,
"ss"
),
(
"s"
,
""
)]
beforeStep1b
::
[
Char
]
->
Either
[
Char
]
[
Char
]
beforeStep1b
word
=
fromMaybe
(
Left
word
)
result
where
cond23
x
=
do
{
v
<-
x
;
either
(
const
Nothing
)
(
return
.
Right
)
v
}
...
...
@@ -60,22 +111,27 @@ beforeStep1b word = fromMaybe (Left word) result
cond23
(
statefulReplace
containsVowel
word
"ed"
""
)
`
mplus
`
cond23
(
statefulReplace
containsVowel
word
"ing"
""
)
afterStep1b
::
[
Char
]
->
[
Char
]
afterStep1b
word
=
fromMaybe
word
result
where
double
=
endsWithDouble
word
&&
not
(
any
((`
isSuffixOf
`
word
)
.
return
)
"lsz"
)
double
=
endsWithDouble
word
&&
not
(
any
((`
isSuffixOf
`
word
)
.
return
)
[
'l'
,
's'
,
'z'
]
)
mEq1AndCvc
=
measure
word
==
1
&&
cvc
word
iif
cond
val
=
if
cond
then
Just
val
else
Nothing
result
=
findStem
(
const
True
)
word
[(
"at"
,
"ate"
),
(
"bl"
,
"ble"
),
(
"iz"
,
"ize"
)]
`
mplus
`
iif
double
(
init
word
)
`
mplus
`
iif
mEq1AndCvc
(
word
++
"e"
)
step1b
=
either
id
afterStep1b
.
beforeStep1b
step1b
::
[
Char
]
->
[
Char
]
step1b
=
either
identity
afterStep1b
.
beforeStep1b
step1c
::
[
Char
]
->
[
Char
]
step1c
word
=
fromMaybe
word
result
where
result
=
replaceEnd
containsVowel
word
"y"
"i"
step1
::
[
Char
]
->
[
Char
]
step1
=
step1c
.
step1b
.
step1a
step2
::
[
Char
]
->
[
Char
]
step2
word
=
fromMaybe
word
result
where
result
=
findStem
(
measureGT
0
)
word
...
...
@@ -101,6 +157,7 @@ step2 word = fromMaybe word result
,
(
"biliti"
,
"ble"
)
,
(
"logi"
,
"log"
)
]
step3
::
[
Char
]
->
[
Char
]
step3
word
=
fromMaybe
word
result
where
result
=
findStem
(
measureGT
0
)
word
...
...
@@ -112,37 +169,49 @@ step3 word = fromMaybe word result
,
(
"ful"
,
""
)
,
(
"ness"
,
""
)
]
step4
::
[
Char
]
->
[
Char
]
step4
word
=
fromMaybe
word
result
where
gt1andST
str
=
(
measureGT
1
)
str
&&
any
((`
isSuffixOf
`
str
)
.
return
)
"st"
gt1andST
str
=
(
measureGT
1
)
str
&&
any
((`
isSuffixOf
`
str
)
.
return
)
[
's'
,
't'
]
findGT1
=
findStem
(
measureGT
1
)
word
.
map
(
flip
(,)
""
)
result
=
(
findGT1
[
"al"
,
"ance"
,
"ence"
,
"er"
,
"ic"
,
"able"
,
"ible"
,
"ant"
,
"ement"
,
"ment"
,
"ent"
])
`
mplus
`
(
findStem
gt1andST
word
[(
"ion"
,
""
)])
`
mplus
`
(
findGT1
[
"ou"
,
"ism"
,
"ate"
,
"iti"
,
"ous"
,
"ive"
,
"ize"
])
step5a
::
[
Char
]
->
[
Char
]
step5a
word
=
fromMaybe
word
result
where
test
str
=
(
measureGT
1
str
)
||
((
measure
str
==
1
)
&&
(
not
$
cvc
str
))
result
=
replaceEnd
test
word
"e"
""
step5b
::
[
Char
]
->
[
Char
]
step5b
word
=
fromMaybe
word
result
where
cond
s
=
last
s
==
'l'
&&
measureGT
1
s
result
=
replaceEnd
cond
word
"l"
""
step5
::
[
Char
]
->
[
Char
]
step5
=
step5b
.
step5a
allSteps
::
[
Char
]
->
[
Char
]
allSteps
=
step5
.
step4
.
step3
.
step2
.
step1
stem
s
|
length
s
<
3
=
s
stem
::
Text
->
Text
stem
s
=
pack
(
stem'
$
unpack
s
)
stem'
::
[
Char
]
->
[
Char
]
stem'
s
|
length
s
<
3
=
s
|
otherwise
=
allSteps
s
fixpoint
::
Eq
t
=>
(
t
->
t
)
->
t
->
t
fixpoint
f
x
=
let
fx
=
f
x
in
if
fx
==
x
then
x
else
fixpoint
f
fx
fixstem
=
fixpoint
stem
fixstem
::
[
Char
]
->
[
Char
]
fixstem
=
fixpoint
stem'
{-
...
...
src/Gargantext/Prelude.hs
View file @
9ef8d007
...
...
@@ -42,8 +42,9 @@ import Protolude ( Bool(True, False), Int, Double, Integer
,
Eq
,
(
==
),
(
>=
),
(
<=
),
(
<>
),
(
/=
)
,
(
&&
),
(
||
),
not
,
fst
,
snd
,
toS
,
elem
,
die
,
mod
,
div
,
elem
,
die
,
mod
,
div
,
const
,
curry
,
uncurry
,
otherwise
)
-- TODO import functions optimized in Utils.Count
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment