Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
7ff65f23
Commit
7ff65f23
authored
May 02, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT/STEM] implemenging Porter lib into Gargantext for English language.
parent
ba5c75fb
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
135 additions
and
26 deletions
+135
-26
package.yaml
package.yaml
+1
-0
Ngrams.hs
src/Gargantext/Ngrams.hs
+43
-5
CoreNLP.hs
src/Gargantext/Ngrams/CoreNLP.hs
+1
-1
En.hs
src/Gargantext/Ngrams/Stem/En.hs
+88
-19
Prelude.hs
src/Gargantext/Prelude.hs
+2
-1
No files found.
package.yaml
View file @
7ff65f23
...
...
@@ -40,6 +40,7 @@ library:
-
Gargantext.Ngrams.CoreNLP
-
Gargantext.Ngrams.Parser
-
Gargantext.Ngrams.Lang.En
-
Gargantext.Ngrams.Stem.En
-
Gargantext.Ngrams.Lang.Fr
-
Gargantext.Ngrams.Metrics
-
Gargantext.Ngrams.TextMining
...
...
src/Gargantext/Ngrams.hs
View file @
7ff65f23
...
...
@@ -23,8 +23,9 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
,
module
Gargantext
.
Ngrams
.
Occurrences
,
module
Gargantext
.
Ngrams
.
TextMining
,
module
Gargantext
.
Ngrams
.
Metrics
,
Ngrams
(
..
),
ngrams
,
occ
,
sumOcc
,
text2fis
,
ListName
(
..
),
equivNgrams
,
isGram
,
Ngrams
(
..
),
ngrams
,
occ
,
sumOcc
,
text2fis
,
clean
,
ListName
(
..
),
equivNgrams
,
isGram
,
sentences
,
ngramsTest
--, module Gargantext.Ngrams.Words
)
where
...
...
@@ -43,7 +44,10 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import
Data.List
(
sort
)
import
Data.Char
(
Char
,
isAlphaNum
,
isSpace
)
import
Data.Text
(
Text
,
words
,
filter
,
toLower
)
import
Data.Text
(
Text
,
filter
,
toLower
,
split
,
lines
,
concat
)
import
qualified
Data.Text
as
DT
import
Data.Text.IO
(
readFile
)
import
Data.Map.Strict
(
Map
,
empty
,
insertWith
,
unionWith
...
...
@@ -80,11 +84,19 @@ type Occ = Int
ngrams
::
Text
->
[
Text
]
ngrams
xs
=
monograms
$
toLower
$
filter
isGram
xs
clean
::
Text
->
Text
clean
txt
=
DT
.
map
clean'
txt
where
clean'
'’'
=
'
\'
'
clean'
c
=
c
monograms
::
Text
->
[
Text
]
monograms
=
words
monograms
txt
=
split
isWord
txt
where
isWord
c
=
c
`
elem
`
[
' '
,
'
\'
'
,
','
,
';'
]
isGram
::
Char
->
Bool
isGram
c
=
isAlphaNum
c
||
isSpace
c
||
c
`
elem
`
[
'-'
,
'/'
]
isGram
c
=
isAlphaNum
c
||
isSpace
c
||
c
`
elem
`
[
'-'
,
'/'
,
'
\'
'
]
-- | Compute the occurrences (occ)
occ
::
Ord
a
=>
[
a
]
->
Map
a
Occ
...
...
@@ -129,4 +141,30 @@ text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences
::
Text
->
[
Text
]
sentences
txt
=
split
isStop
txt
isStop
::
Char
->
Bool
isStop
c
=
c
`
elem
`
[
'.'
,
'?'
,
'!'
]
-- | Tests
-- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
ngramsTest
=
ws
where
txt
=
concat
<$>
lines
<$>
clean
<$>
readFile
"Giono-arbres.txt"
-- | Number of sentences
ls
=
sentences
<$>
txt
-- | Number of monograms used in the full text
ws
=
ngrams
<$>
txt
-- | stem ngrams
-- TODO
-- group ngrams
ocs
=
occ
<$>
ws
src/Gargantext/Ngrams/CoreNLP.hs
View file @
7ff65f23
src/Gargantext/Ngrams/Stem/En.hs
View file @
7ff65f23
module
Language.Porter
(
stem
,
fixstem
)
where
{-|
Module : Gargantext.
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Ngrams.Stem.En
where
import
Control.Monad
import
Data.Either
import
Data.Maybe
import
Data.List
import
Data.Text
(
Text
(),
pack
,
unpack
)
import
Data.List
hiding
(
map
,
head
)
import
Gargantext.Prelude
vowels
::
[
Char
]
vowels
=
[
'a'
,
'e'
,
'i'
,
'o'
,
'u'
]
isConsonant
::
[
Char
]
->
Int
->
Bool
isConsonant
str
i
|
c
`
elem
`
"aeiou"
=
False
|
c
`
elem
`
vowels
=
False
|
c
==
'y'
=
i
==
0
||
isVowel
str
(
i
-
1
)
|
otherwise
=
True
where
c
=
str
!!
i
isVowel
::
[
Char
]
->
Int
->
Bool
isVowel
=
(
not
.
)
.
isConsonant
byIndex
::
Foldable
t1
=>
(
t1
a
->
[
Int
]
->
t2
)
->
t1
a
->
t2
byIndex
fun
str
=
fun
str
[
0
..
length
str
-
1
]
measure
=
length
.
filter
not
.
init
.
(
True
:
)
.
map
head
.
group
.
byIndex
(
map
.
isConsonant
)
containsVowel
::
[
Char
]
->
Bool
containsVowel
=
byIndex
(
any
.
isVowel
)
-- | /!\ unsafe fromJust
measure
::
[
Char
]
->
Int
measure
=
length
.
filter
not
.
init
.
(
True
:
)
.
map
fromJust
.
map
head
.
group
.
byIndex
(
map
.
isConsonant
)
endsWithDouble
::
[
Char
]
->
Bool
endsWithDouble
=
startsWithDouble
.
reverse
where
startsWithDouble
l
|
length
l
<
2
=
False
|
otherwise
=
let
(
x
:
y
:
_
)
=
l
in
x
==
y
&&
x
`
notElem
`
"aeiou"
startsWithDouble
l
=
case
l
of
(
x
:
y
:
_
)
->
x
==
y
&&
x
`
notElem
`
vowels
_
->
False
cvc
::
[
Char
]
->
Bool
cvc
word
|
length
word
<
3
=
False
|
otherwise
=
isConsonant
word
lastIndex
&&
isVowel
word
(
lastIndex
-
1
)
&&
isConsonant
word
(
lastIndex
-
2
)
&&
last
word
`
notElem
`
"wxy"
last
word
`
notElem
`
[
'w'
,
'x'
,
'y'
]
where
lastIndex
=
length
word
-
1
statefulReplace
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
(
Data
.
Either
.
Either
[
a
]
[
a
])
statefulReplace
predicate
str
end
replacement
|
end
`
isSuffixOf
`
str
=
Just
replaced
|
otherwise
=
Nothing
...
...
@@ -40,17 +82,26 @@ statefulReplace predicate str end replacement
replaced
|
predicate
part
=
Right
(
part
++
replacement
)
|
otherwise
=
Left
str
replaceEnd
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
[
a
]
replaceEnd
predicate
str
end
replacement
=
do
result
<-
statefulReplace
predicate
str
end
replacement
return
(
either
id
id
result
)
return
(
either
id
entity
identity
result
)
findStem
::
(
Foldable
t
,
Functor
t
,
Eq
a
)
=>
([
a
]
->
Bool
)
->
[
a
]
->
t
([
a
],
[
a
])
->
Maybe
[
a
]
findStem
f
word
pairs
=
msum
$
map
(
uncurry
(
replaceEnd
f
word
))
pairs
measureGT
::
Int
->
[
Char
]
->
Bool
measureGT
=
flip
((
>
)
.
measure
)
step1a
::
[
Char
]
->
[
Char
]
step1a
word
=
fromMaybe
word
result
where
result
=
findStem
(
const
True
)
word
[(
"sses"
,
"ss"
),
(
"ies"
,
"i"
),
(
"ss"
,
"ss"
),
(
"s"
,
""
)]
where
result
=
findStem
(
const
True
)
word
suffixes
suffixes
=
[(
"sses"
,
"ss"
),
(
"ies"
,
"i"
),
(
"ss"
,
"ss"
),
(
"s"
,
""
)]
beforeStep1b
::
[
Char
]
->
Either
[
Char
]
[
Char
]
beforeStep1b
word
=
fromMaybe
(
Left
word
)
result
where
cond23
x
=
do
{
v
<-
x
;
either
(
const
Nothing
)
(
return
.
Right
)
v
}
...
...
@@ -60,22 +111,27 @@ beforeStep1b word = fromMaybe (Left word) result
cond23
(
statefulReplace
containsVowel
word
"ed"
""
)
`
mplus
`
cond23
(
statefulReplace
containsVowel
word
"ing"
""
)
afterStep1b
::
[
Char
]
->
[
Char
]
afterStep1b
word
=
fromMaybe
word
result
where
double
=
endsWithDouble
word
&&
not
(
any
((`
isSuffixOf
`
word
)
.
return
)
"lsz"
)
double
=
endsWithDouble
word
&&
not
(
any
((`
isSuffixOf
`
word
)
.
return
)
[
'l'
,
's'
,
'z'
]
)
mEq1AndCvc
=
measure
word
==
1
&&
cvc
word
iif
cond
val
=
if
cond
then
Just
val
else
Nothing
result
=
findStem
(
const
True
)
word
[(
"at"
,
"ate"
),
(
"bl"
,
"ble"
),
(
"iz"
,
"ize"
)]
`
mplus
`
iif
double
(
init
word
)
`
mplus
`
iif
mEq1AndCvc
(
word
++
"e"
)
step1b
=
either
id
afterStep1b
.
beforeStep1b
step1b
::
[
Char
]
->
[
Char
]
step1b
=
either
identity
afterStep1b
.
beforeStep1b
step1c
::
[
Char
]
->
[
Char
]
step1c
word
=
fromMaybe
word
result
where
result
=
replaceEnd
containsVowel
word
"y"
"i"
step1
::
[
Char
]
->
[
Char
]
step1
=
step1c
.
step1b
.
step1a
step2
::
[
Char
]
->
[
Char
]
step2
word
=
fromMaybe
word
result
where
result
=
findStem
(
measureGT
0
)
word
...
...
@@ -101,6 +157,7 @@ step2 word = fromMaybe word result
,
(
"biliti"
,
"ble"
)
,
(
"logi"
,
"log"
)
]
step3
::
[
Char
]
->
[
Char
]
step3
word
=
fromMaybe
word
result
where
result
=
findStem
(
measureGT
0
)
word
...
...
@@ -112,37 +169,49 @@ step3 word = fromMaybe word result
,
(
"ful"
,
""
)
,
(
"ness"
,
""
)
]
step4
::
[
Char
]
->
[
Char
]
step4
word
=
fromMaybe
word
result
where
gt1andST
str
=
(
measureGT
1
)
str
&&
any
((`
isSuffixOf
`
str
)
.
return
)
"st"
gt1andST
str
=
(
measureGT
1
)
str
&&
any
((`
isSuffixOf
`
str
)
.
return
)
[
's'
,
't'
]
findGT1
=
findStem
(
measureGT
1
)
word
.
map
(
flip
(,)
""
)
result
=
(
findGT1
[
"al"
,
"ance"
,
"ence"
,
"er"
,
"ic"
,
"able"
,
"ible"
,
"ant"
,
"ement"
,
"ment"
,
"ent"
])
`
mplus
`
(
findStem
gt1andST
word
[(
"ion"
,
""
)])
`
mplus
`
(
findGT1
[
"ou"
,
"ism"
,
"ate"
,
"iti"
,
"ous"
,
"ive"
,
"ize"
])
step5a
::
[
Char
]
->
[
Char
]
step5a
word
=
fromMaybe
word
result
where
test
str
=
(
measureGT
1
str
)
||
((
measure
str
==
1
)
&&
(
not
$
cvc
str
))
result
=
replaceEnd
test
word
"e"
""
step5b
::
[
Char
]
->
[
Char
]
step5b
word
=
fromMaybe
word
result
where
cond
s
=
last
s
==
'l'
&&
measureGT
1
s
result
=
replaceEnd
cond
word
"l"
""
step5
::
[
Char
]
->
[
Char
]
step5
=
step5b
.
step5a
allSteps
::
[
Char
]
->
[
Char
]
allSteps
=
step5
.
step4
.
step3
.
step2
.
step1
stem
s
|
length
s
<
3
=
s
stem
::
Text
->
Text
stem
s
=
pack
(
stem'
$
unpack
s
)
stem'
::
[
Char
]
->
[
Char
]
stem'
s
|
length
s
<
3
=
s
|
otherwise
=
allSteps
s
fixpoint
::
Eq
t
=>
(
t
->
t
)
->
t
->
t
fixpoint
f
x
=
let
fx
=
f
x
in
if
fx
==
x
then
x
else
fixpoint
f
fx
fixstem
=
fixpoint
stem
fixstem
::
[
Char
]
->
[
Char
]
fixstem
=
fixpoint
stem'
{-
...
...
src/Gargantext/Prelude.hs
View file @
7ff65f23
...
...
@@ -42,8 +42,9 @@ import Protolude ( Bool(True, False), Int, Double, Integer
,
Eq
,
(
==
),
(
>=
),
(
<=
),
(
<>
),
(
/=
)
,
(
&&
),
(
||
),
not
,
fst
,
snd
,
toS
,
elem
,
die
,
mod
,
div
,
elem
,
die
,
mod
,
div
,
const
,
curry
,
uncurry
,
otherwise
)
-- TODO import functions optimized in Utils.Count
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment