Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
e21bad3e
Commit
e21bad3e
authored
May 02, 2018
by
Alexandre Delanoë
2
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT/STEM] implemenging Porter lib into Gargantext for English language.
parent
dbff4c96
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
135 additions
and
26 deletions
+135
-26
package.yaml
package.yaml
+1
-0
Ngrams.hs
src/Gargantext/Ngrams.hs
+43
-5
CoreNLP.hs
src/Gargantext/Ngrams/CoreNLP.hs
+1
-1
En.hs
src/Gargantext/Ngrams/Stem/En.hs
+88
-19
Prelude.hs
src/Gargantext/Prelude.hs
+2
-1
No files found.
package.yaml
View file @
e21bad3e
...
@@ -40,6 +40,7 @@ library:
...
@@ -40,6 +40,7 @@ library:
-
Gargantext.Ngrams.CoreNLP
-
Gargantext.Ngrams.CoreNLP
-
Gargantext.Ngrams.Parser
-
Gargantext.Ngrams.Parser
-
Gargantext.Ngrams.Lang.En
-
Gargantext.Ngrams.Lang.En
-
Gargantext.Ngrams.Stem.En
-
Gargantext.Ngrams.Lang.Fr
-
Gargantext.Ngrams.Lang.Fr
-
Gargantext.Ngrams.Metrics
-
Gargantext.Ngrams.Metrics
-
Gargantext.Ngrams.TextMining
-
Gargantext.Ngrams.TextMining
...
...
src/Gargantext/Ngrams.hs
View file @
e21bad3e
...
@@ -23,8 +23,9 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
...
@@ -23,8 +23,9 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
,
module
Gargantext
.
Ngrams
.
Occurrences
,
module
Gargantext
.
Ngrams
.
Occurrences
,
module
Gargantext
.
Ngrams
.
TextMining
,
module
Gargantext
.
Ngrams
.
TextMining
,
module
Gargantext
.
Ngrams
.
Metrics
,
module
Gargantext
.
Ngrams
.
Metrics
,
Ngrams
(
..
),
ngrams
,
occ
,
sumOcc
,
text2fis
,
Ngrams
(
..
),
ngrams
,
occ
,
sumOcc
,
text2fis
,
clean
,
ListName
(
..
),
equivNgrams
,
isGram
,
ListName
(
..
),
equivNgrams
,
isGram
,
sentences
,
ngramsTest
--, module Gargantext.Ngrams.Words
--, module Gargantext.Ngrams.Words
)
where
)
where
...
@@ -43,7 +44,10 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
...
@@ -43,7 +44,10 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import
Data.List
(
sort
)
import
Data.List
(
sort
)
import
Data.Char
(
Char
,
isAlphaNum
,
isSpace
)
import
Data.Char
(
Char
,
isAlphaNum
,
isSpace
)
import
Data.Text
(
Text
,
words
,
filter
,
toLower
)
import
Data.Text
(
Text
,
filter
,
toLower
,
split
,
lines
,
concat
)
import
qualified
Data.Text
as
DT
import
Data.Text.IO
(
readFile
)
import
Data.Map.Strict
(
Map
import
Data.Map.Strict
(
Map
,
empty
,
empty
,
insertWith
,
unionWith
,
insertWith
,
unionWith
...
@@ -80,11 +84,19 @@ type Occ = Int
...
@@ -80,11 +84,19 @@ type Occ = Int
ngrams
::
Text
->
[
Text
]
ngrams
::
Text
->
[
Text
]
ngrams
xs
=
monograms
$
toLower
$
filter
isGram
xs
ngrams
xs
=
monograms
$
toLower
$
filter
isGram
xs
clean
::
Text
->
Text
clean
txt
=
DT
.
map
clean'
txt
where
clean'
'’'
=
'
\'
'
clean'
c
=
c
monograms
::
Text
->
[
Text
]
monograms
::
Text
->
[
Text
]
monograms
=
words
monograms
txt
=
split
isWord
txt
where
isWord
c
=
c
`
elem
`
[
' '
,
'
\'
'
,
','
,
';'
]
isGram
::
Char
->
Bool
isGram
::
Char
->
Bool
isGram
c
=
isAlphaNum
c
||
isSpace
c
||
c
`
elem
`
[
'-'
,
'/'
]
isGram
c
=
isAlphaNum
c
||
isSpace
c
||
c
`
elem
`
[
'-'
,
'/'
,
'
\'
'
]
-- | Compute the occurrences (occ)
-- | Compute the occurrences (occ)
occ
::
Ord
a
=>
[
a
]
->
Map
a
Occ
occ
::
Ord
a
=>
[
a
]
->
Map
a
Occ
...
@@ -129,4 +141,30 @@ text2fis n xs = list2fis n (map ngrams xs)
...
@@ -129,4 +141,30 @@ text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
--text2fisWith = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences
::
Text
->
[
Text
]
sentences
txt
=
split
isStop
txt
isStop
::
Char
->
Bool
isStop
c
=
c
`
elem
`
[
'.'
,
'?'
,
'!'
]
-- | Tests
-- TODO http://hackage.haskell.org/package/tokenize-0.3.0/docs/NLP-Tokenize-Text.html
ngramsTest
=
ws
where
txt
=
concat
<$>
lines
<$>
clean
<$>
readFile
"Giono-arbres.txt"
-- | Number of sentences
ls
=
sentences
<$>
txt
-- | Number of monograms used in the full text
ws
=
ngrams
<$>
txt
-- | stem ngrams
-- TODO
-- group ngrams
ocs
=
occ
<$>
ws
src/Gargantext/Ngrams/CoreNLP.hs
View file @
e21bad3e
src/Gargantext/Ngrams/Stem/En.hs
View file @
e21bad3e
module
Language.Porter
(
stem
,
fixstem
)
{-|
where
Module : Gargantext.
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adapted from:
- source: https://hackage.haskell.org/package/porter
- [Char] -> [Text]
- adding Types signatures
- fixes unseen cases
-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Ngrams.Stem.En
where
import
Control.Monad
import
Control.Monad
import
Data.Either
import
Data.Maybe
import
Data.Maybe
import
Data.List
import
Data.Text
(
Text
(),
pack
,
unpack
)
import
Data.List
hiding
(
map
,
head
)
import
Gargantext.Prelude
vowels
::
[
Char
]
vowels
=
[
'a'
,
'e'
,
'i'
,
'o'
,
'u'
]
isConsonant
::
[
Char
]
->
Int
->
Bool
isConsonant
str
i
isConsonant
str
i
|
c
`
elem
`
"aeiou"
=
False
|
c
`
elem
`
vowels
=
False
|
c
==
'y'
=
i
==
0
||
isVowel
str
(
i
-
1
)
|
c
==
'y'
=
i
==
0
||
isVowel
str
(
i
-
1
)
|
otherwise
=
True
|
otherwise
=
True
where
where
c
=
str
!!
i
c
=
str
!!
i
isVowel
::
[
Char
]
->
Int
->
Bool
isVowel
=
(
not
.
)
.
isConsonant
isVowel
=
(
not
.
)
.
isConsonant
byIndex
::
Foldable
t1
=>
(
t1
a
->
[
Int
]
->
t2
)
->
t1
a
->
t2
byIndex
fun
str
=
fun
str
[
0
..
length
str
-
1
]
byIndex
fun
str
=
fun
str
[
0
..
length
str
-
1
]
measure
=
length
.
filter
not
.
init
.
(
True
:
)
.
map
head
.
group
.
byIndex
(
map
.
isConsonant
)
containsVowel
::
[
Char
]
->
Bool
containsVowel
=
byIndex
(
any
.
isVowel
)
containsVowel
=
byIndex
(
any
.
isVowel
)
-- | /!\ unsafe fromJust
measure
::
[
Char
]
->
Int
measure
=
length
.
filter
not
.
init
.
(
True
:
)
.
map
fromJust
.
map
head
.
group
.
byIndex
(
map
.
isConsonant
)
endsWithDouble
::
[
Char
]
->
Bool
endsWithDouble
=
startsWithDouble
.
reverse
endsWithDouble
=
startsWithDouble
.
reverse
where
where
startsWithDouble
l
|
length
l
<
2
=
False
startsWithDouble
l
=
case
l
of
|
otherwise
=
let
(
x
:
y
:
_
)
=
l
in
x
==
y
&&
x
`
notElem
`
"aeiou"
(
x
:
y
:
_
)
->
x
==
y
&&
x
`
notElem
`
vowels
_
->
False
cvc
::
[
Char
]
->
Bool
cvc
word
|
length
word
<
3
=
False
cvc
word
|
length
word
<
3
=
False
|
otherwise
=
isConsonant
word
lastIndex
&&
|
otherwise
=
isConsonant
word
lastIndex
&&
isVowel
word
(
lastIndex
-
1
)
&&
isVowel
word
(
lastIndex
-
1
)
&&
isConsonant
word
(
lastIndex
-
2
)
&&
isConsonant
word
(
lastIndex
-
2
)
&&
last
word
`
notElem
`
"wxy"
last
word
`
notElem
`
[
'w'
,
'x'
,
'y'
]
where
lastIndex
=
length
word
-
1
where
lastIndex
=
length
word
-
1
statefulReplace
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
(
Data
.
Either
.
Either
[
a
]
[
a
])
statefulReplace
predicate
str
end
replacement
statefulReplace
predicate
str
end
replacement
|
end
`
isSuffixOf
`
str
=
Just
replaced
|
end
`
isSuffixOf
`
str
=
Just
replaced
|
otherwise
=
Nothing
|
otherwise
=
Nothing
...
@@ -40,17 +82,26 @@ statefulReplace predicate str end replacement
...
@@ -40,17 +82,26 @@ statefulReplace predicate str end replacement
replaced
|
predicate
part
=
Right
(
part
++
replacement
)
replaced
|
predicate
part
=
Right
(
part
++
replacement
)
|
otherwise
=
Left
str
|
otherwise
=
Left
str
replaceEnd
::
Eq
a
=>
([
a
]
->
Bool
)
->
[
a
]
->
[
a
]
->
[
a
]
->
Maybe
[
a
]
replaceEnd
predicate
str
end
replacement
=
do
replaceEnd
predicate
str
end
replacement
=
do
result
<-
statefulReplace
predicate
str
end
replacement
result
<-
statefulReplace
predicate
str
end
replacement
return
(
either
id
id
result
)
return
(
either
id
entity
identity
result
)
findStem
::
(
Foldable
t
,
Functor
t
,
Eq
a
)
=>
([
a
]
->
Bool
)
->
[
a
]
->
t
([
a
],
[
a
])
->
Maybe
[
a
]
findStem
f
word
pairs
=
msum
$
map
(
uncurry
(
replaceEnd
f
word
))
pairs
findStem
f
word
pairs
=
msum
$
map
(
uncurry
(
replaceEnd
f
word
))
pairs
measureGT
::
Int
->
[
Char
]
->
Bool
measureGT
=
flip
((
>
)
.
measure
)
measureGT
=
flip
((
>
)
.
measure
)
step1a
::
[
Char
]
->
[
Char
]
step1a
word
=
fromMaybe
word
result
step1a
word
=
fromMaybe
word
result
where
result
=
findStem
(
const
True
)
word
[(
"sses"
,
"ss"
),
(
"ies"
,
"i"
),
(
"ss"
,
"ss"
),
(
"s"
,
""
)]
where
result
=
findStem
(
const
True
)
word
suffixes
suffixes
=
[(
"sses"
,
"ss"
),
(
"ies"
,
"i"
),
(
"ss"
,
"ss"
),
(
"s"
,
""
)]
beforeStep1b
::
[
Char
]
->
Either
[
Char
]
[
Char
]
beforeStep1b
word
=
fromMaybe
(
Left
word
)
result
beforeStep1b
word
=
fromMaybe
(
Left
word
)
result
where
where
cond23
x
=
do
{
v
<-
x
;
either
(
const
Nothing
)
(
return
.
Right
)
v
}
cond23
x
=
do
{
v
<-
x
;
either
(
const
Nothing
)
(
return
.
Right
)
v
}
...
@@ -60,22 +111,27 @@ beforeStep1b word = fromMaybe (Left word) result
...
@@ -60,22 +111,27 @@ beforeStep1b word = fromMaybe (Left word) result
cond23
(
statefulReplace
containsVowel
word
"ed"
""
)
`
mplus
`
cond23
(
statefulReplace
containsVowel
word
"ed"
""
)
`
mplus
`
cond23
(
statefulReplace
containsVowel
word
"ing"
""
)
cond23
(
statefulReplace
containsVowel
word
"ing"
""
)
afterStep1b
::
[
Char
]
->
[
Char
]
afterStep1b
word
=
fromMaybe
word
result
afterStep1b
word
=
fromMaybe
word
result
where
where
double
=
endsWithDouble
word
&&
not
(
any
((`
isSuffixOf
`
word
)
.
return
)
"lsz"
)
double
=
endsWithDouble
word
&&
not
(
any
((`
isSuffixOf
`
word
)
.
return
)
[
'l'
,
's'
,
'z'
]
)
mEq1AndCvc
=
measure
word
==
1
&&
cvc
word
mEq1AndCvc
=
measure
word
==
1
&&
cvc
word
iif
cond
val
=
if
cond
then
Just
val
else
Nothing
iif
cond
val
=
if
cond
then
Just
val
else
Nothing
result
=
findStem
(
const
True
)
word
[(
"at"
,
"ate"
),
(
"bl"
,
"ble"
),
(
"iz"
,
"ize"
)]
result
=
findStem
(
const
True
)
word
[(
"at"
,
"ate"
),
(
"bl"
,
"ble"
),
(
"iz"
,
"ize"
)]
`
mplus
`
iif
double
(
init
word
)
`
mplus
`
iif
double
(
init
word
)
`
mplus
`
iif
mEq1AndCvc
(
word
++
"e"
)
`
mplus
`
iif
mEq1AndCvc
(
word
++
"e"
)
step1b
=
either
id
afterStep1b
.
beforeStep1b
step1b
::
[
Char
]
->
[
Char
]
step1b
=
either
identity
afterStep1b
.
beforeStep1b
step1c
::
[
Char
]
->
[
Char
]
step1c
word
=
fromMaybe
word
result
step1c
word
=
fromMaybe
word
result
where
result
=
replaceEnd
containsVowel
word
"y"
"i"
where
result
=
replaceEnd
containsVowel
word
"y"
"i"
step1
::
[
Char
]
->
[
Char
]
step1
=
step1c
.
step1b
.
step1a
step1
=
step1c
.
step1b
.
step1a
step2
::
[
Char
]
->
[
Char
]
step2
word
=
fromMaybe
word
result
step2
word
=
fromMaybe
word
result
where
where
result
=
findStem
(
measureGT
0
)
word
result
=
findStem
(
measureGT
0
)
word
...
@@ -101,6 +157,7 @@ step2 word = fromMaybe word result
...
@@ -101,6 +157,7 @@ step2 word = fromMaybe word result
,
(
"biliti"
,
"ble"
)
,
(
"biliti"
,
"ble"
)
,
(
"logi"
,
"log"
)
]
,
(
"logi"
,
"log"
)
]
step3
::
[
Char
]
->
[
Char
]
step3
word
=
fromMaybe
word
result
step3
word
=
fromMaybe
word
result
where
where
result
=
findStem
(
measureGT
0
)
word
result
=
findStem
(
measureGT
0
)
word
...
@@ -112,37 +169,49 @@ step3 word = fromMaybe word result
...
@@ -112,37 +169,49 @@ step3 word = fromMaybe word result
,
(
"ful"
,
""
)
,
(
"ful"
,
""
)
,
(
"ness"
,
""
)
]
,
(
"ness"
,
""
)
]
step4
::
[
Char
]
->
[
Char
]
step4
word
=
fromMaybe
word
result
step4
word
=
fromMaybe
word
result
where
where
gt1andST
str
=
(
measureGT
1
)
str
&&
any
((`
isSuffixOf
`
str
)
.
return
)
"st"
gt1andST
str
=
(
measureGT
1
)
str
&&
any
((`
isSuffixOf
`
str
)
.
return
)
[
's'
,
't'
]
findGT1
=
findStem
(
measureGT
1
)
word
.
map
(
flip
(,)
""
)
findGT1
=
findStem
(
measureGT
1
)
word
.
map
(
flip
(,)
""
)
result
=
(
findGT1
[
"al"
,
"ance"
,
"ence"
,
"er"
,
"ic"
,
"able"
,
"ible"
,
"ant"
,
"ement"
,
"ment"
,
"ent"
])
`
mplus
`
result
=
(
findGT1
[
"al"
,
"ance"
,
"ence"
,
"er"
,
"ic"
,
"able"
,
"ible"
,
"ant"
,
"ement"
,
"ment"
,
"ent"
])
`
mplus
`
(
findStem
gt1andST
word
[(
"ion"
,
""
)])
`
mplus
`
(
findStem
gt1andST
word
[(
"ion"
,
""
)])
`
mplus
`
(
findGT1
[
"ou"
,
"ism"
,
"ate"
,
"iti"
,
"ous"
,
"ive"
,
"ize"
])
(
findGT1
[
"ou"
,
"ism"
,
"ate"
,
"iti"
,
"ous"
,
"ive"
,
"ize"
])
step5a
::
[
Char
]
->
[
Char
]
step5a
word
=
fromMaybe
word
result
step5a
word
=
fromMaybe
word
result
where
where
test
str
=
(
measureGT
1
str
)
||
((
measure
str
==
1
)
&&
(
not
$
cvc
str
))
test
str
=
(
measureGT
1
str
)
||
((
measure
str
==
1
)
&&
(
not
$
cvc
str
))
result
=
replaceEnd
test
word
"e"
""
result
=
replaceEnd
test
word
"e"
""
step5b
::
[
Char
]
->
[
Char
]
step5b
word
=
fromMaybe
word
result
step5b
word
=
fromMaybe
word
result
where
where
cond
s
=
last
s
==
'l'
&&
measureGT
1
s
cond
s
=
last
s
==
'l'
&&
measureGT
1
s
result
=
replaceEnd
cond
word
"l"
""
result
=
replaceEnd
cond
word
"l"
""
step5
::
[
Char
]
->
[
Char
]
step5
=
step5b
.
step5a
step5
=
step5b
.
step5a
allSteps
::
[
Char
]
->
[
Char
]
allSteps
=
step5
.
step4
.
step3
.
step2
.
step1
allSteps
=
step5
.
step4
.
step3
.
step2
.
step1
stem
s
|
length
s
<
3
=
s
stem
::
Text
->
Text
stem
s
=
pack
(
stem'
$
unpack
s
)
stem'
::
[
Char
]
->
[
Char
]
stem'
s
|
length
s
<
3
=
s
|
otherwise
=
allSteps
s
|
otherwise
=
allSteps
s
fixpoint
::
Eq
t
=>
(
t
->
t
)
->
t
->
t
fixpoint
f
x
=
let
fx
=
f
x
in
fixpoint
f
x
=
let
fx
=
f
x
in
if
fx
==
x
if
fx
==
x
then
x
then
x
else
fixpoint
f
fx
else
fixpoint
f
fx
fixstem
=
fixpoint
stem
fixstem
::
[
Char
]
->
[
Char
]
fixstem
=
fixpoint
stem'
{-
{-
...
...
src/Gargantext/Prelude.hs
View file @
e21bad3e
...
@@ -42,8 +42,9 @@ import Protolude ( Bool(True, False), Int, Double, Integer
...
@@ -42,8 +42,9 @@ import Protolude ( Bool(True, False), Int, Double, Integer
,
Eq
,
(
==
),
(
>=
),
(
<=
),
(
<>
),
(
/=
)
,
Eq
,
(
==
),
(
>=
),
(
<=
),
(
<>
),
(
/=
)
,
(
&&
),
(
||
),
not
,
(
&&
),
(
||
),
not
,
fst
,
snd
,
toS
,
fst
,
snd
,
toS
,
elem
,
die
,
mod
,
div
,
elem
,
die
,
mod
,
div
,
const
,
curry
,
uncurry
,
curry
,
uncurry
,
otherwise
)
)
-- TODO import functions optimized in Utils.Count
-- TODO import functions optimized in Utils.Count
...
...
delanoe
@anoe
mentioned in commit
fa0ed7df
·
May 02, 2018
mentioned in commit
fa0ed7df
mentioned in commit fa0ed7df012f276f23670d3972da35fc866237de
Toggle commit list
delanoe
@anoe
mentioned in commit
20b24e1b
·
May 04, 2018
mentioned in commit
20b24e1b
mentioned in commit 20b24e1b94487f956168afadae4a7087d21c6962
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment