Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
a3490841
Commit
a3490841
authored
Sep 29, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'bayes'
parents
88585c12
3cbf7e51
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
283 additions
and
79 deletions
+283
-79
package.yaml
package.yaml
+3
-0
Core.hs
src/Gargantext/Core.hs
+2
-1
Prelude.hs
src/Gargantext/Prelude.hs
+1
-1
Count.hs
src/Gargantext/Text/Metrics/Count.hs
+3
-0
FrequentItemSet.hs
src/Gargantext/Text/Metrics/FrequentItemSet.hs
+3
-3
Date.hs
src/Gargantext/Text/Parsers/Date.hs
+1
-1
Stem.hs
src/Gargantext/Text/Terms/Mono/Stem.hs
+1
-0
Multi.hs
src/Gargantext/Text/Terms/Multi.hs
+1
-1
PosTagging.hs
src/Gargantext/Text/Terms/Multi/PosTagging.hs
+1
-0
RAKE.hs
src/Gargantext/Text/Terms/Multi/RAKE.hs
+11
-72
Stop.hs
src/Gargantext/Text/Terms/Stop.hs
+256
-0
No files found.
package.yaml
View file @
a3490841
...
...
@@ -53,6 +53,7 @@ library:
-
Gargantext.Text.Parsers.WOS
-
Gargantext.Text.Search
-
Gargantext.Text.Terms
-
Gargantext.Text.Terms.Stop
-
Gargantext.Text.Terms.Mono
-
Gargantext.Text.Terms.Multi.Lang.En
-
Gargantext.Text.Terms.Multi.Lang.Fr
...
...
@@ -112,6 +113,7 @@ library:
-
path-io
-
postgresql-simple
-
pretty
-
probability
-
product-profunctors
-
profunctors
-
protolude
...
...
@@ -165,6 +167,7 @@ executables:
-
-with-rtsopts=-N
-
-O2
-
-Wmissing-signatures
-
-Wcompat
dependencies
:
-
base
-
containers
...
...
src/Gargantext/Core.hs
View file @
a3490841
...
...
@@ -25,4 +25,5 @@ module Gargantext.Core
-- - SP == spanish (not implemented yet)
--
-- ... add your language and help us to implement it (:
data
Lang
=
EN
|
FR
data
Lang
=
EN
|
FR
|
DE
|
SP
|
CH
deriving
(
Show
,
Eq
,
Ord
)
src/Gargantext/Prelude.hs
View file @
a3490841
...
...
@@ -46,7 +46,7 @@ import Protolude ( Bool(True, False), Int, Int64, Double, Integer
,
abs
,
min
,
max
,
maximum
,
minimum
,
return
,
snd
,
truncate
,
(
+
),
(
*
),
(
/
),
(
-
),
(
.
),
(
$
),
(
&
),
(
**
),
(
^
),
(
<
),
(
>
),
log
,
Eq
,
(
==
),
(
>=
),
(
<=
),
(
<>
),
(
/=
)
,
(
&&
),
(
||
),
not
,
any
,
(
&&
),
(
||
),
not
,
any
,
all
,
fst
,
snd
,
toS
,
elem
,
die
,
mod
,
div
,
const
,
either
,
curry
,
uncurry
,
repeat
...
...
src/Gargantext/Text/Metrics/Count.hs
View file @
a3490841
...
...
@@ -144,6 +144,9 @@ occurrences = occurrencesOn _terms_stem
occurrencesOn
::
(
Ord
a
,
Ord
b
)
=>
(
a
->
b
)
->
[
a
]
->
Map
b
(
Map
a
Int
)
occurrencesOn
f
=
foldl'
(
\
m
a
->
insertWith
(
unionWith
(
+
))
(
f
a
)
(
singleton
a
1
)
m
)
empty
occurrencesWith
::
(
Foldable
list
,
Ord
k
,
Num
a
)
=>
(
b
->
k
)
->
list
b
->
Map
k
a
occurrencesWith
f
xs
=
foldl'
(
\
x
y
->
insertWith
(
+
)
(
f
y
)
1
x
)
empty
xs
-- TODO add groups and filter stops
sumOcc
::
Ord
a
=>
[
Occ
a
]
->
Occ
a
...
...
src/Gargantext/Text/Metrics/FrequentItemSet.hs
View file @
a3490841
...
...
@@ -16,7 +16,7 @@ Domain Specific Language to manage Frequent Item Set (FIS)
module
Gargantext.Text.Metrics.FrequentItemSet
(
Fis
,
Size
(
..
)
,
occ_hlcm
,
cooc_hlcm
,
all
,
between
,
all
Fis
,
between
,
fisWithSize
,
fisWith
,
fisWithSizePoly
...
...
@@ -51,8 +51,8 @@ occ_hlcm = fisWithSize (Point 1)
cooc_hlcm
::
Frequency
->
[[
Item
]]
->
[
Fis
]
cooc_hlcm
=
fisWithSize
(
Point
2
)
all
::
Frequency
->
[[
Item
]]
->
[
Fis
]
all
=
fisWith
Nothing
all
Fis
::
Frequency
->
[[
Item
]]
->
[
Fis
]
all
Fis
=
fisWith
Nothing
------------------------------------------------------------------------
between
::
(
Int
,
Int
)
->
Frequency
->
[[
Item
]]
->
[
Fis
]
...
...
src/Gargantext/Text/Parsers/Date.hs
View file @
a3490841
...
...
@@ -68,7 +68,7 @@ import Text.XML.HXT.DOM.Util (decimalStringToInt)
parserLang
::
Lang
->
DC
.
Lang
parserLang
FR
=
DC
.
FR
parserLang
EN
=
DC
.
EN
parserLang
_
=
panic
"not implemented"
-- | Final Date parser API
-- IO can be avoided here:
...
...
src/Gargantext/Text/Terms/Mono/Stem.hs
View file @
a3490841
...
...
@@ -55,6 +55,7 @@ stem lang = DT.pack . N.stem lang' . DT.unpack
lang'
=
case
lang
of
EN
->
N
.
English
FR
->
N
.
French
_
->
panic
$
DT
.
pack
"not implemented yet"
src/Gargantext/Text/Terms/Multi.hs
View file @
a3490841
...
...
@@ -57,4 +57,4 @@ tokenTags' lang t = map tokens2tokensTags
group
::
Lang
->
[
TokenTag
]
->
[
TokenTag
]
group
EN
=
En
.
group
group
FR
=
Fr
.
group
group
_
=
panic
$
pack
"group :: Lang not implemeted yet"
src/Gargantext/Text/Terms/Multi/PosTagging.hs
View file @
a3490841
...
...
@@ -124,6 +124,7 @@ corenlp' lang txt = do
EN
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
-- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
FR
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
parse.model
\"
:
\"
edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz
\"
,
\"
pos.model
\"
:
\"
edu/stanford/nlp/models/pos-tagger/french/french.tagger
\"
,
\"
tokenize.language
\"
:
\"
fr
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
_
->
panic
$
pack
"not implemented yet"
url
<-
parseRequest
$
"POST http://localhost:9000/?properties="
<>
properties
let
request
=
setRequestBodyLBS
(
cs
txt
)
url
httpJSON
request
...
...
src/Gargantext/Text/Terms/Multi/RAKE.hs
View file @
a3490841
...
...
@@ -29,13 +29,22 @@ list quality in time.
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Text.Terms.Multi.RAKE
(
multiterms_rake
)
module
Gargantext.Text.Terms.Multi.RAKE
(
multiterms_rake
,
select
,
hardStopList
)
where
import
GHC.Real
(
round
)
import
Data.Text
(
Text
)
import
NLP.RAKE.Text
import
Gargantext.Text.Terms.Stop
(
stopList
)
import
Gargantext.Prelude
select
::
Double
->
[
a
]
->
[
a
]
select
part
ns
=
take
n
ns
where
n
=
round
$
part
*
(
fromIntegral
$
length
ns
)
multiterms_rake
::
Text
->
[
WordScore
]
multiterms_rake
=
candidates
hardStopList
defaultNosplit
...
...
@@ -43,74 +52,4 @@ multiterms_rake = candidates hardStopList
-- | StopList
hardStopList
::
StopwordsMap
hardStopList
=
mkStopwordsStr
[
"a"
,
"a's"
,
"able"
,
"about"
,
"above"
,
"apply"
,
"according"
,
"accordingly"
,
"across"
,
"actually"
,
"after"
,
"afterwards"
,
"again"
,
"against"
,
"ain't"
,
"all"
,
"allow"
,
"allows"
,
"almost"
,
"alone"
,
"along"
,
"already"
,
"also"
,
"although"
,
"always"
,
"am"
,
"among"
,
"amongst"
,
"an"
,
"and"
,
"another"
,
"any"
,
"anybody"
,
"anyhow"
,
"anyone"
,
"anything"
,
"anyway"
,
"anyways"
,
"anywhere"
,
"analyze"
,
"apart"
,
"appear"
,
"appreciate"
,
"appropriate"
,
"are"
,
"aren't"
,
"around"
,
"as"
,
"aside"
,
"ask"
,
"asking"
,
"associated"
,
"at"
,
"available"
,
"away"
,
"awfully"
,
"based"
,
"b"
,
"be"
,
"became"
,
"because"
,
"become"
,
"becomes"
,
"becoming"
,
"been"
,
"before"
,
"beforehand"
,
"behind"
,
"being"
,
"believe"
,
"below"
,
"beside"
,
"besides"
,
"best"
,
"better"
,
"between"
,
"beyond"
,
"both"
,
"brief"
,
"but"
,
"by"
,
"c"
,
"c'mon"
,
"c's"
,
"came"
,
"can"
,
"can't"
,
"cannot"
,
"cant"
,
"cause"
,
"causes"
,
"certain"
,
"certainly"
,
"changes"
,
"clearly"
,
"co"
,
"com"
,
"come"
,
"comes"
,
"common"
,
"concerning"
,
"consequently"
,
"consider"
,
"considering"
,
"contain"
,
"containing"
,
"contains"
,
"corresponding"
,
"could"
,
"couldn't"
,
"course"
,
"currently"
,
"d"
,
"definitely"
,
"described"
,
"detects"
,
"detecting"
,
"despite"
,
"did"
,
"didn't"
,
"different"
,
"do"
,
"does"
,
"doesn't"
,
"doing"
,
"don't"
,
"done"
,
"down"
,
"downwards"
,
"during"
,
"e"
,
"each"
,
"edu"
,
"eg"
,
"eight"
,
"either"
,
"else"
,
"elsewhere"
,
"enough"
,
"entirely"
,
"especially"
,
"et"
,
"etc"
,
"even"
,
"ever"
,
"every"
,
"everybody"
,
"everyone"
,
"everything"
,
"everywhere"
,
"ex"
,
"exactly"
,
"example"
,
"except"
,
"f"
,
"far"
,
"few"
,
"find"
,
"fifth"
,
"first"
,
"five"
,
"followed"
,
"following"
,
"follows"
,
"for"
,
"former"
,
"formerly"
,
"forth"
,
"four"
,
"from"
,
"further"
,
"furthermore"
,
"g"
,
"get"
,
"gets"
,
"getting"
,
"given"
,
"gives"
,
"go"
,
"goes"
,
"going"
,
"gone"
,
"got"
,
"gotten"
,
"greetings"
,
"h"
,
"had"
,
"hadn't"
,
"happens"
,
"hardly"
,
"has"
,
"hasn't"
,
"have"
,
"haven't"
,
"having"
,
"he"
,
"he's"
,
"hello"
,
"help"
,
"hence"
,
"her"
,
"here"
,
"here's"
,
"hereafter"
,
"hereby"
,
"herein"
,
"hereupon"
,
"hers"
,
"herself"
,
"hi"
,
"him"
,
"himself"
,
"his"
,
"hither"
,
"hopefully"
,
"how"
,
"howbeit"
,
"however"
,
"i"
,
"i'd"
,
"identify"
,
"i'll"
,
"i'm"
,
"i've"
,
"ie"
,
"if"
,
"ignored"
,
"immediate"
,
"in"
,
"inasmuch"
,
"inc"
,
"indeed"
,
"indicate"
,
"indicated"
,
"indicates"
,
"inner"
,
"insofar"
,
"instead"
,
"into"
,
"inward"
,
"is"
,
"isn't"
,
"it"
,
"it'd"
,
"it'll"
,
"it's"
,
"its"
,
"itself"
,
"j"
,
"just"
,
"k"
,
"keep"
,
"keeps"
,
"kept"
,
"know"
,
"known"
,
"knows"
,
"l"
,
"last"
,
"lately"
,
"later"
,
"latter"
,
"latterly"
,
"least"
,
"less"
,
"lest"
,
"let"
,
"let's"
,
"like"
,
"liked"
,
"likely"
,
"little"
,
"look"
,
"looking"
,
"looks"
,
"ltd"
,
"m"
,
"mainly"
,
"many"
,
"may"
,
"maybe"
,
"me"
,
"mean"
,
"meanwhile"
,
"merely"
,
"might"
,
"more"
,
"moreover"
,
"most"
,
"mostly"
,
"much"
,
"must"
,
"my"
,
"myself"
,
"n"
,
"name"
,
"namely"
,
"nd"
,
"near"
,
"nearly"
,
"necessary"
,
"need"
,
"needs"
,
"neither"
,
"never"
,
"nevertheless"
,
"new"
,
"next"
,
"nine"
,
"no"
,
"nobody"
,
"non"
,
"none"
,
"noone"
,
"nor"
,
"normally"
,
"not"
,
"nothing"
,
"novel"
,
"now"
,
"nowhere"
,
"o"
,
"obviously"
,
"of"
,
"off"
,
"often"
,
"oh"
,
"ok"
,
"okay"
,
"old"
,
"on"
,
"once"
,
"one"
,
"ones"
,
"only"
,
"onto"
,
"or"
,
"other"
,
"others"
,
"otherwise"
,
"ought"
,
"our"
,
"ours"
,
"ourselves"
,
"out"
,
"outside"
,
"over"
,
"overall"
,
"own"
,
"p"
,
"particular"
,
"particularly"
,
"per"
,
"perhaps"
,
"placed"
,
"please"
,
"plus"
,
"possible"
,
"presents"
,
"presumably"
,
"probably"
,
"provides"
,
"q"
,
"que"
,
"quite"
,
"qv"
,
"r"
,
"rather"
,
"rd"
,
"re"
,
"really"
,
"reasonably"
,
"regarding"
,
"regardless"
,
"regards"
,
"relatively"
,
"respectively"
,
"right"
,
"s"
,
"said"
,
"same"
,
"saw"
,
"say"
,
"saying"
,
"says"
,
"second"
,
"secondly"
,
"see"
,
"seeing"
,
"seem"
,
"seemed"
,
"seeming"
,
"seems"
,
"seen"
,
"self"
,
"selves"
,
"sensible"
,
"sent"
,
"serious"
,
"seriously"
,
"seven"
,
"several"
,
"shall"
,
"she"
,
"should"
,
"shouldn't"
,
"since"
,
"six"
,
"so"
,
"some"
,
"somebody"
,
"somehow"
,
"someone"
,
"something"
,
"sometime"
,
"sometimes"
,
"somewhat"
,
"somewhere"
,
"soon"
,
"sorry"
,
"specified"
,
"specify"
,
"specifying"
,
"still"
,
"sub"
,
"such"
,
"sup"
,
"sure"
,
"t"
,
"t's"
,
"take"
,
"taken"
,
"tell"
,
"tends"
,
"th"
,
"than"
,
"thank"
,
"thanks"
,
"thanx"
,
"that"
,
"that's"
,
"thats"
,
"the"
,
"their"
,
"theirs"
,
"them"
,
"themselves"
,
"then"
,
"thence"
,
"there"
,
"there's"
,
"thereafter"
,
"thereby"
,
"therefore"
,
"therein"
,
"theres"
,
"thereupon"
,
"these"
,
"they"
,
"they'd"
,
"they'll"
,
"they're"
,
"they've"
,
"think"
,
"third"
,
"this"
,
"thorough"
,
"thoroughly"
,
"those"
,
"though"
,
"three"
,
"through"
,
"throughout"
,
"thru"
,
"thus"
,
"to"
,
"together"
,
"too"
,
"took"
,
"toward"
,
"towards"
,
"tried"
,
"tries"
,
"truly"
,
"try"
,
"trying"
,
"twice"
,
"two"
,
"u"
,
"un"
,
"under"
,
"unfortunately"
,
"unless"
,
"unlikely"
,
"until"
,
"unto"
,
"up"
,
"upon"
,
"us"
,
"use"
,
"used"
,
"useful"
,
"uses"
,
"using"
,
"usually"
,
"uucp"
,
"v"
,
"value"
,
"various"
,
"very"
,
"via"
,
"viz"
,
"vs"
,
"w"
,
"want"
,
"wants"
,
"was"
,
"wasn't"
,
"way"
,
"we"
,
"we'd"
,
"we'll"
,
"we're"
,
"we've"
,
"welcome"
,
"well"
,
"went"
,
"were"
,
"weren't"
,
"what"
,
"what's"
,
"whatever"
,
"when"
,
"whence"
,
"whenever"
,
"where"
,
"where's"
,
"whereafter"
,
"whereas"
,
"whereby"
,
"wherein"
,
"whereupon"
,
"wherever"
,
"whether"
,
"which"
,
"while"
,
"whither"
,
"who"
,
"who's"
,
"whoever"
,
"whole"
,
"whom"
,
"whose"
,
"why"
,
"will"
,
"willing"
,
"wish"
,
"with"
,
"within"
,
"without"
,
"won't"
,
"wonder"
,
"would"
,
"wouldn't"
,
"x"
,
"y"
,
"yes"
,
"yet"
,
"you"
,
"you'd"
,
"you'll"
,
"you're"
,
"you've"
,
"your"
,
"yours"
,
"yourself"
,
"yourselves"
,
"z"
,
"zero"
]
hardStopList
=
mkStopwordsStr
stopList
src/Gargantext/Text/Terms/Stop.hs
0 → 100644
View file @
a3490841
{-|
Module : Gargantext.Text.Terms.Stop
Description : Mono Terms module
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Stop words and (how to learn it).
Main type here is String.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Text.Terms.Stop
where
import
Numeric.Probability.Distribution
((
??
))
import
qualified
Numeric.Probability.Distribution
as
D
import
Data.Char
(
toLower
)
import
qualified
Data.List
as
DL
import
Data.Maybe
(
maybe
)
import
Data.Map.Strict
(
Map
)
import
qualified
Data.Map.Strict
as
DM
import
Data.String
(
String
)
import
Data.Text
(
pack
,
unpack
)
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Text.Terms.Mono
(
words
)
import
Gargantext.Text.Metrics.Count
(
occurrencesWith
)
import
Gargantext.Text.Samples.FR
as
FR
import
Gargantext.Text.Samples.EN
as
EN
import
Gargantext.Text.Samples.DE
as
DE
import
Gargantext.Text.Samples.SP
as
SP
import
Gargantext.Text.Samples.CH
as
CH
------------------------------------------------------------------------
data
Candidate
=
Candidate
{
stop
::
Double
,
noStop
::
Double
}
deriving
(
Show
)
-- * String preparation
-- | String prepare
blanks
::
String
->
String
blanks
[]
=
[]
blanks
xs
=
[
' '
]
<>
xs
<>
[
' '
]
-- | Blocks increase the size of the word to ease computations
-- some border and unexepected effects can happen, need to be tested
blockOf
::
Int
->
String
->
String
blockOf
n
st
=
DL
.
concat
$
DL
.
take
n
$
DL
.
repeat
st
-- | Chunks is the same function as splitBy in Context but for Strings,
-- not Text (without pack and unpack operations that are not needed).
chunks
::
Int
->
Int
->
String
->
[
String
]
chunks
n
m
=
DL
.
take
m
.
filter
(
not
.
all
(
==
' '
))
.
chunkAlong
(
n
+
1
)
1
.
DL
.
concat
.
DL
.
take
1000
.
DL
.
repeat
.
blanks
allChunks
::
[
Int
]
->
Int
->
String
->
[
String
]
allChunks
ns
m
st
=
DL
.
concat
$
map
(
\
n
->
chunks
n
m
st
)
ns
allChunks'
::
[
Int
]
->
Int
->
String
->
[[
String
]]
allChunks'
ns
m
st
=
map
(
\
n
->
chunks
n
m
st
)
ns
------------------------------------------------------------------------
-- * Analyze candidate
type
StringSize
=
Int
type
TotalFreq
=
Int
type
Freq
=
Int
type
Word
=
String
data
LangWord
=
LangWord
Lang
Word
type
LangProba
=
Map
Lang
Double
------------------------------------------------------------------------
detectLangs
::
String
->
LangProba
detectLangs
s
=
detect
(
wordsToBook
[
0
..
2
]
s
)
testEL
testEL
::
EventLang
testEL
=
toEventLangs
[
0
..
2
]
[
LangWord
EN
EN
.
textMining
,
LangWord
FR
FR
.
textMining
,
LangWord
DE
DE
.
textMining
,
LangWord
SP
SP
.
textMining
,
LangWord
CH
CH
.
textMining
]
detect
::
EventBook
->
EventLang
->
LangProba
detect
(
EventBook
mapFreq
_
)
el
=
DM
.
unionsWith
(
+
)
$
map
(
\
(
s
,
n
)
->
DM
.
map
(
\
eb
->
(
fromIntegral
n
)
*
peb
s
eb
)
el
)
$
filter
(
\
x
->
fst
x
/=
" "
)
$
DM
.
toList
mapFreq
------------------------------------------------------------------------
-- | TODO: monoids
type
EventLang
=
Map
Lang
EventBook
toEventLangs
::
[
Int
]
->
[
LangWord
]
->
EventLang
toEventLangs
ns
=
foldl'
(
opLang
(
+
))
(
emptyEventLang
ns
)
.
map
(
toLang
ns
)
emptyEventLang
::
[
Int
]
->
EventLang
emptyEventLang
ns
=
toLang
ns
(
LangWord
FR
""
)
toLang
::
[
Int
]
->
LangWord
->
EventLang
toLang
ns
(
LangWord
l
txt
)
=
DM
.
fromList
[(
l
,
wordsToBook
ns
txt
)]
opLang
::
(
Freq
->
Freq
->
Freq
)
->
EventLang
->
EventLang
->
EventLang
opLang
f
=
DM
.
unionWith
(
op
f
)
------------------------------------------------------------------------
-- | TODO: monoids (but proba >= 0)
peb
::
String
->
EventBook
->
Double
peb
st
(
EventBook
mapFreq
mapN
)
=
(
fromIntegral
a
)
/
(
fromIntegral
b
)
where
a
=
maybe
0
identity
$
DM
.
lookup
st
mapFreq
b
=
maybe
1
identity
$
DM
.
lookup
(
length
st
)
mapN
data
EventBook
=
EventBook
{
events_freq
::
Map
String
Freq
,
events_n
::
Map
StringSize
TotalFreq
}
deriving
(
Show
)
emptyEventBook
::
[
Int
]
->
EventBook
emptyEventBook
ns
=
wordToBook
ns
" "
wordsToBook
::
[
Int
]
->
String
->
EventBook
wordsToBook
ns
txt
=
foldl'
(
op
(
+
))
(
emptyEventBook
ns
)
eventsBook
where
ws
=
map
unpack
$
words
$
pack
txt
eventsBook
=
map
(
wordToBook
ns
)
ws
wordToBook
::
[
Int
]
->
Word
->
EventBook
wordToBook
ns
txt
=
EventBook
ef
en
where
chks
=
allChunks'
ns
10
txt
en
=
DM
.
fromList
$
map
(
\
(
n
,
ns'
)
->
(
n
,
length
ns'
))
$
zip
ns
chks
ef
=
foldl'
DM
.
union
DM
.
empty
$
map
(
occurrencesWith
identity
)
chks
op
::
(
Freq
->
Freq
->
Freq
)
->
EventBook
->
EventBook
->
EventBook
op
f
(
EventBook
ef1
en1
)
(
EventBook
ef2
en2
)
=
EventBook
(
DM
.
unionWith
f
ef1
ef2
)
(
DM
.
unionWith
f
en1
en2
)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- * Make the distributions
makeDist
::
[
String
]
->
D
.
T
Double
String
makeDist
=
D
.
uniform
.
DL
.
concat
.
map
(
allChunks
[
0
,
2
]
10
)
stopDist
::
D
.
T
Double
String
stopDist
=
makeDist
stopList
candDist
::
D
.
T
Double
String
candDist
=
makeDist
candList
------------------------------------------------------------------------
sumProba
::
Num
a
=>
D
.
T
a
String
->
[
Char
]
->
a
sumProba
ds
x
=
sum
$
map
((
~?
)
ds
)
$
allChunks
[
0
,
2
]
10
$
map
toLower
x
-- | Get probability according a distribution
(
~?
)
::
(
Num
prob
,
Eq
a
)
=>
D
.
T
prob
a
->
a
->
prob
(
~?
)
ds
x
=
(
==
x
)
??
ds
------------------------------------------------------------------------
candidate
::
[
Char
]
->
Candidate
candidate
x
=
Candidate
(
sumProba
stopDist
x
)
(
sumProba
candDist
x
)
------------------------------------------------------------------------
candList
::
[
String
]
candList
=
[
"france"
,
"alexandre"
,
"mael"
,
"constitution"
,
"etats-unis"
,
"associes"
,
"car"
,
"train"
,
"spam"
]
stopList
::
[
String
]
stopList
=
map
show
([
0
..
9
]
::
[
Int
])
<>
[
"a"
,
"a's"
,
"able"
,
"about"
,
"above"
,
"apply"
,
"according"
,
"accordingly"
,
"across"
,
"actually"
,
"after"
,
"afterwards"
,
"again"
,
"against"
,
"ain't"
,
"all"
,
"allow"
,
"allows"
,
"almost"
,
"alone"
,
"along"
,
"involves"
,
"already"
,
"also"
,
"although"
,
"always"
,
"am"
,
"among"
,
"amongst"
,
"an"
,
"and"
,
"another"
,
"any"
,
"anybody"
,
"anyhow"
,
"anyone"
,
"anything"
,
"anyway"
,
"anyways"
,
"anywhere"
,
"analyze"
,
"apart"
,
"appear"
,
"appreciate"
,
"appropriate"
,
"are"
,
"aren't"
,
"around"
,
"as"
,
"aside"
,
"ask"
,
"asking"
,
"associated"
,
"at"
,
"available"
,
"away"
,
"awfully"
,
"based"
,
"b"
,
"be"
,
"became"
,
"because"
,
"become"
,
"becomes"
,
"becoming"
,
"been"
,
"before"
,
"beforehand"
,
"behind"
,
"being"
,
"believe"
,
"below"
,
"beside"
,
"besides"
,
"best"
,
"better"
,
"between"
,
"beyond"
,
"both"
,
"brief"
,
"but"
,
"by"
,
"c"
,
"c'mon"
,
"c's"
,
"came"
,
"can"
,
"can't"
,
"cannot"
,
"cant"
,
"cause"
,
"causes"
,
"certain"
,
"certainly"
,
"changes"
,
"clearly"
,
"co"
,
"com"
,
"come"
,
"comes"
,
"common"
,
"concerning"
,
"consequently"
,
"consider"
,
"considering"
,
"contain"
,
"containing"
,
"contains"
,
"corresponding"
,
"could"
,
"couldn't"
,
"course"
,
"currently"
,
"d"
,
"definitely"
,
"described"
,
"detects"
,
"detecting"
,
"despite"
,
"did"
,
"didn't"
,
"different"
,
"do"
,
"does"
,
"doesn't"
,
"doing"
,
"don't"
,
"done"
,
"down"
,
"downwards"
,
"during"
,
"e"
,
"each"
,
"edu"
,
"eg"
,
"eight"
,
"either"
,
"else"
,
"elsewhere"
,
"enough"
,
"entirely"
,
"especially"
,
"et"
,
"etc"
,
"even"
,
"ever"
,
"every"
,
"everybody"
,
"everyone"
,
"everything"
,
"everywhere"
,
"ex"
,
"exactly"
,
"example"
,
"except"
,
"f"
,
"far"
,
"few"
,
"find"
,
"fifth"
,
"first"
,
"five"
,
"followed"
,
"following"
,
"follows"
,
"for"
,
"former"
,
"formerly"
,
"forth"
,
"four"
,
"from"
,
"further"
,
"furthermore"
,
"g"
,
"get"
,
"gets"
,
"getting"
,
"given"
,
"gives"
,
"go"
,
"goes"
,
"going"
,
"gone"
,
"got"
,
"gotten"
,
"greetings"
,
"h"
,
"had"
,
"hadn't"
,
"happens"
,
"hardly"
,
"has"
,
"hasn't"
,
"have"
,
"haven't"
,
"having"
,
"he"
,
"he's"
,
"hello"
,
"help"
,
"hence"
,
"her"
,
"here"
,
"here's"
,
"hereafter"
,
"hereby"
,
"herein"
,
"hereupon"
,
"hers"
,
"herself"
,
"hi"
,
"him"
,
"himself"
,
"his"
,
"hither"
,
"hopefully"
,
"how"
,
"howbeit"
,
"however"
,
"i"
,
"i'd"
,
"identify"
,
"i'll"
,
"i'm"
,
"i've"
,
"ie"
,
"if"
,
"ignored"
,
"immediate"
,
"in"
,
"inasmuch"
,
"inc"
,
"indeed"
,
"indicate"
,
"indicated"
,
"indicates"
,
"inner"
,
"insofar"
,
"instead"
,
"into"
,
"inward"
,
"is"
,
"isn't"
,
"it"
,
"it'd"
,
"it'll"
,
"it's"
,
"its"
,
"itself"
,
"j"
,
"just"
,
"k"
,
"keep"
,
"keeps"
,
"kept"
,
"know"
,
"known"
,
"knows"
,
"l"
,
"last"
,
"lately"
,
"later"
,
"latter"
,
"latterly"
,
"least"
,
"less"
,
"lest"
,
"let"
,
"let's"
,
"like"
,
"liked"
,
"likely"
,
"little"
,
"look"
,
"looking"
,
"looks"
,
"ltd"
,
"m"
,
"mainly"
,
"many"
,
"may"
,
"maybe"
,
"me"
,
"mean"
,
"meanwhile"
,
"merely"
,
"might"
,
"more"
,
"moreover"
,
"most"
,
"mostly"
,
"much"
,
"must"
,
"my"
,
"myself"
,
"n"
,
"name"
,
"namely"
,
"nd"
,
"near"
,
"nearly"
,
"necessary"
,
"need"
,
"needs"
,
"neither"
,
"never"
,
"nevertheless"
,
"new"
,
"next"
,
"nine"
,
"no"
,
"nobody"
,
"non"
,
"none"
,
"noone"
,
"nor"
,
"normally"
,
"not"
,
"nothing"
,
"novel"
,
"now"
,
"nowhere"
,
"o"
,
"obviously"
,
"of"
,
"off"
,
"often"
,
"oh"
,
"ok"
,
"okay"
,
"old"
,
"on"
,
"once"
,
"one"
,
"ones"
,
"only"
,
"onto"
,
"or"
,
"other"
,
"others"
,
"otherwise"
,
"ought"
,
"our"
,
"ours"
,
"ourselves"
,
"out"
,
"outside"
,
"over"
,
"overall"
,
"own"
,
"p"
,
"particular"
,
"particularly"
,
"per"
,
"perhaps"
,
"placed"
,
"please"
,
"plus"
,
"possible"
,
"presents"
,
"presumably"
,
"probably"
,
"provides"
,
"q"
,
"que"
,
"quite"
,
"qv"
,
"r"
,
"rather"
,
"rd"
,
"re"
,
"really"
,
"reasonably"
,
"regarding"
,
"regardless"
,
"regards"
,
"relatively"
,
"respectively"
,
"right"
,
"s"
,
"said"
,
"same"
,
"saw"
,
"say"
,
"saying"
,
"says"
,
"second"
,
"secondly"
,
"see"
,
"seeing"
,
"seem"
,
"seemed"
,
"seeming"
,
"seems"
,
"seen"
,
"self"
,
"selves"
,
"sensible"
,
"sent"
,
"serious"
,
"seriously"
,
"seven"
,
"several"
,
"shall"
,
"she"
,
"should"
,
"shouldn't"
,
"since"
,
"six"
,
"so"
,
"some"
,
"somebody"
,
"somehow"
,
"someone"
,
"something"
,
"sometime"
,
"sometimes"
,
"somewhat"
,
"somewhere"
,
"soon"
,
"sorry"
,
"specified"
,
"specify"
,
"specifying"
,
"still"
,
"sub"
,
"such"
,
"sup"
,
"sure"
,
"t"
,
"t's"
,
"take"
,
"taken"
,
"tell"
,
"tends"
,
"th"
,
"than"
,
"thank"
,
"thanks"
,
"thanx"
,
"that"
,
"that's"
,
"thats"
,
"the"
,
"their"
,
"theirs"
,
"them"
,
"themselves"
,
"then"
,
"thence"
,
"there"
,
"there's"
,
"thereafter"
,
"thereby"
,
"therefore"
,
"therein"
,
"theres"
,
"thereupon"
,
"these"
,
"they"
,
"they'd"
,
"they'll"
,
"they're"
,
"they've"
,
"think"
,
"third"
,
"this"
,
"thorough"
,
"thoroughly"
,
"those"
,
"though"
,
"three"
,
"through"
,
"throughout"
,
"thru"
,
"thus"
,
"to"
,
"together"
,
"too"
,
"took"
,
"toward"
,
"towards"
,
"tried"
,
"tries"
,
"truly"
,
"try"
,
"trying"
,
"twice"
,
"two"
,
"u"
,
"un"
,
"under"
,
"unfortunately"
,
"unless"
,
"unlikely"
,
"until"
,
"unto"
,
"up"
,
"upon"
,
"us"
,
"use"
,
"used"
,
"useful"
,
"uses"
,
"using"
,
"usually"
,
"uucp"
,
"v"
,
"value"
,
"various"
,
"very"
,
"via"
,
"viz"
,
"vs"
,
"w"
,
"want"
,
"wants"
,
"was"
,
"wasn't"
,
"way"
,
"we"
,
"we'd"
,
"we'll"
,
"we're"
,
"we've"
,
"welcome"
,
"well"
,
"went"
,
"were"
,
"weren't"
,
"what"
,
"what's"
,
"whatever"
,
"when"
,
"whence"
,
"whenever"
,
"where"
,
"where's"
,
"whereafter"
,
"whereas"
,
"whereby"
,
"wherein"
,
"whereupon"
,
"wherever"
,
"whether"
,
"which"
,
"while"
,
"whither"
,
"who"
,
"who's"
,
"whoever"
,
"whole"
,
"whom"
,
"whose"
,
"why"
,
"will"
,
"willing"
,
"wish"
,
"with"
,
"within"
,
"without"
,
"won't"
,
"wonder"
,
"would"
,
"wouldn't"
,
"x"
,
"y"
,
"yes"
,
"yet"
,
"you"
,
"you'd"
,
"you'll"
,
"you're"
,
"you've"
,
"your"
,
"yours"
,
"yourself"
,
"yourselves"
,
"z"
,
"zero"
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment