Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
10
Merge Requests
10
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
704fe86f
Commit
704fe86f
authored
Sep 24, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Lang detect.
parent
652975a0
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
105 additions
and
10 deletions
+105
-10
Core.hs
src/Gargantext/Core.hs
+2
-0
Prelude.hs
src/Gargantext/Prelude.hs
+1
-1
Count.hs
src/Gargantext/Text/Metrics/Count.hs
+3
-0
FrequentItemSet.hs
src/Gargantext/Text/Metrics/FrequentItemSet.hs
+3
-3
Stop.hs
src/Gargantext/Text/Terms/Stop.hs
+96
-6
No files found.
src/Gargantext/Core.hs
View file @
704fe86f
...
...
@@ -12,6 +12,7 @@ Portability : POSIX
module
Gargantext.Core
where
import
Gargantext.Prelude
------------------------------------------------------------------------
-- | Language of a Text
-- For simplicity, we suppose text has an homogenous language
...
...
@@ -26,3 +27,4 @@ module Gargantext.Core
--
-- ... add your language and help us to implement it (:
data
Lang
=
EN
|
FR
deriving
(
Show
,
Eq
,
Ord
)
src/Gargantext/Prelude.hs
View file @
704fe86f
...
...
@@ -46,7 +46,7 @@ import Protolude ( Bool(True, False), Int, Int64, Double, Integer
,
abs
,
min
,
max
,
maximum
,
minimum
,
return
,
snd
,
truncate
,
(
+
),
(
*
),
(
/
),
(
-
),
(
.
),
(
$
),
(
&
),
(
**
),
(
^
),
(
<
),
(
>
),
log
,
Eq
,
(
==
),
(
>=
),
(
<=
),
(
<>
),
(
/=
)
,
(
&&
),
(
||
),
not
,
any
,
(
&&
),
(
||
),
not
,
any
,
all
,
fst
,
snd
,
toS
,
elem
,
die
,
mod
,
div
,
const
,
either
,
curry
,
uncurry
,
repeat
...
...
src/Gargantext/Text/Metrics/Count.hs
View file @
704fe86f
...
...
@@ -144,6 +144,9 @@ occurrences = occurrencesOn _terms_stem
occurrencesOn
::
(
Ord
a
,
Ord
b
)
=>
(
a
->
b
)
->
[
a
]
->
Map
b
(
Map
a
Int
)
occurrencesOn
f
=
foldl'
(
\
m
a
->
insertWith
(
unionWith
(
+
))
(
f
a
)
(
singleton
a
1
)
m
)
empty
occurrencesWith
::
(
Foldable
list
,
Ord
k
,
Num
a
)
=>
(
b
->
k
)
->
list
b
->
Map
k
a
occurrencesWith
f
xs
=
foldl'
(
\
x
y
->
insertWith
(
+
)
(
f
y
)
1
x
)
empty
xs
-- TODO add groups and filter stops
sumOcc
::
Ord
a
=>
[
Occ
a
]
->
Occ
a
...
...
src/Gargantext/Text/Metrics/FrequentItemSet.hs
View file @
704fe86f
...
...
@@ -16,7 +16,7 @@ Domain Specific Language to manage Frequent Item Set (FIS)
module
Gargantext.Text.Metrics.FrequentItemSet
(
Fis
,
Size
(
..
)
,
occ_hlcm
,
cooc_hlcm
,
all
,
between
,
all
Fis
,
between
,
fisWithSize
,
fisWith
,
fisWithSizePoly
...
...
@@ -51,8 +51,8 @@ occ_hlcm = fisWithSize (Point 1)
cooc_hlcm
::
Frequency
->
[[
Item
]]
->
[
Fis
]
cooc_hlcm
=
fisWithSize
(
Point
2
)
all
::
Frequency
->
[[
Item
]]
->
[
Fis
]
all
=
fisWith
Nothing
all
Fis
::
Frequency
->
[[
Item
]]
->
[
Fis
]
all
Fis
=
fisWith
Nothing
------------------------------------------------------------------------
between
::
(
Int
,
Int
)
->
Frequency
->
[[
Item
]]
->
[
Fis
]
...
...
src/Gargantext/Text/Terms/Stop.hs
View file @
704fe86f
...
...
@@ -21,14 +21,23 @@ module Gargantext.Text.Terms.Stop
import
Numeric.Probability.Distribution
((
??
))
import
qualified
Numeric.Probability.Distribution
as
D
import
Data.String
(
String
)
import
Data.Char
(
toLower
)
import
qualified
Data.List
as
DL
-- import qualified Data.Map as M
import
Data.Maybe
(
maybe
)
import
Data.Map.Strict
(
Map
)
import
qualified
Data.Map.Strict
as
DM
import
Data.String
(
String
)
import
Data.Text
(
pack
,
unpack
)
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Text.Terms.Mono
(
words
)
import
Gargantext.Text.Metrics.Count
(
occurrencesWith
)
------------------------------------------------------------------------
data
Candidate
=
Candidate
{
stop
::
Double
,
noStop
::
Double
}
deriving
(
Show
)
...
...
@@ -48,11 +57,92 @@ blockOf n st = DL.concat $ DL.take n $ DL.repeat st
-- | Chunks is the same function as splitBy in Context but for Strings,
-- not Text (without pack and unpack operations that are not needed).
chunks
::
Int
->
Int
->
String
->
[
String
]
chunks
n
m
=
DL
.
take
m
.
chunkAlong
(
n
+
1
)
1
.
DL
.
concat
.
DL
.
take
1000
.
DL
.
repeat
.
blanks
chunks
n
m
=
DL
.
take
m
.
filter
(
not
.
all
(
==
' '
))
.
chunkAlong
(
n
+
1
)
1
.
DL
.
concat
.
DL
.
take
1000
.
DL
.
repeat
.
blanks
allChunks
::
[
Int
]
->
Int
->
String
->
[
String
]
allChunks
ns
m
st
=
DL
.
concat
$
map
(
\
n
->
chunks
n
m
st
)
ns
allChunks'
::
[
Int
]
->
Int
->
String
->
[[
String
]]
allChunks'
ns
m
st
=
map
(
\
n
->
chunks
n
m
st
)
ns
------------------------------------------------------------------------
-- * Analyze candidate
type
StringSize
=
Int
type
TotalFreq
=
Int
type
Freq
=
Int
type
Word
=
String
data
LangWord
=
LangWord
Lang
Word
type
LangProba
=
Map
Lang
Double
------------------------------------------------------------------------
estimeTest
::
String
->
LangProba
estimeTest
s
=
estime
(
wordsToBook
[
0
..
2
]
s
)
testEL
testEL
::
EventLang
testEL
=
toEventLangs
[
0
,
1
,
2
]
[
LangWord
EN
"Lovely day. This day."
,
LangWord
FR
"Belle journée, j'y vais."
,
LangWord
EN
"Hello Sir, how are you doing? I am fine thank you, good bye"
,
LangWord
FR
"Bonjour Monsieur, comment allez-vous? Je vais bien merci."
]
estime
::
EventBook
->
EventLang
->
LangProba
estime
(
EventBook
mapFreq
_
)
el
=
DM
.
unionsWith
(
+
)
$
map
(
\
(
s
,
n
)
->
DM
.
map
(
\
eb
->
(
fromIntegral
n
)
*
peb
s
eb
)
el
)
$
filter
(
\
x
->
fst
x
/=
" "
)
$
DM
.
toList
mapFreq
------------------------------------------------------------------------
-- | TODO: monoids
type
EventLang
=
Map
Lang
EventBook
toEventLangs
::
[
Int
]
->
[
LangWord
]
->
EventLang
toEventLangs
ns
=
foldl'
(
opLang
(
+
))
(
emptyEventLang
ns
)
.
map
(
toLang
ns
)
emptyEventLang
::
[
Int
]
->
EventLang
emptyEventLang
ns
=
toLang
ns
(
LangWord
FR
""
)
toLang
::
[
Int
]
->
LangWord
->
EventLang
toLang
ns
(
LangWord
l
txt
)
=
DM
.
fromList
[(
l
,
wordsToBook
ns
txt
)]
opLang
::
(
Freq
->
Freq
->
Freq
)
->
EventLang
->
EventLang
->
EventLang
opLang
f
=
DM
.
unionWith
(
op
f
)
------------------------------------------------------------------------
-- | TODO: monoids (but proba >= 0)
peb
::
String
->
EventBook
->
Double
peb
st
(
EventBook
mapFreq
mapN
)
=
(
fromIntegral
a
)
/
(
fromIntegral
b
)
where
a
=
maybe
0
identity
$
DM
.
lookup
st
mapFreq
b
=
maybe
1
identity
$
DM
.
lookup
(
length
st
)
mapN
data
EventBook
=
EventBook
{
events_freq
::
Map
String
Freq
,
events_n
::
Map
StringSize
TotalFreq
}
deriving
(
Show
)
emptyEventBook
::
[
Int
]
->
EventBook
emptyEventBook
ns
=
wordToBook
ns
" "
wordsToBook
::
[
Int
]
->
String
->
EventBook
wordsToBook
ns
txt
=
foldl'
(
op
(
+
))
(
emptyEventBook
ns
)
eventsBook
where
ws
=
map
unpack
$
words
$
pack
txt
eventsBook
=
map
(
wordToBook
ns
)
ws
wordToBook
::
[
Int
]
->
Word
->
EventBook
wordToBook
ns
txt
=
EventBook
ef
en
where
chks
=
allChunks'
ns
10
txt
en
=
DM
.
fromList
$
map
(
\
(
n
,
ns
)
->
(
n
,
length
ns
))
$
zip
ns
chks
ef
=
foldl'
DM
.
union
DM
.
empty
$
map
(
occurrencesWith
identity
)
chks
op
::
(
Freq
->
Freq
->
Freq
)
->
EventBook
->
EventBook
->
EventBook
op
f
(
EventBook
ef1
en1
)
(
EventBook
ef2
en2
)
=
EventBook
(
DM
.
unionWith
f
ef1
ef2
)
(
DM
.
unionWith
f
en1
en2
)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- * Make the distributions
makeDist
::
[
String
]
->
D
.
T
Double
String
...
...
@@ -65,7 +155,6 @@ candDist :: D.T Double String
candDist
=
makeDist
candList
------------------------------------------------------------------------
-- * Analyze candidate
sumProba
::
Num
a
=>
D
.
T
a
String
->
[
Char
]
->
a
sumProba
ds
x
=
sum
$
map
((
~?
)
ds
)
$
allChunks
[
0
,
2
]
10
$
map
toLower
x
...
...
@@ -78,7 +167,8 @@ candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
------------------------------------------------------------------------
candList
::
[
String
]
candList
=
[
"france"
,
"alexandre"
,
"mael"
,
"constitution"
,
"delanoe"
,
"etats-unis"
,
"associes"
,
"car"
,
"train"
,
"spam"
]
candList
=
[
"france"
,
"alexandre"
,
"mael"
,
"constitution"
,
"etats-unis"
,
"associes"
,
"car"
,
"train"
,
"spam"
]
stopList
::
[
String
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment