Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
147
Issues
147
List
Board
Labels
Milestones
Merge Requests
10
Merge Requests
10
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
8e40d912
Commit
8e40d912
authored
May 29, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev' into features/acp-init-graph
parents
5863fee7
bd4e8f10
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
106 additions
and
62 deletions
+106
-62
Eleve.hs
src/Gargantext/Text/Eleve.hs
+100
-57
Terms.hs
src/Gargantext/Text/Terms.hs
+6
-5
No files found.
src/Gargantext/Text/Eleve.hs
View file @
8e40d912
...
...
@@ -10,8 +10,9 @@ Portability : POSIX
# Implementation of Unsupervized Word Segmentation
References:
- EleVe Python implementation and discussions with Korantin August and Bruno Gaume
[git repo](https://github.com/kodexlab/eleve.git)
- Python implementation (Korantin August, Emmanuel Navarro):
[EleVe](https://github.com/kodexlab/eleve.git)
- Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
...
...
@@ -19,9 +20,8 @@ References:
, pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
Notes for current implementation:
- The node count is correct; TODO AD add tests to keep track of it
- NP fix normalization
- NP extract longer ngrams (see paper above, viterbi algo can be used)
- TODO fix normalization
- TODO extract longer ngrams (see paper above, viterbi algo can be used)
- TODO AD TEST: prop (Node c _e f) = c == Map.size f
- AD: Real ngrams extraction test
...
...
@@ -31,7 +31,6 @@ Notes for current implementation:
$ catMaybes
$ Gargantext.map _hyperdataDocument_abstract docs
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
...
...
@@ -40,11 +39,11 @@ Notes for current implementation:
module
Gargantext.Text.Eleve
where
import
Debug.Trace
(
trace
)
--
import Debug.Trace (trace)
-- import Debug.SimpleReflect
import
Control.Lens
(
Lens
,
Lens
'
,
ASetter
,
Getting
,
(
^.
),
(
^?
),
(
&
),
(
.~
),
(
%~
),
view
,
makeLenses
,
_Just
)
import
Control.Monad
(
foldM
)
import
Control.Lens
(
Lens
'
,
Getting
,
(
^.
),
(
^?
),
(
%~
),
view
,
makeLenses
,
_Just
)
import
Control.Monad
(
foldM
,
mapM_
,
forM_
)
import
Data.Ord
(
Ord
)
import
qualified
Data.List
as
L
import
Data.Monoid
...
...
@@ -56,7 +55,7 @@ import qualified Data.Map as Map
import
Gargantext.Prelude
hiding
(
cs
)
import
qualified
Data.Tree
as
Tree
import
Data.Tree
(
Tree
)
import
qualified
Prelude
as
P
(
putStrLn
,
logBase
,
String
)
import
qualified
Prelude
as
P
(
putStrLn
,
logBase
)
------------------------------------------------------------------------
-- | Example and tests for development
...
...
@@ -76,32 +75,6 @@ type ModEntropy i o e = (e -> e) -> i -> o
setNormEntropy
::
ModEntropy
e
(
I
e
)
e
setNormEntropy
f
e
=
I
e
(
f
e
)
e
-- (panic "setNormEntropy")
testEleve
n
example
=
do
let
ex
=
toToken
n
<$>
example
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
ex
nt
=
normalizeEntropy
identity
setNormEntropy
(
t
::
Trie
Token
Double
)
nt'
=
normalizeEntropy'
info_entropy
(
\
f
->
info_norm_entropy'
%~
f
)
nt
{-
P.putStrLn $ Tree.drawTree
$ fmap show
$ toTree (NonTerminal "") nt'
--}
pure
$
map
unToken
$
split
info_entropy
nt'
$
L
.
concat
ex
-- NP: here we use the entropy to split
-- instead we should use either:
-- info_norm_entropy or info_norm_entropy'
-- However they should first be fixed.
-- | TODO real data is a list of tokenized sentences
example0
=
[
T
.
words
"New York is New York and New York"
]
example1
=
[
T
.
words
"to be or not to be"
]
example2
=
[
T
.
words
"to be or not to be or"
]
example3
=
example0
<>
example0
-- > TEST: Should not have York New in the trie
example4
=
map
(
T
.
pack
.
pure
)
(
"abcdefabcdegabcde"
::
P
.
String
)
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Token
=
NonTerminal
Text
|
Terminal
deriving
(
Ord
,
Eq
,
Show
)
...
...
@@ -125,6 +98,7 @@ data Trie k e
|
Leaf
{
_node_count
::
Int
}
deriving
(
Show
)
makeLenses
''
T
rie
insertTries
::
Ord
k
=>
[[
k
]]
->
Trie
k
()
insertTries
=
L
.
foldr
insertTrie
emptyTrie
...
...
@@ -171,13 +145,16 @@ normalizeEntropy :: (Fractional e, Floating e, Show e)
normalizeEntropy
inE
modE
=
go
$
modE
identity
where
go
_
(
Leaf
c
)
=
Leaf
c
go
f
(
Node
c
i
children
)
|
not
(
Map
.
null
children
)
=
-- trace (show $ L.length es) $
Node
c
(
f
i
)
$
go
(
modE
$
normalizeLevel
m
v
)
<$>
children
where
es
=
[
i'
^.
inE
|
Node
_
i'
_
<-
Map
.
elems
children
]
m
=
mean
es
v
=
deviation
es
go
f
(
Node
c
i
children
)
|
Map
.
null
children
=
panic
"normalizeEntropy: impossible"
|
otherwise
=
-- trace (show $ L.length es) $
Node
c
(
f
i
)
$
go
(
modE
$
normalizeLevel
m
v
)
<$>
children
where
es
=
[
i'
^.
inE
|
Node
_
i'
_
<-
Map
.
elems
children
]
m
=
mean
es
v
=
deviation
es
------------------------------------------------------------------------
normalizeLevel
::
(
Fractional
e
,
Floating
e
,
Show
e
)
...
...
@@ -195,6 +172,13 @@ nodeChildren :: Trie k e -> Map k (Trie k e)
nodeChildren
(
Node
_
_
cs
)
=
cs
nodeChildren
(
Leaf
_
)
=
Map
.
empty
nodeChild
::
Ord
k
=>
k
->
Trie
k
e
->
Maybe
(
Trie
k
e
)
nodeChild
k
(
Node
_
_
cs
)
=
Map
.
lookup
k
cs
nodeChild
_
(
Leaf
_
)
=
Nothing
findTrie
::
Ord
k
=>
[
k
]
->
Trie
k
e
->
Maybe
(
Trie
k
e
)
findTrie
ks
t
=
foldM
(
flip
nodeChild
)
t
ks
levels
::
Trie
k
e
->
[[
Trie
k
e
]]
levels
=
L
.
takeWhile
(
not
.
L
.
null
)
.
L
.
iterate
(
L
.
concatMap
subForest
)
.
pure
where
...
...
@@ -229,29 +213,88 @@ split inE t0 = go t0 []
consRev
[]
xss
=
xss
consRev
xs
xss
=
reverse
xs
:
xss
go
_
pref
[]
=
[
reverse
pref
]
go
_
pref
[]
=
[
reverse
pref
]
go
_
pref
(
Terminal
:
_
)
=
[
reverse
pref
]
go
t
pref
(
x
:
xs
)
=
case
nodeChild
x
t
of
Nothing
->
consRev
pref
$
go
t0
[
x
]
xs
Just
xt
->
case
nodeChild
x
t0
of
Nothing
->
panic
"TODO"
Nothing
->
panic
$
"TODO"
Just
xt0
->
let
et
=
ne
(
panic
"t"
)
t
-- ^ entropy of the current prefix
ext0
=
ne
(
panic
"xt0"
)
xt0
-- ^ entropy of [x]
ext
=
ne
0
xt
-- ^ entropy of the current prefix plus x
in
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
case
et
{-+ ext0-}
<
ext
of
-- NP: here we must take ext0 in account however currently it
-- makes it worse.
-- For instance it currently works well to 2-grams but not more.
-- PASS: test 4 example1
-- FAIL: test 4 example2
True
->
go
xt
(
x
:
pref
)
xs
False
->
consRev
pref
$
go
xt0
[
x
]
xs
if
ext
+
ext0
>
et
then
go
xt
(
x
:
pref
)
xs
else
consRev
pref
$
go
xt0
[
x
]
xs
nodeChild
::
Ord
k
=>
k
->
Trie
k
e
->
Maybe
(
Trie
k
e
)
nodeChild
k
(
Node
_
_
cs
)
=
Map
.
lookup
k
cs
nodeChild
_
(
Leaf
_
)
=
Nothing
ne
d
t
=
fromMaybe
d
(
nodeEntropy
t
^?
_Just
.
inE
)
------------------------------------------------------------------------
------------------------------------------------------------------------
ne
d
t
=
fromMaybe
d
(
nodeEntropy
t
^?
_Just
.
inE
)
mainEleve
::
Int
->
[[
Text
]]
->
[[[
Text
]]]
mainEleve
n
input
=
map
unToken
.
split
identity
(
t
::
Trie
Token
Double
)
<$>
inp
where
inp
=
toToken
(
n
-
1
)
<$>
input
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
inp
-- NP: here we use the entropy to split
-- instead we should use either:
-- info_norm_entropy or info_norm_entropy'
-- However they should first be fixed.
testEleve
::
Bool
->
Int
->
[
Text
]
->
IO
Bool
testEleve
debug
n
output
=
do
let
out
=
T
.
words
<$>
output
expected
=
fmap
(
T
.
splitOn
"-"
)
<$>
out
input
=
(
T
.
splitOn
"-"
=<<
)
<$>
out
inp
=
toToken
(
n
-
1
)
<$>
input
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
inp
nt
=
normalizeEntropy
identity
setNormEntropy
(
t
::
Trie
Token
Double
)
nt'
=
normalizeEntropy'
info_entropy
(
\
f
->
info_norm_entropy'
%~
f
)
nt
pss
=
[
(
ps
,
findTrie
ps
t
^?
_Just
.
node_entropy
)
-- . info_entropy)
|
ps
<-
L
.
nub
$
[
c
|
m
<-
[
1
..
n
]
,
cs
<-
chunkAlong
m
1
<$>
inp
,
c
<-
cs
]
]
res
=
map
unToken
.
split
identity
t
<$>
inp
when
debug
$
do
P
.
putStrLn
(
show
input
)
mapM_
(
P
.
putStrLn
.
show
)
pss
P
.
putStrLn
$
Tree
.
drawTree
$
fmap
show
$
toTree
(
NonTerminal
""
)
nt'
P
.
putStrLn
$
show
res
pure
$
expected
==
res
-- | TODO real data is a list of tokenized sentences
example0
,
example1
,
example2
,
example3
,
example4
,
example5
::
[
Text
]
example0
=
[
"New-York is New-York and New-York"
]
example1
=
[
"to-be or not to-be"
]
example2
=
[
"to-be-or not to-be-or NOT to-be and"
]
example3
=
example0
<>
example0
-- > TEST: Should not have York New in the trie
example4
=
[
"a-b-c-d e a-b-c-d f"
]
example5
=
[
"a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"
]
runTests
::
IO
()
runTests
=
forM_
[(
"example0"
,
2
,
example0
)
,(
"example1"
,
2
,
example1
)
,(
"example2"
,
3
,
example2
)
,(
"example3"
,
2
,
example3
)
,(
"example4"
,
4
,
example4
)
,(
"example5"
,
5
,
example5
)
]
(
\
(
name
,
n
,
ex
)
->
do
b
<-
testEleve
False
n
ex
P
.
putStrLn
$
name
<>
" "
<>
show
n
<>
" "
<>
if
b
then
"PASS"
else
"FAIL"
)
src/Gargantext/Text/Terms.hs
View file @
8e40d912
...
...
@@ -49,7 +49,7 @@ import qualified Data.List as List
import
qualified
Data.Text
as
Text
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Eleve
(
test
Eleve
)
import
Gargantext.Text.Eleve
(
main
Eleve
)
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
...
...
@@ -89,11 +89,12 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
-- language agnostic extraction
-- TODO: remove IO
-- TODO: BlockText
extractTermsUnsupervised
::
Int
->
Text
->
IO
[[
Text
]]
extractTermsUnsupervised
::
Int
->
Text
->
[[
Text
]]
extractTermsUnsupervised
n
=
fmap
List
.
nub
.
fmap
(
List
.
filter
(
\
l
->
List
.
length
l
>
1
))
.
testEleve
n
List
.
nub
.
(
List
.
filter
(
\
l
->
List
.
length
l
>
1
))
.
List
.
concat
.
mainEleve
n
.
map
(
map
Text
.
toLower
)
.
map
(
List
.
filter
(
not
.
isPunctuation
))
.
map
tokenize
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment