Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
195
Issues
195
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
ec5bb213
Commit
ec5bb213
authored
May 26, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Ngrams extraction unsupervized (doc, type, function).
parent
d1b3f47c
Pipeline
#413
failed with stage
Changes
3
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
98 additions
and
46 deletions
+98
-46
Node.hs
src/Gargantext/Database/Schema/Node.hs
+0
-2
Eleve.hs
src/Gargantext/Text/Eleve.hs
+72
-44
Terms.hs
src/Gargantext/Text/Terms.hs
+26
-0
No files found.
src/Gargantext/Database/Schema/Node.hs
View file @
ec5bb213
...
@@ -578,8 +578,6 @@ instance MkCorpus HyperdataAnnuaire
...
@@ -578,8 +578,6 @@ instance MkCorpus HyperdataAnnuaire
mk
n
h
p
u
=
insertNodesR
[
nodeAnnuaireW
n
h
p
u
]
mk
n
h
p
u
=
insertNodesR
[
nodeAnnuaireW
n
h
p
u
]
getOrMkList
::
HasNodeError
err
=>
ParentId
->
UserId
->
Cmd
err
ListId
getOrMkList
::
HasNodeError
err
=>
ParentId
->
UserId
->
Cmd
err
ListId
getOrMkList
pId
uId
=
getOrMkList
pId
uId
=
maybe
(
mkList'
pId
uId
)
(
pure
.
view
node_id
)
.
headMay
=<<
getListsWithParentId
pId
maybe
(
mkList'
pId
uId
)
(
pure
.
view
node_id
)
.
headMay
=<<
getListsWithParentId
pId
...
...
src/Gargantext/Text/Eleve.hs
View file @
ec5bb213
...
@@ -3,12 +3,31 @@
...
@@ -3,12 +3,31 @@
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TemplateHaskell #-}
{-
{-
# Implementation of Unsupervized Word Segmentation
Implementation of EleVe Python version of papers:
References:
- EleVe Python implementation and discussions with Korantin August and Bruno Gaume
[git repo](https://github.com/kodexlab/eleve.git)
- Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
the 50th Annual Meeting of the Association for Computational Linguistics
, pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
Notes for current implementation:
- The node count is correct; TODO add tests to keep track of it
- NP fix normalization
- NP extract longer ngrams (see paper above, viterbi algo can be used)
- TODO TEST: prop (Node c _e f) = c == Map.size f
- AD: Real ngrams extraction test
from Gargantext.Text.Terms import extractTermsUnsupervised
docs <- runCmdRepl $ selectDocs 1004
extractTermsUnsupervised 3 $ DT.intercalate " "
$ catMaybes
$ Gargantext.map _hyperdataDocument_abstract docs
NP:
* The node count is correct and we should not regress on this front.
-}
-}
module
Gargantext.Text.Eleve
where
module
Gargantext.Text.Eleve
where
...
@@ -30,10 +49,7 @@ import qualified Data.Tree as Tree
...
@@ -30,10 +49,7 @@ import qualified Data.Tree as Tree
import
Data.Tree
(
Tree
)
import
Data.Tree
(
Tree
)
import
qualified
Prelude
as
P
(
putStrLn
,
logBase
,
String
)
import
qualified
Prelude
as
P
(
putStrLn
,
logBase
,
String
)
-- prop (Node c _e f) = c == Map.size f
------------------------------------------------------------------------
-- TODO maybe add Leaf
-- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
data
I
e
=
I
data
I
e
=
I
{
_info_entropy
::
e
{
_info_entropy
::
e
,
_info_norm_entropy
::
e
,
_info_norm_entropy
::
e
...
@@ -50,30 +66,32 @@ type ModEntropy i o e = (e -> e) -> i -> o
...
@@ -50,30 +66,32 @@ type ModEntropy i o e = (e -> e) -> i -> o
setNormEntropy
::
ModEntropy
e
(
I
e
)
e
setNormEntropy
::
ModEntropy
e
(
I
e
)
e
setNormEntropy
f
e
=
I
e
(
f
e
)
e
-- (panic "setNormEntropy")
setNormEntropy
f
e
=
I
e
(
f
e
)
e
-- (panic "setNormEntropy")
test
n
example
=
do
test
Eleve
n
example
=
do
let
let
ex
=
toToken
n
example
ex
=
toToken
n
<$>
example
t
=
buildTrie
$
chunkAlong
n
1
ex
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
ex
nt
=
normalizeEntropy
identity
setNormEntropy
(
t
::
Trie
Token
Double
)
nt
=
normalizeEntropy
identity
setNormEntropy
(
t
::
Trie
Token
Double
)
nt'
=
normalizeEntropy'
info_entropy
(
\
f
->
info_norm_entropy'
%~
f
)
nt
nt'
=
normalizeEntropy'
info_entropy
(
\
f
->
info_norm_entropy'
%~
f
)
nt
{-
P.putStrLn $ Tree.drawTree
P.putStrLn $ Tree.drawTree
$ fmap show
$ fmap show
$ toTree (NonTerminal "") nt'
$ toTree (NonTerminal "") nt'
--}
pure
$
map
unToken
$
split
info_entropy
nt'
ex
pure
$
map
unToken
$
split
info_entropy
nt'
$
L
.
concat
ex
-- NP: here we use the entropy to split
-- NP: here we use the entropy to split
-- instead we should use either:
-- instead we should use either:
-- info_norm_entropy or info_norm_entropy'
-- info_norm_entropy or info_norm_entropy'
-- However they should first be fixed.
-- However they should first be fixed.
-- | TODO real data is a list of tokenized sentences
example0
=
[
T
.
words
"New York is New York and New York"
]
example1
=
[
T
.
words
"to be or not to be"
]
example2
=
[
T
.
words
"to be or not to be or"
]
example3
=
example0
<>
example0
-- > TEST: Should not have York New in the trie
example4
=
map
(
T
.
pack
.
pure
)
(
"abcdefabcdegabcde"
::
P
.
String
)
example0
=
T
.
words
"New York is New York and New York"
------------------------------------------------------------------------
example1
=
T
.
words
"to be or not to be"
------------------------------------------------------------------------
example2
=
T
.
words
"to be or not to be or"
example3
=
map
(
T
.
pack
.
pure
)
(
"abcdefabcdegabcde"
::
P
.
String
)
data
Token
=
NonTerminal
Text
data
Token
=
NonTerminal
Text
|
Terminal
|
Terminal
deriving
(
Ord
,
Eq
,
Show
)
deriving
(
Ord
,
Eq
,
Show
)
...
@@ -87,6 +105,7 @@ unToken = map f
...
@@ -87,6 +105,7 @@ unToken = map f
f
(
NonTerminal
x
)
=
x
f
(
NonTerminal
x
)
=
x
f
Terminal
=
""
f
Terminal
=
""
------------------------------------------------------------------------
data
Trie
k
e
data
Trie
k
e
=
Node
{
_node_count
::
Int
=
Node
{
_node_count
::
Int
...
@@ -96,9 +115,16 @@ data Trie k e
...
@@ -96,9 +115,16 @@ data Trie k e
|
Leaf
{
_node_count
::
Int
}
|
Leaf
{
_node_count
::
Int
}
deriving
(
Show
)
deriving
(
Show
)
toTree
::
k
->
Trie
k
e
->
Tree
(
k
,
Int
,
Maybe
e
)
toTree
k
(
Leaf
c
)
=
Tree
.
Node
(
k
,
c
,
Nothing
)
[]
insertTries
::
Ord
k
=>
[[
k
]]
->
Trie
k
()
toTree
k
(
Node
c
e
cs
)
=
Tree
.
Node
(
k
,
c
,
Just
e
)
(
map
(
uncurry
toTree
)
$
Map
.
toList
cs
)
insertTries
=
L
.
foldr
insertTrie
emptyTrie
insertTrie
::
Ord
k
=>
[
k
]
->
Trie
k
()
->
Trie
k
()
insertTrie
[]
n
=
n
{
_node_count
=
_node_count
n
+
1
}
insertTrie
(
x
:
xs
)
(
Leaf
c
)
=
mkTrie
(
c
+
1
)
$
Map
.
singleton
x
$
insertTrie
xs
emptyTrie
insertTrie
(
x
:
xs
)
(
Node
c
_e
children
)
=
mkTrie
(
c
+
1
)
$
Map
.
alter
f
x
children
where
f
=
Just
.
insertTrie
xs
.
fromMaybe
emptyTrie
-- emptyTrie :: (Ord k, Monoid e) => Trie k e
-- emptyTrie :: (Ord k, Monoid e) => Trie k e
-- emptyTrie = Node 0 mempty mempty
-- emptyTrie = Node 0 mempty mempty
...
@@ -110,25 +136,25 @@ mkTrie c children
...
@@ -110,25 +136,25 @@ mkTrie c children
|
Map
.
null
children
=
Leaf
c
|
Map
.
null
children
=
Leaf
c
|
otherwise
=
Node
c
mempty
children
|
otherwise
=
Node
c
mempty
children
insertTrie
::
Ord
k
=>
[
k
]
->
Trie
k
()
->
Trie
k
()
-----------------------------
insertTrie
[]
n
=
n
{
_node_count
=
_node_count
n
+
1
}
insertTrie
(
x
:
xs
)
(
Leaf
c
)
=
mkTrie
(
c
+
1
)
$
Map
.
singleton
x
$
insertTrie
xs
emptyTrie
insertTrie
(
x
:
xs
)
(
Node
c
_e
children
)
=
mkTrie
(
c
+
1
)
$
Map
.
alter
f
x
children
where
f
=
Just
.
insertTrie
xs
.
fromMaybe
emptyTrie
insertTries
::
Ord
k
=>
[[
k
]]
->
Trie
k
()
-- | Trie to Tree since Tree as nice print function
insertTries
=
L
.
foldr
insertTrie
emptyTrie
toTree
::
k
->
Trie
k
e
->
Tree
(
k
,
Int
,
Maybe
e
)
toTree
k
(
Leaf
c
)
=
Tree
.
Node
(
k
,
c
,
Nothing
)
[]
toTree
k
(
Node
c
e
cs
)
=
Tree
.
Node
(
k
,
c
,
Just
e
)
(
map
(
uncurry
toTree
)
$
Map
.
toList
cs
)
------------------------------------------------------------------------
------------------------------------------------------------------------
entropyTrie
::
(
Num
e
,
Floating
e
)
=>
(
k
->
Bool
)
->
Trie
k
()
->
Trie
k
e
entropyTrie
::
(
Num
e
,
Floating
e
)
=>
(
k
->
Bool
)
->
Trie
k
()
->
Trie
k
e
entropyTrie
_
(
Leaf
c
)
=
Leaf
c
entropyTrie
_
(
Leaf
c
)
=
Leaf
c
entropyTrie
pred
(
Node
c
_e
children
)
=
Node
c
e
(
map
(
entropyTrie
pred
)
children
)
entropyTrie
pred
(
Node
c
_e
children
)
=
Node
c
e
(
map
(
entropyTrie
pred
)
children
)
where
where
e
=
sum
$
map
f
$
Map
.
toList
children
e
=
sum
$
map
f
$
Map
.
toList
children
f
(
k
,
child
)
=
if
pred
k
then
c
f
c
*
P
.
logBase
2
(
fromIntegral
c
)
f
(
k
,
child
)
=
if
pred
k
then
c
h
c
*
P
.
logBase
2
(
fromIntegral
c
)
else
-
c
fc
*
P
.
logBase
2
cf
c
else
-
c
hc
*
P
.
logBase
2
ch
c
where
where
c
f
c
=
fromIntegral
(
_node_count
child
)
/
fromIntegral
c
c
h
c
=
fromIntegral
(
_node_count
child
)
/
fromIntegral
c
normalizeEntropy
::
(
Fractional
e
,
Floating
e
,
Show
e
)
normalizeEntropy
::
(
Fractional
e
,
Floating
e
,
Show
e
)
=>
Getting
e
i
e
->
ModEntropy
i
o
e
->
Trie
k
i
->
Trie
k
o
=>
Getting
e
i
e
->
ModEntropy
i
o
e
->
Trie
k
i
->
Trie
k
o
...
@@ -142,6 +168,7 @@ normalizeEntropy inE modE = go $ modE identity
...
@@ -142,6 +168,7 @@ normalizeEntropy inE modE = go $ modE identity
es
=
[
i'
^.
inE
|
Node
_
i'
_
<-
Map
.
elems
children
]
es
=
[
i'
^.
inE
|
Node
_
i'
_
<-
Map
.
elems
children
]
m
=
mean
es
m
=
mean
es
v
=
deviation
es
v
=
deviation
es
------------------------------------------------------------------------
normalizeLevel
::
(
Fractional
e
,
Floating
e
,
Show
e
)
normalizeLevel
::
(
Fractional
e
,
Floating
e
,
Show
e
)
=>
e
->
e
->
e
->
e
=>
e
->
e
->
e
->
e
...
@@ -150,10 +177,6 @@ normalizeLevel m v e = (e - m) / v
...
@@ -150,10 +177,6 @@ normalizeLevel m v e = (e - m) / v
buildTrie
::
(
Floating
e
,
Show
e
)
=>
[[
Token
]]
->
Trie
Token
e
buildTrie
::
(
Floating
e
,
Show
e
)
=>
[[
Token
]]
->
Trie
Token
e
buildTrie
=
entropyTrie
(
==
Terminal
)
.
insertTries
buildTrie
=
entropyTrie
(
==
Terminal
)
.
insertTries
subForest
::
Trie
k
e
->
[
Trie
k
e
]
subForest
(
Leaf
_
)
=
[]
subForest
(
Node
_
_
children
)
=
Map
.
elems
children
nodeEntropy
::
Trie
k
e
->
Maybe
e
nodeEntropy
::
Trie
k
e
->
Maybe
e
nodeEntropy
(
Node
_
e
_
)
=
Just
e
nodeEntropy
(
Node
_
e
_
)
=
Just
e
nodeEntropy
(
Leaf
_
)
=
Nothing
nodeEntropy
(
Leaf
_
)
=
Nothing
...
@@ -168,6 +191,10 @@ nodeChild _ (Leaf _) = Nothing
...
@@ -168,6 +191,10 @@ nodeChild _ (Leaf _) = Nothing
levels
::
Trie
k
e
->
[[
Trie
k
e
]]
levels
::
Trie
k
e
->
[[
Trie
k
e
]]
levels
=
L
.
takeWhile
(
not
.
L
.
null
)
.
L
.
iterate
(
L
.
concatMap
subForest
)
.
pure
levels
=
L
.
takeWhile
(
not
.
L
.
null
)
.
L
.
iterate
(
L
.
concatMap
subForest
)
.
pure
where
subForest
::
Trie
k
e
->
[
Trie
k
e
]
subForest
(
Leaf
_
)
=
[]
subForest
(
Node
_
_
children
)
=
Map
.
elems
children
entropyLevels
::
Getting
e
i
e
->
Trie
k
i
->
[[
e
]]
entropyLevels
::
Getting
e
i
e
->
Trie
k
i
->
[[
e
]]
entropyLevels
inE
=
fmap
(
fmap
(
view
inE
)
.
catMaybes
.
fmap
nodeEntropy
)
.
levels
entropyLevels
inE
=
fmap
(
fmap
(
view
inE
)
.
catMaybes
.
fmap
nodeEntropy
)
.
levels
...
@@ -189,11 +216,10 @@ normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
...
@@ -189,11 +216,10 @@ normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
v
=
deviation
es
v
=
deviation
es
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
split
::
(
Num
e
,
Ord
e
,
Show
e
)
=>
Lens'
i
e
->
Trie
Token
i
->
[
Token
]
->
[[
Token
]]
split
::
(
Num
e
,
Ord
e
,
Show
e
)
=>
Lens'
i
e
->
Trie
Token
i
->
[
Token
]
->
[[
Token
]]
split
inE
t0
=
go
t0
[]
split
inE
t0
=
go
t0
[]
where
where
ne
d
t
=
fromMaybe
d
(
nodeEntropy
t
^?
_Just
.
inE
)
consRev
[]
xss
=
xss
consRev
[]
xss
=
xss
consRev
xs
xss
=
reverse
xs
:
xss
consRev
xs
xss
=
reverse
xs
:
xss
...
@@ -209,10 +235,12 @@ split inE t0 = go t0 []
...
@@ -209,10 +235,12 @@ split inE t0 = go t0 []
in
in
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
case
et
{-+ ext0-}
<
ext
of
case
et
{-+ ext0-}
<
ext
of
-- NP: here we must take ext0 in account how
o
ver currently it
-- NP: here we must take ext0 in account how
e
ver currently it
-- makes it worse.
-- makes it worse.
-- For instance it currently works well to 2-grams but not more.
-- For instance it currently works well to 2-grams but not more.
-- PASS: test 4 example1
-- PASS: test 4 example1
-- FAIL: test 4 example2
-- FAIL: test 4 example2
True
->
go
xt
(
x
:
pref
)
xs
True
->
go
xt
(
x
:
pref
)
xs
False
->
consRev
pref
$
go
xt0
[
x
]
xs
False
->
consRev
pref
$
go
xt0
[
x
]
xs
ne
d
t
=
fromMaybe
d
(
nodeEntropy
t
^?
_Just
.
inE
)
src/Gargantext/Text/Terms.hs
View file @
ec5bb213
...
@@ -37,6 +37,7 @@ module Gargantext.Text.Terms
...
@@ -37,6 +37,7 @@ module Gargantext.Text.Terms
import
Control.Lens
import
Control.Lens
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
Data.Traversable
import
Data.Traversable
import
GHC.Base
(
String
)
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Core
import
Gargantext.Core
...
@@ -44,6 +45,11 @@ import Gargantext.Core.Types
...
@@ -44,6 +45,11 @@ import Gargantext.Core.Types
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
qualified
Data.List
as
List
import
qualified
Data.Text
as
Text
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Eleve
(
testEleve
)
data
TermType
lang
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
=
Mono
{
_tt_lang
::
lang
}
...
@@ -75,3 +81,23 @@ terms (MonoMulti lang) txt = terms (Multi lang) txt
...
@@ -75,3 +81,23 @@ terms (MonoMulti lang) txt = terms (Multi lang) txt
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
------------------------------------------------------------------------
isPunctuation
::
Text
->
Bool
isPunctuation
x
=
List
.
elem
x
$
(
Text
.
pack
.
pure
)
<$>
(
"!?(),;."
::
String
)
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: BlockText
extractTermsUnsupervised
::
Int
->
Text
->
IO
[[
Text
]]
extractTermsUnsupervised
n
=
fmap
List
.
nub
.
fmap
(
List
.
filter
(
\
l
->
List
.
length
l
>
1
))
.
testEleve
n
.
map
(
map
Text
.
toLower
)
.
map
(
List
.
filter
(
not
.
isPunctuation
))
.
map
tokenize
.
sentences
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment