Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
ec5bb213
Commit
ec5bb213
authored
May 26, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Ngrams extraction unsupervized (doc, type, function).
parent
d1b3f47c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
98 additions
and
46 deletions
+98
-46
Node.hs
src/Gargantext/Database/Schema/Node.hs
+0
-2
Eleve.hs
src/Gargantext/Text/Eleve.hs
+72
-44
Terms.hs
src/Gargantext/Text/Terms.hs
+26
-0
No files found.
src/Gargantext/Database/Schema/Node.hs
View file @
ec5bb213
...
...
@@ -578,8 +578,6 @@ instance MkCorpus HyperdataAnnuaire
mk
n
h
p
u
=
insertNodesR
[
nodeAnnuaireW
n
h
p
u
]
getOrMkList
::
HasNodeError
err
=>
ParentId
->
UserId
->
Cmd
err
ListId
getOrMkList
pId
uId
=
maybe
(
mkList'
pId
uId
)
(
pure
.
view
node_id
)
.
headMay
=<<
getListsWithParentId
pId
...
...
src/Gargantext/Text/Eleve.hs
View file @
ec5bb213
...
...
@@ -3,12 +3,31 @@
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE TemplateHaskell #-}
{-
# Implementation of Unsupervized Word Segmentation
Implementation of EleVe Python version of papers:
References:
- EleVe Python implementation and discussions with Korantin August and Bruno Gaume
[git repo](https://github.com/kodexlab/eleve.git)
- Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
the 50th Annual Meeting of the Association for Computational Linguistics
, pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
Notes for current implementation:
- The node count is correct; TODO add tests to keep track of it
- NP fix normalization
- NP extract longer ngrams (see paper above, viterbi algo can be used)
- TODO TEST: prop (Node c _e f) = c == Map.size f
- AD: Real ngrams extraction test
from Gargantext.Text.Terms import extractTermsUnsupervised
docs <- runCmdRepl $ selectDocs 1004
extractTermsUnsupervised 3 $ DT.intercalate " "
$ catMaybes
$ Gargantext.map _hyperdataDocument_abstract docs
NP:
* The node count is correct and we should not regress on this front.
-}
module
Gargantext.Text.Eleve
where
...
...
@@ -30,10 +49,7 @@ import qualified Data.Tree as Tree
import
Data.Tree
(
Tree
)
import
qualified
Prelude
as
P
(
putStrLn
,
logBase
,
String
)
-- prop (Node c _e f) = c == Map.size f
-- TODO maybe add Leaf
-- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
------------------------------------------------------------------------
data
I
e
=
I
{
_info_entropy
::
e
,
_info_norm_entropy
::
e
...
...
@@ -50,30 +66,32 @@ type ModEntropy i o e = (e -> e) -> i -> o
setNormEntropy
::
ModEntropy
e
(
I
e
)
e
setNormEntropy
f
e
=
I
e
(
f
e
)
e
-- (panic "setNormEntropy")
test
n
example
=
do
test
Eleve
n
example
=
do
let
ex
=
toToken
n
example
t
=
buildTrie
$
chunkAlong
n
1
ex
ex
=
toToken
n
<$>
example
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
ex
nt
=
normalizeEntropy
identity
setNormEntropy
(
t
::
Trie
Token
Double
)
nt'
=
normalizeEntropy'
info_entropy
(
\
f
->
info_norm_entropy'
%~
f
)
nt
{-
P.putStrLn $ Tree.drawTree
$ fmap show
$ toTree (NonTerminal "") nt'
pure
$
map
unToken
$
split
info_entropy
nt'
ex
--}
pure
$
map
unToken
$
split
info_entropy
nt'
$
L
.
concat
ex
-- NP: here we use the entropy to split
-- instead we should use either:
-- info_norm_entropy or info_norm_entropy'
-- However they should first be fixed.
-- | TODO real data is a list of tokenized sentences
example0
=
[
T
.
words
"New York is New York and New York"
]
example1
=
[
T
.
words
"to be or not to be"
]
example2
=
[
T
.
words
"to be or not to be or"
]
example3
=
example0
<>
example0
-- > TEST: Should not have York New in the trie
example4
=
map
(
T
.
pack
.
pure
)
(
"abcdefabcdegabcde"
::
P
.
String
)
example0
=
T
.
words
"New York is New York and New York"
example1
=
T
.
words
"to be or not to be"
example2
=
T
.
words
"to be or not to be or"
example3
=
map
(
T
.
pack
.
pure
)
(
"abcdefabcdegabcde"
::
P
.
String
)
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Token
=
NonTerminal
Text
|
Terminal
deriving
(
Ord
,
Eq
,
Show
)
...
...
@@ -85,8 +103,9 @@ unToken :: [Token] -> [Text]
unToken
=
map
f
where
f
(
NonTerminal
x
)
=
x
f
Terminal
=
""
f
Terminal
=
""
------------------------------------------------------------------------
data
Trie
k
e
=
Node
{
_node_count
::
Int
...
...
@@ -96,39 +115,46 @@ data Trie k e
|
Leaf
{
_node_count
::
Int
}
deriving
(
Show
)
toTree
::
k
->
Trie
k
e
->
Tree
(
k
,
Int
,
Maybe
e
)
toTree
k
(
Leaf
c
)
=
Tree
.
Node
(
k
,
c
,
Nothing
)
[]
toTree
k
(
Node
c
e
cs
)
=
Tree
.
Node
(
k
,
c
,
Just
e
)
(
map
(
uncurry
toTree
)
$
Map
.
toList
cs
)
insertTries
::
Ord
k
=>
[[
k
]]
->
Trie
k
()
insertTries
=
L
.
foldr
insertTrie
emptyTrie
insertTrie
::
Ord
k
=>
[
k
]
->
Trie
k
()
->
Trie
k
()
insertTrie
[]
n
=
n
{
_node_count
=
_node_count
n
+
1
}
insertTrie
(
x
:
xs
)
(
Leaf
c
)
=
mkTrie
(
c
+
1
)
$
Map
.
singleton
x
$
insertTrie
xs
emptyTrie
insertTrie
(
x
:
xs
)
(
Node
c
_e
children
)
=
mkTrie
(
c
+
1
)
$
Map
.
alter
f
x
children
where
f
=
Just
.
insertTrie
xs
.
fromMaybe
emptyTrie
-- emptyTrie :: (Ord k, Monoid e) => Trie k e
-- emptyTrie = Node 0 mempty mempty
emptyTrie
::
Trie
k
e
emptyTrie
=
Leaf
0
emptyTrie
=
Leaf
0
mkTrie
::
Monoid
e
=>
Int
->
Map
k
(
Trie
k
e
)
->
Trie
k
e
mkTrie
c
children
|
Map
.
null
children
=
Leaf
c
|
otherwise
=
Node
c
mempty
children
insertTrie
::
Ord
k
=>
[
k
]
->
Trie
k
()
->
Trie
k
()
insertTrie
[]
n
=
n
{
_node_count
=
_node_count
n
+
1
}
insertTrie
(
x
:
xs
)
(
Leaf
c
)
=
mkTrie
(
c
+
1
)
$
Map
.
singleton
x
$
insertTrie
xs
emptyTrie
insertTrie
(
x
:
xs
)
(
Node
c
_e
children
)
=
mkTrie
(
c
+
1
)
$
Map
.
alter
f
x
children
where
f
=
Just
.
insertTrie
xs
.
fromMaybe
emptyTrie
-----------------------------
insertTries
::
Ord
k
=>
[[
k
]]
->
Trie
k
()
insertTries
=
L
.
foldr
insertTrie
emptyTrie
-- | Trie to Tree since Tree as nice print function
toTree
::
k
->
Trie
k
e
->
Tree
(
k
,
Int
,
Maybe
e
)
toTree
k
(
Leaf
c
)
=
Tree
.
Node
(
k
,
c
,
Nothing
)
[]
toTree
k
(
Node
c
e
cs
)
=
Tree
.
Node
(
k
,
c
,
Just
e
)
(
map
(
uncurry
toTree
)
$
Map
.
toList
cs
)
------------------------------------------------------------------------
------------------------------------------------------------------------
entropyTrie
::
(
Num
e
,
Floating
e
)
=>
(
k
->
Bool
)
->
Trie
k
()
->
Trie
k
e
entropyTrie
_
(
Leaf
c
)
=
Leaf
c
entropyTrie
pred
(
Node
c
_e
children
)
=
Node
c
e
(
map
(
entropyTrie
pred
)
children
)
where
e
=
sum
$
map
f
$
Map
.
toList
children
f
(
k
,
child
)
=
if
pred
k
then
c
f
c
*
P
.
logBase
2
(
fromIntegral
c
)
else
-
c
fc
*
P
.
logBase
2
cf
c
f
(
k
,
child
)
=
if
pred
k
then
c
h
c
*
P
.
logBase
2
(
fromIntegral
c
)
else
-
c
hc
*
P
.
logBase
2
ch
c
where
c
f
c
=
fromIntegral
(
_node_count
child
)
/
fromIntegral
c
c
h
c
=
fromIntegral
(
_node_count
child
)
/
fromIntegral
c
normalizeEntropy
::
(
Fractional
e
,
Floating
e
,
Show
e
)
=>
Getting
e
i
e
->
ModEntropy
i
o
e
->
Trie
k
i
->
Trie
k
o
...
...
@@ -142,6 +168,7 @@ normalizeEntropy inE modE = go $ modE identity
es
=
[
i'
^.
inE
|
Node
_
i'
_
<-
Map
.
elems
children
]
m
=
mean
es
v
=
deviation
es
------------------------------------------------------------------------
normalizeLevel
::
(
Fractional
e
,
Floating
e
,
Show
e
)
=>
e
->
e
->
e
->
e
...
...
@@ -150,10 +177,6 @@ normalizeLevel m v e = (e - m) / v
buildTrie
::
(
Floating
e
,
Show
e
)
=>
[[
Token
]]
->
Trie
Token
e
buildTrie
=
entropyTrie
(
==
Terminal
)
.
insertTries
subForest
::
Trie
k
e
->
[
Trie
k
e
]
subForest
(
Leaf
_
)
=
[]
subForest
(
Node
_
_
children
)
=
Map
.
elems
children
nodeEntropy
::
Trie
k
e
->
Maybe
e
nodeEntropy
(
Node
_
e
_
)
=
Just
e
nodeEntropy
(
Leaf
_
)
=
Nothing
...
...
@@ -168,6 +191,10 @@ nodeChild _ (Leaf _) = Nothing
levels
::
Trie
k
e
->
[[
Trie
k
e
]]
levels
=
L
.
takeWhile
(
not
.
L
.
null
)
.
L
.
iterate
(
L
.
concatMap
subForest
)
.
pure
where
subForest
::
Trie
k
e
->
[
Trie
k
e
]
subForest
(
Leaf
_
)
=
[]
subForest
(
Node
_
_
children
)
=
Map
.
elems
children
entropyLevels
::
Getting
e
i
e
->
Trie
k
i
->
[[
e
]]
entropyLevels
inE
=
fmap
(
fmap
(
view
inE
)
.
catMaybes
.
fmap
nodeEntropy
)
.
levels
...
...
@@ -189,11 +216,10 @@ normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
v
=
deviation
es
------------------------------------------------------------------------
------------------------------------------------------------------------
split
::
(
Num
e
,
Ord
e
,
Show
e
)
=>
Lens'
i
e
->
Trie
Token
i
->
[
Token
]
->
[[
Token
]]
split
inE
t0
=
go
t0
[]
where
ne
d
t
=
fromMaybe
d
(
nodeEntropy
t
^?
_Just
.
inE
)
consRev
[]
xss
=
xss
consRev
xs
xss
=
reverse
xs
:
xss
...
...
@@ -203,16 +229,18 @@ split inE t0 = go t0 []
Just
xt
->
case
nodeChild
x
t0
of
Nothing
->
panic
"TODO"
Just
xt0
->
let
et
=
ne
(
panic
"t"
)
t
let
et
=
ne
(
panic
"t"
)
t
ext0
=
ne
(
panic
"xt0"
)
xt0
ext
=
ne
0
xt
ext
=
ne
0
xt
in
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
case
et
{-+ ext0-}
<
ext
of
-- NP: here we must take ext0 in account how
o
ver currently it
-- NP: here we must take ext0 in account how
e
ver currently it
-- makes it worse.
-- For instance it currently works well to 2-grams but not more.
-- PASS: test 4 example1
-- FAIL: test 4 example2
True
->
go
xt
(
x
:
pref
)
xs
False
->
consRev
pref
$
go
xt0
[
x
]
xs
ne
d
t
=
fromMaybe
d
(
nodeEntropy
t
^?
_Just
.
inE
)
src/Gargantext/Text/Terms.hs
View file @
ec5bb213
...
...
@@ -37,6 +37,7 @@ module Gargantext.Text.Terms
import
Control.Lens
import
Data.Text
(
Text
)
import
Data.Traversable
import
GHC.Base
(
String
)
import
Gargantext.Prelude
import
Gargantext.Core
...
...
@@ -44,6 +45,11 @@ import Gargantext.Core.Types
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
qualified
Data.List
as
List
import
qualified
Data.Text
as
Text
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Eleve
(
testEleve
)
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
...
...
@@ -75,3 +81,23 @@ terms (MonoMulti lang) txt = terms (Multi lang) txt
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
isPunctuation
::
Text
->
Bool
isPunctuation
x
=
List
.
elem
x
$
(
Text
.
pack
.
pure
)
<$>
(
"!?(),;."
::
String
)
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: BlockText
extractTermsUnsupervised
::
Int
->
Text
->
IO
[[
Text
]]
extractTermsUnsupervised
n
=
fmap
List
.
nub
.
fmap
(
List
.
filter
(
\
l
->
List
.
length
l
>
1
))
.
testEleve
n
.
map
(
map
Text
.
toLower
)
.
map
(
List
.
filter
(
not
.
isPunctuation
))
.
map
tokenize
.
sentences
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment