Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
8f285b42
Commit
8f285b42
authored
Jun 15, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev-eleve' into dev
parents
c9ce9c6f
842cbf68
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
349 additions
and
146 deletions
+349
-146
Eleve.hs
src/Gargantext/Text/Eleve.hs
+349
-146
No files found.
src/Gargantext/Text/Eleve.hs
View file @
8f285b42
...
@@ -20,7 +20,6 @@ References:
...
@@ -20,7 +20,6 @@ References:
, pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
, pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
Notes for current implementation:
Notes for current implementation:
- TODO fix normalization
- TODO extract longer ngrams (see paper above, viterbi algo can be used)
- TODO extract longer ngrams (see paper above, viterbi algo can be used)
- TODO AD TEST: prop (Node c _e f) = c == Map.size f
- TODO AD TEST: prop (Node c _e f) = c == Map.size f
...
@@ -32,62 +31,107 @@ Notes for current implementation:
...
@@ -32,62 +31,107 @@ Notes for current implementation:
$ Gargantext.map _hyperdataDocument_abstract docs
$ Gargantext.map _hyperdataDocument_abstract docs
-}
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
module
Gargantext.Text.Eleve
where
module
Gargantext.Text.Eleve
where
-- import Debug.Trace (trace)
-- import Debug.Trace (trace)
-- import Debug.SimpleReflect
-- import Debug.SimpleReflect
import
Control.Lens
(
Lens
'
,
Getting
,
(
^.
),
(
^?
),
(
%~
),
view
,
makeLenses
,
_Just
)
import
Control.Lens
hiding
(
levels
,
children
)
import
Control.Monad
(
fo
ldM
,
mapM_
,
fo
rM_
)
import
Control.Monad
(
forM_
)
import
Data.Ord
(
Ord
)
import
Data.Ord
(
Ord
)
import
qualified
Data.List
as
L
import
qualified
Data.List
as
L
import
Data.Monoid
import
Data.Monoid
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
qualified
Data.Text
as
T
import
qualified
Data.Text
as
T
import
Data.Map
(
Map
)
import
Data.Map
(
Map
)
import
Data.Maybe
(
fromMaybe
,
catMaybes
)
import
Data.Maybe
(
fromMaybe
)
import
qualified
Data.Map
as
Map
import
qualified
Data.Map
as
Map
import
Gargantext.Prelude
hiding
(
cs
)
import
Gargantext.Prelude
hiding
(
cs
)
import
qualified
Data.Tree
as
Tree
import
qualified
Data.Tree
as
Tree
import
Data.Tree
(
Tree
)
import
Data.Tree
(
Tree
)
import
qualified
Prelude
as
P
(
putStrLn
,
logBase
)
import
qualified
Prelude
as
P
(
putStrLn
,
logBase
,
isNaN
,
RealFloat
)
nan
::
Floating
e
=>
e
nan
=
0
/
0
noNaNs
::
P
.
RealFloat
e
=>
[
e
]
->
[
e
]
noNaNs
=
filter
(
not
.
P
.
isNaN
)
updateIfDefined
::
P
.
RealFloat
e
=>
e
->
e
->
e
updateIfDefined
e0
e
|
P
.
isNaN
e
=
e0
|
otherwise
=
e
sim
::
Entropy
e
=>
e
->
e
->
Bool
sim
x
y
=
x
==
y
||
(
P
.
isNaN
x
&&
P
.
isNaN
y
)
subst
::
Entropy
e
=>
(
e
,
e
)
->
e
->
e
subst
(
src
,
dst
)
x
|
sim
src
x
=
dst
|
otherwise
=
x
------------------------------------------------------------------------
type
Entropy
e
=
(
Fractional
e
,
Floating
e
,
P
.
RealFloat
e
,
Show
e
-- ^ TODO: only used for debugging
)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Example and tests for development
-- | Example and tests for development
data
I
e
=
I
data
I
e
=
I
{
_info_entropy
::
e
{
_info_entropy
::
e
,
_info_
norm_entropy
::
e
,
_info_
entropy_var
::
e
,
_info_
norm_entropy'
::
e
,
_info_
autonomy
::
e
}
}
instance
Show
e
=>
Show
(
I
e
)
where
instance
Show
e
=>
Show
(
I
e
)
where
show
(
I
e
n
n'
)
=
show
(
e
,
n
,
n'
)
show
(
I
e
ev
a
)
=
show
(
e
,
ev
,
a
)
makeLenses
''
I
makeLenses
''
I
type
ModEntropy
i
o
e
=
(
e
->
e
)
->
i
->
o
type
ModEntropy
i
o
e
=
(
e
->
e
)
->
i
->
o
setNormEntropy
::
ModEntropy
e
(
I
e
)
e
set_autonomy
::
Entropy
e
=>
ModEntropy
(
I
e
)
(
I
e
)
e
setNormEntropy
f
e
=
I
e
(
f
e
)
e
-- (panic "setNormEntropy")
set_autonomy
fe
i
=
i
&
info_autonomy
.~
fe
(
i
^.
info_entropy_var
)
set_entropy_var
::
Entropy
e
=>
Setter
e
(
I
e
)
e
e
set_entropy_var
f
e
=
(
\
ev
->
I
e
ev
nan
)
<$>
f
e
data
StartStop
=
Start
|
Stop
deriving
(
Ord
,
Eq
,
Show
)
data
Token
=
NonTerminal
Text
data
Token
=
NonTerminal
Text
|
Terminal
|
Terminal
StartStop
deriving
(
Ord
,
Eq
,
Show
)
deriving
(
Ord
,
Eq
,
Show
)
toToken
::
Int
->
[
Text
]
->
[
Token
]
isTerminal
::
Token
->
Bool
toToken
n
xs
=
(
NonTerminal
<$>
xs
)
<>
L
.
take
n
(
repeat
Terminal
)
isTerminal
(
Terminal
_
)
=
True
isTerminal
(
NonTerminal
_
)
=
False
unToken
::
[
Token
]
->
[
Text
]
nonTerminals
::
[
Token
]
->
[
Text
]
unToken
=
map
f
nonTerminals
ts
=
[
nt
|
NonTerminal
nt
<-
ts
]
where
f
(
NonTerminal
x
)
=
x
parseToken
::
Text
->
Token
f
Terminal
=
""
parseToken
"<start>"
=
Terminal
Start
parseToken
"<stop>"
=
Terminal
Stop
parseToken
t
=
NonTerminal
t
toToken
::
[
Text
]
->
[
Token
]
toToken
xs
=
Terminal
Start
:
(
NonTerminal
<$>
xs
)
<>
[
Terminal
Stop
]
printToken
::
Token
->
Text
printToken
=
f
where
f
(
NonTerminal
x
)
=
x
f
(
Terminal
Start
)
=
"<start>"
f
(
Terminal
Stop
)
=
"<stop>"
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Trie
k
e
data
Trie
k
e
...
@@ -100,9 +144,6 @@ data Trie k e
...
@@ -100,9 +144,6 @@ data Trie k e
makeLenses
''
T
rie
makeLenses
''
T
rie
insertTries
::
Ord
k
=>
[[
k
]]
->
Trie
k
()
insertTries
=
L
.
foldr
insertTrie
emptyTrie
insertTrie
::
Ord
k
=>
[
k
]
->
Trie
k
()
->
Trie
k
()
insertTrie
::
Ord
k
=>
[
k
]
->
Trie
k
()
->
Trie
k
()
insertTrie
[]
n
=
n
{
_node_count
=
_node_count
n
+
1
}
insertTrie
[]
n
=
n
{
_node_count
=
_node_count
n
+
1
}
insertTrie
(
x
:
xs
)
(
Leaf
c
)
=
mkTrie
(
c
+
1
)
$
Map
.
singleton
x
$
insertTrie
xs
emptyTrie
insertTrie
(
x
:
xs
)
(
Leaf
c
)
=
mkTrie
(
c
+
1
)
$
Map
.
singleton
x
$
insertTrie
xs
emptyTrie
...
@@ -121,7 +162,6 @@ mkTrie c children
...
@@ -121,7 +162,6 @@ mkTrie c children
|
otherwise
=
Node
c
mempty
children
|
otherwise
=
Node
c
mempty
children
-----------------------------
-----------------------------
-- | Trie to Tree since Tree as nice print function
-- | Trie to Tree since Tree as nice print function
toTree
::
k
->
Trie
k
e
->
Tree
(
k
,
Int
,
Maybe
e
)
toTree
::
k
->
Trie
k
e
->
Tree
(
k
,
Int
,
Maybe
e
)
toTree
k
(
Leaf
c
)
=
Tree
.
Node
(
k
,
c
,
Nothing
)
[]
toTree
k
(
Leaf
c
)
=
Tree
.
Node
(
k
,
c
,
Nothing
)
[]
...
@@ -129,55 +169,94 @@ toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.t
...
@@ -129,55 +169,94 @@ toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.t
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
normalizeLevel
::
Entropy
e
=>
e
->
e
->
e
->
e
entropyTrie
::
(
Num
e
,
Floating
e
)
=>
(
k
->
Bool
)
->
Trie
k
()
->
Trie
k
e
entropyTrie
_
(
Leaf
c
)
=
Leaf
c
entropyTrie
pred
(
Node
c
_e
children
)
=
Node
c
e
(
map
(
entropyTrie
pred
)
children
)
where
e
=
sum
$
map
f
$
Map
.
toList
children
f
(
k
,
child
)
=
if
pred
k
then
chc
*
P
.
logBase
2
(
fromIntegral
c
)
else
-
chc
*
P
.
logBase
2
chc
where
chc
=
fromIntegral
(
_node_count
child
)
/
fromIntegral
c
normalizeEntropy
::
(
Fractional
e
,
Floating
e
,
Show
e
)
=>
Getting
e
i
e
->
ModEntropy
i
o
e
->
Trie
k
i
->
Trie
k
o
normalizeEntropy
inE
modE
=
go
$
modE
identity
where
go
_
(
Leaf
c
)
=
Leaf
c
go
f
(
Node
c
i
children
)
|
Map
.
null
children
=
panic
"normalizeEntropy: impossible"
|
otherwise
=
-- trace (show $ L.length es) $
Node
c
(
f
i
)
$
go
(
modE
$
normalizeLevel
m
v
)
<$>
children
where
es
=
[
i'
^.
inE
|
Node
_
i'
_
<-
Map
.
elems
children
]
m
=
mean
es
v
=
deviation
es
------------------------------------------------------------------------
normalizeLevel
::
(
Fractional
e
,
Floating
e
,
Show
e
)
=>
e
->
e
->
e
->
e
normalizeLevel
m
v
e
=
(
e
-
m
)
/
v
normalizeLevel
m
v
e
=
(
e
-
m
)
/
v
buildTrie
::
(
Floating
e
,
Show
e
)
=>
[[
Token
]]
->
Trie
Token
e
{- Unused
buildTrie
=
entropyTrie
(
==
Terminal
)
.
insertTries
nodeEntropy
::
Trie
k
e
->
Maybe
e
nodeEntropy
(
Node
_
e
_
)
=
Just
e
nodeEntropy
(
Leaf
_
)
=
Nothing
nodeChildren :: Trie k e -> Map k (Trie k e)
nodeChildren :: Trie k e -> Map k (Trie k e)
nodeChildren (Node _ _ cs) = cs
nodeChildren (Node _ _ cs) = cs
nodeChildren (Leaf _) = Map.empty
nodeChildren (Leaf _) = Map.empty
nodeChild
::
Ord
k
=>
k
->
Trie
k
e
->
Maybe
(
Trie
k
e
)
-}
nodeChild
k
(
Node
_
_
cs
)
=
Map
.
lookup
k
cs
nodeChild
_
(
Leaf
_
)
=
Nothing
chunkAlongEleve
::
Int
->
[
a
]
->
[[
a
]]
chunkAlongEleve
n
xs
=
L
.
take
n
<$>
L
.
tails
xs
data
Direction
=
Backward
|
Forward
findTrie
::
Ord
k
=>
[
k
]
->
Trie
k
e
->
Maybe
(
Trie
k
e
)
buildTrie
::
Direction
->
Int
->
[[
Token
]]
->
Trie
Token
()
findTrie
ks
t
=
foldM
(
flip
nodeChild
)
t
ks
buildTrie
d
n
sentences
=
L
.
foldr
insertTrie
emptyTrie
.
L
.
concat
$
(
filter
(
/=
[
Terminal
(
term
d
)])
.
chunkAlongEleve
(
n
+
1
)
.
order
d
)
<$>
sentences
where
order
Forward
=
identity
order
Backward
=
reverse
term
Forward
=
Stop
term
Backward
=
Start
class
IsTrie
trie
where
entropyTrie
::
Entropy
e
=>
(
k
->
Bool
)
->
trie
k
()
->
trie
k
e
nodeEntropy
::
Entropy
e
=>
Getting
e
i
e
->
trie
k
i
->
e
nodeChild
::
Ord
k
=>
k
->
trie
k
e
->
trie
k
e
findTrie
::
Ord
k
=>
[
k
]
->
trie
k
e
->
trie
k
e
printTrie
::
(
Show
i
,
Entropy
e
)
=>
Getting
e
i
e
->
trie
Token
i
->
IO
()
evTrie
::
Entropy
e
=>
Getting
e
i
e
->
Setter
i
o
e
e
->
trie
k
i
->
trie
k
o
normalizeEntropy
::
Entropy
e
=>
Getting
e
i
e
->
ModEntropy
i
o
e
->
trie
k
i
->
trie
k
o
instance
IsTrie
Trie
where
entropyTrie
_
(
Leaf
c
)
=
Leaf
c
entropyTrie
pred
(
Node
c
()
children
)
=
Node
c
e
(
map
(
entropyTrie
pred
)
children
)
where
children'
=
Map
.
toList
children
sum_count
=
sum
$
_node_count
.
snd
<$>
children'
e
|
sum_count
==
0
=
nan
|
otherwise
=
sum
$
f
<$>
children'
f
(
k
,
child
)
=
if
pred
k
then
chc
*
P
.
logBase
2
(
fromIntegral
c
)
else
-
chc
*
P
.
logBase
2
chc
where
chc
=
fromIntegral
(
_node_count
child
)
/
fromIntegral
c
nodeEntropy
inE
(
Node
_
e
_
)
=
e
^.
inE
nodeEntropy
_
(
Leaf
_
)
=
nan
nodeChild
k
(
Node
_
_
cs
)
=
fromMaybe
emptyTrie
(
Map
.
lookup
k
cs
)
nodeChild
_
(
Leaf
_
)
=
emptyTrie
findTrie
ks
t
=
L
.
foldl
(
flip
nodeChild
)
t
ks
printTrie
inE
t
=
do
P
.
putStrLn
.
Tree
.
drawTree
.
fmap
show
$
toTree
(
NonTerminal
""
)
t
P
.
putStrLn
" Levels:"
forM_
(
normalizationLevels
inE
t
)
$
\
level
->
P
.
putStrLn
$
" "
<>
show
level
evTrie
inE
setEV
=
go
nan
where
go
_
(
Leaf
c
)
=
Leaf
c
go
e0
(
Node
c
i
children
)
=
Node
c
(
i
&
setEV
.~
ev
e0
e1
)
$
go
e1
<$>
children
where
e1
=
i
^.
inE
ev
0
0
=
nan
ev
i0
i1
=
i1
-
i0
normalizeEntropy
inE
modE
t
=
go
(
modE
identity
)
(
normalizationLevels
inE
t
)
t
where
go
_
_
(
Leaf
c
)
=
Leaf
c
go
_
[]
_
=
panic
"normalizeEntropy' empty levels"
go
f
((
m
,
v
,
_
)
:
ess
)
(
Node
c
i
children
)
=
Node
c
(
f
i
)
$
go
(
modE
$
normalizeLevel
m
v
)
ess
<$>
children
------------------------------------------------------------------------
levels
::
Trie
k
e
->
[[
Trie
k
e
]]
levels
::
Trie
k
e
->
[[
Trie
k
e
]]
levels
=
L
.
takeWhile
(
not
.
L
.
null
)
.
L
.
iterate
(
L
.
concatMap
subForest
)
.
pure
levels
=
L
.
takeWhile
(
not
.
L
.
null
)
.
L
.
iterate
(
L
.
concatMap
subForest
)
.
pure
...
@@ -186,96 +265,173 @@ levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
...
@@ -186,96 +265,173 @@ levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
subForest
(
Leaf
_
)
=
[]
subForest
(
Leaf
_
)
=
[]
subForest
(
Node
_
_
children
)
=
Map
.
elems
children
subForest
(
Node
_
_
children
)
=
Map
.
elems
children
entropyLevels
::
Getting
e
i
e
->
Trie
k
i
->
[[
e
]]
entropyLevels
::
Entropy
e
=>
Getting
e
i
e
->
Trie
k
i
->
[[
e
]]
entropyLevels
inE
=
fmap
(
fmap
(
view
inE
)
.
catMaybes
.
fmap
nodeEntropy
)
.
levels
entropyLevels
inE
=
fmap
(
noNaNs
.
map
(
nodeEntropy
inE
))
.
L
.
tail
.
levels
--fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
--fwd inE outE s = s & outE .~ (s ^. inE)
normalizeEntropy'
::
(
Fractional
e
,
Floating
e
,
Show
e
)
normalizationLevels
::
Entropy
e
=>
Getting
e
i
e
->
Trie
k
i
->
[(
e
,
e
,
Int
)]
=>
Getting
e
i
e
->
ModEntropy
i
o
e
->
Trie
k
i
->
Trie
k
o
normalizationLevels
inE
=
fmap
f
.
entropyLevels
inE
normalizeEntropy'
inE
modE
t
=
go
(
modE
identity
)
(
entropyLevels
inE
t
)
t
where
where
go
_
[]
_
=
panic
"normalizeEntropy' empty levels"
f
es
=
(
mean
es
,
deviation
es
,
length
es
)
go
_
_
(
Leaf
c
)
=
Leaf
c
go
_
(
[]
:
_
)
_
=
panic
"normalizeEntropy': empty level"
go
f
(
es
:
ess
)
(
Node
c
i
children
)
=
Node
c
(
f
i
)
$
go
(
modE
$
normalizeLevel
m
v
)
ess
<$>
children
where
m
=
mean
es
v
=
deviation
es
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Tries
k
e
=
Tries
{
_fwd
::
Trie
k
e
,
_bwd
::
Trie
k
e
}
makeLenses
''
T
ries
buildTries
::
Int
->
[[
Token
]]
->
Tries
Token
()
buildTries
n
sentences
=
Tries
{
_fwd
=
buildTrie
Forward
n
sentences
,
_bwd
=
buildTrie
Backward
n
sentences
}
instance
IsTrie
Tries
where
nodeEntropy
inE
(
Tries
f
b
)
=
mean
[
nodeEntropy
inE
f
,
nodeEntropy
inE
b
]
findTrie
ks
(
Tries
f
b
)
=
Tries
(
findTrie
ks
f
)
(
findTrie
(
reverse
ks
)
b
)
nodeChild
=
onTries
.
nodeChild
entropyTrie
=
onTries
.
entropyTrie
evTrie
inE
setEV
=
onTries
$
evTrie
inE
setEV
normalizeEntropy
inE
=
onTries
.
normalizeEntropy
inE
printTrie
inE
(
Tries
f
b
)
=
do
P
.
putStrLn
"Forward:"
printTrie
inE
f
P
.
putStrLn
""
P
.
putStrLn
"Backward:"
printTrie
inE
b
onTries
::
(
Trie
k
i
->
Trie
k
o
)
->
Tries
k
i
->
Tries
k
o
onTries
h
(
Tries
f
b
)
=
Tries
(
h
f
)
(
h
b
)
------------------------------------------------------------------------
------------------------------------------------------------------------
split
::
(
Num
e
,
Ord
e
,
Show
e
)
=>
Lens'
i
e
->
Trie
Token
i
->
[
Token
]
->
[[
Token
]]
mayCons
::
[
a
]
->
[[
a
]]
->
[[
a
]]
split
inE
t0
=
go
t0
[]
mayCons
[]
xss
=
xss
mayCons
xs
xss
=
xs
:
xss
{-
split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
split _ _ [] = []
split inE t (Terminal Start:xs) = split inE t xs
split inE t (x0:xs0) = go [x0] xs0
where
where
consRev
[]
xss
=
xss
go pref [] = [pref]
consRev
xs
xss
=
reverse
xs
:
xss
go pref (Terminal Stop:_) = [pref]
go _ (Terminal Start:_) = panic "split impossible"
go
_
pref
[]
=
[
reverse
pref
]
go pref (x:xs) =
go
_
pref
(
Terminal
:
_
)
=
[
reverse
pref
]
-- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
go
t
pref
(
x
:
xs
)
=
case
nodeChild
x
t
of
if acc
Nothing
->
consRev
pref
$
go
t0
[
x
]
xs
then go prefx xs
Just
xt
->
case
nodeChild
x
t0
of
else mayCons pref $ go [x] xs
Nothing
->
panic
$
"TODO"
where
Just
xt0
->
prefx = pref <> [x]
let
et
=
ne
(
panic
"t"
)
t
pt = findTrie pref t
-- ^ entropy of the current prefix
pxt = findTrie prefx t
ext0
=
ne
(
panic
"xt0"
)
xt0
xt = findTrie [x] t
-- ^ entropy of [x]
ept = ne pt
ext
=
ne
0
xt
-- ^ entropy of the current prefix
-- ^ entropy of the current prefix plus x
ext = ne xt
in
-- ^ entropy of [x]
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
epxt = ne pxt
if
ext
+
ext0
>
et
-- ^ entropy of the current prefix plus x
then
go
xt
(
x
:
pref
)
xs
acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
else
consRev
pref
$
go
xt0
[
x
]
xs
-- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
ne
d
t
=
fromMaybe
d
(
nodeEntropy
t
^?
_Just
.
inE
)
ne = nodeEntropy inE
-}
split
::
Entropy
e
=>
Int
->
Lens'
i
e
->
Tries
Token
i
->
[
Token
]
->
[[
Text
]]
split
_
_
_
[]
=
[]
split
_
_
_
[
t
]
=
pure
<$>
nonTerminals
[
t
]
split
n
inE
t
ts
=
nonTerminals
pref
`
mayCons
`
split
n
inE
t
(
drop
(
length
pref
)
ts
)
where
pref
=
maximumWith
(
\
ks
->
nodeEntropy
inE
$
findTrie
ks
t
)
(
L
.
tail
.
L
.
inits
.
take
n
$
ts
)
{-
split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
split inE t0 ts =
maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
-}
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
mainEleve
::
Int
->
[[
Text
]]
->
[[[
Text
]]]
mainEleve
::
Int
->
[[
Text
]]
->
[[[
Text
]]]
mainEleve
n
input
=
map
unToken
.
split
identity
(
t
::
Trie
Token
Double
)
<$>
inp
mainEleve
n
input
=
split
n
info_autonomy
(
t
::
Tries
Token
(
I
Double
)
)
<$>
inp
where
where
inp
=
toToken
(
n
-
1
)
<$>
input
inp
=
toToken
<$>
input
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
inp
t
=
normalizeEntropy
info_entropy_var
set_autonomy
-- NP: here we use the entropy to split
.
evTrie
identity
set_entropy_var
-- instead we should use either:
.
entropyTrie
isTerminal
-- info_norm_entropy or info_norm_entropy'
$
buildTries
n
inp
-- However they should first be fixed.
testEleve
::
Bool
->
Int
->
[
Text
]
->
IO
Bool
---------------------------------------------
testEleve
debug
n
output
=
do
type
Checks
e
=
[(
Text
,
Int
,
e
,
e
,
e
,
e
,
e
,
e
,
e
,
e
,
e
)]
testEleve
::
e
~
Double
=>
Bool
->
Int
->
[
Text
]
->
Checks
e
->
IO
Bool
testEleve
debug
n
output
checks
=
do
let
let
out
=
T
.
words
<$>
output
res
=
split
n
info_autonomy
nt
<$>
inp
expected
=
fmap
(
T
.
splitOn
"-"
)
<$>
out
input
=
(
T
.
splitOn
"-"
=<<
)
<$>
out
inp
=
toToken
(
n
-
1
)
<$>
input
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
inp
nt
=
normalizeEntropy
identity
setNormEntropy
(
t
::
Trie
Token
Double
)
nt'
=
normalizeEntropy'
info_entropy
(
\
f
->
info_norm_entropy'
%~
f
)
nt
pss
=
[
(
ps
,
findTrie
ps
t
^?
_Just
.
node_entropy
)
-- . info_entropy)
|
ps
<-
L
.
nub
$
[
c
|
m
<-
[
1
..
n
]
,
cs
<-
chunkAlong
m
1
<$>
inp
,
c
<-
cs
]
]
res
=
map
unToken
.
split
identity
t
<$>
inp
when
debug
$
do
when
debug
$
do
P
.
putStrLn
(
show
input
)
P
.
putStrLn
$
show
input
mapM_
(
P
.
putStrLn
.
show
)
pss
P
.
putStrLn
""
P
.
putStrLn
$
Tree
.
drawTree
printTrie
info_entropy
nt
$
fmap
show
P
.
putStrLn
""
$
toTree
(
NonTerminal
""
)
nt'
P
.
putStrLn
"Splitting:"
P
.
putStrLn
$
show
res
P
.
putStrLn
$
show
res
forM_
checks
checker
pure
$
expected
==
res
pure
$
expected
==
res
where
out
=
T
.
words
<$>
output
expected
=
fmap
(
T
.
splitOn
"-"
)
<$>
out
input
=
(
T
.
splitOn
"-"
=<<
)
<$>
out
inp
=
toToken
<$>
input
nt
::
Tries
Token
(
I
Double
)
nt
=
normalizeEntropy
info_entropy_var
set_autonomy
.
evTrie
identity
set_entropy_var
.
entropyTrie
isTerminal
$
buildTries
n
inp
check
f
msg
ref
my
=
if
f
ref
my
then
P
.
putStrLn
$
"
\ESC
[32mPASS
\ESC
[m "
<>
msg
<>
" "
<>
show
ref
else
P
.
putStrLn
$
"
\ESC
[31mFAIL
\ESC
[m "
<>
msg
<>
" ref="
<>
show
ref
<>
" my="
<>
show
my
checker
(
ngram
,
count
,
entropy
,
ev
,
autonomy
,
fwd_entropy
,
fwd_ev
,
fwd_autonomy
,
bwd_entropy
,
bwd_ev
,
bwd_autonomy
)
=
do
let
ns
=
parseToken
<$>
T
.
words
ngram
nt'
=
findTrie
ns
nt
P
.
putStrLn
$
" "
<>
T
.
unpack
ngram
<>
":"
check
(
==
)
"count"
count
(
_node_count
(
_fwd
nt'
))
check
sim
"entropy"
entropy
(
nodeEntropy
info_entropy
nt'
)
check
sim
"ev"
ev
(
nodeEntropy
info_entropy_var
nt'
)
check
sim
"autonomy"
autonomy
(
nodeEntropy
info_autonomy
nt'
)
check
sim
"fwd_entropy"
fwd_entropy
(
nodeEntropy
info_entropy
(
_fwd
nt'
))
check
sim
"fwd_ev"
fwd_ev
(
nodeEntropy
info_entropy_var
(
_fwd
nt'
))
check
sim
"fwd_autonomy"
fwd_autonomy
(
nodeEntropy
info_autonomy
(
_fwd
nt'
))
check
sim
"bwd_entropy"
bwd_entropy
(
nodeEntropy
info_entropy
(
_bwd
nt'
))
check
sim
"bwd_ev"
bwd_ev
(
nodeEntropy
info_entropy_var
(
_bwd
nt'
))
check
sim
"bwd_autonomy"
bwd_autonomy
(
nodeEntropy
info_autonomy
(
_bwd
nt'
))
-- | TODO real data is a list of tokenized sentences
-- | TODO real data is a list of tokenized sentences
example0
,
example1
,
example2
,
example3
,
example4
,
example5
::
[
Text
]
example0
,
example1
,
example2
,
example3
,
example4
,
example5
,
example6
::
[
Text
]
example0
=
[
"New-York is New-York and New-York"
]
example0
=
[
"New-York is New-York and New-York"
]
example1
=
[
"to-be or not to-be"
]
example1
=
[
"to-be or not to-be"
]
example2
=
[
"to-be-or not to-be-or NOT to-be and"
]
example2
=
[
"to-be-or not to-be-or NOT to-be and"
]
...
@@ -283,18 +439,65 @@ example3 = example0 <> example0
...
@@ -283,18 +439,65 @@ example3 = example0 <> example0
-- > TEST: Should not have York New in the trie
-- > TEST: Should not have York New in the trie
example4
=
[
"a-b-c-d e a-b-c-d f"
]
example4
=
[
"a-b-c-d e a-b-c-d f"
]
example5
=
[
"a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"
]
example5
=
[
"a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"
]
example6
=
[
"le-petit chat"
,
"le-petit chien"
,
"le-petit rat"
,
"le gros rat"
]
checks0
,
checks2
::
Checks
Double
checks0
=
-- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
[
(
"<start>"
,
1
,
nan
,
nan
,
nan
,
0.0
,
-
2.113283334294875
,
-
0.5000000000000002
,
nan
,
nan
,
nan
)
,
(
"New"
,
3
,
0.792481250360578
,
-
1.3208020839342969
,
0.7499999999999999
,
0.0
,
-
2.113283334294875
,
-
0.5000000000000002
,
1.584962500721156
,
-
0.5283208335737188
,
2.0
)
,
(
"York"
,
3
,
0.792481250360578
,
-
1.3208020839342969
,
0.7499999999999999
,
1.584962500721156
,
-
0.5283208335737188
,
2.0
,
0.0
,
-
2.113283334294875
,
-
0.5000000000000002
)
,
(
"is"
,
1
,
0
,
-
2.113283334294875
,
-
0.5000000000000002
,
0.0
,
-
2.113283334294875
,
-
0.5000000000000002
,
0.0
,
-
2.113283334294875
,
-
0.5000000000000002
)
,
(
"and"
,
1
,
0
,
-
2.113283334294875
,
-
0.5000000000000002
,
0.0
,
-
2.113283334294875
,
-
0.5000000000000002
,
0.0
,
-
2.113283334294875
,
-
0.5000000000000002
)
,
(
"<stop>"
,
0
,
nan
,
nan
,
nan
,
nan
,
nan
,
nan
,
0.0
,
-
2.113283334294875
,
-
0.5000000000000002
)
,
(
"<start> New"
,
1
,
nan
,
nan
,
nan
,
0.0
,
nan
,
nan
,
nan
,
nan
,
nan
)
,
(
"New York"
,
3
,
1.584962500721156
,
1.584962500721156
,
1.414213562373095
,
1.584962500721156
,
1.584962500721156
,
1.4142135623730947
,
1.584962500721156
,
1.584962500721156
,
1.4142135623730951
)
,
(
"York is"
,
1
,
0
,
nan
,
nan
,
0.0
,
-
1.584962500721156
,
-
0.7071067811865476
,
0.0
,
nan
,
nan
)
,
(
"is New"
,
1
,
0
,
nan
,
nan
,
0.0
,
nan
,
nan
,
0.0
,
-
1.584962500721156
,
-
0.7071067811865474
)
,
(
"York and"
,
1
,
0
,
nan
,
nan
,
0.0
,
-
1.584962500721156
,
-
0.7071067811865476
,
0.0
,
nan
,
nan
)
,
(
"and New"
,
1
,
0
,
nan
,
nan
,
0.0
,
nan
,
nan
,
0.0
,
-
1.584962500721156
,
-
0.7071067811865474
)
,
(
"York <stop>"
,
1
,
nan
,
nan
,
nan
,
nan
,
nan
,
nan
,
0.0
,
nan
,
nan
)
,
(
"<start> New York"
,
1
,
nan
,
nan
,
nan
,
0.0
,
nan
,
nan
,
nan
,
nan
,
nan
)
,
(
"New York is"
,
1
,
0
,
nan
,
nan
,
0.0
,
-
1.584962500721156
,
nan
,
0.0
,
nan
,
nan
)
,
(
"York is New"
,
1
,
0
,
nan
,
nan
,
0.0
,
nan
,
nan
,
0.0
,
nan
,
nan
)
,
(
"is New York"
,
1
,
0
,
nan
,
nan
,
0.0
,
nan
,
nan
,
0.0
,
-
1.584962500721156
,
nan
)
,
(
"New York and"
,
1
,
0
,
nan
,
nan
,
0.0
,
-
1.584962500721156
,
nan
,
0.0
,
nan
,
nan
)
,
(
"York and New"
,
1
,
0
,
nan
,
nan
,
0.0
,
nan
,
nan
,
0.0
,
nan
,
nan
)
,
(
"and New York"
,
1
,
0
,
nan
,
nan
,
0.0
,
nan
,
nan
,
0.0
,
-
1.584962500721156
,
nan
)
,
(
"New York <stop>"
,
1
,
nan
,
nan
,
nan
,
nan
,
nan
,
nan
,
0.0
,
nan
,
nan
)
]
checks2
=
[]
{-
[("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
,("be or", 2, 0.5, nan, nan, nan, 1.0)
,("or not", 1, 0.0, nan, nan, nan, 0.0)
,("not to", 1, 0.0, nan, nan, nan, 0.0)
,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
,("be and", 1, 0.0, nan, nan, nan, 0.0)
]
-}
runTests
::
IO
()
runTests
::
IO
()
runTests
=
runTests
=
forM_
forM_
[(
"example0"
,
2
,
example0
)
[(
"example0"
,
3
,
example0
,
checks0
)
,(
"example1"
,
2
,
example1
)
,(
"example0"
,
2
,
example0
,
[]
)
,(
"example2"
,
3
,
example2
)
,(
"example1"
,
2
,
example1
,
[]
)
,(
"example3"
,
2
,
example3
)
,(
"example2"
,
3
,
example2
,
checks2
)
,(
"example4"
,
4
,
example4
)
,(
"example3"
,
2
,
example3
,
[]
)
,(
"example5"
,
5
,
example5
)
,(
"example4"
,
4
,
example4
,
[]
)
,(
"example5"
,
5
,
example5
,
[]
)
,(
"example6"
,
2
,
example6
,
[]
)
]
]
(
\
(
name
,
n
,
ex
)
->
do
(
\
(
name
,
n
,
ex
,
checks
)
->
do
b
<-
testEleve
False
n
ex
P
.
putStrLn
$
name
<>
" "
<>
show
n
P
.
putStrLn
$
name
<>
" "
<>
show
n
<>
" "
<>
if
b
then
"PASS"
else
"FAIL"
b
<-
testEleve
False
n
ex
checks
P
.
putStrLn
$
" splitting: "
<>
if
b
then
"PASS"
else
"FAIL"
)
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment