Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
197
Issues
197
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
79f3006c
Unverified
Commit
79f3006c
authored
May 28, 2019
by
Nicolas Pouillard
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Eleve updates
parent
9ba0327c
Pipeline
#418
failed with stage
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
101 additions
and
56 deletions
+101
-56
Eleve.hs
src/Gargantext/Text/Eleve.hs
+95
-51
Terms.hs
src/Gargantext/Text/Terms.hs
+6
-5
No files found.
src/Gargantext/Text/Eleve.hs
View file @
79f3006c
...
@@ -40,11 +40,11 @@ Notes for current implementation:
...
@@ -40,11 +40,11 @@ Notes for current implementation:
module
Gargantext.Text.Eleve
where
module
Gargantext.Text.Eleve
where
import
Debug.Trace
(
trace
)
--
import Debug.Trace (trace)
-- import Debug.SimpleReflect
-- import Debug.SimpleReflect
import
Control.Lens
(
Lens
,
Lens
'
,
ASetter
,
Getting
,
(
^.
),
(
^?
),
(
&
),
(
.~
),
(
%~
),
view
,
makeLenses
,
_Just
)
import
Control.Lens
(
Lens
'
,
Getting
,
(
^.
),
(
^?
),
(
%~
),
view
,
makeLenses
,
_Just
)
import
Control.Monad
(
foldM
)
import
Control.Monad
(
foldM
,
mapM_
,
forM_
)
import
Data.Ord
(
Ord
)
import
Data.Ord
(
Ord
)
import
qualified
Data.List
as
L
import
qualified
Data.List
as
L
import
Data.Monoid
import
Data.Monoid
...
@@ -56,7 +56,7 @@ import qualified Data.Map as Map
...
@@ -56,7 +56,7 @@ import qualified Data.Map as Map
import
Gargantext.Prelude
hiding
(
cs
)
import
Gargantext.Prelude
hiding
(
cs
)
import
qualified
Data.Tree
as
Tree
import
qualified
Data.Tree
as
Tree
import
Data.Tree
(
Tree
)
import
Data.Tree
(
Tree
)
import
qualified
Prelude
as
P
(
putStrLn
,
logBase
,
String
)
import
qualified
Prelude
as
P
(
putStrLn
,
logBase
)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Example and tests for development
-- | Example and tests for development
...
@@ -76,32 +76,6 @@ type ModEntropy i o e = (e -> e) -> i -> o
...
@@ -76,32 +76,6 @@ type ModEntropy i o e = (e -> e) -> i -> o
setNormEntropy
::
ModEntropy
e
(
I
e
)
e
setNormEntropy
::
ModEntropy
e
(
I
e
)
e
setNormEntropy
f
e
=
I
e
(
f
e
)
e
-- (panic "setNormEntropy")
setNormEntropy
f
e
=
I
e
(
f
e
)
e
-- (panic "setNormEntropy")
testEleve
n
example
=
do
let
ex
=
toToken
n
<$>
example
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
ex
nt
=
normalizeEntropy
identity
setNormEntropy
(
t
::
Trie
Token
Double
)
nt'
=
normalizeEntropy'
info_entropy
(
\
f
->
info_norm_entropy'
%~
f
)
nt
{-
P.putStrLn $ Tree.drawTree
$ fmap show
$ toTree (NonTerminal "") nt'
--}
pure
$
map
unToken
$
split
info_entropy
nt'
$
L
.
concat
ex
-- NP: here we use the entropy to split
-- instead we should use either:
-- info_norm_entropy or info_norm_entropy'
-- However they should first be fixed.
-- | TODO real data is a list of tokenized sentences
example0
=
[
T
.
words
"New York is New York and New York"
]
example1
=
[
T
.
words
"to be or not to be"
]
example2
=
[
T
.
words
"to be or not to be or"
]
example3
=
example0
<>
example0
-- > TEST: Should not have York New in the trie
example4
=
map
(
T
.
pack
.
pure
)
(
"abcdefabcdegabcde"
::
P
.
String
)
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Token
=
NonTerminal
Text
data
Token
=
NonTerminal
Text
|
Terminal
|
Terminal
deriving
(
Ord
,
Eq
,
Show
)
deriving
(
Ord
,
Eq
,
Show
)
...
@@ -125,6 +99,7 @@ data Trie k e
...
@@ -125,6 +99,7 @@ data Trie k e
|
Leaf
{
_node_count
::
Int
}
|
Leaf
{
_node_count
::
Int
}
deriving
(
Show
)
deriving
(
Show
)
makeLenses
''
T
rie
insertTries
::
Ord
k
=>
[[
k
]]
->
Trie
k
()
insertTries
::
Ord
k
=>
[[
k
]]
->
Trie
k
()
insertTries
=
L
.
foldr
insertTrie
emptyTrie
insertTries
=
L
.
foldr
insertTrie
emptyTrie
...
@@ -171,13 +146,16 @@ normalizeEntropy :: (Fractional e, Floating e, Show e)
...
@@ -171,13 +146,16 @@ normalizeEntropy :: (Fractional e, Floating e, Show e)
normalizeEntropy
inE
modE
=
go
$
modE
identity
normalizeEntropy
inE
modE
=
go
$
modE
identity
where
where
go
_
(
Leaf
c
)
=
Leaf
c
go
_
(
Leaf
c
)
=
Leaf
c
go
f
(
Node
c
i
children
)
|
not
(
Map
.
null
children
)
=
go
f
(
Node
c
i
children
)
-- trace (show $ L.length es) $
|
Map
.
null
children
=
Node
c
(
f
i
)
$
go
(
modE
$
normalizeLevel
m
v
)
<$>
children
panic
"normalizeEntropy: impossible"
where
|
otherwise
=
es
=
[
i'
^.
inE
|
Node
_
i'
_
<-
Map
.
elems
children
]
-- trace (show $ L.length es) $
m
=
mean
es
Node
c
(
f
i
)
$
go
(
modE
$
normalizeLevel
m
v
)
<$>
children
v
=
deviation
es
where
es
=
[
i'
^.
inE
|
Node
_
i'
_
<-
Map
.
elems
children
]
m
=
mean
es
v
=
deviation
es
------------------------------------------------------------------------
------------------------------------------------------------------------
normalizeLevel
::
(
Fractional
e
,
Floating
e
,
Show
e
)
normalizeLevel
::
(
Fractional
e
,
Floating
e
,
Show
e
)
...
@@ -195,6 +173,13 @@ nodeChildren :: Trie k e -> Map k (Trie k e)
...
@@ -195,6 +173,13 @@ nodeChildren :: Trie k e -> Map k (Trie k e)
nodeChildren
(
Node
_
_
cs
)
=
cs
nodeChildren
(
Node
_
_
cs
)
=
cs
nodeChildren
(
Leaf
_
)
=
Map
.
empty
nodeChildren
(
Leaf
_
)
=
Map
.
empty
nodeChild
::
Ord
k
=>
k
->
Trie
k
e
->
Maybe
(
Trie
k
e
)
nodeChild
k
(
Node
_
_
cs
)
=
Map
.
lookup
k
cs
nodeChild
_
(
Leaf
_
)
=
Nothing
findTrie
::
Ord
k
=>
[
k
]
->
Trie
k
e
->
Maybe
(
Trie
k
e
)
findTrie
ks
t
=
foldM
(
flip
nodeChild
)
t
ks
levels
::
Trie
k
e
->
[[
Trie
k
e
]]
levels
::
Trie
k
e
->
[[
Trie
k
e
]]
levels
=
L
.
takeWhile
(
not
.
L
.
null
)
.
L
.
iterate
(
L
.
concatMap
subForest
)
.
pure
levels
=
L
.
takeWhile
(
not
.
L
.
null
)
.
L
.
iterate
(
L
.
concatMap
subForest
)
.
pure
where
where
...
@@ -229,29 +214,88 @@ split inE t0 = go t0 []
...
@@ -229,29 +214,88 @@ split inE t0 = go t0 []
consRev
[]
xss
=
xss
consRev
[]
xss
=
xss
consRev
xs
xss
=
reverse
xs
:
xss
consRev
xs
xss
=
reverse
xs
:
xss
go
_
pref
[]
=
[
reverse
pref
]
go
_
pref
[]
=
[
reverse
pref
]
go
_
pref
(
Terminal
:
_
)
=
[
reverse
pref
]
go
t
pref
(
x
:
xs
)
=
case
nodeChild
x
t
of
go
t
pref
(
x
:
xs
)
=
case
nodeChild
x
t
of
Nothing
->
consRev
pref
$
go
t0
[
x
]
xs
Nothing
->
consRev
pref
$
go
t0
[
x
]
xs
Just
xt
->
case
nodeChild
x
t0
of
Just
xt
->
case
nodeChild
x
t0
of
Nothing
->
panic
"TODO"
Nothing
->
panic
$
"TODO"
Just
xt0
->
Just
xt0
->
let
et
=
ne
(
panic
"t"
)
t
let
et
=
ne
(
panic
"t"
)
t
-- ^ entropy of the current prefix
ext0
=
ne
(
panic
"xt0"
)
xt0
ext0
=
ne
(
panic
"xt0"
)
xt0
-- ^ entropy of [x]
ext
=
ne
0
xt
ext
=
ne
0
xt
-- ^ entropy of the current prefix plus x
in
in
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
-- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
case
et
{-+ ext0-}
<
ext
of
if
ext
+
ext0
>
et
-- NP: here we must take ext0 in account however currently it
then
go
xt
(
x
:
pref
)
xs
-- makes it worse.
else
consRev
pref
$
go
xt0
[
x
]
xs
-- For instance it currently works well to 2-grams but not more.
-- PASS: test 4 example1
-- FAIL: test 4 example2
True
->
go
xt
(
x
:
pref
)
xs
False
->
consRev
pref
$
go
xt0
[
x
]
xs
nodeChild
::
Ord
k
=>
k
->
Trie
k
e
->
Maybe
(
Trie
k
e
)
ne
d
t
=
fromMaybe
d
(
nodeEntropy
t
^?
_Just
.
inE
)
nodeChild
k
(
Node
_
_
cs
)
=
Map
.
lookup
k
cs
nodeChild
_
(
Leaf
_
)
=
Nothing
------------------------------------------------------------------------
------------------------------------------------------------------------
ne
d
t
=
fromMaybe
d
(
nodeEntropy
t
^?
_Just
.
inE
)
mainEleve
::
Int
->
[[
Text
]]
->
[[[
Text
]]]
mainEleve
n
input
=
map
unToken
.
split
identity
(
t
::
Trie
Token
Double
)
<$>
inp
where
inp
=
toToken
(
n
-
1
)
<$>
input
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
inp
-- NP: here we use the entropy to split
-- instead we should use either:
-- info_norm_entropy or info_norm_entropy'
-- However they should first be fixed.
testEleve
::
Bool
->
Int
->
[
Text
]
->
IO
Bool
testEleve
debug
n
output
=
do
let
out
=
T
.
words
<$>
output
expected
=
fmap
(
T
.
splitOn
"-"
)
<$>
out
input
=
(
T
.
splitOn
"-"
=<<
)
<$>
out
inp
=
toToken
(
n
-
1
)
<$>
input
t
=
buildTrie
$
L
.
concat
$
chunkAlong
n
1
<$>
inp
nt
=
normalizeEntropy
identity
setNormEntropy
(
t
::
Trie
Token
Double
)
nt'
=
normalizeEntropy'
info_entropy
(
\
f
->
info_norm_entropy'
%~
f
)
nt
pss
=
[
(
ps
,
findTrie
ps
t
^?
_Just
.
node_entropy
)
-- . info_entropy)
|
ps
<-
L
.
nub
$
[
c
|
m
<-
[
1
..
n
]
,
cs
<-
chunkAlong
m
1
<$>
inp
,
c
<-
cs
]
]
res
=
map
unToken
.
split
identity
t
<$>
inp
when
debug
$
do
P
.
putStrLn
(
show
input
)
mapM_
(
P
.
putStrLn
.
show
)
pss
P
.
putStrLn
$
Tree
.
drawTree
$
fmap
show
$
toTree
(
NonTerminal
""
)
nt'
P
.
putStrLn
$
show
res
pure
$
expected
==
res
-- | TODO real data is a list of tokenized sentences
example0
,
example1
,
example2
,
example3
,
example4
,
example5
::
[
Text
]
example0
=
[
"New-York is New-York and New-York"
]
example1
=
[
"to-be or not to-be"
]
example2
=
[
"to-be-or not to-be-or NOT to-be and"
]
example3
=
example0
<>
example0
-- > TEST: Should not have York New in the trie
example4
=
[
"a-b-c-d e a-b-c-d f"
]
example5
=
[
"a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"
]
runTests
::
IO
()
runTests
=
forM_
[(
"example0"
,
2
,
example0
)
,(
"example1"
,
2
,
example1
)
,(
"example2"
,
3
,
example2
)
,(
"example3"
,
2
,
example3
)
,(
"example4"
,
4
,
example4
)
,(
"example5"
,
5
,
example5
)
]
(
\
(
name
,
n
,
ex
)
->
do
b
<-
testEleve
False
n
ex
P
.
putStrLn
$
name
<>
" "
<>
show
n
<>
" "
<>
if
b
then
"PASS"
else
"FAIL"
)
src/Gargantext/Text/Terms.hs
View file @
79f3006c
...
@@ -49,7 +49,7 @@ import qualified Data.List as List
...
@@ -49,7 +49,7 @@ import qualified Data.List as List
import
qualified
Data.Text
as
Text
import
qualified
Data.Text
as
Text
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Eleve
(
test
Eleve
)
import
Gargantext.Text.Eleve
(
main
Eleve
)
data
TermType
lang
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
=
Mono
{
_tt_lang
::
lang
}
...
@@ -89,11 +89,12 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
...
@@ -89,11 +89,12 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
-- language agnostic extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: remove IO
-- TODO: BlockText
-- TODO: BlockText
extractTermsUnsupervised
::
Int
->
Text
->
IO
[[
Text
]]
extractTermsUnsupervised
::
Int
->
Text
->
[[
Text
]]
extractTermsUnsupervised
n
=
extractTermsUnsupervised
n
=
fmap
List
.
nub
List
.
nub
.
fmap
(
List
.
filter
(
\
l
->
List
.
length
l
>
1
))
.
(
List
.
filter
(
\
l
->
List
.
length
l
>
1
))
.
testEleve
n
.
List
.
concat
.
mainEleve
n
.
map
(
map
Text
.
toLower
)
.
map
(
map
Text
.
toLower
)
.
map
(
List
.
filter
(
not
.
isPunctuation
))
.
map
(
List
.
filter
(
not
.
isPunctuation
))
.
map
tokenize
.
map
tokenize
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment