Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
d746a99b
Commit
d746a99b
authored
Jun 28, 2019
by
qlobbe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
still some refactoring
parent
eea8b990
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
523 additions
and
0 deletions
+523
-0
Aggregates.hs
src/Gargantext/Viz/Phylo/Aggregates.hs
+223
-0
Cluster.hs
src/Gargantext/Viz/Phylo/Cluster.hs
+157
-0
Metrics.hs
src/Gargantext/Viz/Phylo/Metrics.hs
+143
-0
No files found.
src/Gargantext/Viz/Phylo/Aggregates.hs
0 → 100644
View file @
d746a99b
{-|
Module : Gargantext.Viz.Phylo.Tools
Description : Phylomemy Tools to build/manage it
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ViewPatterns #-}
module
Gargantext.Viz.Phylo.Aggregates
where
import
Control.Lens
hiding
(
makeLenses
,
both
,
Level
)
import
Gargantext.Prelude
hiding
(
elem
)
import
Gargantext.Text.Context
(
TermList
)
import
Gargantext.Text.Metrics.FrequentItemSet
(
fisWithSizePolyMap
,
Size
(
..
))
import
Gargantext.Text.Terms.Mono
(
monoTexts
)
import
Gargantext.Viz.Phylo
import
Gargantext.Viz.Phylo.Tools
import
Debug.Trace
(
trace
)
import
Data.List
(
partition
,
concat
,
nub
,
elem
,
sort
,
(
++
),
null
)
import
Data.Map
(
Map
,
fromList
,
fromListWith
,
adjust
,
filterWithKey
,
toList
,
elems
,
keys
,
unionWith
,
mapWithKey
)
import
Data.Set
(
size
)
import
Data.Text
(
Text
,
unwords
)
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
Vector
---------------------
-- | Foundations | --
---------------------
-- | Extract all the labels of a termList
termListToNgrams
::
TermList
->
[
Ngrams
]
termListToNgrams
l
=
map
(
\
(
lbl
,
_
)
->
unwords
lbl
)
l
-------------------
-- | Documents | --
-------------------
-- | To group a list of Documents by fixed periods
groupDocsByPeriod
::
(
Ord
date
,
Enum
date
)
=>
(
doc
->
date
)
->
[(
date
,
date
)]
->
[
doc
]
->
Map
(
date
,
date
)
[
doc
]
groupDocsByPeriod
_
_
[]
=
panic
"[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods"
groupDocsByPeriod
f
pds
es
=
trace
(
"----
\n
Group docs by periods
\n
"
)
$
fromList
$
zip
pds
$
map
(
inPeriode
f
es
)
pds
where
--------------------------------------
inPeriode
::
Ord
b
=>
(
t
->
b
)
->
[
t
]
->
(
b
,
b
)
->
[
t
]
inPeriode
f'
h
(
start
,
end
)
=
fst
$
partition
(
\
d
->
f'
d
>=
start
&&
f'
d
<=
end
)
h
--------------------------------------
-- | To parse a list of Documents by filtering on a Vector of Ngrams
parseDocs
::
Vector
Ngrams
->
[(
Date
,
Text
)]
->
[
Document
]
parseDocs
roots
c
=
map
(
\
(
d
,
t
)
->
Document
d
(
filter
(
\
x
->
Vector
.
elem
x
roots
)
$
monoTexts
t
))
c
-- | To count the number of documents by year
countDocs
::
[(
Date
,
a
)]
->
Map
Date
Double
countDocs
corpus
=
fromListWith
(
+
)
$
map
(
\
(
d
,
_
)
->
(
d
,
1
))
corpus
-----------------
-- | Periods | --
-----------------
-- | To init a list of Periods framed by a starting Date and an ending Date
initPeriods
::
(
Eq
date
,
Enum
date
)
=>
Grain
->
Step
->
(
date
,
date
)
->
[(
date
,
date
)]
initPeriods
g
s
(
start
,
end
)
=
map
(
\
l
->
(
head'
"Doc"
l
,
last'
"Doc"
l
))
$
chunkAlong
g
s
[
start
..
end
]
--------------
-- | Cooc | --
--------------
-- | To transform a tuple of group's information into a coocurency Matrix
toCooc
::
[([
Int
],
Double
)]
->
Map
(
Int
,
Int
)
Double
toCooc
l
=
map
(
/
docs
)
$
foldl
(
\
mem
x
->
adjust
(
+
1
)
x
mem
)
cooc
$
concat
$
map
(
\
x
->
listToFullCombi
$
fst
x
)
l
where
--------------------------------------
idx
::
[
Int
]
idx
=
nub
$
concat
$
map
fst
l
--------------------------------------
docs
::
Double
docs
=
sum
$
map
snd
l
--------------------------------------
cooc
::
Map
(
Int
,
Int
)
(
Double
)
cooc
=
fromList
$
map
(
\
x
->
(
x
,
0
))
$
listToFullCombi
idx
--------------------------------------
-- | To reduce a coocurency Matrix to some keys
getSubCooc
::
[
Int
]
->
Map
(
Int
,
Int
)
Double
->
Map
(
Int
,
Int
)
Double
getSubCooc
idx
cooc
=
filterWithKey
(
\
k
_
->
(
elem
(
fst
k
)
idx
)
&&
(
elem
(
snd
k
)
idx
))
cooc
-- | To get a coocurency Matrix related to a given list of Periods
getCooc
::
[
PhyloPeriodId
]
->
Phylo
->
Map
(
Int
,
Int
)
Double
getCooc
prds
p
=
toCooc
$
map
(
\
g
->
(
getGroupNgrams
g
,
getGroupMeta
"support"
g
))
gs
where
--------------------------------------
-- | Here we need to go back to the level 1 (aka : the Fis level)
gs
::
[
PhyloGroup
]
gs
=
filter
(
\
g
->
elem
(
getGroupPeriod
g
)
prds
)
$
getGroupsWithLevel
1
p
--------------------------------------
-- | To transform a list of index into a cooc matrix
listToCooc
::
[
Int
]
->
Map
(
Int
,
Int
)
Double
listToCooc
lst
=
fromList
$
map
(
\
combi
->
(
combi
,
1
))
$
listToFullCombi
lst
-- | To build the cooc matrix by years out of the corpus
docsToCooc
::
[
Document
]
->
Vector
Ngrams
->
Map
Date
(
Map
(
Int
,
Int
)
Double
)
docsToCooc
docs
fdt
=
fromListWith
sumCooc
$
map
(
\
(
d
,
l
)
->
(
d
,
listToCooc
l
))
$
map
(
\
doc
->
(
date
doc
,
ngramsToIdx
(
text
doc
)
fdt
))
docs
-------------
-- | Fis | --
-------------
-- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
filterFis
::
Bool
->
Int
->
(
Int
->
[
PhyloFis
]
->
[
PhyloFis
])
->
Map
(
Date
,
Date
)
[
PhyloFis
]
->
Map
(
Date
,
Date
)
[
PhyloFis
]
filterFis
keep
thr
f
m
=
case
keep
of
False
->
map
(
\
l
->
f
thr
l
)
m
True
->
map
(
\
l
->
keepFilled
(
f
)
thr
l
)
m
-- | To filter Fis with small Support
filterFisBySupport
::
Int
->
[
PhyloFis
]
->
[
PhyloFis
]
filterFisBySupport
thr
l
=
filter
(
\
fis
->
getSupport
fis
>=
thr
)
l
-- | To filter Fis with small Clique size
filterFisByClique
::
Int
->
[
PhyloFis
]
->
[
PhyloFis
]
filterFisByClique
thr
l
=
filter
(
\
fis
->
(
size
$
getClique
fis
)
>=
thr
)
l
-- | To filter nested Fis
filterFisByNested
::
Map
(
Date
,
Date
)
[
PhyloFis
]
->
Map
(
Date
,
Date
)
[
PhyloFis
]
filterFisByNested
=
map
(
\
l
->
let
cliqueMax
=
filterNestedSets
(
head'
"Fis"
$
map
getClique
l
)
(
map
getClique
l
)
[]
in
filter
(
\
fis
->
elem
(
getClique
fis
)
cliqueMax
)
l
)
-- | Choose if we use a set of Fis from a file or if we have to create them
docsToFis
::
Map
(
Date
,
Date
)
[
Document
]
->
Phylo
->
Phylo
docsToFis
m
p
=
if
(
null
$
getPhyloFis
p
)
then
trace
(
"----
\n
Rebuild the Fis from scratch
\n
"
)
$
p
&
phylo_fis
.~
mapWithKey
(
\
k
docs
->
let
fis
=
toList
$
fisWithSizePolyMap
(
Segment
1
20
)
1
(
map
text
docs
)
in
map
(
\
f
->
PhyloFis
(
fst
f
)
(
snd
f
)
k
)
fis
)
m
else
trace
(
"----
\n
Use Fis from an existing file
\n
"
)
$
p
&
phylo_fis
%~
(
unionWith
(
++
)
(
fromList
$
map
(
\
k
->
(
k
,
[]
))
$
keys
m
))
-- | Process some filters on top of a set of Fis
refineFis
::
Map
(
Date
,
Date
)
[
PhyloFis
]
->
Bool
->
Support
->
Int
->
Map
(
Date
,
Date
)
[
PhyloFis
]
refineFis
fis
k
s
t
=
traceFis
"----
\n
Filtered Fis by clique size :
\n
"
$
filterFis
k
t
(
filterFisByClique
)
$
traceFis
"----
\n
Filtered Fis by nested :
\n
"
$
filterFisByNested
$
traceFis
"----
\n
Filtered Fis by support :
\n
"
$
filterFis
k
s
(
filterFisBySupport
)
$
traceFis
"----
\n
Unfiltered Fis :
\n
"
fis
-----------------
-- | Tracers | --
-----------------
traceFis
::
[
Char
]
->
Map
(
Date
,
Date
)
[
PhyloFis
]
->
Map
(
Date
,
Date
)
[
PhyloFis
]
traceFis
lbl
m
=
trace
(
lbl
<>
"count : "
<>
show
(
sum
$
map
length
$
elems
m
)
<>
" Fis
\n
"
<>
"support : "
<>
show
(
countSup
1
supps
)
<>
" (>1) "
<>
show
(
countSup
2
supps
)
<>
" (>2) "
<>
show
(
countSup
3
supps
)
<>
" (>3) "
<>
show
(
countSup
4
supps
)
<>
" (>4) "
<>
show
(
countSup
5
supps
)
<>
" (>5) "
<>
show
(
countSup
6
supps
)
<>
" (>6)
\n
"
<>
"clique size : "
<>
show
(
countSup
1
ngrms
)
<>
" (>1) "
<>
show
(
countSup
2
ngrms
)
<>
" (>2) "
<>
show
(
countSup
3
ngrms
)
<>
" (>3) "
<>
show
(
countSup
4
ngrms
)
<>
" (>4) "
<>
show
(
countSup
5
ngrms
)
<>
" (>5) "
<>
show
(
countSup
6
ngrms
)
<>
" (>6)
\n
"
)
m
where
--------------------------------------
countSup
::
Double
->
[
Double
]
->
Int
countSup
s
l
=
length
$
filter
(
>
s
)
l
--------------------------------------
supps
::
[
Double
]
supps
=
sort
$
map
(
fromIntegral
.
_phyloFis_support
)
$
concat
$
elems
m
--------------------------------------
ngrms
::
[
Double
]
ngrms
=
sort
$
map
(
\
f
->
fromIntegral
$
size
$
_phyloFis_clique
f
)
$
concat
$
elems
m
--------------------------------------
\ No newline at end of file
src/Gargantext/Viz/Phylo/Cluster.hs
0 → 100644
View file @
d746a99b
{-|
Module : Gargantext.Viz.Phylo.Tools
Description : Phylomemy Tools to build/manage it
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Viz.Phylo.Cluster
where
import
Control.Parallel.Strategies
import
Data.Graph.Clustering.Louvain.CplusPlus
import
Data.List
(
null
,
concat
,
sort
,
intersect
,(
++
),
elemIndex
,
groupBy
,
nub
,
union
,
(
\\
),
(
!!
))
import
Data.Map
(
Map
,
fromList
,
mapKeys
)
import
Data.Tuple
(
fst
)
import
Gargantext.Prelude
import
Gargantext.Viz.Phylo
import
Gargantext.Viz.Phylo.Tools
import
Gargantext.Viz.Phylo.Metrics
import
Gargantext.Viz.Phylo.LinkMaker
import
qualified
Data.Map
as
Map
import
qualified
Data.Vector.Storable
as
VS
import
Debug.Trace
(
trace
)
import
Numeric.Statistics
(
percentile
)
--------------
-- | Algo | --
--------------
relatedComp
::
Eq
a
=>
[[
a
]]
->
[[
a
]]
relatedComp
graphs
=
foldl'
(
\
mem
groups
->
if
(
null
mem
)
then
mem
++
[
groups
]
else
let
related
=
filter
(
\
groups'
->
(
not
.
null
)
$
intersect
groups
groups'
)
mem
in
if
(
null
related
)
then
mem
++
[
groups
]
else
(
mem
\\
related
)
++
[
union
groups
(
nub
$
concat
related
)]
)
[]
graphs
louvain
::
([
GroupNode
],[
GroupEdge
])
->
IO
[[
PhyloGroup
]]
louvain
(
nodes
,
edges
)
=
map
(
\
community
->
map
(
\
node
->
nodes
!!
(
l_node_id
node
))
community
)
<$>
groupBy
(
\
a
b
->
(
l_community_id
a
)
==
(
l_community_id
b
))
<$>
(
cLouvain
$
mapKeys
(
\
(
x
,
y
)
->
(
idx
x
,
idx
y
))
$
fromList
edges
)
where
--------------------------------------
idx
::
PhyloGroup
->
Int
idx
e
=
case
elemIndex
e
nodes
of
Nothing
->
panic
"[ERR][Gargantext.Viz.Phylo.Metrics.Clustering] a node is missing"
Just
i
->
i
--------------------------------------
-----------------------
-- | Cluster Maker | --
-----------------------
-- | Optimisation to filter only relevant candidates
getCandidates
::
[
PhyloGroup
]
->
[(
PhyloGroup
,
PhyloGroup
)]
getCandidates
gs
=
filter
(
\
(
g
,
g'
)
->
(
not
.
null
)
$
intersect
(
getGroupNgrams
g
)
(
getGroupNgrams
g'
))
$
filter
(
\
(
g
,
g'
)
->
g
/=
g'
)
$
listToDirectedCombi
gs
-- | To transform a Graph into Clusters
graphToClusters
::
Cluster
->
GroupGraph
->
[
PhyloCluster
]
graphToClusters
clust
(
nodes
,
edges
)
=
case
clust
of
Louvain
(
LouvainParams
_
)
->
undefined
RelatedComponents
(
RCParams
_
)
->
relatedComp
$
((
map
(
\
((
g
,
g'
),
_
)
->
[
g
,
g'
])
edges
)
++
(
map
(
\
g
->
[
g
])
nodes
))
_
->
panic
"[ERR][Viz.Phylo.Aggregates.Cluster.graphToClusters] not implemented"
-- | To transform a list of PhyloGroups into a Graph of Proximity
groupsToGraph
::
Double
->
Proximity
->
[
PhyloGroup
]
->
([
GroupNode
],[
GroupEdge
])
groupsToGraph
nbDocs
prox
gs
=
case
prox
of
WeightedLogJaccard
(
WLJParams
_
sens
)
->
(
gs
,
let
candidates
=
map
(
\
(
x
,
y
)
->
((
x
,
y
),
weightedLogJaccard
sens
nbDocs
(
getGroupCooc
x
)
(
getGroupCooc
y
)
(
getGroupNgrams
x
)
(
getGroupNgrams
y
)))
$
getCandidates
gs
candidates'
=
candidates
`
using
`
parList
rdeepseq
in
candidates'
)
Hamming
(
HammingParams
_
)
->
(
gs
,
map
(
\
(
x
,
y
)
->
((
x
,
y
),
hamming
(
getGroupCooc
x
)
(
getGroupCooc
y
)))
$
getCandidates
gs
)
_
->
undefined
-- | To filter a Graph of Proximity using a given threshold
filterGraph
::
Proximity
->
([
GroupNode
],[
GroupEdge
])
->
([
GroupNode
],[
GroupEdge
])
filterGraph
prox
(
ns
,
es
)
=
case
prox
of
WeightedLogJaccard
(
WLJParams
thr
_
)
->
(
ns
,
filter
(
\
(
_
,
v
)
->
v
>=
thr
)
es
)
Hamming
(
HammingParams
thr
)
->
(
ns
,
filter
(
\
(
_
,
v
)
->
v
<=
thr
)
es
)
_
->
undefined
-- | To clusterise a Phylo
phyloToClusters
::
Level
->
Cluster
->
Phylo
->
Map
(
Date
,
Date
)
[
PhyloCluster
]
phyloToClusters
lvl
clus
p
=
Map
.
fromList
$
zip
periods
$
map
(
\
g
->
if
null
(
fst
g
)
then
[]
else
graphToClusters
clus
g
)
graphs'
where
--------------------------------------
graphs'
::
[([
GroupNode
],[
GroupEdge
])]
graphs'
=
traceGraphFiltered
lvl
$
map
(
\
g
->
filterGraph
prox
g
)
graphs
--------------------------------------
graphs
::
[([
GroupNode
],[
GroupEdge
])]
graphs
=
traceGraph
lvl
(
getThreshold
prox
)
$
let
gs
=
map
(
\
prd
->
groupsToGraph
(
periodsToNbDocs
[
prd
]
p
)
prox
(
getGroupsWithFilters
lvl
prd
p
))
periods
gs'
=
gs
`
using
`
parList
rdeepseq
in
gs'
--------------------------------------
prox
::
Proximity
prox
=
getProximity
clus
--------------------------------------
periods
::
[
PhyloPeriodId
]
periods
=
getPhyloPeriods
p
--------------------------------------
----------------
-- | Tracer | --
----------------
traceGraph
::
Level
->
Double
->
[([
GroupNode
],[
GroupEdge
])]
->
[([
GroupNode
],[
GroupEdge
])]
traceGraph
lvl
thr
g
=
trace
(
"----
\n
Unfiltered clustering in Phylo"
<>
show
(
lvl
)
<>
" :
\n
"
<>
"count : "
<>
show
(
length
lst
)
<>
" potential edges ("
<>
show
(
length
$
filter
(
>=
thr
)
lst
)
<>
" >= "
<>
show
(
thr
)
<>
")
\n
"
<>
"similarity : "
<>
show
(
percentile
25
(
VS
.
fromList
lst
))
<>
" (25%) "
<>
show
(
percentile
50
(
VS
.
fromList
lst
))
<>
" (50%) "
<>
show
(
percentile
75
(
VS
.
fromList
lst
))
<>
" (75%) "
<>
show
(
percentile
90
(
VS
.
fromList
lst
))
<>
" (90%)
\n
"
)
g
where
lst
=
sort
$
map
snd
$
concat
$
map
snd
g
traceGraphFiltered
::
Level
->
[([
GroupNode
],[
GroupEdge
])]
->
[([
GroupNode
],[
GroupEdge
])]
traceGraphFiltered
lvl
g
=
trace
(
"----
\n
Clustering in Phylo"
<>
show
(
lvl
)
<>
" :
\n
"
<>
"count : "
<>
show
(
length
lst
)
<>
" edges
\n
"
<>
"similarity : "
<>
show
(
percentile
25
(
VS
.
fromList
lst
))
<>
" (25%) "
<>
show
(
percentile
50
(
VS
.
fromList
lst
))
<>
" (50%) "
<>
show
(
percentile
75
(
VS
.
fromList
lst
))
<>
" (75%) "
<>
show
(
percentile
90
(
VS
.
fromList
lst
))
<>
" (90%)
\n
"
)
g
where
lst
=
sort
$
map
snd
$
concat
$
map
snd
g
src/Gargantext/Viz/Phylo/Metrics.hs
0 → 100644
View file @
d746a99b
{-|
Module : Gargantext.Viz.Phylo.Tools
Description : Phylomemy Tools to build/manage it
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ViewPatterns #-}
module
Gargantext.Viz.Phylo.Metrics
where
import
Gargantext.Prelude
import
Data.List
((
\\
),
sortOn
,
concat
,
nub
,
take
,
union
,
intersect
,
null
)
import
Data.Map
(
Map
,
foldlWithKey
,
toList
,
size
,
unionWith
,
intersection
,
intersectionWith
,
filterWithKey
,
elems
,
fromList
,
findWithDefault
)
import
Data.Text
(
Text
)
-- import Debug.Trace (trace)
----------------
-- | Ngrams | --
----------------
-- | Return the conditional probability of i knowing j
conditional
::
Ord
a
=>
Map
(
a
,
a
)
Double
->
a
->
a
->
Double
conditional
m
i
j
=
(
findWithDefault
0
(
i
,
j
)
m
)
/
foldlWithKey
(
\
s
(
x
,
_
)
v
->
if
x
==
j
then
s
+
v
else
s
)
0
m
-- | Return the genericity score of a given ngram
genericity
::
Map
(
Int
,
Int
)
Double
->
[
Int
]
->
Int
->
Double
genericity
m
l
i
=
(
(
sum
$
map
(
\
j
->
conditional
m
i
j
)
l
)
-
(
sum
$
map
(
\
j
->
conditional
m
j
i
)
l
))
/
2
-- | Return the specificity score of a given ngram
specificity
::
Map
(
Int
,
Int
)
Double
->
[
Int
]
->
Int
->
Double
specificity
m
l
i
=
(
(
sum
$
map
(
\
j
->
conditional
m
j
i
)
l
)
-
(
sum
$
map
(
\
j
->
conditional
m
i
j
)
l
))
/
2
-- | Return the coverage score of a given ngram
coverage
::
Map
(
Int
,
Int
)
Double
->
[
Int
]
->
Int
->
Double
coverage
m
l
i
=
(
(
sum
$
map
(
\
j
->
conditional
m
j
i
)
l
)
+
(
sum
$
map
(
\
j
->
conditional
m
i
j
)
l
))
/
2
-- | Process some metrics on top of ngrams
getNgramsMeta
::
Map
(
Int
,
Int
)
Double
->
[
Int
]
->
Map
Text
[
Double
]
getNgramsMeta
m
ngrams
=
fromList
[
(
"genericity"
,
map
(
\
n
->
genericity
m
(
ngrams
\\
[
n
])
n
)
ngrams
),
(
"specificity"
,
map
(
\
n
->
specificity
m
(
ngrams
\\
[
n
])
n
)
ngrams
),
(
"coverage"
,
map
(
\
n
->
coverage
m
(
ngrams
\\
[
n
])
n
)
ngrams
)]
-- | To get the nth most occurent elems in a coocurency matrix
getNthMostOcc
::
Int
->
Map
(
Int
,
Int
)
Double
->
[
Int
]
getNthMostOcc
nth
cooc
=
(
nub
.
concat
)
$
map
(
\
((
idx
,
idx'
),
_
)
->
[
idx
,
idx'
])
$
take
nth
$
reverse
$
sortOn
snd
$
toList
cooc
-------------------------
-- | Ngrams Dynamics | --
-------------------------
-------------------
-- | Proximity | --
-------------------
-- | Process the inverse sumLog
sumInvLog
::
Double
->
[
Double
]
->
Double
sumInvLog
s
l
=
foldl
(
\
mem
x
->
mem
+
(
1
/
log
(
s
+
x
)))
0
l
-- | Process the sumLog
sumLog
::
Double
->
[
Double
]
->
Double
sumLog
s
l
=
foldl
(
\
mem
x
->
mem
+
log
(
s
+
x
))
0
l
-- | To compute a jaccard similarity between two lists
jaccard
::
[
Int
]
->
[
Int
]
->
Double
jaccard
inter'
union'
=
((
fromIntegral
.
length
)
$
inter'
)
/
((
fromIntegral
.
length
)
$
union'
)
-- | To get the diagonal of a matrix
toDiago
::
Map
(
Int
,
Int
)
Double
->
[
Double
]
toDiago
cooc
=
elems
$
filterWithKey
(
\
(
x
,
x'
)
_
->
x
==
x'
)
cooc
-- | To process WeighedLogJaccard distance between to coocurency matrix
weightedLogJaccard
::
Double
->
Double
->
Map
(
Int
,
Int
)
Double
->
Map
(
Int
,
Int
)
Double
->
[
Int
]
->
[
Int
]
->
Double
weightedLogJaccard
sens
nbDocs
cooc
cooc'
ngrams
ngrams'
|
null
gInter
=
0
|
gInter
==
gUnion
=
1
|
sens
==
0
=
jaccard
gInter
gUnion
|
sens
>
0
=
(
sumInvLog
sens
wInter
)
/
(
sumInvLog
sens
wUnion
)
|
otherwise
=
(
sumLog
sens
wInter
)
/
(
sumLog
sens
wUnion
)
where
--------------------------------------
gInter
::
[
Int
]
gInter
=
intersect
ngrams
ngrams'
--------------------------------------
gUnion
::
[
Int
]
gUnion
=
union
ngrams
ngrams'
--------------------------------------
wInter
::
[
Double
]
wInter
=
toDiago
$
map
(
/
nbDocs
)
$
intersectionWith
(
+
)
cooc
cooc'
--------------------------------------
wUnion
::
[
Double
]
wUnion
=
toDiago
$
map
(
/
nbDocs
)
$
unionWith
(
+
)
cooc
cooc'
--------------------------------------
-- | To process the Hamming distance between two PhyloGroup fields
hamming
::
Map
(
Int
,
Int
)
Double
->
Map
(
Int
,
Int
)
Double
->
Double
hamming
f1
f2
=
fromIntegral
$
max
((
size
inter
)
-
(
size
f1
))
((
size
inter
)
-
(
size
f2
))
where
--------------------------------------
inter
::
Map
(
Int
,
Int
)
Double
inter
=
intersection
f1
f2
--------------------------------------
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment