Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
haskell-gargantext
Commits
70a2339e
Commit
70a2339e
authored
Feb 20, 2020
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev-phylo' into dev
parents
01d762cb
ddeafce7
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
873 additions
and
500 deletions
+873
-500
Main.hs
bin/gargantext-adaptative-phylo/Main.hs
+1
-1
AdaptativePhylo.hs
src/Gargantext/Viz/AdaptativePhylo.hs
+63
-31
PhyloExample.hs
src/Gargantext/Viz/Phylo/PhyloExample.hs
+19
-7
PhyloExport.hs
src/Gargantext/Viz/Phylo/PhyloExport.hs
+101
-39
PhyloMaker.hs
src/Gargantext/Viz/Phylo/PhyloMaker.hs
+110
-45
PhyloTools.hs
src/Gargantext/Viz/Phylo/PhyloTools.hs
+69
-55
SynchronicClustering.hs
src/Gargantext/Viz/Phylo/SynchronicClustering.hs
+105
-42
TemporalMatching.hs
src/Gargantext/Viz/Phylo/TemporalMatching.hs
+405
-280
No files found.
bin/gargantext-adaptative-phylo/Main.hs
View file @
70a2339e
...
...
@@ -167,7 +167,7 @@ main = do
printIOMsg
"End of reconstruction, start the export"
let
dot
=
toPhyloExport
phylo
let
dot
=
toPhyloExport
phylo
let
output
=
(
outputPath
config
)
<>
(
unpack
$
phyloName
config
)
...
...
src/Gargantext/Viz/AdaptativePhylo.hs
View file @
70a2339e
...
...
@@ -57,22 +57,39 @@ data CorpusParser =
|
Csv
{
_csv_limit
::
Int
}
deriving
(
Show
,
Generic
,
Eq
)
data
SeaElevation
=
Constante
{
_cons_start
::
Double
,
_cons_step
::
Double
}
|
Adaptative
{
_adap_granularity
::
Double
}
deriving
(
Show
,
Generic
,
Eq
)
data
Proximity
=
WeightedLogJaccard
{
_wlj_sensibility
::
Double
,
_wlj_thresholdInit
::
Double
,
_wlj_thresholdStep
::
Double
}
-- , _wlj_thresholdInit :: Double
-- , _wlj_thresholdStep :: Double
-- | max height for sea level in temporal matching
-- , _wlj_elevation :: Double
}
|
Hamming
deriving
(
Show
,
Generic
,
Eq
)
data
SynchronyScope
=
SingleBranch
|
SiblingBranches
|
AllBranches
deriving
(
Show
,
Generic
,
Eq
)
data
SynchronyStrategy
=
MergeRegularGroups
|
MergeAllGroups
deriving
(
Show
,
Generic
,
Eq
)
data
Synchrony
=
ByProximityThreshold
ByProximityThreshold
{
_bpt_threshold
::
Double
,
_bpt_sensibility
::
Double
}
,
_bpt_sensibility
::
Double
,
_bpt_scope
::
SynchronyScope
,
_bpt_strategy
::
SynchronyStrategy
}
|
ByProximityDistribution
{
_bpd_sensibility
::
Double
}
{
_bpd_sensibility
::
Double
,
_bpd_strategy
::
SynchronyStrategy
}
deriving
(
Show
,
Generic
,
Eq
)
...
...
@@ -84,16 +101,18 @@ data TimeUnit =
deriving
(
Show
,
Generic
,
Eq
)
data
C
ontextualUnit
=
data
C
lique
=
Fis
{
_fis_support
::
Int
,
_fis_size
::
Int
}
|
MaxClique
{
_mcl_size
::
Int
}
deriving
(
Show
,
Generic
,
Eq
)
data
Quality
=
Quality
{
_qua_
relevance
::
Double
,
_qua_minBranch
::
Int
}
Quality
{
_qua_
granularity
::
Double
,
_qua_minBranch
::
Int
}
deriving
(
Show
,
Generic
,
Eq
)
...
...
@@ -105,10 +124,11 @@ data Config =
,
phyloName
::
Text
,
phyloLevel
::
Int
,
phyloProximity
::
Proximity
,
seaElevation
::
SeaElevation
,
phyloSynchrony
::
Synchrony
,
phyloQuality
::
Quality
,
timeUnit
::
TimeUnit
,
c
ontextualUnit
::
ContextualUnit
,
c
lique
::
Clique
,
exportLabel
::
[
PhyloLabel
]
,
exportSort
::
Sort
,
exportFilter
::
[
Filter
]
...
...
@@ -123,11 +143,12 @@ defaultConfig =
,
corpusParser
=
Csv
1000
,
phyloName
=
pack
"Default Phylo"
,
phyloLevel
=
2
,
phyloProximity
=
WeightedLogJaccard
10
0
0.1
,
phyloSynchrony
=
ByProximityDistribution
0
,
phyloQuality
=
Quality
0.1
1
,
phyloProximity
=
WeightedLogJaccard
10
,
seaElevation
=
Constante
0
0.1
,
phyloSynchrony
=
ByProximityThreshold
0.5
10
SiblingBranches
MergeAllGroups
,
phyloQuality
=
Quality
0.6
1
,
timeUnit
=
Year
3
1
5
,
c
ontextualUnit
=
Fis
1
5
,
c
lique
=
Fis
1
5
,
exportLabel
=
[
BranchLabel
MostInclusive
2
,
GroupLabel
MostEmergentInclusive
2
]
,
exportSort
=
ByHierarchy
,
exportFilter
=
[
ByBranchSize
2
]
...
...
@@ -139,10 +160,12 @@ instance FromJSON CorpusParser
instance
ToJSON
CorpusParser
instance
FromJSON
Proximity
instance
ToJSON
Proximity
instance
FromJSON
SeaElevation
instance
ToJSON
SeaElevation
instance
FromJSON
TimeUnit
instance
ToJSON
TimeUnit
instance
FromJSON
C
ontextualUnit
instance
ToJSON
C
ontextualUnit
instance
FromJSON
C
lique
instance
ToJSON
C
lique
instance
FromJSON
PhyloLabel
instance
ToJSON
PhyloLabel
instance
FromJSON
Tagger
...
...
@@ -153,6 +176,10 @@ instance FromJSON Order
instance
ToJSON
Order
instance
FromJSON
Filter
instance
ToJSON
Filter
instance
FromJSON
SynchronyScope
instance
ToJSON
SynchronyScope
instance
FromJSON
SynchronyStrategy
instance
ToJSON
SynchronyStrategy
instance
FromJSON
Synchrony
instance
ToJSON
Synchrony
instance
FromJSON
Quality
...
...
@@ -239,6 +266,8 @@ data Phylo =
Phylo
{
_phylo_foundations
::
PhyloFoundations
,
_phylo_timeCooc
::
!
(
Map
Date
Cooc
)
,
_phylo_timeDocs
::
!
(
Map
Date
Double
)
,
_phylo_termFreq
::
!
(
Map
Int
Double
)
,
_phylo_groupsProxi
::
!
(
Map
(
PhyloGroupId
,
PhyloGroupId
)
Double
)
,
_phylo_param
::
PhyloParam
,
_phylo_periods
::
Map
PhyloPeriodId
PhyloPeriod
}
...
...
@@ -310,21 +339,17 @@ data Filiation = ToParents | ToChilds deriving (Generic, Show)
data
PointerType
=
TemporalPointer
|
LevelPointer
deriving
(
Generic
,
Show
)
---------------------------
-- | Frequent Item Set | --
---------------------------
-- | Clique : Set of ngrams cooccurring in the same Document
type
Clique
=
Set
Ngrams
----------------------
-- | Phylo Clique | --
----------------------
-- | Support : Number of Documents where a Clique occurs
type
Support
=
Int
-- | Fis : Frequent Items Set (ie: the association between a Clique and a Support)
data
PhyloFis
=
PhyloFis
{
_phyloFis_clique
::
Clique
,
_phyloFis_support
::
Support
,
_phyloFis_period
::
(
Date
,
Date
)
data
PhyloClique
=
PhyloClique
{
_phyloClique_nodes
::
Set
Ngrams
,
_phyloClique_support
::
Support
,
_phyloClique_period
::
(
Date
,
Date
)
}
deriving
(
Generic
,
NFData
,
Show
,
Eq
)
...
...
@@ -356,9 +381,15 @@ data PhyloLabel =
data
PhyloBranch
=
PhyloBranch
{
_branch_id
::
PhyloBranchId
,
_branch_label
::
Text
,
_branch_meta
::
Map
Text
[
Double
]
}
deriving
(
Generic
,
Show
)
,
_branch_canonId
::
[
Int
]
,
_branch_seaLevel
::
[
Double
]
,
_branch_x
::
Double
,
_branch_y
::
Double
,
_branch_w
::
Double
,
_branch_t
::
Double
,
_branch_label
::
Text
,
_branch_meta
::
Map
Text
[
Double
]
}
deriving
(
Generic
,
Show
,
Eq
)
data
PhyloExport
=
PhyloExport
...
...
@@ -372,12 +403,13 @@ data PhyloExport =
makeLenses
''
C
onfig
makeLenses
''
P
roximity
makeLenses
''
S
eaElevation
makeLenses
''
Q
uality
makeLenses
''
C
ontextualUnit
makeLenses
''
C
lique
makeLenses
''
P
hyloLabel
makeLenses
''
T
imeUnit
makeLenses
''
P
hyloFoundations
makeLenses
''
P
hylo
Fis
makeLenses
''
P
hylo
Clique
makeLenses
''
P
hylo
makeLenses
''
P
hyloPeriod
makeLenses
''
P
hyloLevel
...
...
src/Gargantext/Viz/Phylo/PhyloExample.hs
View file @
70a2339e
...
...
@@ -30,7 +30,7 @@ import Gargantext.Viz.AdaptativePhylo
import
Gargantext.Viz.Phylo.PhyloTools
import
Gargantext.Viz.Phylo.PhyloMaker
import
Gargantext.Viz.Phylo.PhyloExport
import
Gargantext.Viz.Phylo.TemporalMatching
(
t
emporalMatching
)
import
Gargantext.Viz.Phylo.TemporalMatching
(
adaptativeTemporalMatching
,
constanteT
emporalMatching
)
import
Gargantext.Viz.Phylo.SynchronicClustering
(
synchronicClustering
)
import
Control.Lens
...
...
@@ -38,6 +38,9 @@ import Data.GraphViz.Types.Generalised (DotGraph)
import
qualified
Data.Vector
as
Vector
---------------------------------
-- | STEP 5 | -- Export the phylo
---------------------------------
phyloExport
::
IO
()
phyloExport
=
dotToFile
"/home/qlobbe/data/phylo/output/cesar_cleopatre_V2.dot"
phyloDot
...
...
@@ -45,6 +48,10 @@ phyloExport = dotToFile "/home/qlobbe/data/phylo/output/cesar_cleopatre_V2.dot"
phyloDot
::
DotGraph
DotId
phyloDot
=
toPhyloExport
phylo2
--------------------------------------------------
-- | STEP 4 | -- Process the synchronic clustering
--------------------------------------------------
phylo2
::
Phylo
phylo2
=
synchronicClustering
phylo1
...
...
@@ -53,17 +60,22 @@ phylo2 = synchronicClustering phylo1
-----------------------------------------------
phylo1
::
Phylo
phylo1
=
temporalMatching
$
appendGroups
fisToGroup
1
phyloFis
phyloBase
phylo1
=
case
(
getSeaElevation
phyloBase
)
of
Constante
s
g
->
constanteTemporalMatching
s
g
$
toGroupsProxi
1
$
appendGroups
cliqueToGroup
1
phyloClique
phyloBase
Adaptative
s
->
adaptativeTemporalMatching
s
$
toGroupsProxi
1
$
appendGroups
cliqueToGroup
1
phyloClique
phyloBase
---------------------------------------------
-- | STEP 2 | -- Build the
frequent items set
-- | STEP 2 | -- Build the
cliques
---------------------------------------------
phylo
Fis
::
Map
(
Date
,
Date
)
[
PhyloFis
]
phylo
Fis
=
toPhyloFis
docsByPeriods
(
getFisSupport
$
contextualUnit
config
)
(
getFisSize
$
contextualUnit
config
)
phylo
Clique
::
Map
(
Date
,
Date
)
[
PhyloClique
]
phylo
Clique
=
toPhyloClique
phyloBase
docsByPeriods
docsByPeriods
::
Map
(
Date
,
Date
)
[
Document
]
...
...
@@ -96,7 +108,7 @@ config =
defaultConfig
{
phyloName
=
"Cesar et Cleopatre"
,
phyloLevel
=
2
,
exportFilter
=
[
ByBranchSize
0
]
,
c
ontextualUnit
=
Fis
0
0
}
,
c
lique
=
Fis
0
0
}
docs
::
[
Document
]
...
...
src/Gargantext/Viz/Phylo/PhyloExport.hs
View file @
70a2339e
...
...
@@ -18,7 +18,7 @@ Portability : POSIX
module
Gargantext.Viz.Phylo.PhyloExport
where
import
Data.Map
(
Map
,
fromList
,
empty
,
fromListWith
,
insert
,
(
!
),
elems
,
unionWith
,
findWithDefault
,
toList
)
import
Data.List
((
++
),
sort
,
nub
,
concat
,
sortOn
,
reverse
,
groupBy
,
union
,
(
\\
),
(
!!
),
init
,
partition
,
unwords
,
nubBy
)
import
Data.List
((
++
),
sort
,
nub
,
concat
,
sortOn
,
reverse
,
groupBy
,
union
,
(
\\
),
(
!!
),
init
,
partition
,
unwords
,
nubBy
,
inits
,
elemIndex
)
import
Data.Vector
(
Vector
)
import
Prelude
(
writeFile
)
...
...
@@ -36,6 +36,7 @@ import System.FilePath
import
Debug.Trace
(
trace
)
import
qualified
Data.Text
as
Text
import
qualified
Data.Vector
as
Vector
import
qualified
Data.Text.Lazy
as
Lazy
import
qualified
Data.GraphViz.Attributes.HTML
as
H
...
...
@@ -105,44 +106,51 @@ groupToTable fdt g = H.Table H.HTable
<>
(
pack
$
show
(
fst
$
g
^.
phylo_groupPeriod
))
<>
(
fromStrict
" , "
)
<>
(
pack
$
show
(
snd
$
g
^.
phylo_groupPeriod
))
<>
(
fromStrict
" ) "
))]]
<>
(
fromStrict
" ) "
)
<>
(
pack
$
show
(
getGroupId
g
)))]]
--------------------------------------
branchToDotNode
::
PhyloBranch
->
Dot
DotId
branchToDotNode
b
=
branchToDotNode
::
PhyloBranch
->
Int
->
Dot
DotId
branchToDotNode
b
bId
=
node
(
branchIdToDotId
$
b
^.
branch_id
)
([
FillColor
[
toWColor
CornSilk
],
FontName
"Arial"
,
FontSize
40
,
Shape
Egg
,
Style
[
SItem
Bold
[]
],
Label
(
toDotLabel
$
b
^.
branch_label
)]
<>
(
metaToAttr
$
b
^.
branch_meta
)
<>
[
toAttr
"nodeType"
"branch"
,
toAttr
"branchId"
(
pack
$
unwords
(
map
show
$
snd
$
b
^.
branch_id
))
])
,
toAttr
"bId"
(
pack
$
show
bId
)
,
toAttr
"branchId"
(
pack
$
unwords
(
map
show
$
snd
$
b
^.
branch_id
))
,
toAttr
"branch_x"
(
fromStrict
$
Text
.
pack
$
(
show
$
b
^.
branch_x
))
,
toAttr
"branch_y"
(
fromStrict
$
Text
.
pack
$
(
show
$
b
^.
branch_y
))
,
toAttr
"label"
(
pack
$
show
$
b
^.
branch_label
)
])
periodToDotNode
::
(
Date
,
Date
)
->
Dot
DotId
periodToDotNode
prd
=
node
(
periodIdToDotId
prd
)
([
Shape
Squar
e
,
FontSize
50
,
Label
(
toDotLabel
$
Text
.
pack
(
show
(
fst
prd
)
<>
" "
<>
show
(
snd
prd
)))]
([
Shape
BoxShap
e
,
FontSize
50
,
Label
(
toDotLabel
$
Text
.
pack
(
show
(
fst
prd
)
<>
" "
<>
show
(
snd
prd
)))]
<>
[
toAttr
"nodeType"
"period"
,
toAttr
"from"
(
fromStrict
$
Text
.
pack
$
(
show
$
fst
prd
))
,
toAttr
"to"
(
fromStrict
$
Text
.
pack
$
(
show
$
snd
prd
))])
groupToDotNode
::
Vector
Ngrams
->
PhyloGroup
->
Dot
DotId
groupToDotNode
fdt
g
=
groupToDotNode
::
Vector
Ngrams
->
PhyloGroup
->
Int
->
Dot
DotId
groupToDotNode
fdt
g
bId
=
node
(
groupIdToDotId
$
getGroupId
g
)
([
FontName
"Arial"
,
Shape
Square
,
toLabel
(
groupToTable
fdt
g
)]
([
FontName
"Arial"
,
Shape
Square
,
penWidth
4
,
toLabel
(
groupToTable
fdt
g
)]
<>
[
toAttr
"nodeType"
"group"
,
toAttr
"from"
(
pack
$
show
(
fst
$
g
^.
phylo_groupPeriod
))
,
toAttr
"to"
(
pack
$
show
(
snd
$
g
^.
phylo_groupPeriod
))
,
toAttr
"branchId"
(
pack
$
unwords
(
init
$
map
show
$
snd
$
g
^.
phylo_groupBranchId
))
,
toAttr
"bId"
(
pack
$
show
bId
)
,
toAttr
"support"
(
pack
$
show
(
g
^.
phylo_groupSupport
))])
toDotEdge
::
DotId
->
DotId
->
Text
.
Text
->
EdgeType
->
Dot
DotId
toDotEdge
source
target
lbl
edgeType
=
edge
source
target
(
case
edgeType
of
GroupToGroup
->
[
Width
10
,
Color
[
toWColor
Black
],
Constraint
True
,
Label
(
StrLabel
$
fromStrict
lbl
)]
GroupToGroup
->
[
Width
3
,
penWidth
4
,
Color
[
toWColor
Black
],
Constraint
True
,
Label
(
StrLabel
$
fromStrict
lbl
)]
<>
[
toAttr
"edgeType"
"link"
]
BranchToGroup
->
[
Width
3
,
Color
[
toWColor
Black
],
ArrowHead
(
AType
[(
ArrMod
FilledArrow
RightSide
,
DotArrow
)])
,
Label
(
StrLabel
$
fromStrict
lbl
)]
,
Label
(
StrLabel
$
fromStrict
lbl
)]
<>
[
toAttr
"edgeType"
"branchLink"
]
BranchToBranch
->
[
Width
2
,
Color
[
toWColor
Black
],
Style
[
SItem
Dashed
[]
],
ArrowHead
(
AType
[(
ArrMod
FilledArrow
BothSides
,
DotArrow
)])
,
Label
(
StrLabel
$
fromStrict
lbl
)]
PeriodToPeriod
->
[
Width
5
,
Color
[
toWColor
Black
]])
...
...
@@ -155,10 +163,17 @@ mergePointers groups =
in
unionWith
(
\
w
w'
->
max
w
w'
)
toChilds
toParents
toBid
::
PhyloGroup
->
[
PhyloBranch
]
->
Int
toBid
g
bs
=
let
b'
=
head'
"toBid"
(
filter
(
\
b
->
b
^.
branch_id
==
g
^.
phylo_groupBranchId
)
bs
)
in
fromJust
$
elemIndex
b'
bs
exportToDot
::
Phylo
->
PhyloExport
->
DotGraph
DotId
exportToDot
phylo
export
=
trace
(
"
\n
-- | Convert "
<>
show
(
length
$
export
^.
export_branches
)
<>
" branches and "
<>
show
(
length
$
export
^.
export_groups
)
<>
" groups to a dot file
\n
"
)
$
<>
show
(
length
$
export
^.
export_groups
)
<>
" groups "
<>
show
(
length
$
nub
$
concat
$
map
(
\
g
->
g
^.
phylo_groupNgrams
)
$
export
^.
export_groups
)
<>
" terms to a dot file
\n\n
"
<>
"##########################"
)
$
digraph
((
Str
.
fromStrict
)
$
(
phyloName
$
getConfig
phylo
))
$
do
-- | 1) init the dot graph
...
...
@@ -167,11 +182,12 @@ exportToDot phylo export =
,
Ratio
FillRatio
,
Style
[
SItem
Filled
[]
],
Color
[
toWColor
White
]]
-- | home made attributes
<>
[(
toAttr
(
fromStrict
"nbDocs"
)
$
pack
$
show
(
sum
$
elems
$
phylo
^.
phylo_timeDocs
))
,(
toAttr
(
fromStrict
"proxiName"
)
$
pack
$
show
(
getProximityName
$
phyloProximity
$
getConfig
phylo
))
,(
toAttr
(
fromStrict
"proxiInit"
)
$
pack
$
show
(
getProximityInit
$
phyloProximity
$
getConfig
phylo
))
,(
toAttr
(
fromStrict
"proxiStep"
)
$
pack
$
show
(
getProximityStep
$
phyloProximity
$
getConfig
phylo
))
,(
toAttr
(
fromStrict
"quaFactor"
)
$
pack
$
show
(
_qua_relevance
$
phyloQuality
$
getConfig
phylo
))
<>
[(
toAttr
(
fromStrict
"phyloFoundations"
)
$
pack
$
show
(
length
$
Vector
.
toList
$
getRoots
phylo
))
,(
toAttr
(
fromStrict
"phyloTerms"
)
$
pack
$
show
(
length
$
nub
$
concat
$
map
(
\
g
->
g
^.
phylo_groupNgrams
)
$
export
^.
export_groups
))
,(
toAttr
(
fromStrict
"phyloDocs"
)
$
pack
$
show
(
sum
$
elems
$
phylo
^.
phylo_timeDocs
))
,(
toAttr
(
fromStrict
"phyloPeriods"
)
$
pack
$
show
(
length
$
elems
$
phylo
^.
phylo_periods
))
,(
toAttr
(
fromStrict
"phyloBranches"
)
$
pack
$
show
(
length
$
export
^.
export_branches
))
,(
toAttr
(
fromStrict
"phyloGroups"
)
$
pack
$
show
(
length
$
export
^.
export_groups
))
])
...
...
@@ -191,7 +207,7 @@ exportToDot phylo export =
-- mapM branchToDotNode branches
-- ) $ elems $ fromListWith (++) $ map (\b -> ((init . snd) $ b ^. branch_id,[b])) $ export ^. export_branches
mapM
branchToDotNode
$
export
^.
export_branches
mapM
(
\
b
->
branchToDotNode
b
(
fromJust
$
elemIndex
b
(
export
^.
export_branches
)))
$
export
^.
export_branches
-- | 5) create a layer for each period
_
<-
mapM
(
\
period
->
...
...
@@ -200,7 +216,7 @@ exportToDot phylo export =
periodToDotNode
period
-- | 6) create a node for each group
mapM
(
\
g
->
groupToDotNode
(
getRoots
phylo
)
g
)
(
filter
(
\
g
->
g
^.
phylo_groupPeriod
==
period
)
$
export
^.
export_groups
)
mapM
(
\
g
->
groupToDotNode
(
getRoots
phylo
)
g
(
toBid
g
(
export
^.
export_branches
))
)
(
filter
(
\
g
->
g
^.
phylo_groupPeriod
==
period
)
$
export
^.
export_groups
)
)
$
getPeriodIds
phylo
-- | 7) create the edges between a branch and its first groups
...
...
@@ -224,12 +240,12 @@ exportToDot phylo export =
)
$
nubBy
(
\
combi
combi'
->
fst
combi
==
fst
combi'
)
$
listToCombi'
$
getPeriodIds
phylo
-- | 8) create the edges between the branches
_
<-
mapM
(
\
(
bId
,
bId'
)
->
toDotEdge
(
branchIdToDotId
bId
)
(
branchIdToDotId
bId'
)
(
Text
.
pack
$
show
(
branchIdsToProximity
bId
bId'
(
getThresholdInit
$
phyloProximity
$
getConfig
phylo
)
(
getThresholdStep
$
phyloProximity
$
getConfig
phylo
)))
BranchToBranch
)
$
nubBy
(
\
combi
combi'
->
fst
combi
==
fst
combi'
)
$
listToCombi'
$
map
_branch_id
$
export
^.
export_branches
--
_ <- mapM (\(bId,bId') ->
--
toDotEdge (branchIdToDotId bId) (branchIdToDotId bId')
--
(Text.pack $ show(branchIdsToProximity bId bId'
--
(getThresholdInit $ phyloProximity $ getConfig phylo)
--
(getThresholdStep $ phyloProximity $ getConfig phylo))) BranchToBranch
--
) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ map _branch_id $ export ^. export_branches
graphAttrs
[
Rank
SameRank
]
...
...
@@ -261,11 +277,25 @@ processFilters filters qua export =
-- | Sort | --
--------------
branchToIso
::
[
PhyloBranch
]
->
[
PhyloBranch
]
branchToIso
branches
=
let
steps
=
map
sum
$
inits
$
map
(
\
(
b
,
x
)
->
b
^.
branch_y
+
0.05
-
x
)
$
zip
branches
$
([
0
]
++
(
map
(
\
(
b
,
b'
)
->
let
idx
=
length
$
commonPrefix
(
b
^.
branch_canonId
)
(
b'
^.
branch_canonId
)
[]
in
(
b'
^.
branch_seaLevel
)
!!
(
idx
-
1
)
)
$
listToSeq
branches
))
in
map
(
\
(
x
,
b
)
->
b
&
branch_x
.~
x
)
$
zip
steps
branches
sortByHierarchy
::
Int
->
[
PhyloBranch
]
->
[
PhyloBranch
]
sortByHierarchy
depth
branches
=
if
(
length
branches
==
1
)
then
branches
else
concat
then
branch
ToIso
branch
es
else
branchToIso
$
concat
$
map
(
\
branches'
->
let
partitions
=
partition
(
\
b
->
depth
+
1
==
((
length
.
snd
)
$
b
^.
branch_id
))
branches'
in
(
sortOn
(
\
b
->
(
b
^.
branch_meta
)
!
"birth"
)
(
fst
partitions
))
...
...
@@ -338,11 +368,12 @@ branchDating export =
$
foldl'
(
\
acc
g
->
if
(
g
^.
phylo_groupBranchId
==
b
^.
branch_id
)
then
acc
++
[
g
^.
phylo_groupPeriod
]
else
acc
)
[]
$
export
^.
export_groups
periods
=
nub
groups
birth
=
fst
$
head'
"birth"
groups
age
=
(
snd
$
last'
"age"
groups
)
-
birth
in
b
&
branch_meta
%~
insert
"birth"
[
fromIntegral
birth
]
&
branch_meta
%~
insert
"age"
[
fromIntegral
age
]
&
branch_meta
%~
insert
"size"
[
fromIntegral
$
length
group
s
]
)
export
&
branch_meta
%~
insert
"size"
[
fromIntegral
$
length
period
s
]
)
export
processMetrics
::
PhyloExport
->
PhyloExport
processMetrics
export
=
ngramsMetrics
...
...
@@ -409,8 +440,8 @@ processLabels labels foundations export =
toDynamics
::
Int
->
[
PhyloGroup
]
->
PhyloGroup
->
Map
Int
(
Date
,
Date
)
->
Double
toDynamics
n
parents
g
roup
m
=
let
prd
=
g
roup
^.
phylo_groupPeriod
toDynamics
n
parents
g
m
=
let
prd
=
g
^.
phylo_groupPeriod
end
=
last'
"dynamics"
(
sort
$
map
snd
$
elems
m
)
in
if
(((
snd
prd
)
==
(
snd
$
m
!
n
))
&&
(
snd
prd
/=
end
))
-- | decrease
...
...
@@ -429,7 +460,7 @@ toDynamics n parents group m =
processDynamics
::
[
PhyloGroup
]
->
[
PhyloGroup
]
processDynamics
groups
=
processDynamics
groups
=
map
(
\
g
->
let
parents
=
filter
(
\
g'
->
(
g
^.
phylo_groupBranchId
==
g'
^.
phylo_groupBranchId
)
&&
((
fst
$
g
^.
phylo_groupPeriod
)
>
(
fst
$
g'
^.
phylo_groupPeriod
)))
groups
...
...
@@ -449,7 +480,6 @@ processDynamics groups =
-- | phyloExport | --
---------------------
toPhyloExport
::
Phylo
->
DotGraph
DotId
toPhyloExport
phylo
=
exportToDot
phylo
$
processFilters
(
exportFilter
$
getConfig
phylo
)
(
phyloQuality
$
getConfig
phylo
)
...
...
@@ -458,15 +488,47 @@ toPhyloExport phylo = exportToDot phylo
$
processMetrics
export
where
export
::
PhyloExport
export
=
PhyloExport
groups
branches
export
=
PhyloExport
groups
branches
--------------------------------------
branches
::
[
PhyloBranch
]
branches
=
traceExportBranches
$
map
(
\
bId
->
PhyloBranch
bId
""
empty
)
$
nub
$
map
_phylo_groupBranchId
groups
branches
::
[
PhyloBranch
]
branches
=
map
(
\
g
->
let
seaLvl
=
(
g
^.
phylo_groupMeta
)
!
"seaLevels"
breaks
=
(
g
^.
phylo_groupMeta
)
!
"breaks"
canonId
=
take
(
round
$
(
last'
"export"
breaks
)
+
2
)
(
snd
$
g
^.
phylo_groupBranchId
)
in
PhyloBranch
(
g
^.
phylo_groupBranchId
)
canonId
seaLvl
0
(
last'
"export"
(
take
(
round
$
(
last'
"export"
breaks
)
+
1
)
seaLvl
))
0
0
""
empty
)
$
map
(
\
gs
->
head'
"export"
gs
)
$
groupBy
(
\
g
g'
->
g
^.
phylo_groupBranchId
==
g'
^.
phylo_groupBranchId
)
$
sortOn
(
\
g
->
g
^.
phylo_groupBranchId
)
groups
--------------------------------------
groups
::
[
PhyloGroup
]
groups
=
processDynamics
$
getGroupsFromLevel
(
phyloLevel
$
getConfig
phylo
)
phylo
groups
=
traceExportGroups
$
processDynamics
$
getGroupsFromLevel
(
phyloLevel
$
getConfig
phylo
)
$
tracePhyloInfo
phylo
traceExportBranches
::
[
PhyloBranch
]
->
[
PhyloBranch
]
traceExportBranches
branches
=
trace
(
"
\n
"
<>
"-- | Export "
<>
show
(
length
branches
)
<>
" branches"
)
branches
traceExportBranches
branches
=
trace
(
"
\n
"
<>
"-- | Export "
<>
show
(
length
branches
)
<>
" branches"
)
branches
tracePhyloInfo
::
Phylo
->
Phylo
tracePhyloInfo
phylo
=
trace
(
"
\n
"
<>
"##########################"
<>
"
\n\n
"
<>
"-- | Phylo with β = "
<>
show
(
_qua_granularity
$
phyloQuality
$
getConfig
phylo
)
<>
" applied to "
<>
show
(
length
$
Vector
.
toList
$
getRoots
phylo
)
<>
" foundations"
)
phylo
traceExportGroups
::
[
PhyloGroup
]
->
[
PhyloGroup
]
traceExportGroups
groups
=
trace
(
"
\n
"
<>
"-- | Export "
<>
show
(
length
$
nub
$
map
(
\
g
->
g
^.
phylo_groupBranchId
)
groups
)
<>
" branches, "
<>
show
(
length
groups
)
<>
" groups and "
<>
show
(
length
$
nub
$
concat
$
map
(
\
g
->
g
^.
phylo_groupNgrams
)
groups
)
<>
" terms"
)
groups
src/Gargantext/Viz/Phylo/PhyloMaker.hs
View file @
70a2339e
...
...
@@ -15,15 +15,15 @@ Portability : POSIX
module
Gargantext.Viz.Phylo.PhyloMaker
where
import
Data.List
(
concat
,
nub
,
partition
,
sort
,
(
++
))
import
Data.Map
(
Map
,
fromListWith
,
keys
,
unionWith
,
fromList
,
empty
,
toList
,
elems
,
(
!
),
restrictKeys
)
import
Data.List
(
concat
,
nub
,
partition
,
sort
,
(
++
)
,
group
,
intersect
,
null
,
sortOn
,
groupBy
)
import
Data.Map
(
Map
,
fromListWith
,
keys
,
unionWith
,
fromList
,
empty
,
toList
,
elems
,
(
!
),
restrictKeys
,
foldlWithKey
)
import
Data.Set
(
size
)
import
Data.Vector
(
Vector
)
import
Gargantext.Prelude
import
Gargantext.Viz.AdaptativePhylo
import
Gargantext.Viz.Phylo.PhyloTools
import
Gargantext.Viz.Phylo.TemporalMatching
(
temporalMatching
)
import
Gargantext.Viz.Phylo.TemporalMatching
(
adaptativeTemporalMatching
,
constanteTemporalMatching
,
getNextPeriods
,
filterDocs
,
filterDiago
,
reduceDiagos
,
toProximity
)
import
Gargantext.Viz.Phylo.SynchronicClustering
(
synchronicClustering
)
import
Gargantext.Text.Context
(
TermList
)
import
Gargantext.Text.Metrics.FrequentItemSet
(
fisWithSizePolyMap
,
Size
(
..
))
...
...
@@ -43,7 +43,8 @@ import qualified Data.Set as Set
toPhylo
::
[
Document
]
->
TermList
->
Config
->
Phylo
toPhylo
docs
lst
conf
=
traceToPhylo
(
phyloLevel
conf
)
$
toPhylo
docs
lst
conf
=
trace
(
"# phylo1 groups "
<>
show
(
length
$
getGroupsFromLevel
1
phylo1
))
$
traceToPhylo
(
phyloLevel
conf
)
$
if
(
phyloLevel
conf
)
>
1
then
foldl'
(
\
phylo'
_
->
synchronicClustering
phylo'
)
phylo1
[
2
..
(
phyloLevel
conf
)]
else
phylo1
...
...
@@ -62,6 +63,33 @@ toPhylo docs lst conf = traceToPhylo (phyloLevel conf) $
-- | To Phylo 1 | --
--------------------
toGroupsProxi
::
Level
->
Phylo
->
Phylo
toGroupsProxi
lvl
phylo
=
let
proximity
=
phyloProximity
$
getConfig
phylo
groupsProxi
=
foldlWithKey
(
\
acc
pId
pds
->
-- 1) process period by period
let
egos
=
map
(
\
g
->
(
getGroupId
g
,
g
^.
phylo_groupNgrams
))
$
elems
$
view
(
phylo_periodLevels
.
traverse
.
filtered
(
\
phyloLvl
->
phyloLvl
^.
phylo_levelLevel
==
lvl
)
.
phylo_levelGroups
)
pds
next
=
getNextPeriods
ToParents
(
getTimeFrame
$
timeUnit
$
getConfig
phylo
)
pId
(
keys
$
phylo
^.
phylo_periods
)
targets
=
map
(
\
g
->
(
getGroupId
g
,
g
^.
phylo_groupNgrams
))
$
getGroupsFromLevelPeriods
lvl
next
phylo
docs
=
filterDocs
(
phylo
^.
phylo_timeDocs
)
([
pId
]
++
next
)
diagos
=
filterDiago
(
phylo
^.
phylo_timeCooc
)
([
pId
]
++
next
)
-- 2) compute the pairs in parallel
pairs
=
map
(
\
(
id
,
ngrams
)
->
map
(
\
(
id'
,
ngrams'
)
->
let
nbDocs
=
(
sum
.
elems
)
$
filterDocs
docs
([
idToPrd
id
,
idToPrd
id'
])
diago
=
reduceDiagos
$
filterDiago
diagos
([
idToPrd
id
,
idToPrd
id'
])
in
((
id
,
id'
),
toProximity
nbDocs
diago
proximity
ngrams
ngrams'
ngrams'
)
)
$
filter
(
\
(
_
,
ngrams'
)
->
(
not
.
null
)
$
intersect
ngrams
ngrams'
)
targets
)
egos
pairs'
=
pairs
`
using
`
parList
rdeepseq
in
acc
++
(
concat
pairs'
)
)
[]
$
phylo
^.
phylo_periods
in
phylo
&
phylo_groupsProxi
.~
((
traceGroupsProxi
.
fromList
)
groupsProxi
)
appendGroups
::
(
a
->
PhyloPeriodId
->
Level
->
Int
->
Vector
Ngrams
->
[
Cooc
]
->
PhyloGroup
)
->
Level
->
Map
(
Date
,
Date
)
[
a
]
->
Phylo
->
Phylo
appendGroups
f
lvl
m
phylo
=
trace
(
"
\n
"
<>
"-- | Append "
<>
show
(
length
$
concat
$
elems
m
)
<>
" groups to Level "
<>
show
(
lvl
)
<>
"
\n
"
)
...
...
@@ -72,40 +100,44 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co
(
\
phyloLvl
->
if
lvl
==
(
phyloLvl
^.
phylo_levelLevel
)
then
let
pId
=
phyloLvl
^.
phylo_levelPeriod
phylo
Fis
=
m
!
pId
phylo
CUnit
=
m
!
pId
in
phyloLvl
&
phylo_levelGroups
.~
(
fromList
$
foldl
(
\
groups
obj
->
groups
++
[
(((
pId
,
lvl
),
length
groups
)
,
f
obj
pId
lvl
(
length
groups
)
(
getRoots
phylo
)
(
elems
$
restrictKeys
(
phylo
^.
phylo_timeCooc
)
$
periodsToYears
[
pId
]))
]
)
[]
phylo
Fis
)
]
)
[]
phylo
CUnit
)
else
phyloLvl
)
phylo
fisToGroup
::
PhyloFis
->
PhyloPeriodId
->
Level
->
Int
->
Vector
Ngrams
->
[
Cooc
]
->
PhyloGroup
fis
ToGroup
fis
pId
lvl
idx
fdt
coocs
=
let
ngrams
=
ngramsToIdx
(
Set
.
toList
$
fis
^.
phylo
Fis_clique
)
fdt
cliqueToGroup
::
PhyloClique
->
PhyloPeriodId
->
Level
->
Int
->
Vector
Ngrams
->
[
Cooc
]
->
PhyloGroup
clique
ToGroup
fis
pId
lvl
idx
fdt
coocs
=
let
ngrams
=
ngramsToIdx
(
Set
.
toList
$
fis
^.
phylo
Clique_nodes
)
fdt
in
PhyloGroup
pId
lvl
idx
""
(
fis
^.
phylo
Fis
_support
)
(
fis
^.
phylo
Clique
_support
)
ngrams
(
ngramsToCooc
ngrams
coocs
)
(
1
,[
0
])
empty
(
1
,[
0
])
-- | branchid (lvl,[path in the branching tree])
(
fromList
[(
"breaks"
,[
0
]),(
"seaLevels"
,[
0
])])
[]
[]
[]
[]
toPhylo1
::
[
Document
]
->
Phylo
->
Phylo
toPhylo1
docs
phyloBase
=
temporalMatching
$
appendGroups
fisToGroup
1
phyloFis
phyloBase
toPhylo1
docs
phyloBase
=
case
(
getSeaElevation
phyloBase
)
of
Constante
start
gap
->
constanteTemporalMatching
start
gap
$
appendGroups
cliqueToGroup
1
phyloClique
phyloBase
Adaptative
steps
->
adaptativeTemporalMatching
steps
$
toGroupsProxi
1
$
appendGroups
cliqueToGroup
1
phyloClique
phyloBase
where
--------------------------------------
phylo
Fis
::
Map
(
Date
,
Date
)
[
PhyloFis
]
phylo
Fis
=
toPhyloFis
docs'
(
getFisSupport
$
contextualUnit
$
getConfig
phyloBase
)
(
getFisSize
$
contextualUnit
$
getConfig
phyloBase
)
phylo
Clique
::
Map
(
Date
,
Date
)
[
PhyloClique
]
phylo
Clique
=
toPhyloClique
phyloBase
docs'
--------------------------------------
docs'
::
Map
(
Date
,
Date
)
[
Document
]
docs'
=
groupDocsByPeriod
date
(
getPeriodIds
phyloBase
)
docs
docs'
=
groupDocsByPeriod
'
date
(
getPeriodIds
phyloBase
)
docs
--------------------------------------
...
...
@@ -115,54 +147,59 @@ toPhylo1 docs phyloBase = temporalMatching
-- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
filter
Fis
::
Bool
->
Int
->
(
Int
->
[
PhyloFis
]
->
[
PhyloFis
])
->
Map
(
Date
,
Date
)
[
PhyloFis
]
->
Map
(
Date
,
Date
)
[
PhyloFis
]
filter
Fis
keep
thr
f
m
=
case
keep
of
filter
Clique
::
Bool
->
Int
->
(
Int
->
[
PhyloClique
]
->
[
PhyloClique
])
->
Map
(
Date
,
Date
)
[
PhyloClique
]
->
Map
(
Date
,
Date
)
[
PhyloClique
]
filter
Clique
keep
thr
f
m
=
case
keep
of
False
->
map
(
\
l
->
f
thr
l
)
m
True
->
map
(
\
l
->
keepFilled
(
f
)
thr
l
)
m
-- | To filter Fis with small Support
filter
FisBySupport
::
Int
->
[
PhyloFis
]
->
[
PhyloFis
]
filter
FisBySupport
thr
l
=
filter
(
\
fis
->
(
fis
^.
phyloFis
_support
)
>=
thr
)
l
filter
CliqueBySupport
::
Int
->
[
PhyloClique
]
->
[
PhyloClique
]
filter
CliqueBySupport
thr
l
=
filter
(
\
clq
->
(
clq
^.
phyloClique
_support
)
>=
thr
)
l
-- | To filter Fis with small Clique size
filter
FisByClique
::
Int
->
[
PhyloFis
]
->
[
PhyloFis
]
filter
FisByClique
thr
l
=
filter
(
\
fis
->
(
size
$
fis
^.
phyloFis_clique
)
>=
thr
)
l
filter
CliqueBySize
::
Int
->
[
PhyloClique
]
->
[
PhyloClique
]
filter
CliqueBySize
thr
l
=
filter
(
\
clq
->
(
size
$
clq
^.
phyloClique_nodes
)
>=
thr
)
l
-- | To filter nested Fis
filter
FisByNested
::
Map
(
Date
,
Date
)
[
PhyloFis
]
->
Map
(
Date
,
Date
)
[
PhyloFis
]
filter
Fis
ByNested
m
=
let
fis
=
map
(
\
l
->
foldl
(
\
mem
f
->
if
(
any
(
\
f'
->
isNested
(
Set
.
toList
$
f'
^.
phylo
Fis_clique
)
(
Set
.
toList
$
f
^.
phyloFis_clique
))
mem
)
filter
CliqueByNested
::
Map
(
Date
,
Date
)
[
PhyloClique
]
->
Map
(
Date
,
Date
)
[
PhyloClique
]
filter
Clique
ByNested
m
=
let
clq
=
map
(
\
l
->
foldl
(
\
mem
f
->
if
(
any
(
\
f'
->
isNested
(
Set
.
toList
$
f'
^.
phylo
Clique_nodes
)
(
Set
.
toList
$
f
^.
phyloClique_nodes
))
mem
)
then
mem
else
let
fMax
=
filter
(
\
f'
->
not
$
isNested
(
Set
.
toList
$
f
^.
phylo
Fis_clique
)
(
Set
.
toList
$
f'
^.
phyloFis_clique
))
mem
let
fMax
=
filter
(
\
f'
->
not
$
isNested
(
Set
.
toList
$
f
^.
phylo
Clique_nodes
)
(
Set
.
toList
$
f'
^.
phyloClique_nodes
))
mem
in
fMax
++
[
f
]
)
[]
l
)
$
elems
m
fis'
=
fis
`
using
`
parList
rdeepseq
in
fromList
$
zip
(
keys
m
)
fis
'
clq'
=
clq
`
using
`
parList
rdeepseq
in
fromList
$
zip
(
keys
m
)
clq
'
-- | To transform a time map of docs innto a time map of Fis with some filters
toPhyloFis
::
Map
(
Date
,
Date
)
[
Document
]
->
Int
->
Int
->
Map
(
Date
,
Date
)
[
PhyloFis
]
toPhyloFis
phyloDocs
support
clique
=
traceFis
"Filtered Fis"
$
filterFisByNested
$
traceFis
"Filtered by clique size"
$
filterFis
True
clique
(
filterFisByClique
)
$
traceFis
"Filtered by support"
$
filterFis
True
support
(
filterFisBySupport
)
$
traceFis
"Unfiltered Fis"
phyloFis
toPhyloClique
::
Phylo
->
Map
(
Date
,
Date
)
[
Document
]
->
Map
(
Date
,
Date
)
[
PhyloClique
]
toPhyloClique
phylo
phyloDocs
=
case
(
clique
$
getConfig
phylo
)
of
Fis
s
s'
->
-- traceFis "Filtered Fis"
filterCliqueByNested
-- $ traceFis "Filtered by clique size"
$
filterClique
True
s'
(
filterCliqueBySize
)
-- $ traceFis "Filtered by support"
$
filterClique
True
s
(
filterCliqueBySupport
)
-- $ traceFis "Unfiltered Fis"
phyloClique
MaxClique
_
->
undefined
where
--------------------------------------
phyloFis
::
Map
(
Date
,
Date
)
[
PhyloFis
]
phyloFis
=
let
fis
=
map
(
\
(
prd
,
docs
)
->
let
lst
=
toList
$
fisWithSizePolyMap
(
Segment
1
20
)
1
(
map
text
docs
)
in
(
prd
,
map
(
\
f
->
PhyloFis
(
fst
f
)
(
snd
f
)
prd
)
lst
))
$
toList
phyloDocs
fis'
=
fis
`
using
`
parList
rdeepseq
in
fromList
fis'
phyloClique
::
Map
(
Date
,
Date
)
[
PhyloClique
]
phyloClique
=
case
(
clique
$
getConfig
phylo
)
of
Fis
_
_
->
let
fis
=
map
(
\
(
prd
,
docs
)
->
let
lst
=
toList
$
fisWithSizePolyMap
(
Segment
1
20
)
1
(
map
text
docs
)
in
(
prd
,
map
(
\
f
->
PhyloClique
(
fst
f
)
(
snd
f
)
prd
)
lst
))
$
toList
phyloDocs
fis'
=
fis
`
using
`
parList
rdeepseq
in
fromList
fis'
MaxClique
_
->
undefined
--------------------------------------
...
...
@@ -188,6 +225,21 @@ docsToTimeScaleCooc docs fdt =
-- | to Phylo Base | --
-----------------------
-- | To group a list of Documents by fixed periods
groupDocsByPeriod'
::
(
NFData
doc
,
Ord
date
,
Enum
date
)
=>
(
doc
->
date
)
->
[(
date
,
date
)]
->
[
doc
]
->
Map
(
date
,
date
)
[
doc
]
groupDocsByPeriod'
f
pds
docs
=
let
docs'
=
groupBy
(
\
d
d'
->
f
d
==
f
d'
)
$
sortOn
f
docs
periods
=
map
(
inPeriode
f
docs'
)
pds
periods'
=
periods
`
using
`
parList
rdeepseq
in
trace
(
"
\n
"
<>
"-- | Group "
<>
show
(
length
docs
)
<>
" docs by "
<>
show
(
length
pds
)
<>
" periods"
<>
"
\n
"
)
$
fromList
$
zip
pds
periods'
where
--------------------------------------
inPeriode
::
Ord
b
=>
(
t
->
b
)
->
[[
t
]]
->
(
b
,
b
)
->
[
t
]
inPeriode
f'
h
(
start
,
end
)
=
concat
$
fst
$
partition
(
\
d
->
f'
(
head'
"inPeriode"
d
)
>=
start
&&
f'
(
head'
"inPeriode"
d
)
<=
end
)
h
-- | To group a list of Documents by fixed periods
groupDocsByPeriod
::
(
NFData
doc
,
Ord
date
,
Enum
date
)
=>
(
doc
->
date
)
->
[(
date
,
date
)]
->
[
doc
]
->
Map
(
date
,
date
)
[
doc
]
...
...
@@ -206,6 +258,17 @@ groupDocsByPeriod f pds es =
--------------------------------------
docsToTermFreq
::
[
Document
]
->
Vector
Ngrams
->
Map
Int
Double
docsToTermFreq
docs
fdt
=
let
nbDocs
=
fromIntegral
$
length
docs
freqs
=
map
(
/
(
nbDocs
))
$
fromList
$
map
(
\
lst
->
(
head'
"docsToTermFreq"
lst
,
fromIntegral
$
length
lst
))
$
group
$
sort
$
concat
$
map
(
\
d
->
nub
$
ngramsToIdx
(
text
d
)
fdt
)
docs
sumFreqs
=
sum
$
elems
freqs
in
map
(
/
sumFreqs
)
freqs
-- | To count the number of docs by unit of time
docsToTimeScaleNb
::
[
Document
]
->
Map
Date
Double
docsToTimeScaleNb
docs
=
...
...
@@ -230,5 +293,7 @@ toPhyloBase docs lst conf =
$
Phylo
foundations
(
docsToTimeScaleCooc
docs
(
foundations
^.
foundations_roots
))
(
docsToTimeScaleNb
docs
)
(
docsToTermFreq
docs
(
foundations
^.
foundations_roots
))
empty
params
(
fromList
$
map
(
\
prd
->
(
prd
,
PhyloPeriod
prd
(
initPhyloLevels
1
prd
)))
periods
)
src/Gargantext/Viz/Phylo/PhyloTools.hs
View file @
70a2339e
...
...
@@ -34,6 +34,7 @@ import Control.Lens hiding (Level)
import
qualified
Data.Vector
as
Vector
import
qualified
Data.List
as
List
import
qualified
Data.Set
as
Set
import
qualified
Data.Map
as
Map
------------
-- | Io | --
...
...
@@ -66,6 +67,7 @@ roundToStr = printf "%0.*f"
countSup
::
Double
->
[
Double
]
->
Int
countSup
s
l
=
length
$
filter
(
>
s
)
l
dropByIdx
::
Int
->
[
a
]
->
[
a
]
dropByIdx
k
l
=
take
k
l
++
drop
(
k
+
1
)
l
...
...
@@ -76,6 +78,15 @@ elemIndex' e l = case (List.elemIndex e l) of
Just
i
->
i
commonPrefix
::
Eq
a
=>
[
a
]
->
[
a
]
->
[
a
]
->
[
a
]
commonPrefix
lst
lst'
acc
=
if
(
null
lst
||
null
lst'
)
then
acc
else
if
(
head'
"commonPrefix"
lst
==
head'
"commonPrefix"
lst'
)
then
commonPrefix
(
tail
lst
)
(
tail
lst'
)
(
acc
++
[
head'
"commonPrefix"
lst
])
else
acc
---------------------
-- | Foundations | --
---------------------
...
...
@@ -162,44 +173,44 @@ keepFilled f thr l = if (null $ f thr l) && (not $ null l)
else
f
thr
l
traceClique
::
Map
(
Date
,
Date
)
[
Phylo
Fis
]
->
String
traceClique
::
Map
(
Date
,
Date
)
[
Phylo
Clique
]
->
String
traceClique
mFis
=
foldl
(
\
msg
cpt
->
msg
<>
show
(
countSup
cpt
cliques
)
<>
" (>"
<>
show
(
cpt
)
<>
") "
)
""
[
1
..
6
]
where
--------------------------------------
cliques
::
[
Double
]
cliques
=
sort
$
map
(
fromIntegral
.
size
.
_phylo
Fis_clique
)
$
concat
$
elems
mFis
cliques
=
sort
$
map
(
fromIntegral
.
size
.
_phylo
Clique_nodes
)
$
concat
$
elems
mFis
--------------------------------------
traceSupport
::
Map
(
Date
,
Date
)
[
Phylo
Fis
]
->
String
traceSupport
::
Map
(
Date
,
Date
)
[
Phylo
Clique
]
->
String
traceSupport
mFis
=
foldl
(
\
msg
cpt
->
msg
<>
show
(
countSup
cpt
supports
)
<>
" (>"
<>
show
(
cpt
)
<>
") "
)
""
[
1
..
6
]
where
--------------------------------------
supports
::
[
Double
]
supports
=
sort
$
map
(
fromIntegral
.
_phylo
Fis
_support
)
$
concat
$
elems
mFis
supports
=
sort
$
map
(
fromIntegral
.
_phylo
Clique
_support
)
$
concat
$
elems
mFis
--------------------------------------
traceFis
::
[
Char
]
->
Map
(
Date
,
Date
)
[
Phylo
Fis
]
->
Map
(
Date
,
Date
)
[
PhyloFis
]
traceFis
::
[
Char
]
->
Map
(
Date
,
Date
)
[
Phylo
Clique
]
->
Map
(
Date
,
Date
)
[
PhyloClique
]
traceFis
msg
mFis
=
trace
(
"
\n
"
<>
"-- | "
<>
msg
<>
" : "
<>
show
(
sum
$
map
length
$
elems
mFis
)
<>
"
\n
"
<>
"Support : "
<>
(
traceSupport
mFis
)
<>
"
\n
"
<>
"
Clique
: "
<>
(
traceClique
mFis
)
<>
"
\n
"
)
mFis
<>
"
Nb Ngrams
: "
<>
(
traceClique
mFis
)
<>
"
\n
"
)
mFis
---------------
----------
-- | C
ontextual unit
| --
---------------
----------
---------------
-- | C
lique
| --
---------------
get
FisSupport
::
ContextualUnit
->
Int
get
Fis
Support
unit
=
case
unit
of
get
CliqueSupport
::
Clique
->
Int
get
Clique
Support
unit
=
case
unit
of
Fis
s
_
->
s
-- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a support")
MaxClique
_
->
0
get
FisSize
::
ContextualUnit
->
Int
get
Fis
Size
unit
=
case
unit
of
get
CliqueSize
::
Clique
->
Int
get
Clique
Size
unit
=
case
unit
of
Fis
_
s
->
s
-- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a clique size")
MaxClique
s
->
s
--------------
...
...
@@ -227,6 +238,8 @@ sumCooc cooc cooc' = unionWith (+) cooc cooc'
getTrace
::
Cooc
->
Double
getTrace
cooc
=
sum
$
elems
$
filterWithKey
(
\
(
k
,
k'
)
_
->
k
==
k'
)
cooc
coocToDiago
::
Cooc
->
Cooc
coocToDiago
cooc
=
filterWithKey
(
\
(
k
,
k'
)
_
->
k
==
k'
)
cooc
-- | To build the local cooc matrix of each phylogroup
ngramsToCooc
::
[
Int
]
->
[
Cooc
]
->
Cooc
...
...
@@ -243,6 +256,12 @@ ngramsToCooc ngrams coocs =
getGroupId
::
PhyloGroup
->
PhyloGroupId
getGroupId
group
=
((
group
^.
phylo_groupPeriod
,
group
^.
phylo_groupLevel
),
group
^.
phylo_groupIndex
)
idToPrd
::
PhyloGroupId
->
PhyloPeriodId
idToPrd
id
=
(
fst
.
fst
)
id
getGroupThr
::
PhyloGroup
->
Double
getGroupThr
group
=
last'
"getGroupThr"
((
group
^.
phylo_groupMeta
)
!
"breaks"
)
groupByField
::
Ord
a
=>
(
PhyloGroup
->
a
)
->
[
PhyloGroup
]
->
Map
a
[
PhyloGroup
]
groupByField
toField
groups
=
fromListWith
(
++
)
$
map
(
\
g
->
(
toField
g
,
[
g
]))
groups
...
...
@@ -255,34 +274,21 @@ getPeriodPointers fil group =
filterProximity
::
Proximity
->
Double
->
Double
->
Bool
filterProximity
proximity
thr
local
=
case
proximity
of
WeightedLogJaccard
_
_
_
->
local
>=
thr
WeightedLogJaccard
_
->
local
>=
thr
Hamming
->
undefined
getProximityName
::
Proximity
->
String
getProximityName
proximity
=
case
proximity
of
WeightedLogJaccard
_
_
_
->
"WLJaccard"
Hamming
->
"Hamming"
getProximityInit
::
Proximity
->
Double
getProximityInit
proximity
=
case
proximity
of
WeightedLogJaccard
_
i
_
->
i
Hamming
->
undefined
getProximityStep
::
Proximity
->
Double
getProximityStep
proximity
=
case
proximity
of
WeightedLogJaccard
_
_
s
->
s
Hamming
->
undefined
WeightedLogJaccard
_
->
"WLJaccard"
Hamming
->
"Hamming"
---------------
-- | Phylo | --
---------------
addPointers
::
PhyloGroup
->
Filiation
->
PointerType
->
[
Pointer
]
->
PhyloGroup
addPointers
group
fil
pty
pointers
=
addPointers
::
Filiation
->
PointerType
->
[
Pointer
]
->
PhyloGroup
->
PhyloGroup
addPointers
fil
pty
pointers
group
=
case
pty
of
TemporalPointer
->
case
fil
of
ToChilds
->
group
&
phylo_groupPeriodChilds
.~
pointers
...
...
@@ -310,6 +316,9 @@ getLevels phylo = nub
.
traverse
.
phylo_periodLevels
)
phylo
getSeaElevation
::
Phylo
->
SeaElevation
getSeaElevation
phylo
=
seaElevation
(
getConfig
phylo
)
getConfig
::
Phylo
->
Config
getConfig
phylo
=
(
phylo
^.
phylo_param
)
^.
phyloParam_config
...
...
@@ -334,6 +343,26 @@ getGroupsFromLevel lvl phylo =
.
phylo_levelGroups
)
phylo
getGroupsFromLevelPeriods
::
Level
->
[
PhyloPeriodId
]
->
Phylo
->
[
PhyloGroup
]
getGroupsFromLevelPeriods
lvl
periods
phylo
=
elems
$
view
(
phylo_periods
.
traverse
.
filtered
(
\
phyloPrd
->
elem
(
phyloPrd
^.
phylo_periodPeriod
)
periods
)
.
phylo_periodLevels
.
traverse
.
filtered
(
\
phyloLvl
->
phyloLvl
^.
phylo_levelLevel
==
lvl
)
.
phylo_levelGroups
)
phylo
getGroupsFromPeriods
::
Level
->
Map
PhyloPeriodId
PhyloPeriod
->
[
PhyloGroup
]
getGroupsFromPeriods
lvl
periods
=
elems
$
view
(
traverse
.
phylo_periodLevels
.
traverse
.
filtered
(
\
phyloLvl
->
phyloLvl
^.
phylo_levelLevel
==
lvl
)
.
phylo_levelGroups
)
periods
updatePhyloGroups
::
Level
->
Map
PhyloGroupId
PhyloGroup
->
Phylo
->
Phylo
updatePhyloGroups
lvl
m
phylo
=
over
(
phylo_periods
...
...
@@ -391,27 +420,7 @@ traceSynchronyStart phylo =
getSensibility
::
Proximity
->
Double
getSensibility
proxi
=
case
proxi
of
WeightedLogJaccard
s
_
_
->
s
Hamming
->
undefined
getThresholdInit
::
Proximity
->
Double
getThresholdInit
proxi
=
case
proxi
of
WeightedLogJaccard
_
t
_
->
t
Hamming
->
undefined
getThresholdStep
::
Proximity
->
Double
getThresholdStep
proxi
=
case
proxi
of
WeightedLogJaccard
_
_
s
->
s
Hamming
->
undefined
traceBranchMatching
::
Proximity
->
Double
->
[
PhyloGroup
]
->
[
PhyloGroup
]
traceBranchMatching
proxi
thr
groups
=
case
proxi
of
WeightedLogJaccard
_
i
s
->
trace
(
roundToStr
2
thr
<>
" "
<>
foldl
(
\
acc
_
->
acc
<>
"."
)
"."
[(
10
*
i
),(
10
*
i
+
10
*
s
)
..
(
10
*
thr
)]
<>
" "
<>
show
(
length
groups
)
<>
" groups"
)
groups
WeightedLogJaccard
s
->
s
Hamming
->
undefined
----------------
...
...
@@ -478,4 +487,9 @@ traceMatchEnd groups =
traceTemporalMatching
::
[
PhyloGroup
]
->
[
PhyloGroup
]
traceTemporalMatching
groups
=
trace
(
"
\n
"
<>
"-- | Start temporal matching for "
<>
show
(
length
groups
)
<>
" groups"
<>
"
\n
"
)
groups
\ No newline at end of file
trace
(
"
\n
"
<>
"-- | Start temporal matching for "
<>
show
(
length
groups
)
<>
" groups"
<>
"
\n
"
)
groups
traceGroupsProxi
::
Map
(
PhyloGroupId
,
PhyloGroupId
)
Double
->
Map
(
PhyloGroupId
,
PhyloGroupId
)
Double
traceGroupsProxi
m
=
trace
(
"
\n
"
<>
"-- | "
<>
show
(
Map
.
size
m
)
<>
" computed pairs of groups proximity"
<>
"
\n
"
)
m
\ No newline at end of file
src/Gargantext/Viz/Phylo/SynchronicClustering.hs
View file @
70a2339e
...
...
@@ -18,34 +18,77 @@ module Gargantext.Viz.Phylo.SynchronicClustering where
import
Gargantext.Prelude
import
Gargantext.Viz.AdaptativePhylo
import
Gargantext.Viz.Phylo.PhyloTools
import
Gargantext.Viz.Phylo.TemporalMatching
(
weightedLogJaccard
)
import
Gargantext.Viz.Phylo.TemporalMatching
(
weightedLogJaccard'
,
filterDiago
,
reduceDiagos
)
import
Gargantext.Viz.Phylo.PhyloExport
(
processDynamics
)
import
Data.List
((
++
),
null
,
intersect
,
nub
,
concat
,
sort
,
sortOn
)
import
Data.Map
(
Map
,
fromList
,
fromListWith
,
foldlWithKey
,
(
!
),
insert
,
empty
,
restrictKeys
,
elems
,
mapWithKey
,
member
)
import
Data.List
((
++
),
null
,
intersect
,
nub
,
concat
,
sort
,
sortOn
,
all
,
groupBy
,
group
,
maximum
)
import
Data.Map
(
Map
,
fromList
,
fromListWith
,
foldlWithKey
,
(
!
),
insert
,
empty
,
restrictKeys
,
elems
,
mapWithKey
,
member
)
import
Data.Text
(
Text
)
import
Control.Lens
hiding
(
Level
)
import
Control.Parallel.Strategies
(
parList
,
rdeepseq
,
using
)
-- import Debug.Trace (trace)
import
qualified
Data.Map
as
Map
import
qualified
Data.Set
as
Set
-------------------------
-- | New Level Maker | --
-------------------------
toBranchId
::
PhyloGroup
->
PhyloBranchId
toBranchId
child
=
((
child
^.
phylo_groupLevel
)
+
1
,
snd
(
child
^.
phylo_groupBranchId
))
mergeBranchIds
::
[[
Int
]]
->
[
Int
]
mergeBranchIds
ids
=
(
head'
"mergeBranchIds"
.
sort
.
mostFreq'
)
ids
where
-- | 2) find the most Up Left ids in the hierarchy of similarity
-- mostUpLeft :: [[Int]] -> [[Int]]
-- mostUpLeft ids' =
-- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
-- inf = (fst . minimum) groupIds
-- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
-- | 1) find the most frequent ids
mostFreq'
::
[[
Int
]]
->
[[
Int
]]
mostFreq'
ids'
=
let
groupIds
=
(
map
(
\
gIds
->
(
length
gIds
,
head'
"gIds"
gIds
))
.
group
.
sort
)
ids'
sup
=
(
fst
.
maximum
)
groupIds
in
map
snd
$
filter
(
\
gIds
->
fst
gIds
==
sup
)
groupIds
mergeMeta
::
[
Int
]
->
[
PhyloGroup
]
->
Map
Text
[
Double
]
mergeMeta
bId
groups
=
let
ego
=
head'
"mergeMeta"
$
filter
(
\
g
->
(
snd
(
g
^.
phylo_groupBranchId
))
==
bId
)
groups
in
fromList
[(
"breaks"
,(
ego
^.
phylo_groupMeta
)
!
"breaks"
),(
"seaLevels"
,(
ego
^.
phylo_groupMeta
)
!
"seaLevels"
)]
groupsToBranches'
::
Map
PhyloGroupId
PhyloGroup
->
[[
PhyloGroup
]]
groupsToBranches'
groups
=
-- | run the related component algorithm
let
egos
=
map
(
\
g
->
[
getGroupId
g
]
++
(
map
fst
$
g
^.
phylo_groupPeriodParents
)
++
(
map
fst
$
g
^.
phylo_groupPeriodChilds
)
)
$
elems
groups
graph
=
relatedComponents
egos
-- | update each group's branch id
in
map
(
\
ids
->
let
groups'
=
elems
$
restrictKeys
groups
(
Set
.
fromList
ids
)
bId
=
mergeBranchIds
$
map
(
\
g
->
snd
$
g
^.
phylo_groupBranchId
)
groups'
in
map
(
\
g
->
g
&
phylo_groupBranchId
%~
(
\
(
lvl
,
_
)
->
(
lvl
,
bId
)))
groups'
)
graph
mergeGroups
::
[
Cooc
]
->
PhyloGroupId
->
Map
PhyloGroupId
PhyloGroupId
->
[
PhyloGroup
]
->
PhyloGroup
mergeGroups
coocs
id
mapIds
childs
=
let
ngrams
=
(
sort
.
nub
.
concat
)
$
map
_phylo_groupNgrams
childs
in
PhyloGroup
(
fst
$
fst
id
)
(
snd
$
fst
id
)
(
snd
id
)
""
(
sum
$
map
_phylo_groupSupport
childs
)
ngrams
(
ngramsToCooc
ngrams
coocs
)
(
toBranchId
(
head'
"mergeGroups"
childs
))
empty
[]
(
map
(
\
g
->
(
getGroupId
g
,
1
))
childs
)
(
ngramsToCooc
ngrams
coocs
)
((
snd
$
fst
id
),
bId
)
(
mergeMeta
bId
childs
)
[]
(
map
(
\
g
->
(
getGroupId
g
,
1
))
childs
)
(
updatePointers
$
concat
$
map
_phylo_groupPeriodParents
childs
)
(
updatePointers
$
concat
$
map
_phylo_groupPeriodChilds
childs
)
where
where
--------------------
bId
::
[
Int
]
bId
=
mergeBranchIds
$
map
(
\
g
->
snd
$
g
^.
phylo_groupBranchId
)
childs
--------------------
updatePointers
::
[
Pointer
]
->
[
Pointer
]
updatePointers
pointers
=
map
(
\
(
pId
,
w
)
->
(
mapIds
!
pId
,
w
))
pointers
...
...
@@ -57,42 +100,47 @@ addPhyloLevel lvl phylo =
%~
(
insert
(
phyloPrd
^.
phylo_periodPeriod
,
lvl
)
(
PhyloLevel
(
phyloPrd
^.
phylo_periodPeriod
)
lvl
empty
)))
phylo
toNextLevel
::
Phylo
->
[
PhyloGroup
]
->
Phylo
toNextLevel
phylo
groups
=
toNextLevel
'
::
Phylo
->
[
PhyloGroup
]
->
Phylo
toNextLevel
'
phylo
groups
=
let
curLvl
=
getLastLevel
phylo
oldGroups
=
fromList
$
map
(
\
g
->
(
getGroupId
g
,
getLevelParentId
g
))
groups
newGroups
=
fromListWith
(
++
)
-- | 5) group the parents by periods
newGroups
=
concat
$
groupsToBranches'
$
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
$
foldlWithKey
(
\
acc
id
groups'
->
-- | 4) create the parent group
let
parent
=
mergeGroups
(
elems
$
restrictKeys
(
phylo
^.
phylo_timeCooc
)
$
periodsToYears
[(
fst
.
fst
)
id
])
id
oldGroups
groups'
in
acc
++
[
(
parent
^.
phylo_groupPeriod
,
[
parent
])
])
[]
in
acc
++
[
parent
])
[]
-- | 3) group the current groups by parentId
$
fromListWith
(
++
)
$
map
(
\
g
->
(
getLevelParentId
g
,
[
g
]))
groups
newPeriods
=
fromListWith
(
++
)
$
map
(
\
g
->
(
g
^.
phylo_groupPeriod
,
[
g
]))
newGroups
in
traceSynchronyEnd
$
over
(
phylo_periods
.
traverse
.
phylo_periodLevels
.
traverse
-- | 6) update each period at curLvl + 1
.
filtered
(
\
phyloLvl
->
phyloLvl
^.
phylo_levelLevel
==
(
curLvl
+
1
)))
-- | 7) by adding the parents
(
\
phyloLvl
->
if
member
(
phyloLvl
^.
phylo_levelPeriod
)
new
Group
s
if
member
(
phyloLvl
^.
phylo_levelPeriod
)
new
Period
s
then
phyloLvl
&
phylo_levelGroups
.~
fromList
(
map
(
\
g
->
(
getGroupId
g
,
g
))
$
new
Group
s
!
(
phyloLvl
^.
phylo_levelPeriod
))
.~
fromList
(
map
(
\
g
->
(
getGroupId
g
,
g
))
$
new
Period
s
!
(
phyloLvl
^.
phylo_levelPeriod
))
else
phyloLvl
)
-- | 2) add the curLvl + 1 phyloLevel to the phylo
$
addPhyloLevel
(
curLvl
+
1
)
-- | 1) update the current groups (with level parent pointers) in the phylo
$
updatePhyloGroups
curLvl
(
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
groups
)
phylo
--------------------
-- | Clustering | --
--------------------
toPairs
::
[
PhyloGroup
]
->
[(
PhyloGroup
,
PhyloGroup
)]
toPairs
groups
=
filter
(
\
(
g
,
g'
)
->
(
not
.
null
)
$
intersect
(
g
^.
phylo_groupNgrams
)
(
g'
^.
phylo_groupNgrams
))
$
listToCombi'
groups
toPairs
::
SynchronyStrategy
->
[
PhyloGroup
]
->
[(
PhyloGroup
,
PhyloGroup
)]
toPairs
strategy
groups
=
case
strategy
of
MergeRegularGroups
->
pairs
$
filter
(
\
g
->
all
(
==
3
)
$
(
g
^.
phylo_groupMeta
)
!
"dynamics"
)
groups
MergeAllGroups
->
pairs
groups
where
pairs
::
[
PhyloGroup
]
->
[(
PhyloGroup
,
PhyloGroup
)]
pairs
gs
=
filter
(
\
(
g
,
g'
)
->
(
not
.
null
)
$
intersect
(
g
^.
phylo_groupNgrams
)
(
g'
^.
phylo_groupNgrams
))
(
listToCombi'
gs
)
toDiamonds
::
[
PhyloGroup
]
->
[[
PhyloGroup
]]
...
...
@@ -108,26 +156,25 @@ toDiamonds groups = foldl' (\acc groups' ->
$
foldl'
(
\
acc
g
->
acc
++
(
map
(
\
(
id
,
_
)
->
(
id
,[
g
])
)
$
g
^.
phylo_groupPeriodParents
)
)
[]
groups
groupsToEdges
::
Proximity
->
Synchrony
->
Double
->
[
PhyloGroup
]
->
[((
PhyloGroup
,
PhyloGroup
),
Double
)]
groupsToEdges
prox
sync
docs
groups
=
groupsToEdges
::
Proximity
->
Synchrony
->
Double
->
Map
Int
Double
->
[
PhyloGroup
]
->
[((
PhyloGroup
,
PhyloGroup
),
Double
)]
groupsToEdges
prox
sync
nbDocs
diago
groups
=
case
sync
of
ByProximityThreshold
t
s
->
filter
(
\
(
_
,
w
)
->
w
>=
t
)
$
toEdges
s
$
toPairs
group
s
ByProximityDistribution
s
->
ByProximityThreshold
t
hr
sens
_
strat
->
filter
(
\
(
_
,
w
)
->
w
>=
thr
)
$
toEdges
sen
s
$
toPairs
strat
groups
ByProximityDistribution
s
ens
strat
->
let
diamonds
=
sortOn
snd
$
toEdges
s
$
concat
$
map
toPairs
$
toDiamonds
groups
$
toEdges
s
ens
$
concat
$
map
(
\
gs
->
toPairs
strat
gs
)
$
toDiamonds
groups
in
take
(
div
(
length
diamonds
)
2
)
diamonds
where
toEdges
::
Double
->
[(
PhyloGroup
,
PhyloGroup
)]
->
[((
PhyloGroup
,
PhyloGroup
),
Double
)]
toEdges
sens
edges
=
case
prox
of
WeightedLogJaccard
_
_
_
->
map
(
\
(
g
,
g'
)
->
((
g
,
g'
),
weightedLogJaccard
sens
docs
(
g
^.
phylo_groupCooc
)
(
g'
^.
phylo_groupCooc
)
(
g
^.
phylo_groupNgrams
)
(
g'
^.
phylo_groupNgrams
)))
edges
WeightedLogJaccard
_
->
map
(
\
(
g
,
g'
)
->
((
g
,
g'
),
weightedLogJaccard'
sens
nbDocs
diago
(
g
^.
phylo_groupNgrams
)
(
g'
^.
phylo_groupNgrams
)))
edges
_
->
undefined
...
...
@@ -142,21 +189,34 @@ toParentId :: PhyloGroup -> PhyloGroupId
toParentId
child
=
((
child
^.
phylo_groupPeriod
,
child
^.
phylo_groupLevel
+
1
),
child
^.
phylo_groupIndex
)
reduce
Branch
::
Proximity
->
Synchrony
->
Map
Date
Double
->
[
PhyloGroup
]
->
[
PhyloGroup
]
reduce
Branch
prox
sync
docs
branch
=
reduce
Groups
::
Proximity
->
Synchrony
->
Map
Date
Double
->
Map
Date
Cooc
->
[
PhyloGroup
]
->
[
PhyloGroup
]
reduce
Groups
prox
sync
docs
diagos
branch
=
-- | 1) reduce a branch as a set of periods & groups
let
periods
=
fromListWith
(
++
)
$
map
(
\
g
->
(
g
^.
phylo_groupPeriod
,[
g
]))
branch
in
(
concat
.
concat
.
elems
)
$
mapWithKey
(
\
prd
groups
->
-- | 2) for each period, transform the groups as a proximity graph filtered by a threshold
let
edges
=
groupsToEdges
prox
sync
((
sum
.
elems
)
$
restrictKeys
docs
$
periodsToYears
[
prd
])
groups
let
diago
=
reduceDiagos
$
filterDiago
diagos
[
prd
]
edges
=
groupsToEdges
prox
sync
((
sum
.
elems
)
$
restrictKeys
docs
$
periodsToYears
[
prd
])
diago
groups
in
map
(
\
comp
->
-- | 4) add to each groups their futur level parent group
let
parentId
=
toParentId
(
head'
"parentId"
comp
)
in
map
(
\
g
->
g
&
phylo_groupLevelParents
%~
(
++
[(
parentId
,
1
)])
)
comp
)
-- |3) reduce the graph a a set of related components
$
toRelatedComponents
groups
edges
)
periods
$
toRelatedComponents
groups
edges
)
periods
adjustClustering
::
Synchrony
->
[[
PhyloGroup
]]
->
[[
PhyloGroup
]]
adjustClustering
sync
branches
=
case
sync
of
ByProximityThreshold
_
_
scope
_
->
case
scope
of
SingleBranch
->
branches
SiblingBranches
->
groupBy
(
\
g
g'
->
(
last'
"adjustClustering"
$
(
g
^.
phylo_groupMeta
)
!
"breaks"
)
==
(
last'
"adjustClustering"
$
(
g'
^.
phylo_groupMeta
)
!
"breaks"
))
$
sortOn
_phylo_groupBranchId
$
concat
branches
AllBranches
->
[
concat
branches
]
ByProximityDistribution
_
_
->
branches
synchronicClustering
::
Phylo
->
Phylo
...
...
@@ -164,11 +224,14 @@ synchronicClustering phylo =
let
prox
=
phyloProximity
$
getConfig
phylo
sync
=
phyloSynchrony
$
getConfig
phylo
docs
=
phylo
^.
phylo_timeDocs
branches
=
map
(
\
branch
->
reduceBranch
prox
sync
docs
branch
)
$
phyloToLastBranches
$
traceSynchronyStart
phylo
branches'
=
branches
`
using
`
parList
rdeepseq
in
toNextLevel
phylo
$
concat
branches'
diagos
=
map
coocToDiago
$
phylo
^.
phylo_timeCooc
newBranches
=
map
(
\
branch
->
reduceGroups
prox
sync
docs
diagos
branch
)
$
map
processDynamics
$
adjustClustering
sync
$
phyloToLastBranches
$
traceSynchronyStart
phylo
newBranches'
=
newBranches
`
using
`
parList
rdeepseq
in
toNextLevel'
phylo
$
concat
newBranches'
----------------
...
...
src/Gargantext/Viz/Phylo/TemporalMatching.hs
View file @
70a2339e
...
...
@@ -15,18 +15,19 @@ Portability : POSIX
module
Gargantext.Viz.Phylo.TemporalMatching
where
import
Data.List
(
concat
,
splitAt
,
tail
,
sortOn
,
(
++
),
intersect
,
null
,
inits
,
groupBy
,
scanl
,
nub
,
union
,
dropWhile
,
partition
,
delete
)
import
Data.Map
(
Map
,
fromList
,
elems
,
restrictKeys
,
unionWith
,
intersectionWith
,
findWithDefault
,
keys
,
(
!
),
filterWithKey
)
import
Data.List
(
concat
,
splitAt
,
tail
,
sortOn
,
(
++
),
intersect
,
null
,
inits
,
groupBy
,
scanl
,
nub
,
union
,
dropWhile
,
partition
,
or
,
sort
,
(
!!
)
)
import
Data.Map
(
Map
,
fromList
,
elems
,
restrictKeys
,
unionWith
,
findWithDefault
,
keys
,
(
!
),
(
!?
),
filterWithKey
,
singleton
,
empty
,
mapKeys
,
adjust
)
import
Gargantext.Prelude
import
Gargantext.Viz.AdaptativePhylo
import
Gargantext.Viz.Phylo.PhyloTools
-- import Prelude (logBase
)
import
Prelude
(
floor
)
import
Control.Lens
hiding
(
Level
)
import
Control.Parallel.Strategies
(
parList
,
rdeepseq
,
using
)
import
Debug.Trace
(
trace
)
import
qualified
Data.Map
as
Map
import
qualified
Data.Set
as
Set
...
...
@@ -35,30 +36,29 @@ import qualified Data.Set as Set
-------------------
-- |
Process the inverse sumLog
sumInvLog
::
Double
->
[
Double
]
->
Double
sumInvLog
s
l
=
foldl
(
\
mem
x
->
mem
+
(
1
/
log
(
s
+
x
)))
0
l
-- |
To compute a jaccard similarity between two lists
jaccard
::
[
Int
]
->
[
Int
]
->
Double
jaccard
inter'
union'
=
((
fromIntegral
.
length
)
$
inter'
)
/
((
fromIntegral
.
length
)
$
union'
)
-- | Process the sumLog
sum
Log
::
Double
->
[
Double
]
->
Double
sum
Log
s
l
=
foldl
(
\
mem
x
->
mem
+
log
(
s
+
x
))
0
l
-- | Process the
inverse
sumLog
sum
InvLog'
::
Double
->
Double
->
[
Double
]
->
Double
sum
InvLog'
s
nb
diago
=
foldl
(
\
mem
occ
->
mem
+
(
1
/
(
log
(
occ
+
s
)
/
log
(
nb
+
s
))))
0
diago
-- |
To compute a jaccard similarity between two lists
jaccard
::
[
Int
]
->
[
Int
]
->
Double
jaccard
inter'
union'
=
((
fromIntegral
.
length
)
$
inter'
)
/
((
fromIntegral
.
length
)
$
union'
)
-- |
Process the sumLog
sumLog'
::
Double
->
Double
->
[
Double
]
->
Double
sumLog'
s
nb
diago
=
foldl
(
\
mem
occ
->
mem
+
(
log
(
occ
+
s
)
/
log
(
nb
+
s
)))
0
diago
-- | To process a WeighedLogJaccard distance between to coocurency matrix
weightedLogJaccard
::
Double
->
Double
->
Cooc
->
Cooc
->
[
Int
]
->
[
Int
]
->
Double
weightedLogJaccard
sens
docs
cooc
cooc'
ngrams
ngrams'
weightedLogJaccard'
::
Double
->
Double
->
Map
Int
Double
->
[
Int
]
->
[
Int
]
->
Double
weightedLogJaccard'
sens
nbDocs
diago
ngrams
ngrams'
|
null
ngramsInter
=
0
|
ngramsInter
==
ngramsUnion
=
1
|
sens
==
0
=
jaccard
ngramsInter
ngramsUnion
|
sens
>
0
=
(
sumInvLog
sens
coocInter
)
/
(
sumInvLog
sens
cooc
Union
)
|
otherwise
=
(
sumLog
sens
coocInter
)
/
(
sumLog
sens
coocUnion
)
where
|
sens
>
0
=
(
sumInvLog
'
sens
nbDocs
diagoInter
)
/
(
sumInvLog'
sens
nbDocs
diago
Union
)
|
otherwise
=
(
sumLog
'
sens
nbDocs
diagoInter
)
/
(
sumLog'
sens
nbDocs
diagoUnion
)
where
--------------------------------------
ngramsInter
::
[
Int
]
ngramsInter
=
intersect
ngrams
ngrams'
...
...
@@ -66,122 +66,125 @@ weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
ngramsUnion
::
[
Int
]
ngramsUnion
=
union
ngrams
ngrams'
--------------------------------------
coocInter
::
[
Double
]
coocInter
=
elems
$
map
(
/
docs
)
$
filterWithKey
(
\
(
k
,
k'
)
_
->
k
==
k'
)
$
intersectionWith
(
+
)
cooc
cooc'
-- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
--------------------------------------
coocUnion
::
[
Double
]
coocUnion
=
elems
$
map
(
/
docs
)
$
filterWithKey
(
\
(
k
,
k'
)
_
->
k
==
k'
)
$
unionWith
(
+
)
cooc
cooc'
--------------------------------------
-- | To choose a proximity function
pickProximity
::
Proximity
->
Double
->
Cooc
->
Cooc
->
[
Int
]
->
[
Int
]
->
Double
pickProximity
proximity
docs
cooc
cooc'
ngrams
ngrams'
=
case
proximity
of
WeightedLogJaccard
sens
_
_
->
weightedLogJaccard
sens
docs
cooc
cooc'
ngrams
ngrams'
Hamming
->
undefined
diagoInter
::
[
Double
]
diagoInter
=
elems
$
restrictKeys
diago
(
Set
.
fromList
ngramsInter
)
--------------------------------------
diagoUnion
::
[
Double
]
diagoUnion
=
elems
$
restrictKeys
diago
(
Set
.
fromList
ngramsUnion
)
--------------------------------------
-- | To process the proximity between a current group and a pair of targets group
toProximity
::
Map
Date
Double
->
Proximity
->
PhyloGroup
->
PhyloGroup
->
PhyloGroup
->
Double
toProximity
docs
proximity
ego
target
target'
=
let
docs'
=
sum
$
elems
docs
cooc
=
if
target
==
target'
then
(
target
^.
phylo_groupCooc
)
else
sumCooc
(
target
^.
phylo_groupCooc
)
(
target'
^.
phylo_groupCooc
)
ngrams
=
if
target
==
target'
then
(
target
^.
phylo_groupNgrams
)
else
union
(
target
^.
phylo_groupNgrams
)
(
target'
^.
phylo_groupNgrams
)
in
pickProximity
proximity
docs'
(
ego
^.
phylo_groupCooc
)
cooc
(
ego
^.
phylo_groupNgrams
)
ngrams
toProximity
::
Double
->
Map
Int
Double
->
Proximity
->
[
Int
]
->
[
Int
]
->
[
Int
]
->
Double
toProximity
nbDocs
diago
proximity
egoNgrams
targetNgrams
targetNgrams'
=
case
proximity
of
WeightedLogJaccard
sens
->
let
pairNgrams
=
if
targetNgrams
==
targetNgrams'
then
targetNgrams
else
union
targetNgrams
targetNgrams'
in
weightedLogJaccard'
sens
nbDocs
diago
egoNgrams
pairNgrams
Hamming
->
undefined
------------------------
-- | Local Matching | --
------------------------
toLastPeriod
::
Filiation
->
[
PhyloPeriodId
]
->
PhyloPeriodId
toLastPeriod
fil
periods
=
case
fil
of
ToParents
->
head'
"toLastPeriod"
(
sortOn
fst
periods
)
ToChilds
->
last'
"toLastPeriod"
(
sortOn
fst
periods
)
toLazyPairs
::
[
Pointer
]
->
Filiation
->
Double
->
Proximity
->
PhyloPeriodId
->
[(
PhyloGroup
,
PhyloGroup
)]
->
[(
PhyloGroup
,
PhyloGroup
)]
toLazyPairs
pointers
fil
thr
prox
prd
pairs
=
if
null
pointers
then
pairs
else
let
rest
=
filterPointers
prox
thr
pointers
in
if
null
rest
then
let
prd'
=
toLastPeriod
fil
(
map
(
fst
.
fst
.
fst
)
pointers
)
in
if
prd'
==
prd
then
[]
else
filter
(
\
(
g
,
g'
)
->
case
fil
of
ToParents
->
((
fst
$
g
^.
phylo_groupPeriod
)
<
(
fst
prd'
))
||
((
fst
$
g'
^.
phylo_groupPeriod
)
<
(
fst
prd'
))
ToChilds
->
((
fst
$
g
^.
phylo_groupPeriod
)
>
(
fst
prd'
))
||
((
fst
$
g'
^.
phylo_groupPeriod
)
>
(
fst
prd'
)))
pairs
else
[]
-- | Find pairs of valuable candidates to be matched
makePairs'
::
PhyloGroup
->
[
PhyloGroup
]
->
[
PhyloPeriodId
]
->
[
Pointer
]
->
Filiation
->
Double
->
Proximity
->
Map
Date
Double
->
[(
PhyloGroup
,
PhyloGroup
)]
makePairs'
ego
candidates
periods
pointers
fil
thr
prox
docs
=
case
null
periods
of
True
->
[]
False
->
toLazyPairs
pointers
fil
thr
prox
lastPrd
-- | at least on of the pair candidates should be from the last added period
$
filter
(
\
(
g
,
g'
)
->
((
g
^.
phylo_groupPeriod
)
==
lastPrd
)
||
((
g'
^.
phylo_groupPeriod
)
==
lastPrd
))
$
listToKeys
$
filter
(
\
g
->
(
g
^.
phylo_groupPeriod
==
lastPrd
)
||
((
toProximity
docs
prox
ego
ego
g
)
>=
thr
))
candidates
findLastPeriod
::
Filiation
->
[
PhyloPeriodId
]
->
PhyloPeriodId
findLastPeriod
fil
periods
=
case
fil
of
ToParents
->
head'
"findLastPeriod"
(
sortOn
fst
periods
)
ToChilds
->
last'
"findLastPeriod"
(
sortOn
fst
periods
)
-- | To filter pairs of candidates related to old pointers periods
removeOldPointers
::
[
Pointer
]
->
Filiation
->
Double
->
Proximity
->
PhyloPeriodId
->
[((
PhyloGroupId
,[
Int
]),(
PhyloGroupId
,[
Int
]))]
->
[((
PhyloGroupId
,[
Int
]),(
PhyloGroupId
,[
Int
]))]
removeOldPointers
oldPointers
fil
thr
prox
prd
pairs
|
null
oldPointers
=
pairs
|
null
(
filterPointers
prox
thr
oldPointers
)
=
let
lastMatchedPrd
=
findLastPeriod
fil
(
map
(
fst
.
fst
.
fst
)
oldPointers
)
in
if
lastMatchedPrd
==
prd
then
[]
else
filter
(
\
((
id
,
_
),(
id'
,
_
))
->
case
fil
of
ToParents
->
(((
fst
.
fst
.
fst
)
id
)
<
(
fst
lastMatchedPrd
))
||
(((
fst
.
fst
.
fst
)
id'
)
<
(
fst
lastMatchedPrd
))
ToChilds
->
(((
fst
.
fst
.
fst
)
id
)
>
(
fst
lastMatchedPrd
))
||
(((
fst
.
fst
.
fst
)
id'
)
>
(
fst
lastMatchedPrd
)))
pairs
|
otherwise
=
[]
makePairs'
::
(
PhyloGroupId
,[
Int
])
->
[(
PhyloGroupId
,[
Int
])]
->
[
PhyloPeriodId
]
->
[
Pointer
]
->
Filiation
->
Double
->
Proximity
->
Map
Date
Double
->
Map
Date
Cooc
->
[((
PhyloGroupId
,[
Int
]),(
PhyloGroupId
,[
Int
]))]
makePairs'
(
egoId
,
egoNgrams
)
candidates
periods
oldPointers
fil
thr
prox
docs
diagos
=
if
(
null
periods
)
then
[]
else
removeOldPointers
oldPointers
fil
thr
prox
lastPrd
-- | at least on of the pair candidates should be from the last added period
$
filter
(
\
((
id
,
_
),(
id'
,
_
))
->
((
fst
.
fst
)
id
==
lastPrd
)
||
((
fst
.
fst
)
id'
==
lastPrd
))
$
listToKeys
$
filter
(
\
(
id
,
ngrams
)
->
let
nbDocs
=
(
sum
.
elems
)
$
filterDocs
docs
([(
fst
.
fst
)
egoId
,
(
fst
.
fst
)
id
])
diago
=
reduceDiagos
$
filterDiago
diagos
([(
fst
.
fst
)
egoId
,
(
fst
.
fst
)
id
])
in
(
toProximity
nbDocs
diago
prox
egoNgrams
egoNgrams
ngrams
)
>=
thr
)
candidates
where
lastPrd
::
PhyloPeriodId
lastPrd
=
to
LastPeriod
fil
periods
lastPrd
::
PhyloPeriodId
lastPrd
=
find
LastPeriod
fil
periods
filterPointers
::
Proximity
->
Double
->
[
Pointer
]
->
[
Pointer
]
filterPointers
proxi
thr
pts
=
filter
(
\
(
_
,
w
)
->
filterProximity
proxi
thr
w
)
pts
phyloGroupMatching
::
[[
PhyloGroup
]]
->
Filiation
->
Proximity
->
Map
Date
Double
->
Double
->
PhyloGroup
->
PhyloGroup
phyloGroupMatching
candidates
fil
proxi
docs
thr
ego
=
case
null
nextPointers
of
-- | let's find new pointers
True
->
if
null
$
filterPointers
proxi
thr
$
getPeriodPointers
fil
ego
then
addPointers
ego
fil
TemporalPointer
[]
-- | or keep the old ones
else
addPointers
ego
fil
TemporalPointer
$
filterPointers
proxi
thr
$
getPeriodPointers
fil
ego
False
->
addPointers
ego
fil
TemporalPointer
$
head'
"phyloGroupMatching"
-- | Keep only the best set of pointers grouped by proximity
$
groupBy
(
\
pt
pt'
->
snd
pt
==
snd
pt'
)
$
reverse
$
sortOn
snd
$
head'
"pointers"
$
nextPointers
-- | Find the first time frame where at leats one pointer satisfies the proximity threshold
reduceDiagos
::
Map
Date
Cooc
->
Map
Int
Double
reduceDiagos
diagos
=
mapKeys
(
\
(
k
,
_
)
->
k
)
$
foldl
(
\
acc
diago
->
unionWith
(
+
)
acc
diago
)
empty
(
elems
diagos
)
phyloGroupMatching
::
[[(
PhyloGroupId
,[
Int
])]]
->
Filiation
->
Proximity
->
Map
Date
Double
->
Map
Date
Cooc
->
Double
->
[
Pointer
]
->
(
PhyloGroupId
,[
Int
])
->
[
Pointer
]
phyloGroupMatching
candidates
fil
proxi
docs
diagos
thr
oldPointers
(
id
,
ngrams
)
=
if
(
null
$
filterPointers
proxi
thr
oldPointers
)
-- | let's find new pointers
then
if
null
nextPointers
then
[]
else
head'
"phyloGroupMatching"
-- | Keep only the best set of pointers grouped by proximity
$
groupBy
(
\
pt
pt'
->
snd
pt
==
snd
pt'
)
$
reverse
$
sortOn
snd
$
head'
"pointers"
nextPointers
-- | Find the first time frame where at leats one pointer satisfies the proximity threshold
else
oldPointers
where
nextPointers
::
[[
Pointer
]]
nextPointers
=
take
1
$
dropWhile
(
null
)
-- | for each time frame, process the proximity on relevant pairs of targeted groups
$
scanl
(
\
acc
groups
->
let
periods
=
nub
$
map
_phylo_groupPeriod
$
concat
groups
docs'
=
(
filterDocs
docs
([
ego
^.
phylo_groupPeriod
]
++
periods
))
pairs
=
makePairs'
ego
(
concat
groups
)
periods
(
getPeriodPointers
fil
ego
)
fil
thr
proxi
docs
let
periods
=
nub
$
map
(
fst
.
fst
.
fst
)
$
concat
groups
nbdocs
=
sum
$
elems
$
(
filterDocs
docs
([(
fst
.
fst
)
id
]
++
periods
))
diago
=
reduceDiagos
$
filterDiago
diagos
([(
fst
.
fst
)
id
]
++
periods
)
-- | important resize nbdocs et diago dans le make pairs
pairs
=
makePairs'
(
id
,
ngrams
)
(
concat
groups
)
periods
oldPointers
fil
thr
proxi
docs
diagos
in
acc
++
(
filterPointers
proxi
thr
$
concat
$
map
(
\
(
c
,
c'
)
->
-- | process the proximity between the current group and a pair of candidates
let
proximity
=
toProximity
docs'
proxi
ego
c
c'
let
proximity
=
toProximity
nbdocs
diago
proxi
ngrams
(
snd
c
)
(
snd
c'
)
in
if
(
c
==
c'
)
then
[(
getGroupId
c
,
proximity
)]
else
[(
getGroupId
c
,
proximity
),(
getGroupId
c'
,
proximity
)]
)
pairs
))
[]
then
[(
fst
c
,
proximity
)]
else
[(
fst
c
,
proximity
),(
fst
c'
,
proximity
)]
)
pairs
))
[]
$
inits
candidates
-- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
filterDocs
::
Map
Date
Double
->
[
PhyloPeriodId
]
->
Map
Date
Double
filterDocs
d
pds
=
restrictKeys
d
$
periodsToYears
pds
filterDiago
::
Map
Date
Cooc
->
[
PhyloPeriodId
]
->
Map
Date
Cooc
filterDiago
diago
pds
=
restrictKeys
diago
$
periodsToYears
pds
-----------------------------
-- | Matching Processing | --
...
...
@@ -195,40 +198,41 @@ getNextPeriods fil max' pId pIds =
ToParents
->
take
max'
$
(
reverse
.
fst
)
$
splitAt
(
elemIndex'
pId
pIds
)
pIds
getCandidates
::
Filiation
->
PhyloGroup
->
[[
PhyloGroup
]]
->
[[
PhyloGroup
]]
getCandidates
fil
ego
targets
=
case
fil
of
ToChilds
->
targets'
ToParents
->
reverse
targets'
where
targets'
::
[[
PhyloGroup
]]
targets'
=
map
(
\
groups'
->
filter
(
\
g'
->
(
not
.
null
)
$
intersect
(
ego
^.
phylo_groupNgrams
)
(
g'
^.
phylo_groupNgrams
)
)
groups'
)
targets
phyloBranchMatching
::
Int
->
[
PhyloPeriodId
]
->
Proximity
->
Double
->
Map
Date
Double
->
[
PhyloGroup
]
->
[
PhyloGroup
]
phyloBranchMatching
frame
periods
proximity
thr
docs
branch
=
traceBranchMatching
proximity
thr
-- $ matchByPeriods ToParents
-- $ groupByField _phylo_groupPeriod
$
matchByPeriods
$
groupByField
_phylo_groupPeriod
branch
where
--------------------------------------
matchByPeriods
::
Map
PhyloPeriodId
[
PhyloGroup
]
->
[
PhyloGroup
]
matchByPeriods
branch'
=
foldl'
(
\
acc
prd
->
let
periodsPar
=
getNextPeriods
ToParents
frame
prd
periods
periodsChi
=
getNextPeriods
ToChilds
frame
prd
periods
candidatesPar
=
map
(
\
prd'
->
findWithDefault
[]
prd'
branch'
)
periodsPar
candidatesChi
=
map
(
\
prd'
->
findWithDefault
[]
prd'
branch'
)
periodsChi
docsPar
=
filterDocs
docs
([
prd
]
++
periodsPar
)
docsChi
=
filterDocs
docs
([
prd
]
++
periodsChi
)
egos
=
map
(
\
ego
->
phyloGroupMatching
(
getCandidates
ToParents
ego
candidatesPar
)
ToParents
proximity
docsPar
thr
$
phyloGroupMatching
(
getCandidates
ToChilds
ego
candidatesChi
)
ToChilds
proximity
docsChi
thr
ego
)
$
findWithDefault
[]
prd
branch'
egos'
=
egos
`
using
`
parList
rdeepseq
in
acc
++
egos'
)
[]
periods
getCandidates
::
PhyloGroup
->
[[(
PhyloGroupId
,[
Int
])]]
->
[[(
PhyloGroupId
,[
Int
])]]
getCandidates
ego
targets
=
map
(
\
groups'
->
filter
(
\
g'
->
(
not
.
null
)
$
intersect
(
ego
^.
phylo_groupNgrams
)
(
snd
g'
)
)
groups'
)
targets
matchGroupsToGroups
::
Int
->
[
PhyloPeriodId
]
->
Proximity
->
Double
->
Map
Date
Double
->
Map
Date
Cooc
->
[
PhyloGroup
]
->
[
PhyloGroup
]
matchGroupsToGroups
frame
periods
proximity
thr
docs
coocs
groups
=
let
groups'
=
groupByField
_phylo_groupPeriod
groups
in
foldl'
(
\
acc
prd
->
let
-- | 1) find the parents/childs matching periods
periodsPar
=
getNextPeriods
ToParents
frame
prd
periods
periodsChi
=
getNextPeriods
ToChilds
frame
prd
periods
-- | 2) find the parents/childs matching candidates
candidatesPar
=
map
(
\
prd'
->
map
(
\
g
->
(
getGroupId
g
,
g
^.
phylo_groupNgrams
))
$
findWithDefault
[]
prd'
groups'
)
periodsPar
candidatesChi
=
map
(
\
prd'
->
map
(
\
g
->
(
getGroupId
g
,
g
^.
phylo_groupNgrams
))
$
findWithDefault
[]
prd'
groups'
)
periodsChi
-- | 3) find the parents/child number of docs by years
docsPar
=
filterDocs
docs
([
prd
]
++
periodsPar
)
docsChi
=
filterDocs
docs
([
prd
]
++
periodsChi
)
-- | 4) find the parents/child diago by years
diagoPar
=
filterDiago
(
map
coocToDiago
coocs
)
([
prd
]
++
periodsPar
)
diagoChi
=
filterDiago
(
map
coocToDiago
coocs
)
([
prd
]
++
periodsPar
)
-- | 5) match in parallel all the groups (egos) to their possible candidates
egos
=
map
(
\
ego
->
let
pointersPar
=
phyloGroupMatching
(
getCandidates
ego
candidatesPar
)
ToParents
proximity
docsPar
diagoPar
thr
(
getPeriodPointers
ToParents
ego
)
(
getGroupId
ego
,
ego
^.
phylo_groupNgrams
)
pointersChi
=
phyloGroupMatching
(
getCandidates
ego
candidatesChi
)
ToChilds
proximity
docsChi
diagoChi
thr
(
getPeriodPointers
ToChilds
ego
)
(
getGroupId
ego
,
ego
^.
phylo_groupNgrams
)
in
addPointers
ToChilds
TemporalPointer
pointersChi
$
addPointers
ToParents
TemporalPointer
pointersPar
ego
)
$
findWithDefault
[]
prd
groups'
egos'
=
egos
`
using
`
parList
rdeepseq
in
acc
++
egos'
)
[]
periods
-----------------------
...
...
@@ -236,85 +240,38 @@ phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatchin
-----------------------
count
::
Eq
a
=>
a
->
[
a
]
->
Int
count
x
=
length
.
filter
(
==
x
)
termFreq'
::
Int
->
[
PhyloGroup
]
->
Double
termFreq'
term
groups
=
let
ngrams
=
concat
$
map
_phylo_groupNgrams
groups
in
log
((
fromIntegral
$
count
term
ngrams
)
/
(
fromIntegral
$
length
ngrams
))
relevantBranches
::
Int
->
[[
PhyloGroup
]]
->
[[
PhyloGroup
]]
relevantBranches
term
branches
=
filter
(
\
groups
->
(
any
(
\
group
->
elem
term
$
group
^.
phylo_groupNgrams
)
groups
))
branches
branchCov'
::
[
PhyloGroup
]
->
[[
PhyloGroup
]]
->
Double
branchCov'
branch
branches
=
(
fromIntegral
$
length
branch
)
/
(
fromIntegral
$
length
$
concat
branches
)
toRecall
::
Double
->
Int
->
Int
->
[[
PhyloGroup
]]
->
Double
toRecall
freq
term
border
branches
=
-- | given a random term in a phylo
freq
-- | for each relevant branches
*
(
sum
$
map
(
\
branch
->
-- | given its local coverage
((
branchCov'
branch
branches'
)
/
(
sum
$
map
(
\
b
->
branchCov'
b
branches'
)
branches'
))
-- | compute the local recall
*
(
(
fromIntegral
$
length
$
filter
(
\
group
->
elem
term
$
group
^.
phylo_groupNgrams
)
branch
)
/
(
(
fromIntegral
$
length
$
filter
(
\
group
->
elem
term
$
group
^.
phylo_groupNgrams
)
$
concat
branches'
)
-- | with a ponderation from border branches
+
(
fromIntegral
border
))
))
branches'
)
where
branches'
::
[[
PhyloGroup
]]
branches'
=
relevantBranches
term
branches
toAccuracy
::
Double
->
Int
->
[[
PhyloGroup
]]
->
Double
toAccuracy
freq
term
branches
=
if
(
null
branches
)
then
0
else
-- | given a random term in a phylo
freq
-- | for each relevant branches
*
(
sum
$
map
(
\
branch
->
-- | given its local coverage
((
branchCov'
branch
branches'
)
/
(
sum
$
map
(
\
b
->
branchCov'
b
branches'
)
branches'
))
-- | compute the local accuracy
*
(
(
fromIntegral
$
length
$
filter
(
\
group
->
elem
term
$
group
^.
phylo_groupNgrams
)
branch
)
/
(
fromIntegral
$
length
branch
)))
branches'
)
where
branches'
::
[[
PhyloGroup
]]
branches'
=
relevantBranches
term
branches
toPhyloQuality
::
Double
->
Map
Int
Double
->
Int
->
Double
->
[[
PhyloGroup
]]
->
Double
toPhyloQuality
beta
frequency
border
oldAcc
branches
=
-- trace (" rec : " <> show(recall)) $
-- trace (" acc : " <> show(accuracy)) $
if
(
null
branches
)
then
0
else
((
1
+
beta
**
2
)
*
accuracy
*
recall
)
/
(((
beta
**
2
)
*
accuracy
+
recall
))
where
-- | for each term compute the global accuracy
accuracy
::
Double
accuracy
=
oldAcc
+
(
sum
$
map
(
\
term
->
toAccuracy
(
frequency
!
term
)
term
branches
)
$
keys
frequency
)
-- | for each term compute the global recall
recall
::
Double
recall
=
sum
$
map
(
\
term
->
toRecall
(
frequency
!
term
)
term
border
branches
)
$
keys
frequency
fScore
::
Double
->
Int
->
[
PhyloGroup
]
->
[[
PhyloGroup
]]
->
Double
fScore
beta
i
bk
bks
=
let
recall
=
(
(
fromIntegral
$
length
$
filter
(
\
g
->
elem
i
$
g
^.
phylo_groupNgrams
)
bk
)
/
(
fromIntegral
$
length
$
filter
(
\
g
->
elem
i
$
g
^.
phylo_groupNgrams
)
$
concat
bks
))
accuracy
=
(
(
fromIntegral
$
length
$
filter
(
\
g
->
elem
i
$
g
^.
phylo_groupNgrams
)
bk
)
/
(
fromIntegral
$
length
bk
))
in
((
1
+
beta
**
2
)
*
accuracy
*
recall
)
/
(((
beta
**
2
)
*
accuracy
+
recall
))
toBorderAccuracy
::
Map
Int
Double
->
[[
PhyloGroup
]
]
->
Double
toBorderAccuracy
freq
branches
=
sum
$
map
(
\
t
->
toAccuracy
(
freq
!
t
)
t
branches
)
$
keys
freq
wk
::
[
PhyloGroup
]
->
Double
wk
bk
=
fromIntegral
$
length
bk
-----------------------------
-- | Adaptative Matching | --
-----------------------------
toPhyloQuality'
::
Double
->
Map
Int
Double
->
[[
PhyloGroup
]]
->
Double
toPhyloQuality'
beta
freq
branches
=
if
(
null
branches
)
then
0
else
sum
$
map
(
\
i
->
let
bks
=
relevantBranches
i
branches
in
(
freq
!
i
)
*
(
sum
$
map
(
\
bk
->
((
wk
bk
)
/
(
sum
$
map
wk
bks
))
*
(
fScore
beta
i
bk
bks
))
bks
))
$
keys
freq
------------------------------------
-- | Constant Temporal Matching | --
------------------------------------
groupsToBranches
::
Map
PhyloGroupId
PhyloGroup
->
[[
PhyloGroup
]]
...
...
@@ -326,6 +283,7 @@ groupsToBranches groups =
++
(
map
fst
$
group
^.
phylo_groupPeriodParents
)
++
(
map
fst
$
group
^.
phylo_groupPeriodChilds
)
)
$
elems
groups
-- | first find the related components by inside each ego's period
-- | a supprimer
graph'
=
map
relatedComponents
egos
-- | then run it for the all the periods
graph
=
zip
[
1
..
]
...
...
@@ -341,81 +299,248 @@ reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
reduceFrequency
frequency
branches
=
restrictKeys
frequency
(
Set
.
fromList
$
(
nub
.
concat
)
$
map
_phylo_groupNgrams
$
concat
branches
)
alterBorder
::
Int
->
[[
PhyloGroup
]]
->
[
PhyloGroup
]
->
Int
alterBorder
border
branches
branch
=
border
+
(
length
$
concat
branches
)
-
(
length
branch
)
recursiveMatching
::
Proximity
->
Double
->
Int
->
Map
Int
Double
->
Double
->
Int
->
[
PhyloPeriodId
]
->
Map
Date
Double
->
Double
->
Int
->
Double
->
[
PhyloGroup
]
->
[
PhyloGroup
]
recursiveMatching
proximity
beta
minBranch
frequency
egoThr
frame
periods
docs
quality
border
oldAcc
groups
=
if
((
egoThr
>=
1
)
||
(
quality
>
quality'
)
||
((
length
$
concat
$
snd
branches'
)
==
(
length
groups
)))
then
trace
(
" ✗ F(β) = "
<>
show
(
quality
)
<>
" (vs) "
<>
show
(
quality'
)
<>
"
\n
"
<>
" |✓ "
<>
show
(
length
$
fst
branches'
)
<>
show
(
map
length
$
fst
branches'
)
<>
" |✗ "
<>
show
(
length
$
snd
branches'
)
<>
"["
<>
show
(
length
$
concat
$
snd
branches'
)
<>
"]"
)
$
groups
else
let
next
=
map
(
\
b
->
recursiveMatching
proximity
beta
minBranch
(
reduceFrequency
frequency
(
fst
branches'
))
(
egoThr
+
(
getThresholdStep
proximity
))
frame
periods
docs
quality'
(
alterBorder
border
(
fst
branches'
)
b
)
(
oldAcc
+
(
toBorderAccuracy
frequency
(
delete
b
((
fst
branches'
)
++
(
snd
branches'
)))))
b
)
(
fst
branches'
)
in
trace
(
" ✓ F(β) = "
<>
show
(
quality
)
<>
" (vs) "
<>
show
(
quality'
)
<>
"
\n
"
<>
" |✓ "
<>
show
(
length
$
fst
branches'
)
<>
show
(
map
length
$
fst
branches'
)
<>
" |✗ "
<>
show
(
length
$
snd
branches'
)
<>
"["
<>
show
(
length
$
concat
$
snd
branches'
)
<>
"]"
)
$
concat
(
next
++
(
snd
branches'
))
updateThr
::
Double
->
[[
PhyloGroup
]]
->
[[
PhyloGroup
]]
updateThr
thr
branches
=
map
(
\
b
->
map
(
\
g
->
g
&
phylo_groupMeta
.~
(
singleton
"seaLevels"
(((
g
^.
phylo_groupMeta
)
!
"seaLevels"
)
++
[
thr
])))
b
)
branches
-- | Sequentially break each branch of a phylo where
-- done = all the allready broken branches
-- ego = the current branch we want to break
-- rest = the branches we still have to break
breakBranches
::
Proximity
->
Double
->
Map
Int
Double
->
Int
->
Double
->
Double
->
Double
->
Int
->
Map
Date
Double
->
Map
Date
Cooc
->
[
PhyloPeriodId
]
->
[([
PhyloGroup
],
Bool
)]
->
([
PhyloGroup
],
Bool
)
->
[([
PhyloGroup
],
Bool
)]
->
[([
PhyloGroup
],
Bool
)]
breakBranches
proximity
beta
frequency
minBranch
thr
depth
elevation
frame
docs
coocs
periods
done
ego
rest
=
-- | 1) keep or not the new division of ego
let
done'
=
done
++
(
if
snd
ego
then
(
if
((
null
(
fst
ego'
))
||
(
quality
>
quality'
))
then
-- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
-- <> " | " <> show(length $ fst ego) <> " groups : "
-- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
-- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
[(
fst
ego
,
False
)]
else
-- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
-- <> " | " <> show(length $ fst ego) <> " groups : "
-- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
-- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
((
map
(
\
e
->
(
e
,
True
))
(
fst
ego'
))
++
(
map
(
\
e
->
(
e
,
False
))
(
snd
ego'
))))
else
[
ego
])
in
-- | 2) if there is no more branches in rest then return else continue
if
null
rest
then
done'
else
breakBranches
proximity
beta
frequency
minBranch
thr
depth
elevation
frame
docs
coocs
periods
done'
(
head'
"breakBranches"
rest
)
(
tail'
"breakBranches"
rest
)
where
--------------------------------------
quality
::
Double
quality
=
toPhyloQuality'
beta
frequency
((
map
fst
done
)
++
[
fst
ego
]
++
(
map
fst
rest
))
--------------------------------------
ego'
::
([[
PhyloGroup
]],[[
PhyloGroup
]])
ego'
=
let
branches
=
groupsToBranches
$
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
$
matchGroupsToGroups
frame
periods
proximity
thr
docs
coocs
(
fst
ego
)
branches'
=
branches
`
using
`
parList
rdeepseq
in
partition
(
\
b
->
(
length
$
nub
$
map
_phylo_groupPeriod
b
)
>=
minBranch
)
$
thrToMeta
thr
$
depthToMeta
(
elevation
-
depth
)
branches'
--------------------------------------
quality'
::
Double
quality'
=
toPhyloQuality'
beta
frequency
((
map
fst
done
)
++
(
fst
ego'
)
++
(
snd
ego'
)
++
(
map
fst
rest
))
seaLevelMatching
::
Proximity
->
Double
->
Int
->
Map
Int
Double
->
Double
->
Double
->
Double
->
Double
->
Int
->
[
PhyloPeriodId
]
->
Map
Date
Double
->
Map
Date
Cooc
->
[([
PhyloGroup
],
Bool
)]
->
[([
PhyloGroup
],
Bool
)]
seaLevelMatching
proximity
beta
minBranch
frequency
thr
step
depth
elevation
frame
periods
docs
coocs
branches
=
-- | if there is no branch to break or if seaLvl level > 1 then end
if
(
thr
>=
1
)
||
((
not
.
or
)
$
map
snd
branches
)
then
branches
else
-- | break all the possible branches at the current seaLvl level
let
branches'
=
breakBranches
proximity
beta
frequency
minBranch
thr
depth
elevation
frame
docs
coocs
periods
[]
(
head'
"seaLevelMatching"
branches
)
(
tail'
"seaLevelMatching"
branches
)
frequency'
=
reduceFrequency
frequency
(
map
fst
branches'
)
in
seaLevelMatching
proximity
beta
minBranch
frequency'
(
thr
+
step
)
step
(
depth
-
1
)
elevation
frame
periods
docs
coocs
branches'
constanteTemporalMatching
::
Double
->
Double
->
Phylo
->
Phylo
constanteTemporalMatching
start
step
phylo
=
updatePhyloGroups
1
(
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
$
traceMatchEnd
$
concat
branches
)
phylo
where
-- | 2) process the temporal matching by elevating seaLvl level
branches
::
[[
PhyloGroup
]]
branches
=
map
fst
$
seaLevelMatching
(
phyloProximity
$
getConfig
phylo
)
(
_qua_granularity
$
phyloQuality
$
getConfig
phylo
)
(
_qua_minBranch
$
phyloQuality
$
getConfig
phylo
)
(
phylo
^.
phylo_termFreq
)
start
step
((((
1
-
start
)
/
step
)
-
1
))
(((
1
-
start
)
/
step
))
(
getTimeFrame
$
timeUnit
$
getConfig
phylo
)
(
getPeriodIds
phylo
)
(
phylo
^.
phylo_timeDocs
)
(
phylo
^.
phylo_timeCooc
)
groups
-- | 1) for each group process an initial temporal Matching
-- | here we suppose that all the groups of level 1 are part of the same big branch
groups
::
[([
PhyloGroup
],
Bool
)]
groups
=
map
(
\
b
->
(
b
,(
length
$
nub
$
map
_phylo_groupPeriod
b
)
>=
(
_qua_minBranch
$
phyloQuality
$
getConfig
phylo
)))
$
groupsToBranches
$
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
$
matchGroupsToGroups
(
getTimeFrame
$
timeUnit
$
getConfig
phylo
)
(
getPeriodIds
phylo
)
(
phyloProximity
$
getConfig
phylo
)
start
(
phylo
^.
phylo_timeDocs
)
(
phylo
^.
phylo_timeCooc
)
(
traceTemporalMatching
$
getGroupsFromLevel
1
phylo
)
--------------------------------------
-- | Adaptative Temporal Matching | --
--------------------------------------
thrToMeta
::
Double
->
[[
PhyloGroup
]]
->
[[
PhyloGroup
]]
thrToMeta
thr
branches
=
map
(
\
b
->
map
(
\
g
->
g
&
phylo_groupMeta
.~
(
adjust
(
\
lst
->
lst
++
[
thr
])
"seaLevels"
(
g
^.
phylo_groupMeta
)))
b
)
branches
depthToMeta
::
Double
->
[[
PhyloGroup
]]
->
[[
PhyloGroup
]]
depthToMeta
depth
branches
=
let
break
=
length
branches
>
1
in
map
(
\
b
->
map
(
\
g
->
if
break
then
g
&
phylo_groupMeta
.~
(
adjust
(
\
lst
->
lst
++
[
depth
])
"breaks"
(
g
^.
phylo_groupMeta
))
else
g
)
b
)
branches
reduceTupleMapByKeys
::
Eq
a
=>
[
a
]
->
Map
(
a
,
a
)
Double
->
Map
(
a
,
a
)
Double
reduceTupleMapByKeys
ks
m
=
filterWithKey
(
\
(
k
,
k'
)
_
->
(
elem
k
ks
)
&&
(
elem
k'
ks
))
m
getInTupleMap
::
Ord
a
=>
Map
(
a
,
a
)
Double
->
a
->
a
->
Double
getInTupleMap
m
k
k'
|
isJust
(
m
!?
(
k
,
k'
))
=
m
!
(
k
,
k'
)
|
isJust
(
m
!?
(
k'
,
k
))
=
m
!
(
k'
,
k
)
|
otherwise
=
0
toThreshold
::
Double
->
Map
(
PhyloGroupId
,
PhyloGroupId
)
Double
->
Double
toThreshold
lvl
proxiGroups
=
let
idx
=
((
Map
.
size
proxiGroups
)
`
div
`
(
floor
lvl
))
-
1
in
if
idx
>=
0
then
(
sort
$
elems
proxiGroups
)
!!
idx
else
1
-- done = all the allready broken branches
-- ego = the current branch we want to break
-- rest = the branches we still have to break
adaptativeBreakBranches
::
Proximity
->
Double
->
Double
->
Map
(
PhyloGroupId
,
PhyloGroupId
)
Double
->
Double
->
Map
Int
Double
->
Int
->
Int
->
Map
Date
Double
->
Map
Date
Cooc
->
[
PhyloPeriodId
]
->
[([
PhyloGroup
],(
Bool
,[
Double
]))]
->
([
PhyloGroup
],(
Bool
,[
Double
]))
->
[([
PhyloGroup
],(
Bool
,[
Double
]))]
->
[([
PhyloGroup
],(
Bool
,[
Double
]))]
adaptativeBreakBranches
proxiConf
depth
elevation
groupsProxi
beta
frequency
minBranch
frame
docs
coocs
periods
done
ego
rest
=
-- | 1) keep or not the new division of ego
let
done'
=
done
++
(
if
(
fst
.
snd
)
ego
then
(
if
((
null
(
fst
ego'
))
||
(
quality
>
quality'
))
then
[(
concat
$
thrToMeta
thr
$
[
fst
ego
],(
False
,
((
snd
.
snd
)
ego
)))]
else
(
(
map
(
\
e
->
(
e
,(
True
,
((
snd
.
snd
)
ego
)
++
[
thr
])))
(
fst
ego'
))
++
(
map
(
\
e
->
(
e
,(
False
,
((
snd
.
snd
)
ego
))))
(
snd
ego'
))))
else
[(
concat
$
thrToMeta
thr
$
[
fst
ego
],
snd
ego
)])
in
-- | uncomment let .. in for debugging
-- let part1 = partition (snd) done'
-- part2 = partition (snd) rest
-- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
-- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
-- ) $
-- | 2) if there is no more branches in rest then return else continue
if
null
rest
then
done'
else
adaptativeBreakBranches
proxiConf
depth
elevation
groupsProxi
beta
frequency
minBranch
frame
docs
coocs
periods
done'
(
head'
"breakBranches"
rest
)
(
tail'
"breakBranches"
rest
)
where
-- | 2) for each of the possible next branches process the phyloQuality score
--------------------------------------
thr
::
Double
thr
=
toThreshold
depth
$
Map
.
filter
(
\
v
->
v
>
(
last'
"breakBranches"
$
(
snd
.
snd
)
ego
))
$
reduceTupleMapByKeys
(
map
getGroupId
$
fst
ego
)
groupsProxi
--------------------------------------
quality
::
Double
quality
=
toPhyloQuality'
beta
frequency
((
map
fst
done
)
++
[
fst
ego
]
++
(
map
fst
rest
))
--------------------------------------
ego'
::
([[
PhyloGroup
]],[[
PhyloGroup
]])
ego'
=
let
branches
=
groupsToBranches
$
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
$
matchGroupsToGroups
frame
periods
proxiConf
thr
docs
coocs
(
fst
ego
)
branches'
=
branches
`
using
`
parList
rdeepseq
in
partition
(
\
b
->
(
length
$
nub
$
map
_phylo_groupPeriod
b
)
>
minBranch
)
$
thrToMeta
thr
$
depthToMeta
(
elevation
-
depth
)
branches'
--------------------------------------
quality'
::
Double
quality'
=
toPhyloQuality
beta
frequency
border
oldAcc
((
fst
branches'
)
++
(
snd
branches'
))
-- | 1) for each local branch process a temporal matching then find the resulting branches
branches'
::
([[
PhyloGroup
]],[[
PhyloGroup
]])
branches'
=
let
branches
=
groupsToBranches
$
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
$
phyloBranchMatching
frame
periods
proximity
egoThr
docs
groups
in
partition
(
\
b
->
length
b
>=
minBranch
)
(
branches
`
using
`
parList
rdeepseq
)
temporalMatching
::
Phylo
->
Phylo
temporalMatching
phylo
=
updatePhyloGroups
1
branches'
phylo
quality'
=
toPhyloQuality'
beta
frequency
((
map
fst
done
)
++
(
fst
ego'
)
++
(
snd
ego'
)
++
(
map
fst
rest
))
adaptativeSeaLevelMatching
::
Proximity
->
Double
->
Double
->
Map
(
PhyloGroupId
,
PhyloGroupId
)
Double
->
Double
->
Int
->
Map
Int
Double
->
Int
->
[
PhyloPeriodId
]
->
Map
Date
Double
->
Map
Date
Cooc
->
[([
PhyloGroup
],(
Bool
,[
Double
]))]
->
[([
PhyloGroup
],(
Bool
,[
Double
]))]
adaptativeSeaLevelMatching
proxiConf
depth
elevation
groupsProxi
beta
minBranch
frequency
frame
periods
docs
coocs
branches
=
-- | if there is no branch to break or if seaLvl level >= depth then end
if
(
Map
.
null
groupsProxi
)
||
(
depth
<=
0
)
||
((
not
.
or
)
$
map
(
fst
.
snd
)
branches
)
then
branches
else
-- | break all the possible branches at the current seaLvl level
let
branches'
=
adaptativeBreakBranches
proxiConf
depth
elevation
groupsProxi
beta
frequency
minBranch
frame
docs
coocs
periods
[]
(
head'
"seaLevelMatching"
branches
)
(
tail'
"seaLevelMatching"
branches
)
frequency'
=
reduceFrequency
frequency
(
map
fst
branches'
)
groupsProxi'
=
reduceTupleMapByKeys
(
map
(
getGroupId
)
$
concat
$
map
(
fst
)
$
filter
(
fst
.
snd
)
branches'
)
groupsProxi
-- thr = toThreshold depth groupsProxi
in
trace
(
"
\n
"
<>
foldl
(
\
acc
_
->
acc
<>
"🌊 "
)
""
[
0
..
(
elevation
-
depth
)]
<>
" [✓ "
<>
show
(
length
$
filter
(
fst
.
snd
)
branches'
)
<>
"("
<>
show
(
length
$
concat
$
map
(
fst
)
$
filter
(
fst
.
snd
)
branches'
)
<>
")|✗ "
<>
show
(
length
$
filter
(
not
.
fst
.
snd
)
branches'
)
<>
"("
<>
show
(
length
$
concat
$
map
(
fst
)
$
filter
(
not
.
fst
.
snd
)
branches'
)
<>
")]"
<>
" thr = "
)
$
adaptativeSeaLevelMatching
proxiConf
(
depth
-
1
)
elevation
groupsProxi'
beta
minBranch
frequency'
frame
periods
docs
coocs
branches'
adaptativeTemporalMatching
::
Double
->
Phylo
->
Phylo
adaptativeTemporalMatching
elevation
phylo
=
updatePhyloGroups
1
(
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
$
traceMatchEnd
$
concat
branches
)
phylo
where
-- | 5) apply the recursive matching
branches'
::
Map
PhyloGroupId
PhyloGroup
branches'
=
let
next
=
trace
(
" ✓ F(β) = "
<>
show
(
quality
)
<>
" |✓ "
<>
show
(
length
$
fst
branches
)
<>
show
(
map
length
$
fst
branches
)
<>
" |✗ "
<>
show
(
length
$
snd
branches
)
<>
"["
<>
show
(
length
$
concat
$
snd
branches
)
<>
"]"
)
$
map
(
\
branch
->
recursiveMatching
(
phyloProximity
$
getConfig
phylo
)
(
_qua_relevance
$
phyloQuality
$
getConfig
phylo
)
(
_qua_minBranch
$
phyloQuality
$
getConfig
phylo
)
(
reduceFrequency
frequency
(
fst
branches
))
(
(
getThresholdInit
$
phyloProximity
$
getConfig
phylo
)
+
(
getThresholdStep
$
phyloProximity
$
getConfig
phylo
))
(
getTimeFrame
$
timeUnit
$
getConfig
phylo
)
(
getPeriodIds
phylo
)
(
phylo
^.
phylo_timeDocs
)
quality
(
alterBorder
0
(
fst
branches
)
branch
)
(
toBorderAccuracy
frequency
(
delete
branch
((
fst
branches
)
++
(
snd
branches
))))
branch
)
(
fst
branches
)
in
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
$
traceMatchEnd
(
concat
(
next
++
(
snd
branches
)))
-- | 4) process the quality score
quality
::
Double
quality
=
toPhyloQuality
(
_qua_relevance
$
phyloQuality
$
getConfig
phylo
)
frequency
0
0
((
fst
branches
)
++
(
snd
branches
))
-- | 3) process the constants of the quality score
frequency
::
Map
Int
Double
frequency
=
let
terms
=
ngramsInBranches
((
fst
branches
)
++
(
snd
branches
))
freqs
=
map
(
\
t
->
termFreq'
t
$
concat
((
fst
branches
)
++
(
snd
branches
)))
terms
in
fromList
$
map
(
\
(
t
,
freq
)
->
(
t
,
freq
/
(
sum
freqs
)))
$
zip
terms
freqs
-- | 2) group into branches
branches
::
([[
PhyloGroup
]],[[
PhyloGroup
]])
branches
=
partition
(
\
b
->
length
b
>=
(
_qua_minBranch
$
phyloQuality
$
getConfig
phylo
))
$
groupsToBranches
$
fromList
$
map
(
\
group
->
(
getGroupId
group
,
group
))
groups'
-- | 2) process the temporal matching by elevating seaLvl level
branches
::
[[
PhyloGroup
]]
branches
=
map
fst
$
adaptativeSeaLevelMatching
(
phyloProximity
$
getConfig
phylo
)
(
elevation
-
1
)
elevation
(
phylo
^.
phylo_groupsProxi
)
(
_qua_granularity
$
phyloQuality
$
getConfig
phylo
)
(
_qua_minBranch
$
phyloQuality
$
getConfig
phylo
)
(
phylo
^.
phylo_termFreq
)
(
getTimeFrame
$
timeUnit
$
getConfig
phylo
)
(
getPeriodIds
phylo
)
(
phylo
^.
phylo_timeDocs
)
(
phylo
^.
phylo_timeCooc
)
groups
-- | 1) for each group process an initial temporal Matching
groups'
::
[
PhyloGroup
]
groups'
=
phyloBranchMatching
(
getTimeFrame
$
timeUnit
$
getConfig
phylo
)
(
getPeriodIds
phylo
)
(
phyloProximity
$
getConfig
phylo
)
(
getThresholdInit
$
phyloProximity
$
getConfig
phylo
)
(
phylo
^.
phylo_timeDocs
)
(
traceTemporalMatching
$
getGroupsFromLevel
1
phylo
)
-- | here we suppose that all the groups of level 1 are part of the same big branch
groups
::
[([
PhyloGroup
],(
Bool
,[
Double
]))]
groups
=
map
(
\
b
->
(
b
,((
length
$
nub
$
map
_phylo_groupPeriod
b
)
>=
(
_qua_minBranch
$
phyloQuality
$
getConfig
phylo
),[
thr
])))
$
groupsToBranches
$
fromList
$
map
(
\
g
->
(
getGroupId
g
,
g
))
$
matchGroupsToGroups
(
getTimeFrame
$
timeUnit
$
getConfig
phylo
)
(
getPeriodIds
phylo
)
(
phyloProximity
$
getConfig
phylo
)
thr
(
phylo
^.
phylo_timeDocs
)
(
phylo
^.
phylo_timeCooc
)
(
traceTemporalMatching
$
getGroupsFromLevel
1
phylo
)
--------------------------------------
thr
::
Double
thr
=
toThreshold
elevation
(
phylo
^.
phylo_groupsProxi
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment