Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
15511563
Commit
15511563
authored
Jun 12, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[WORKFLOW] clean, issue in map2mat: diagonal == 0.
parent
e079af35
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
103 additions
and
67 deletions
+103
-67
Pipeline.hs
src/Gargantext/Pipeline.hs
+72
-44
Metrics.hs
src/Gargantext/Text/Metrics.hs
+28
-18
Count.hs
src/Gargantext/Text/Metrics/Count.hs
+3
-3
Matrice.hs
src/Gargantext/Viz/Graph/Distances/Matrice.hs
+0
-2
No files found.
src/Gargantext/Pipeline.hs
View file @
15511563
...
...
@@ -30,10 +30,10 @@ import Gargantext.Prelude
import
Prelude
(
print
,
seq
)
import
Gargantext.Viz.Graph.Index
(
score
,
createIndices
,
toIndex
,
fromIndex
,
cooc2mat
,
map2mat
,
mat2map
)
import
Gargantext.Viz.Graph.Distances.Matrice
(
conditional'
,
conditional
)
import
Gargantext.Viz.Graph.Distances.Matrice
(
conditional'
,
conditional
,
distributional
)
import
Gargantext.Viz.Graph.Index
(
Index
)
import
Gargantext.Viz.Graph
(
Graph
(
..
),
Node
(
..
),
Edge
(
..
),
Attributes
(
..
),
TypeNode
(
..
))
import
Gargantext.Text.Metrics.Count
(
cooc
,
removeApax
)
import
Gargantext.Text.Metrics.Count
(
cooc
)
import
Gargantext.Text.Metrics
import
Gargantext.Text.Terms
(
TermType
(
Multi
,
Mono
),
extractTerms
)
import
Gargantext.Text.Context
(
splitBy
,
SplitContext
(
Sentences
))
...
...
@@ -51,19 +51,80 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
-}
workflow
lang
path
=
do
-- Text <- IO Text <- FilePath
text
<-
readFile
path
-- context :: Text -> [Text]
let
contexts
=
splitBy
(
Sentences
5
)
text
myterms
<-
extractTerms
Mono
lang
contexts
-- myterms <- extractTerms (Mono lang) contexts # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList)
printDebug
"myterms"
(
sum
$
map
length
myterms
)
-- Bulding the map list
let
myCooc1
=
cooc
myterms
printDebug
"myCooc1"
(
M
.
size
myCooc1
)
-- Remove Apax: appears one time only => lighting the matrix
let
myCooc2
=
M
.
filter
(
>
1
)
myCooc1
printDebug
"myCooc2"
(
M
.
size
myCooc2
)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
let
myCooc3
=
filterCooc
(
FilterConfig
(
MapListSize
20
)
(
InclusionSize
1000
)
(
SampleBins
10
)
(
Clusters
3
)
(
DefaultValue
(
-
1
))
)
myCooc2
printDebug
"myCooc3"
$
M
.
size
myCooc3
-- Cooc -> Matrix
let
(
ti
,
fi
)
=
createIndices
myCooc3
printDebug
"ti"
$
M
.
size
ti
let
myCooc4
=
toIndex
ti
myCooc3
printDebug
"myCooc4"
$
M
.
size
myCooc4
let
matCooc
=
map2mat
(
-
2
)
(
M
.
size
ti
)
myCooc4
printDebug
"matCooc"
matCooc
pure
matCooc
-- Matrix -> Clustering
--let distanceMat = conditional matCooc
-- let distanceMat = distributional matCooc
-- printDebug "distanceMat" $ A.arrayShape distanceMat
-- printDebug "distanceMat" distanceMat
--
-- let distanceMap = mat2map distanceMat
-- printDebug "distanceMap" $ M.size distanceMap
--{-
-- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance
---}
-- partitions <- cLouvain distanceMap
------ | Building : -> Graph -> JSON
-- printDebug "partitions" $ length partitions
-- pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
-----------------------------------------------------------
-- distance should not be a map since we just "toList" it (same as cLouvain)
data2graph
::
[(
Label
,
Int
)]
->
Map
(
Int
,
Int
)
Int
->
Map
(
Int
,
Int
)
Double
->
[
LouvainNode
]
->
Graph
data2graph
::
[(
Label
,
Int
)]
->
Map
(
Int
,
Int
)
Int
->
Map
(
Int
,
Int
)
Double
->
[
LouvainNode
]
->
Graph
data2graph
labels
coocs
distance
partitions
=
Graph
nodes
edges
where
community_id_by_node_id
=
M
.
fromList
[
(
n
,
c
)
|
LouvainNode
n
c
<-
partitions
]
nodes
=
[
Node
{
n_size
=
coocs
M
.!
(
n
,
n
)
-- TODO lookup with default ?
nodes
=
[
Node
{
n_size
=
maybe
0
identity
(
M
.
lookup
(
n
,
n
)
coocs
)
,
n_type
=
Terms
-- or Unknown
,
n_id
=
cs
(
show
n
)
,
n_label
=
T
.
unwords
l
,
n_attributes
=
-- TODO lookup with default ?
Attributes
{
clust_default
=
community_id_by_node_id
M
.!
n
}
}
,
n_attributes
=
Attributes
{
clust_default
=
maybe
0
identity
(
M
.
lookup
n
community_id_by_node_id
)
}
}
|
(
l
,
n
)
<-
labels
]
edges
=
[
Edge
{
e_source
=
s
,
e_target
=
t
...
...
@@ -72,42 +133,9 @@ data2graph labels coocs distance partitions = Graph nodes edges
|
(
i
,
((
s
,
t
),
w
))
<-
zip
[
0
..
]
(
M
.
toList
distance
)
]
-----------------------------------------------------------
-- printDebug msg x = putStrLn $ msg <> " " <> show x
printDebug
_
_
=
pure
()
printDebug
msg
x
=
putStrLn
$
msg
<>
" "
<>
show
x
--printDebug _ _ = pure ()
workflow
lang
path
=
do
-- Text <- IO Text <- FilePath
text
<-
readFile
path
let
contexts
=
splitBy
(
Sentences
5
)
text
myterms
<-
extractTerms
Mono
lang
contexts
printDebug
"myterms"
$
sum
$
map
length
myterms
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
let
myCooc1
=
cooc
myterms
printDebug
"myCooc1"
$
M
.
size
myCooc1
let
myCooc2
=
removeApax
myCooc1
printDebug
"myCooc2"
$
M
.
size
myCooc2
let
myCooc3
=
filterCooc
myCooc2
printDebug
"myCooc3"
$
M
.
size
myCooc3
-- Cooc -> Matrix
let
(
ti
,
fi
)
=
createIndices
myCooc3
printDebug
"ti"
$
M
.
size
ti
let
myCooc4
=
toIndex
ti
myCooc3
printDebug
"myCooc4"
$
M
.
size
myCooc4
let
matCooc
=
map2mat
0
(
M
.
size
ti
)
myCooc4
-- Matrix -> Clustering
let
distanceMat
=
conditional
matCooc
printDebug
"distanceMat"
$
A
.
arrayShape
distanceMat
let
distanceMap
=
mat2map
distanceMat
printDebug
"distanceMap"
$
M
.
size
distanceMap
{-
let distance = fromIndex fi distanceMap
printDebug "distance" $ M.size distance
-}
partitions
<-
cLouvain
distanceMap
---- | Building : -> Graph -> JSON
printDebug
"partitions"
$
length
partitions
pure
$
data2graph
(
M
.
toList
ti
)
myCooc4
distanceMap
partitions
src/Gargantext/Text/Metrics.hs
View file @
15511563
...
...
@@ -58,48 +58,58 @@ import GHC.Real (round)
import
Debug.Trace
import
Prelude
(
seq
)
filterCooc
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
(
t
,
t
)
Int
filterCooc
cc
=
filterCooc'
ts
cc
data
MapListSize
=
MapListSize
Int
data
InclusionSize
=
InclusionSize
Int
data
SampleBins
=
SampleBins
Double
data
Clusters
=
Clusters
Int
data
DefaultValue
=
DefaultValue
Int
data
FilterConfig
=
FilterConfig
{
fc_mapListSize
::
MapListSize
,
fc_inclusionSize
::
InclusionSize
,
fc_sampleBins
::
SampleBins
,
fc_clusters
::
Clusters
,
fc_defaultValue
::
DefaultValue
}
filterCooc
::
Ord
t
=>
FilterConfig
->
Map
(
t
,
t
)
Int
->
Map
(
t
,
t
)
Int
filterCooc
fc
cc
=
(
filterCooc'
fc
)
ts
cc
where
ts
=
map
_scored_terms
$
takeSome
350
5
2
$
coocScored
cc
ts
=
map
_scored_terms
$
takeSome
fc
$
coocScored
cc
filterCooc'
::
Ord
t
=>
[
t
]
->
Map
(
t
,
t
)
Int
->
Map
(
t
,
t
)
Int
filterCooc'
ts
m
=
-- trace ("coocScored " <> show (length ts)) $
foldl'
(
\
m'
k
->
M
.
insert
k
(
maybe
errMessage
identity
$
M
.
lookup
k
m
)
m'
)
filterCooc'
::
Ord
t
=>
FilterConfig
->
[
t
]
->
Map
(
t
,
t
)
Int
->
Map
(
t
,
t
)
Int
filterCooc'
(
FilterConfig
_
_
_
_
(
DefaultValue
dv
))
ts
m
=
-- trace ("coocScored " <> show (length ts)) $
foldl'
(
\
m'
k
->
M
.
insert
k
(
maybe
dv
identity
$
M
.
lookup
k
m
)
m'
)
M
.
empty
selection
where
errMessage
=
panic
"Filter cooc: no key"
selection
=
[(
x
,
y
)
|
x
<-
ts
,
y
<-
ts
,
x
>
y
]
type
MapListSize
=
Int
type
SampleBins
=
Double
type
Clusters
=
Int
-- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
takeSome
::
Ord
t
=>
MapListSize
->
SampleBins
->
Clusters
->
[
Scored
t
]
->
[
Scored
t
]
takeSome
l
s
k
scores
=
L
.
take
l
takeSome
::
Ord
t
=>
FilterConfig
->
[
Scored
t
]
->
[
Scored
t
]
takeSome
(
FilterConfig
(
MapListSize
l
)
(
InclusionSize
l'
)
(
SampleBins
s
)
(
Clusters
k
)
_
)
scores
=
L
.
take
l
$
takeSample
n
m
$
splitKmeans
k
scores
$
L
.
take
l'
$
L
.
reverse
$
L
.
sortOn
_scored_incExc
scores
-- $ splitKmeans k scores
where
-- TODO: benchmark with accelerate-example kmeans version
splitKmeans
x
xs
=
elements
$
V
.
head
splitKmeans
x
xs
=
L
.
concat
$
map
elements
$
V
.
take
(
k
-
1
)
$
kmeans
(
\
i
->
VU
.
fromList
([(
_scored_incExc
i
::
Double
)]))
euclidSq
x
xs
n
=
round
((
fromIntegral
l
)
/
s
)
m
=
round
$
(
fromIntegral
$
length
scores
)
/
(
s
)
takeSample
n
m
xs
=
-- trace ("splitKmeans " <> show (length xs)) $
L
.
concat
$
map
(
L
.
take
n
)
$
L
.
reverse
$
map
(
L
.
sortOn
_scored_incExc
)
$
map
(
reverse
.
(
L
.
sortOn
_scored_incExc
)
)
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
-- without homogeneous order hypothesis
$
splitEvery
m
$
splitEvery
m
$
L
.
reverse
$
L
.
sortOn
_scored_speGen
xs
...
...
src/Gargantext/Text/Metrics/Count.hs
View file @
15511563
...
...
@@ -75,9 +75,9 @@ type Grouped = Stems
type
Occs
=
Int
type
Coocs
=
Int
removeApax
::
Map
(
Label
,
Label
)
Int
->
Map
(
Label
,
Label
)
Int
removeApax
=
DMS
.
filter
(
>
1
)
type
Threshold
=
Int
removeApax
::
Threshold
->
Map
(
Label
,
Label
)
Int
->
Map
(
Label
,
Label
)
Int
removeApax
t
=
DMS
.
filter
(
>
t
)
cooc
::
[[
Terms
]]
->
Map
(
Label
,
Label
)
Int
cooc
tss
=
coocOnWithLabel
_terms_stem
(
useLabelPolicy
label_policy
)
tss
...
...
src/Gargantext/Viz/Graph/Distances/Matrice.hs
View file @
15511563
...
...
@@ -148,8 +148,6 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
crossProduct
m
=
zipWith
(
*
)
(
cross
m
)
(
cross
(
transpose
m
))
cross
mat
=
zipWith
(
-
)
(
mkSum
n
mat
)
(
mat
)
-----------------------------------------------------------------------
-----------------------------------------------------------------------
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment