Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
e079af35
Unverified
Commit
e079af35
authored
Jun 11, 2018
by
Nicolas Pouillard
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Some progress on workflow, the data2graph fails because of unknown nodes, see the TODO
parent
b56988d8
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
106 additions
and
36 deletions
+106
-36
package.yaml
package.yaml
+31
-17
Pipeline.hs
src/Gargantext/Pipeline.hs
+51
-15
Prelude.hs
src/Gargantext/Prelude.hs
+2
-1
Metrics.hs
src/Gargantext/Text/Metrics.hs
+9
-2
Count.hs
src/Gargantext/Text/Metrics/Count.hs
+13
-1
No files found.
package.yaml
View file @
e079af35
...
...
@@ -24,6 +24,7 @@ library:
# - -Werror
exposed-modules
:
-
Gargantext
-
Gargantext.Pipeline
-
Gargantext.Prelude
-
Gargantext.Core
-
Gargantext.Core.Types
...
...
@@ -122,25 +123,38 @@ library:
-
zlib
# - utc
executable
:
main
:
Main.hs
source-dirs
:
app
ghc-options
:
-
-threaded
-
-rtsopts
-
-with-rtsopts=-N
-
-O2
-
-Wmissing-signatures
dependencies
:
executables
:
gargantext
:
main
:
Main.hs
source-dirs
:
app
ghc-options
:
-
-threaded
-
-rtsopts
-
-with-rtsopts=-N
-
-O2
-
-Wmissing-signatures
dependencies
:
-
base
-
containers
-
gargantext
-
vector
-
cassava
-
ini
-
optparse-generic
-
unordered-containers
-
full-text-search
gargantext-workflow
:
main
:
Main.hs
source-dirs
:
app-workflow
ghc-options
:
-
-threaded
-
-rtsopts
-
-with-rtsopts=-N
-
-O2
-
-Wmissing-signatures
dependencies
:
-
base
-
containers
-
gargantext
-
vector
-
cassava
-
ini
-
optparse-generic
-
unordered-containers
-
full-text-search
tests
:
garg-test
:
...
...
src/Gargantext/Pipeline.hs
View file @
e079af35
...
...
@@ -14,27 +14,31 @@ Portability : POSIX
module
Gargantext.Pipeline
where
import
qualified
Data.Text
as
T
import
Data.Text.IO
(
readFile
)
import
Control.Arrow
((
***
))
import
Data.Map.Strict
(
Map
)
import
qualified
Data.Array.Accelerate
as
A
import
qualified
Data.Map.Strict
as
M
import
qualified
Data.List
as
L
import
Data.Tuple.Extra
(
both
)
----------------------------------------------
import
Gargantext.Core
(
Lang
(
FR
))
import
Gargantext.Core.Types
(
Label
)
import
Gargantext.Prelude
import
Prelude
(
print
,
seq
)
import
Gargantext.Viz.Graph.Index
(
score
,
createIndices
,
toIndex
,
fromIndex
,
cooc2mat
,
mat2map
)
import
Gargantext.Viz.Graph.Index
(
score
,
createIndices
,
toIndex
,
fromIndex
,
cooc2mat
,
ma
p2mat
,
ma
t2map
)
import
Gargantext.Viz.Graph.Distances.Matrice
(
conditional'
,
conditional
)
import
Gargantext.Viz.Graph.Index
(
Index
)
import
Gargantext.Viz.Graph
(
Graph
)
import
Gargantext.Viz.Graph
(
Graph
(
..
),
Node
(
..
),
Edge
(
..
),
Attributes
(
..
),
TypeNode
(
..
)
)
import
Gargantext.Text.Metrics.Count
(
cooc
,
removeApax
)
import
Gargantext.Text.Metrics
import
Gargantext.Text.Terms
(
TermType
(
Multi
,
Mono
),
extractTerms
)
import
Gargantext.Text.Context
(
splitBy
,
SplitContext
(
Sentences
))
import
Data.Graph.Clustering.Louvain.CplusPlus
(
cLouvain
,
LouvainNode
)
import
Data.Graph.Clustering.Louvain.CplusPlus
(
cLouvain
,
LouvainNode
(
..
)
)
{-
...
...
@@ -48,30 +52,62 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode)
-}
-----------------------------------------------------------
data2graph
::
Map
(
Int
,
Int
)
Int
->
Map
(
Int
,
Int
)
Double
->
[
LouvainNode
]
->
Graph
data2graph
=
undefined
-- distance should not be a map since we just "toList" it (same as cLouvain)
data2graph
::
[(
Label
,
Int
)]
->
Map
(
Int
,
Int
)
Int
->
Map
(
Int
,
Int
)
Double
->
[
LouvainNode
]
->
Graph
data2graph
labels
coocs
distance
partitions
=
Graph
nodes
edges
where
community_id_by_node_id
=
M
.
fromList
[
(
n
,
c
)
|
LouvainNode
n
c
<-
partitions
]
nodes
=
[
Node
{
n_size
=
coocs
M
.!
(
n
,
n
)
-- TODO lookup with default ?
,
n_type
=
Terms
-- or Unknown
,
n_id
=
cs
(
show
n
)
,
n_label
=
T
.
unwords
l
,
n_attributes
=
-- TODO lookup with default ?
Attributes
{
clust_default
=
community_id_by_node_id
M
.!
n
}
}
|
(
l
,
n
)
<-
labels
]
edges
=
[
Edge
{
e_source
=
s
,
e_target
=
t
,
e_weight
=
w
,
e_id
=
i
}
|
(
i
,
((
s
,
t
),
w
))
<-
zip
[
0
..
]
(
M
.
toList
distance
)
]
-----------------------------------------------------------
-- printDebug msg x = putStrLn $ msg <> " " <> show x
printDebug
_
_
=
pure
()
workflow
lang
path
=
do
-- Text <- IO Text <- FilePath
text
<-
readFile
path
let
contexts
=
splitBy
(
Sentences
5
)
text
myterms
<-
extractTerms
Multi
lang
contexts
myterms
<-
extractTerms
Mono
lang
contexts
printDebug
"myterms"
$
sum
$
map
length
myterms
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
-- @np FIXME optimization issue of filterCooc (too much memory consumed)
let
myCooc
=
filterCooc
$
removeApax
$
cooc
myterms
--pure myCooc
let
myCooc1
=
cooc
myterms
printDebug
"myCooc1"
$
M
.
size
myCooc1
let
myCooc2
=
removeApax
myCooc1
printDebug
"myCooc2"
$
M
.
size
myCooc2
let
myCooc3
=
filterCooc
myCooc2
printDebug
"myCooc3"
$
M
.
size
myCooc3
-- Cooc -> Matrix
let
(
ti
,
_
)
=
createIndices
myCooc
let
(
ti
,
fi
)
=
createIndices
myCooc3
printDebug
"ti"
$
M
.
size
ti
let
myCooc4
=
toIndex
ti
myCooc3
printDebug
"myCooc4"
$
M
.
size
myCooc4
let
matCooc
=
map2mat
0
(
M
.
size
ti
)
myCooc4
-- Matrix -> Clustering
let
distance
=
score
conditional
$
toIndex
ti
myCooc
partitions
<-
cLouvain
distance
--pure partitions
let
distanceMat
=
conditional
matCooc
printDebug
"distanceMat"
$
A
.
arrayShape
distanceMat
let
distanceMap
=
mat2map
distanceMat
printDebug
"distanceMap"
$
M
.
size
distanceMap
{-
let distance = fromIndex fi distanceMap
printDebug "distance" $ M.size distance
-}
partitions
<-
cLouvain
distanceMap
---- | Building : -> Graph -> JSON
p
ure
partitions
--pure $ data2graph myCooc distance
partitions
p
rintDebug
"partitions"
$
length
partitions
pure
$
data2graph
(
M
.
toList
ti
)
myCooc4
distanceMap
partitions
src/Gargantext/Prelude.hs
View file @
e079af35
...
...
@@ -50,6 +50,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
,
undefined
,
IO
()
,
compare
,
on
)
-- TODO import functions optimized in Utils.Count
...
...
@@ -235,5 +236,5 @@ unMaybe :: [Maybe a] -> [a]
unMaybe
=
map
fromJust
.
L
.
filter
isJust
-- maximumWith
maximumWith
f
=
L
.
maximumBy
(
\
x
y
->
compare
(
f
x
)
(
f
y
)
)
maximumWith
f
=
L
.
maximumBy
(
compare
`
on
`
f
)
src/Gargantext/Text/Metrics.hs
View file @
e079af35
...
...
@@ -51,16 +51,22 @@ import Gargantext.Viz.Graph.Index
import
qualified
Data.Array.Accelerate.Interpreter
as
DAA
import
qualified
Data.Array.Accelerate
as
DAA
-- import Data.Array.Accelerate ((:.)(..), Z(..))
import
GHC.Real
(
round
)
import
Debug.Trace
import
Prelude
(
seq
)
filterCooc
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
(
t
,
t
)
Int
filterCooc
cc
=
filterCooc'
ts
cc
where
ts
=
map
_scored_terms
$
takeSome
350
5
2
$
coocScored
cc
filterCooc'
::
Ord
t
=>
[
t
]
->
Map
(
t
,
t
)
Int
->
Map
(
t
,
t
)
Int
filterCooc'
ts
m
=
foldl'
(
\
m'
k
->
M
.
insert
k
(
maybe
errMessage
identity
$
M
.
lookup
k
m
)
m'
)
M
.
empty
selection
filterCooc'
ts
m
=
-- trace ("coocScored " <> show (length ts)) $
foldl'
(
\
m'
k
->
M
.
insert
k
(
maybe
errMessage
identity
$
M
.
lookup
k
m
)
m'
)
M
.
empty
selection
where
errMessage
=
panic
"Filter cooc: no key"
selection
=
[(
x
,
y
)
|
x
<-
ts
,
y
<-
ts
,
x
>
y
]
...
...
@@ -87,7 +93,8 @@ takeSome l s k scores = L.take l
euclidSq
x
xs
n
=
round
((
fromIntegral
l
)
/
s
)
m
=
round
$
(
fromIntegral
$
length
scores
)
/
(
s
)
takeSample
n
m
xs
=
L
.
concat
$
map
(
L
.
take
n
)
takeSample
n
m
xs
=
-- trace ("splitKmeans " <> show (length xs)) $
L
.
concat
$
map
(
L
.
take
n
)
$
L
.
reverse
$
map
(
L
.
sortOn
_scored_incExc
)
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
...
...
src/Gargantext/Text/Metrics/Count.hs
View file @
e079af35
...
...
@@ -80,9 +80,10 @@ removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
removeApax
=
DMS
.
filter
(
>
1
)
cooc
::
[[
Terms
]]
->
Map
(
Label
,
Label
)
Int
cooc
tss
=
coocOnWithLabel
_terms_stem
(
labelPolicy
terms_occs
)
tss
cooc
tss
=
coocOnWithLabel
_terms_stem
(
useLabelPolicy
label_policy
)
tss
where
terms_occs
=
occurrencesOn
_terms_stem
(
List
.
concat
tss
)
label_policy
=
mkLabelPolicy
terms_occs
coocOnWithLabel
::
(
Ord
label
,
Ord
b
)
=>
(
a
->
b
)
->
(
b
->
label
)
...
...
@@ -93,10 +94,21 @@ coocOnWithLabel on policy tss =
delta
f
=
f
***
f
mkLabelPolicy
::
Map
Grouped
(
Map
Terms
Occs
)
->
Map
Grouped
Label
mkLabelPolicy
=
DMS
.
map
f
where
f
=
_terms_label
.
fst
.
maximumWith
snd
.
DMS
.
toList
-- TODO use the Foldable instance of Map instead of building a list
useLabelPolicy
::
Map
Grouped
Label
->
Grouped
->
Label
useLabelPolicy
m
g
=
case
DMS
.
lookup
g
m
of
Just
label
->
label
Nothing
->
panic
$
"Label of Grouped not found: "
<>
(
pack
$
show
g
)
{-
labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
-}
coocOn
::
Ord
b
=>
(
a
->
b
)
->
[[
a
]]
->
Map
(
b
,
b
)
Coocs
coocOn
f
as
=
foldl'
(
\
a
b
->
DMS
.
unionWith
(
+
)
a
b
)
empty
$
map
(
coocOn'
f
)
as
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment