Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
0b0ee22a
Commit
0b0ee22a
authored
Apr 12, 2021
by
qlobbe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add weighted csv
parent
3e6c4d4a
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
125 additions
and
48 deletions
+125
-48
Main.hs
bin/gargantext-adaptative-phylo/Main.hs
+42
-38
CSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+31
-0
FrequentItemSet.hs
src/Gargantext/Core/Text/Metrics/FrequentItemSet.hs
+25
-0
AdaptativePhylo.hs
src/Gargantext/Core/Viz/AdaptativePhylo.hs
+6
-3
PhyloExample.hs
src/Gargantext/Core/Viz/Phylo/PhyloExample.hs
+4
-2
PhyloExport.hs
src/Gargantext/Core/Viz/Phylo/PhyloExport.hs
+1
-0
PhyloMaker.hs
src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
+11
-4
SynchronicClustering.hs
src/Gargantext/Core/Viz/Phylo/SynchronicClustering.hs
+5
-1
No files found.
bin/gargantext-adaptative-phylo/Main.hs
View file @
0b0ee22a
...
...
@@ -25,7 +25,7 @@ import Crypto.Hash.SHA256 (hash)
import
Gargantext.Prelude
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
csv_title
,
csv_abstract
,
csv_publication_year
)
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
csv_title
,
csv_abstract
,
csv_publication_year
,
csv_w_title
,
csv_w_abstract
,
csv_w_publication_year
,
csv_w_weight
)
import
Gargantext.Core.Text.Corpus.Parsers
(
FileFormat
(
..
),
parseFile
)
import
Gargantext.Core.Text.List.Formats.CSV
(
csvMapTermList
)
import
Gargantext.Core.Text.Terms.WithList
(
Patterns
,
buildPatterns
,
extractTermsWithList
)
...
...
@@ -46,10 +46,9 @@ import qualified Data.Vector as Vector
import
qualified
Gargantext.Core.Text.Corpus.Parsers.CSV
as
Csv
import
qualified
Data.Text
as
T
-- import Debug.Trace (trace)
data
PhyloStage
=
PhyloWithCliques
|
PhyloWithLinks
deriving
(
Show
)
data
PhyloStage
=
PhyloWithCliques
|
PhyloWithLinks
deriving
(
Show
)
---------------
-- | Tools | --
...
...
@@ -63,7 +62,6 @@ getFilesFromPath path = do
then
(
listDirectory
path
)
else
return
[
path
]
--------------
-- | Json | --
--------------
...
...
@@ -79,18 +77,13 @@ readJson path = Lazy.readFile path
----------------
-- | To filter the Ngrams of a document based on the termList
filterTerms
::
Patterns
->
(
a
,
Text
)
->
(
a
,
[
Text
])
filterTerms
patterns
(
y
,
d
)
=
(
y
,
termsInText
patterns
d
)
where
--------------------------------------
termsInText
::
Patterns
->
Text
->
[
Text
]
termsInText
pats
txt
=
nub
$
concat
$
map
(
map
unwords
)
$
extractTermsWithList
pats
txt
--------------------------------------
-- | To transform a Wos file (or [file]) into a readable corpus
wosToCorpus
::
Int
->
FilePath
->
IO
([(
Int
,
Text
)])
wosToCorpus
limit
path
=
do
termsInText
::
Patterns
->
Text
->
[
Text
]
termsInText
pats
txt
=
nub
$
concat
$
map
(
map
unwords
)
$
extractTermsWithList
pats
txt
-- | To transform a Wos file (or [file]) into a list of Docs
wosToDocs
::
Int
->
Patterns
->
FilePath
->
IO
([
Document
])
wosToDocs
limit
patterns
path
=
do
files
<-
getFilesFromPath
path
take
limit
<$>
map
(
\
d
->
let
date'
=
fromJust
$
_hd_publication_year
d
...
...
@@ -98,7 +91,7 @@ wosToCorpus limit path = do
abstr
=
if
(
isJust
$
_hd_abstract
d
)
then
fromJust
$
_hd_abstract
d
else
""
in
(
date'
,
title
<>
" "
<>
abstr
)
)
in
Document
date'
(
termsInText
patterns
$
title
<>
" "
<>
abstr
)
Nothing
)
<$>
concat
<$>
mapConcurrently
(
\
file
->
filter
(
\
d
->
(
isJust
$
_hd_publication_year
d
)
...
...
@@ -106,27 +99,38 @@ wosToCorpus limit path = do
<$>
parseFile
WOS
(
path
<>
file
)
)
files
-- | To transform a Csv file into a readable corpus
csvToCorpus
::
Int
->
FilePath
->
IO
([(
Int
,
Text
)])
csvToCorpus
limit
path
=
Vector
.
toList
<$>
Vector
.
take
limit
<$>
Vector
.
map
(
\
row
->
(
csv_publication_year
row
,
(
csv_title
row
)
<>
" "
<>
(
csv_abstract
row
)))
<$>
snd
<$>
Csv
.
readFile
path
-- | To use the correct parser given a CorpusType
fileToCorpus
::
CorpusParser
->
FilePath
->
IO
([(
Int
,
Text
)])
fileToCorpus
parser
path
=
case
parser
of
Wos
limit
->
wosToCorpus
limit
path
Csv
limit
->
csvToCorpus
limit
path
-- To transform a Csv file into a list of Document
csvToDocs
::
CorpusParser
->
Patterns
->
FilePath
->
IO
([
Document
])
csvToDocs
parser
patterns
path
=
case
parser
of
Wos
_
->
undefined
Csv
limit
->
Vector
.
toList
<$>
Vector
.
take
limit
<$>
Vector
.
map
(
\
row
->
Document
(
csv_publication_year
row
)
(
termsInText
patterns
$
(
csv_title
row
)
<>
" "
<>
(
csv_abstract
row
))
Nothing
)
<$>
snd
<$>
Csv
.
readFile
path
CsvWeighted
limit
->
Vector
.
toList
<$>
Vector
.
take
limit
<$>
Vector
.
map
(
\
row
->
Document
(
csv_w_publication_year
row
)
(
termsInText
patterns
$
(
csv_w_title
row
)
<>
" "
<>
(
csv_w_abstract
row
))
(
Just
$
csv_w_weight
row
)
)
<$>
snd
<$>
Csv
.
readWeightedCsv
path
-- To parse a file into a list of Document
fileToDocs'
::
CorpusParser
->
FilePath
->
TermList
->
IO
[
Document
]
fileToDocs'
parser
path
lst
=
do
let
patterns
=
buildPatterns
lst
case
parser
of
Wos
limit
->
wosToDocs
limit
patterns
path
Csv
_
->
csvToDocs
parser
patterns
path
CsvWeighted
_
->
csvToDocs
parser
patterns
path
-- | To parse a file into a list of Document
fileToDocs
::
CorpusParser
->
FilePath
->
TermList
->
IO
[
Document
]
fileToDocs
parser
path
lst
=
do
corpus
<-
fileToCorpus
parser
path
let
patterns
=
buildPatterns
lst
pure
$
map
(
(
\
(
y
,
t
)
->
Document
y
t
)
.
filterTerms
patterns
)
corpus
---------------
-- | Label | --
---------------
-- Config time parameters to label
...
...
@@ -235,7 +239,7 @@ main = do
printIOMsg
"Parse the corpus"
mapList
<-
csvMapTermList
(
listPath
config
)
corpus
<-
fileToDocs
(
corpusParser
config
)
(
corpusPath
config
)
mapList
corpus
<-
fileToDocs
'
(
corpusParser
config
)
(
corpusPath
config
)
mapList
printIOComment
(
show
(
length
corpus
)
<>
" parsed docs from the corpus"
)
printIOMsg
"Reconstruct the phylo"
...
...
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
View file @
0b0ee22a
...
...
@@ -395,3 +395,34 @@ parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
parseCsv'
::
BL
.
ByteString
->
[
HyperdataDocument
]
parseCsv'
bs
=
V
.
toList
$
V
.
map
csv2doc
$
snd
$
readCsvLazyBS
bs
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
data
WeightedCsv
=
WeightedCsv
{
csv_w_title
::
!
Text
,
csv_w_source
::
!
Text
,
csv_w_publication_year
::
!
Int
,
csv_w_publication_month
::
!
Int
,
csv_w_publication_day
::
!
Int
,
csv_w_abstract
::
!
Text
,
csv_w_authors
::
!
Text
,
csv_w_weight
::
!
Double
}
deriving
(
Show
)
instance
FromNamedRecord
WeightedCsv
where
parseNamedRecord
r
=
WeightedCsv
<$>
r
.:
"title"
<*>
r
.:
"source"
<*>
r
.:
"publication_year"
<*>
r
.:
"publication_month"
<*>
r
.:
"publication_day"
<*>
r
.:
"abstract"
<*>
r
.:
"authors"
<*>
r
.:
"weight"
readWeightedCsv
::
FilePath
->
IO
(
Header
,
Vector
WeightedCsv
)
readWeightedCsv
fp
=
fmap
(
\
bs
->
case
decodeByNameWith
csvDecodeOptions
bs
of
Left
e
->
panic
(
pack
e
)
Right
corpus
->
corpus
)
$
BL
.
readFile
fp
\ No newline at end of file
src/Gargantext/Core/Text/Metrics/FrequentItemSet.hs
View file @
0b0ee22a
...
...
@@ -21,6 +21,7 @@ module Gargantext.Core.Text.Metrics.FrequentItemSet
,
fisWithSizePoly
,
fisWithSizePoly2
,
fisWithSizePolyMap
,
fisWithSizePolyMap'
,
module
HLCM
)
where
...
...
@@ -35,6 +36,8 @@ import qualified Data.Map.Strict as Map
import
qualified
Data.Set
as
Set
import
qualified
Data.Vector
as
V
import
Control.Monad
(
sequence
)
data
Size
=
Point
Int
|
Segment
Int
Int
------------------------------------------------------------------------
...
...
@@ -120,6 +123,28 @@ fisWithSizePolyMap n f is =
------------------------------------------------------------------------
------------------------------------------------------------------------
---- Weighted [[Item]]
isSublistOf
::
Ord
a
=>
[
a
]
->
[
a
]
->
Bool
isSublistOf
sub
lst
=
all
(
\
i
->
elem
i
lst
)
sub
reIndexFis
::
Ord
a
=>
[([
a
],
b
)]
->
[
Fis'
a
]
->
[(
Fis'
a
,[
b
])]
reIndexFis
items
fis
=
map
(
\
f
->
let
docs
=
filter
(
\
(
lst
,
_
)
->
isSublistOf
(
_fisItemSet
f
)
lst
)
items
in
(
f
,
map
snd
docs
))
fis
wsum
::
[
Maybe
Double
]
->
Maybe
Double
wsum
lst
=
fmap
sum
$
sequence
lst
fisWithSizePolyMap'
::
Ord
a
=>
Size
->
Frequency
->
[([
a
],
Maybe
Double
)]
->
Map
(
Set
a
)
(
Int
,
Maybe
Double
)
fisWithSizePolyMap'
n
f
is
=
Map
.
fromList
$
map
(
\
(
fis
,
ws
)
->
(
Set
.
fromList
(
_fisItemSet
fis
),(
_fisCount
fis
,(
wsum
ws
))))
$
reIndexFis
is
$
fisWithSizePoly2
n
f
(
map
fst
is
)
------------------------------------------------------------------------
------------------------------------------------------------------------
--
---- | /!\ indexes are not the same:
...
...
src/Gargantext/Core/Viz/AdaptativePhylo.hs
View file @
0b0ee22a
...
...
@@ -52,6 +52,7 @@ import qualified Data.Text.Lazy as TextLazy
data
CorpusParser
=
Wos
{
_wos_limit
::
Int
}
|
Csv
{
_csv_limit
::
Int
}
|
CsvWeighted
{
_csvw_limit
::
Int
}
deriving
(
Show
,
Generic
,
Eq
)
data
SeaElevation
=
...
...
@@ -231,7 +232,6 @@ defaultPhyloParam =
-- | Document | --
------------------
-- | Date : a simple Integer
type
Date
=
Int
...
...
@@ -240,8 +240,9 @@ type Ngrams = Text
-- | Document : a piece of Text linked to a Date
data
Document
=
Document
{
date
::
Date
,
text
::
[
Ngrams
]
{
date
::
Date
,
text
::
[
Ngrams
]
,
weight
::
Maybe
Double
}
deriving
(
Eq
,
Show
,
Generic
,
NFData
)
...
...
@@ -335,6 +336,7 @@ data PhyloGroup =
,
_phylo_groupIndex
::
Int
,
_phylo_groupLabel
::
Text
,
_phylo_groupSupport
::
Support
,
_phylo_groupWeight
::
Maybe
Double
,
_phylo_groupNgrams
::
[
Int
]
,
_phylo_groupCooc
::
!
(
Cooc
)
,
_phylo_groupBranchId
::
PhyloBranchId
...
...
@@ -368,6 +370,7 @@ data PhyloClique = PhyloClique
{
_phyloClique_nodes
::
[
Int
]
,
_phyloClique_support
::
Support
,
_phyloClique_period
::
(
Date
,
Date
)
,
_phyloClique_weight
::
Maybe
Double
}
deriving
(
Generic
,
NFData
,
Show
,
Eq
)
----------------
...
...
src/Gargantext/Core/Viz/Phylo/PhyloExample.hs
View file @
0b0ee22a
...
...
@@ -109,8 +109,10 @@ config =
docs
::
[
Document
]
docs
=
map
(
\
(
d
,
t
)
->
Document
d
(
filter
(
\
n
->
isRoots
n
(
foundations
^.
foundations_roots
))
$
monoTexts
t
))
corpus
->
Document
d
(
filter
(
\
n
->
isRoots
n
(
foundations
^.
foundations_roots
))
$
monoTexts
t
)
Nothing
)
corpus
foundations
::
PhyloFoundations
...
...
src/Gargantext/Core/Viz/Phylo/PhyloExport.hs
View file @
0b0ee22a
...
...
@@ -139,6 +139,7 @@ groupToDotNode fdt g bId =
,
toAttr
"branchId"
(
pack
$
unwords
(
init
$
map
show
$
snd
$
g
^.
phylo_groupBranchId
))
,
toAttr
"bId"
(
pack
$
show
bId
)
,
toAttr
"support"
(
pack
$
show
(
g
^.
phylo_groupSupport
))
,
toAttr
"weight"
(
pack
$
show
(
g
^.
phylo_groupWeight
))
,
toAttr
"lbl"
(
pack
$
show
(
ngramsToLabel
fdt
(
g
^.
phylo_groupNgrams
)))
,
toAttr
"foundation"
(
pack
$
show
(
idxToLabel
(
g
^.
phylo_groupNgrams
)))
,
toAttr
"role"
(
pack
$
show
(
idxToLabel'
((
g
^.
phylo_groupMeta
)
!
"dynamics"
)))
...
...
src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
View file @
0b0ee22a
...
...
@@ -21,7 +21,7 @@ import Gargantext.Core.Viz.Phylo.PhyloTools
import
Gargantext.Core.Viz.Phylo.TemporalMatching
(
adaptativeTemporalMatching
,
constanteTemporalMatching
,
getNextPeriods
,
filterDocs
,
filterDiago
,
reduceDiagos
,
toProximity
)
import
Gargantext.Core.Viz.Phylo.SynchronicClustering
(
synchronicClustering
)
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Metrics.FrequentItemSet
(
fisWithSizePolyMap
,
Size
(
..
))
import
Gargantext.Core.Text.Metrics.FrequentItemSet
(
fisWithSizePolyMap
,
fisWithSizePolyMap'
,
Size
(
..
))
import
Gargantext.Core.Methods.Graph.MaxClique
(
getMaxCliques
)
import
Gargantext.Core.Methods.Distances
(
Distance
(
Conditional
))
import
Gargantext.Core.Viz.Phylo.PhyloExport
(
toHorizon
)
...
...
@@ -128,6 +128,7 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co
cliqueToGroup
::
PhyloClique
->
PhyloPeriodId
->
Level
->
Int
->
[
Cooc
]
->
PhyloGroup
cliqueToGroup
fis
pId
lvl
idx
coocs
=
PhyloGroup
pId
lvl
idx
""
(
fis
^.
phyloClique_support
)
(
fis
^.
phyloClique_weight
)
(
fis
^.
phyloClique_nodes
)
(
ngramsToCooc
(
fis
^.
phyloClique_nodes
)
coocs
)
(
1
,[
0
])
-- branchid (lvl,[path in the branching tree])
...
...
@@ -217,8 +218,14 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
phyloClique
=
case
(
clique
$
getConfig
phylo
)
of
Fis
_
_
->
let
fis
=
map
(
\
(
prd
,
docs
)
->
let
lst
=
toList
$
fisWithSizePolyMap
(
Segment
1
20
)
1
(
map
(
\
d
->
ngramsToIdx
(
text
d
)
(
getRoots
phylo
))
docs
)
in
(
prd
,
map
(
\
f
->
PhyloClique
(
Set
.
toList
$
fst
f
)
(
snd
f
)
prd
)
lst
))
case
(
corpusParser
$
getConfig
phylo
)
of
CsvWeighted
_
->
let
lst
=
toList
$
fisWithSizePolyMap'
(
Segment
1
20
)
1
(
map
(
\
d
->
(
ngramsToIdx
(
text
d
)
(
getRoots
phylo
),
weight
d
))
docs
)
in
(
prd
,
map
(
\
f
->
PhyloClique
(
Set
.
toList
$
fst
f
)
((
fst
.
snd
)
f
)
prd
((
snd
.
snd
)
f
))
lst
)
_
->
let
lst
=
toList
$
fisWithSizePolyMap
(
Segment
1
20
)
1
(
map
(
\
d
->
ngramsToIdx
(
text
d
)
(
getRoots
phylo
))
docs
)
in
(
prd
,
map
(
\
f
->
PhyloClique
(
Set
.
toList
$
fst
f
)
(
snd
f
)
prd
Nothing
)
lst
)
)
$
toList
phyloDocs
fis'
=
fis
`
using
`
parList
rdeepseq
in
fromList
fis'
...
...
@@ -228,7 +235,7 @@ toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
$
foldl
sumCooc
empty
$
map
listToMatrix
$
map
(
\
d
->
ngramsToIdx
(
text
d
)
(
getRoots
phylo
))
docs
in
(
prd
,
map
(
\
cl
->
PhyloClique
cl
0
prd
)
$
getMaxCliques
filterType
Conditional
thr
cooc
))
in
(
prd
,
map
(
\
cl
->
PhyloClique
cl
0
prd
Nothing
)
$
getMaxCliques
filterType
Conditional
thr
cooc
))
$
toList
phyloDocs
mcl'
=
mcl
`
using
`
parList
rdeepseq
in
fromList
mcl'
...
...
src/Gargantext/Core/Viz/Phylo/SynchronicClustering.hs
View file @
0b0ee22a
...
...
@@ -22,6 +22,7 @@ import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty,
import
Control.Lens
hiding
(
Level
)
import
Control.Parallel.Strategies
(
parList
,
rdeepseq
,
using
)
import
Control.Monad
(
sequence
)
-- import Debug.Trace (trace)
import
qualified
Data.Map
as
Map
...
...
@@ -36,7 +37,10 @@ mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [Phylo
mergeGroups
coocs
id
mapIds
childs
=
let
ngrams
=
(
sort
.
nub
.
concat
)
$
map
_phylo_groupNgrams
childs
in
PhyloGroup
(
fst
$
fst
id
)
(
snd
$
fst
id
)
(
snd
id
)
""
(
sum
$
map
_phylo_groupSupport
childs
)
ngrams
(
sum
$
map
_phylo_groupSupport
childs
)
(
fmap
sum
$
sequence
$
map
_phylo_groupWeight
childs
)
ngrams
(
ngramsToCooc
ngrams
coocs
)
((
snd
$
fst
id
),
bId
)
(
mergeMeta
bId
childs
)
[]
(
map
(
\
g
->
(
getGroupId
g
,
1
))
childs
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment