Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
157
Issues
157
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
f939fb19
Commit
f939fb19
authored
Jun 13, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TEXTLINE] adding CSV format parser.
parent
8a838c7f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
50 additions
and
35 deletions
+50
-35
Pipeline.hs
src/Gargantext/Pipeline.hs
+27
-21
Metrics.hs
src/Gargantext/Text/Metrics.hs
+4
-1
CSV.hs
src/Gargantext/Text/Parsers/CSV.hs
+19
-13
No files found.
src/Gargantext/Pipeline.hs
View file @
f939fb19
...
@@ -38,6 +38,8 @@ import Gargantext.Text.Metrics
...
@@ -38,6 +38,8 @@ import Gargantext.Text.Metrics
import
Gargantext.Text.Terms
(
TermType
(
Multi
,
Mono
),
extractTerms
)
import
Gargantext.Text.Terms
(
TermType
(
Multi
,
Mono
),
extractTerms
)
import
Gargantext.Text.Context
(
splitBy
,
SplitContext
(
Sentences
))
import
Gargantext.Text.Context
(
splitBy
,
SplitContext
(
Sentences
))
import
Gargantext.Text.Parsers.CSV
import
Data.Graph.Clustering.Louvain.CplusPlus
(
cLouvain
,
LouvainNode
(
..
))
import
Data.Graph.Clustering.Louvain.CplusPlus
(
cLouvain
,
LouvainNode
(
..
))
...
@@ -51,17 +53,22 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
...
@@ -51,17 +53,22 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
-}
-}
workflow
lang
path
=
do
data
WorkType
=
CSV
|
FullText
-- workflow :: Lang (EN|FR) -> FilePath -> Graph
workflow
termsLang
workType
path
=
do
-- Text <- IO Text <- FilePath
-- Text <- IO Text <- FilePath
text
<-
readFile
path
contexts
<-
case
workType
of
FullText
->
splitBy
(
Sentences
5
)
<$>
readFile
path
CSV
->
readCsvOn
[
csv_title
,
csv_abstract
]
path
let
contexts
=
splitBy
(
Sentences
5
)
text
-- Context :: Text -> [Text]
-- Context :: Text -> [Text]
-- Contexts = Paragraphs n | Sentences n | Chars n
-- Contexts = Paragraphs n | Sentences n | Chars n
myterms
<-
extractTerms
(
Mono
lang
)
contexts
myterms
<-
extractTerms
(
Mono
FR
)
contexts
-- TermsType = Mono | Multi | MonoMulti
-- myterms # filter (\t -> not . elem t stopList)
-- myterms # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList)
-- # groupBy (Stem|GroupList
|Ontology
)
printDebug
"myterms"
(
sum
$
map
length
myterms
)
printDebug
"myterms"
(
sum
$
map
length
myterms
)
-- Bulding the map list
-- Bulding the map list
...
@@ -75,11 +82,11 @@ workflow lang path = do
...
@@ -75,11 +82,11 @@ workflow lang path = do
printDebug
"myCooc2"
(
M
.
size
myCooc2
)
printDebug
"myCooc2"
(
M
.
size
myCooc2
)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
let
myCooc3
=
filterCooc
(
FilterConfig
(
MapListSize
2
0
)
let
myCooc3
=
filterCooc
(
FilterConfig
(
MapListSize
100
0
)
(
InclusionSize
1
000
)
(
InclusionSize
4
000
)
(
SampleBins
10
)
(
SampleBins
10
)
(
Clusters
3
)
(
Clusters
3
)
(
DefaultValue
(
-
1
)
)
(
DefaultValue
0
)
)
myCooc2
)
myCooc2
printDebug
"myCooc3"
$
M
.
size
myCooc3
printDebug
"myCooc3"
$
M
.
size
myCooc3
...
@@ -90,26 +97,25 @@ workflow lang path = do
...
@@ -90,26 +97,25 @@ workflow lang path = do
let
myCooc4
=
toIndex
ti
myCooc3
let
myCooc4
=
toIndex
ti
myCooc3
printDebug
"myCooc4"
$
M
.
size
myCooc4
printDebug
"myCooc4"
$
M
.
size
myCooc4
let
matCooc
=
map2mat
(
-
2
)
(
M
.
size
ti
)
myCooc4
let
matCooc
=
map2mat
(
0
)
(
M
.
size
ti
)
myCooc4
printDebug
"matCooc"
matCooc
--printDebug "matCooc" matCooc
pure
matCooc
-- Matrix -> Clustering
-- Matrix -> Clustering
--
let distanceMat = conditional matCooc
let
distanceMat
=
conditional
matCooc
-- let distanceMat = distributional matCooc
-- let distanceMat = distributional matCooc
--
printDebug "distanceMat" $ A.arrayShape distanceMat
printDebug
"distanceMat"
$
A
.
arrayShape
distanceMat
--
printDebug "distanceMat" distanceMat
--
printDebug "distanceMat" distanceMat
--
--
--
let distanceMap = mat2map distanceMat
let
distanceMap
=
mat2map
distanceMat
--
printDebug "distanceMap" $ M.size distanceMap
printDebug
"distanceMap"
$
M
.
size
distanceMap
--{-
--{-
-- let distance = fromIndex fi distanceMap
-- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance
-- printDebug "distance" $ M.size distance
---}
---}
--
partitions <- cLouvain distanceMap
partitions
<-
cLouvain
distanceMap
------ | Building : -> Graph -> JSON
------ | Building : -> Graph -> JSON
--
printDebug "partitions" $ length partitions
printDebug
"partitions"
$
length
partitions
-- pure $ data2graph (M.toList ti) myCooc4 distanceMap
partitions
--printDebug "partitions"
partitions
pure
$
data2graph
(
M
.
toList
ti
)
myCooc4
distanceMap
partitions
-----------------------------------------------------------
-----------------------------------------------------------
...
...
src/Gargantext/Text/Metrics.hs
View file @
f939fb19
...
@@ -82,7 +82,10 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore
...
@@ -82,7 +82,10 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore
foldl'
(
\
m'
k
->
M
.
insert
k
(
maybe
dv
identity
$
M
.
lookup
k
m
)
m'
)
foldl'
(
\
m'
k
->
M
.
insert
k
(
maybe
dv
identity
$
M
.
lookup
k
m
)
m'
)
M
.
empty
selection
M
.
empty
selection
where
where
selection
=
[(
x
,
y
)
|
x
<-
ts
,
y
<-
ts
,
x
>
y
]
selection
=
[(
x
,
y
)
|
x
<-
ts
,
y
<-
ts
-- , x >= y
]
-- | Map list creation
-- | Map list creation
...
...
src/Gargantext/Text/Parsers/CSV.hs
View file @
f939fb19
...
@@ -12,7 +12,6 @@ CSV parser for Gargantext corpus files.
...
@@ -12,7 +12,6 @@ CSV parser for Gargantext corpus files.
-}
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveGeneric #-}
module
Gargantext.Text.Parsers.CSV
where
module
Gargantext.Text.Parsers.CSV
where
...
@@ -25,7 +24,7 @@ import Control.Applicative
...
@@ -25,7 +24,7 @@ import Control.Applicative
import
Data.Char
(
ord
)
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
,
pack
,
length
)
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
qualified
Data.ByteString.Lazy
as
BL
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Vector
(
Vector
)
import
Data.Vector
(
Vector
)
...
@@ -68,9 +67,8 @@ fromDocs docs = V.map fromDocs' docs
...
@@ -68,9 +67,8 @@ fromDocs docs = V.map fromDocs' docs
-- | Split a document in its context
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc
::
Mean
->
SplitContext
->
CsvDoc
->
Vector
CsvDoc
splitDoc
::
Mean
->
SplitContext
->
CsvDoc
->
Vector
CsvDoc
splitDoc
m
splt
doc
=
let
docSize
=
(
length
$
c_abstract
doc
)
in
splitDoc
m
splt
doc
=
let
docSize
=
(
length
$
c
sv
_abstract
doc
)
in
if
docSize
>
1000
if
docSize
>
1000
then
then
if
(
mod
(
round
m
)
docSize
)
>=
10
if
(
mod
(
round
m
)
docSize
)
>=
10
...
@@ -101,18 +99,18 @@ type Mean = Double
...
@@ -101,18 +99,18 @@ type Mean = Double
docsSize
::
Vector
CsvDoc
->
Mean
docsSize
::
Vector
CsvDoc
->
Mean
docsSize
csvDoc
=
mean
ls
docsSize
csvDoc
=
mean
ls
where
where
ls
=
V
.
toList
$
V
.
map
(
fromIntegral
.
length
.
c_abstract
)
csvDoc
ls
=
V
.
toList
$
V
.
map
(
fromIntegral
.
length
.
c
sv
_abstract
)
csvDoc
---------------------------------------------------------------
---------------------------------------------------------------
data
CsvDoc
=
CsvDoc
data
CsvDoc
=
CsvDoc
{
c_title
::
!
Text
{
c
sv
_title
::
!
Text
,
c_source
::
!
Text
,
c
sv
_source
::
!
Text
,
c_publication_year
::
!
Int
,
c
sv
_publication_year
::
!
Int
,
c_publication_month
::
!
Int
,
c
sv
_publication_month
::
!
Int
,
c_publication_day
::
!
Int
,
c
sv
_publication_day
::
!
Int
,
c_abstract
::
!
Text
,
c
sv
_abstract
::
!
Text
,
c_authors
::
!
Text
,
c
sv
_authors
::
!
Text
}
}
deriving
(
Show
)
deriving
(
Show
)
...
@@ -148,6 +146,14 @@ csvEncodeOptions = ( defaultEncodeOptions
...
@@ -148,6 +146,14 @@ csvEncodeOptions = ( defaultEncodeOptions
)
)
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn
::
[
CsvDoc
->
Text
]
->
FilePath
->
IO
[
Text
]
readCsvOn
fields
fp
=
V
.
toList
<$>
V
.
map
(
\
l
->
intercalate
(
pack
" "
)
$
map
(
\
field
->
field
l
)
fields
)
<$>
snd
<$>
readCsv
fp
------------------------------------------------------------------------
readCsv
::
FilePath
->
IO
(
Header
,
Vector
CsvDoc
)
readCsv
::
FilePath
->
IO
(
Header
,
Vector
CsvDoc
)
readCsv
fp
=
do
readCsv
fp
=
do
csvData
<-
BL
.
readFile
fp
csvData
<-
BL
.
readFile
fp
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment