Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
200
Issues
200
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
2ceb742b
Verified
Commit
2ceb742b
authored
Oct 21, 2025
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[tsv] migrate tsvv3 to separate module, separate tsv hal, tsv phylo
parent
c8185ce7
Pipeline
#7985
passed with stages
in 58 minutes and 39 seconds
Changes
13
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
645 additions
and
508 deletions
+645
-508
FilterTermsAndCooc.hs
bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
+2
-1
Common.hs
bin/gargantext-cli/CLI/Phylo/Common.hs
+13
-10
gargantext.cabal
gargantext.cabal
+4
-0
Isidore.hs
src/Gargantext/Core/Text/Corpus/API/Isidore.hs
+5
-5
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+5
-3
TSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+7
-467
Diagnostics.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV/Diagnostics.hs
+257
-5
TSVv3.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv3.hs
+63
-9
TSVv4.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv4.hs
+11
-2
TsvHal.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV/TsvHal.hs
+141
-0
TsvPhylo.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV/TsvPhylo.hs
+42
-0
Utils.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV/Utils.hs
+88
-0
TSV.hs
test/Test/Core/Text/Corpus/TSV.hs
+7
-6
No files found.
bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
View file @
2ceb742b
...
@@ -20,7 +20,8 @@ import Data.Tuple.Extra (both)
...
@@ -20,7 +20,8 @@ import Data.Tuple.Extra (both)
import
Data.Vector
qualified
as
DV
import
Data.Vector
qualified
as
DV
import
GHC.Generics
import
GHC.Generics
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
readTSVFile
,
tsv_title
,
tsv_abstract
,
tsv_publication_year
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
readTSVFile
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
(
tsv_title
,
tsv_abstract
,
tsv_publication_year
)
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.Metrics.Count
(
coocOnContexts
,
Coocs
)
import
Gargantext.Core.Text.Metrics.Count
(
coocOnContexts
,
Coocs
)
import
Gargantext.Core.Text.Terms.WithList
(
Patterns
,
buildPatterns
,
extractTermsWithList
)
import
Gargantext.Core.Text.Terms.WithList
(
Patterns
,
buildPatterns
,
extractTermsWithList
)
...
...
bin/gargantext-cli/CLI/Phylo/Common.hs
View file @
2ceb742b
...
@@ -14,7 +14,8 @@ import Gargantext.API.Ngrams.Prelude (toTermList)
...
@@ -14,7 +14,8 @@ import Gargantext.API.Ngrams.Prelude (toTermList)
import
Gargantext.API.Ngrams.Types
import
Gargantext.API.Ngrams.Types
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Corpus.Parsers
(
FileFormat
(
..
),
FileType
(
..
),
parseFile
)
import
Gargantext.Core.Text.Corpus.Parsers
(
FileFormat
(
..
),
FileType
(
..
),
parseFile
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
tsv_title
,
tsv_abstract
,
tsv_publication_year
,
tsv_publication_month
,
tsv_publication_day
,
tsv'_source
,
tsv'_title
,
tsv'_abstract
,
tsv'_publication_year
,
tsv'_publication_month
,
tsv'_publication_day
,
tsv'_weight
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
(
tsv_title
,
tsv_abstract
,
tsv_publication_year
,
tsv_publication_month
,
tsv_publication_day
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
(
tsv'_source
,
tsv'_title
,
tsv'_abstract
,
tsv'_publication_year
,
tsv'_publication_month
,
tsv'_publication_day
,
tsv'_weight
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
qualified
as
Tsv
import
Gargantext.Core.Text.Corpus.Parsers.TSV
qualified
as
Tsv
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Core.Text.Ngrams
(
NgramsType
(
..
))
...
@@ -95,15 +96,17 @@ tsvToDocs parser patterns time path =
...
@@ -95,15 +96,17 @@ tsvToDocs parser patterns time path =
,
hour
=
Def
.
hour
,
hour
=
Def
.
hour
,
minute
=
Def
.
minute
,
minute
=
Def
.
minute
,
sec
=
Def
.
second
}
,
sec
=
Def
.
second
}
Tsv'
limit
->
Vector
.
toList
Tsv'
limit
->
do
<$>
Vector
.
take
limit
(
Right
contents
)
<-
Tsv
.
readTSVFileDelim
path
Tsv
.
Tab
<$>
Vector
.
map
(
\
row
->
Document
{
date
=
toPhyloDate
(
rowToUTCTimeR
row
)
time
pure
$
Vector
.
toList
,
date'
=
toPhyloDate'
(
rowToUTCTimeR
row
)
time
$
Vector
.
take
limit
,
text
=
termsInText
patterns
$
(
tsv'_title
row
)
<>
" "
<>
(
tsv'_abstract
row
)
(
Vector
.
map
(
\
row
->
Document
{
date
=
toPhyloDate
(
rowToUTCTimeR
row
)
time
,
weight
=
Just
$
tsv'_weight
row
,
date'
=
toPhyloDate'
(
rowToUTCTimeR
row
)
time
,
sources
=
map
(
T
.
strip
.
pack
)
$
splitOn
";"
(
unpack
$
(
tsv'_source
row
))
,
text
=
termsInText
patterns
$
tsv'_title
row
<>
" "
<>
tsv'_abstract
row
,
docTime
=
time
}
,
weight
=
Just
$
tsv'_weight
row
)
<$>
snd
<$>
Tsv
.
readWeightedTsv
path
,
sources
=
map
(
T
.
strip
.
pack
)
$
splitOn
";"
(
unpack
$
tsv'_source
row
)
,
docTime
=
time
}
)
$
snd
contents
)
where
where
rowToUTCTimeR
row
=
UTCTimeR
{
year
=
tsv'_publication_year
row
rowToUTCTimeR
row
=
UTCTimeR
{
year
=
tsv'_publication_year
row
,
month
=
tsv'_publication_month
row
,
month
=
tsv'_publication_month
row
...
...
gargantext.cabal
View file @
2ceb742b
...
@@ -233,6 +233,10 @@ library
...
@@ -233,6 +233,10 @@ library
Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
Gargantext.Core.Text.Corpus.Parsers.TSV.Types
Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.List
Gargantext.Core.Text.List
Gargantext.Core.Text.List.Formats.TSV
Gargantext.Core.Text.List.Formats.TSV
...
...
src/Gargantext/Core/Text/Corpus/API/Isidore.hs
View file @
2ceb742b
...
@@ -22,11 +22,12 @@ import Data.Text qualified as Text
...
@@ -22,11 +22,12 @@ import Data.Text qualified as Text
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers
(
cleanText
)
import
Gargantext.Core.Text.Corpus.Parsers
(
cleanText
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
writeDocs2Tsv
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
writeDocs2Tsv
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
qualified
as
TSVv3
import
Gargantext.Core.Text.Corpus.Parsers.Date
qualified
as
Date
import
Gargantext.Core.Text.Corpus.Parsers.Date
qualified
as
Date
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Defaults
qualified
as
Defaults
import
Gargantext.Defaults
qualified
as
Defaults
import
Gargantext.Prelude
hiding
(
get
)
import
Gargantext.Prelude
hiding
(
get
)
import
Isidore
qualified
as
Isidore
import
Isidore
qualified
import
Isidore.Client
import
Isidore.Client
import
Servant.Client
(
ClientError
(
DecodeFailure
)
)
import
Servant.Client
(
ClientError
(
DecodeFailure
)
)
...
@@ -47,15 +48,14 @@ get lang l q a = do
...
@@ -47,15 +48,14 @@ get lang l q a = do
iDocs
<-
either
printErr
_content
<$>
Isidore
.
get
l
q
a
iDocs
<-
either
printErr
_content
<$>
Isidore
.
get
l
q
a
hDocs
<-
mapM
(
isidoreToDoc
lang
)
(
toIsidoreDocs
iDocs
)
mapM
(
isidoreToDoc
lang
)
(
toIsidoreDocs
iDocs
)
pure
hDocs
isidore2tsvFile
::
FilePath
->
Lang
->
Maybe
Isidore
.
Limit
isidore2tsvFile
::
FilePath
->
Lang
->
Maybe
Isidore
.
Limit
->
Maybe
Isidore
.
TextQuery
->
Maybe
Isidore
.
AuthorQuery
->
Maybe
Isidore
.
TextQuery
->
Maybe
Isidore
.
AuthorQuery
->
IO
()
->
IO
()
isidore2tsvFile
fp
lang
li
tq
aq
=
do
isidore2tsvFile
fp
lang
li
tq
aq
=
do
hdocs
<-
get
lang
li
tq
aq
hdocs
<-
get
lang
li
tq
aq
writeDocs2Tsv
fp
hdocs
writeDocs2Tsv
TSVv3
.
headerTsvGargV3
TSVv3
.
doc2tsv
fp
hdocs
isidoreToDoc
::
Lang
->
IsidoreDoc
->
IO
HyperdataDocument
isidoreToDoc
::
Lang
->
IsidoreDoc
->
IO
HyperdataDocument
isidoreToDoc
lang
(
IsidoreDoc
t
a
d
u
s
as
)
=
do
isidoreToDoc
lang
(
IsidoreDoc
t
a
d
u
s
as
)
=
do
...
@@ -73,7 +73,7 @@ isidoreToDoc lang (IsidoreDoc t a d u s as) = do
...
@@ -73,7 +73,7 @@ isidoreToDoc lang (IsidoreDoc t a d u s as) = do
langText
(
OnlyText
t2
)
=
t2
langText
(
OnlyText
t2
)
=
t2
langText
(
ArrayText
ts
)
=
Text
.
unwords
$
map
langText
ts
langText
(
ArrayText
ts
)
=
Text
.
unwords
$
map
langText
ts
let
mDateS
=
maybe
(
Just
$
Text
.
pack
$
show
Defaults
.
year
)
Just
d
let
mDateS
=
d
<|>
Just
(
Text
.
pack
$
show
Defaults
.
year
)
let
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
=
Date
.
mDateSplit
mDateS
let
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
=
Date
.
mDateSplit
mDateS
pure
HyperdataDocument
pure
HyperdataDocument
...
...
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
2ceb742b
...
@@ -54,6 +54,8 @@ import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC, parseIstex)
...
@@ -54,6 +54,8 @@ import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC, parseIstex)
import
Gargantext.Core.Text.Corpus.Parsers.RIS
qualified
as
RIS
import
Gargantext.Core.Text.Corpus.Parsers.RIS
qualified
as
RIS
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
parseHal
,
parseTsv
,
parseTsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
parseHal
,
parseTsv
,
parseTsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
qualified
as
TsvHal
import
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
qualified
as
TSVv3
import
Gargantext.Core.Text.Corpus.Parsers.Types
import
Gargantext.Core.Text.Corpus.Parsers.Types
import
Gargantext.Core.Text.Corpus.Parsers.WOS
qualified
as
WOS
import
Gargantext.Core.Text.Corpus.Parsers.WOS
qualified
as
WOS
import
Gargantext.Core.Text.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Core.Text.Ngrams
(
NgramsType
(
..
))
...
@@ -111,10 +113,10 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
...
@@ -111,10 +113,10 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
->
DB
.
ByteString
->
DB
.
ByteString
->
m
(
Either
DT
.
Text
(
Integer
,
ConduitT
()
(
ParseCorpusResult
HyperdataDocument
)
IO
()
))
->
m
(
Either
DT
.
Text
(
Integer
,
ConduitT
()
(
ParseCorpusResult
HyperdataDocument
)
IO
()
))
do_parse
TsvGargV3
Plain
bs
=
do
do_parse
TsvGargV3
Plain
bs
=
do
let
eParsedC
=
parseTsvC
$
DBL
.
fromStrict
bs
let
eParsedC
=
parseTsvC
TSVv3
.
tsv2doc
$
DBL
.
fromStrict
bs
pure
(
second
(
transPipe
(
pure
.
runIdentity
))
<$>
eParsedC
)
pure
(
second
(
transPipe
(
pure
.
runIdentity
))
<$>
eParsedC
)
do_parse
TsvHal
Plain
bs
=
do
do_parse
TsvHal
Plain
bs
=
do
let
eParsedC
=
parseTsvC
$
DBL
.
fromStrict
bs
let
eParsedC
=
parseTsvC
TsvHal
.
tsv2doc
$
DBL
.
fromStrict
bs
pure
(
second
(
transPipe
(
pure
.
runIdentity
))
<$>
eParsedC
)
pure
(
second
(
transPipe
(
pure
.
runIdentity
))
<$>
eParsedC
)
do_parse
Istex
Plain
bs
=
do
do_parse
Istex
Plain
bs
=
do
ep
<-
liftBase
$
parseIstex
EN
$
DBL
.
fromStrict
bs
ep
<-
liftBase
$
parseIstex
EN
$
DBL
.
fromStrict
bs
...
@@ -221,7 +223,7 @@ parseFile :: FileType
...
@@ -221,7 +223,7 @@ parseFile :: FileType
->
FileFormat
->
FileFormat
->
FilePath
->
FilePath
->
IO
(
Either
Text
[
HyperdataDocument
])
->
IO
(
Either
Text
[
HyperdataDocument
])
parseFile
TsvGargV3
Plain
p
=
parseTsv
p
parseFile
TsvGargV3
Plain
p
=
parseTsv
TSVv3
.
tsv2doc
p
parseFile
TsvHal
Plain
p
=
parseHal
p
parseFile
TsvHal
Plain
p
=
parseHal
p
parseFile
RisPresse
Plain
p
=
do
parseFile
RisPresse
Plain
p
=
do
docs
<-
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
<$>
readFileWith
RIS
p
docs
<-
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
<$>
readFileWith
RIS
p
...
...
src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
View file @
2ceb742b
This diff is collapsed.
Click to expand it.
src/Gargantext/Core/Text/Corpus/Parsers/TSV/Diagnostics.hs
View file @
2ceb742b
This diff is collapsed.
Click to expand it.
src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv3.hs
View file @
2ceb742b
{-# LANGUAGE BangPatterns #-}
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
where
module
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
where
import
Data.Csv
import
Data.Csv
import
Data.Text
(
pack
)
import
Data.Time.Segment
(
jour
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV.Diagnostics
import
Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
import
Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Prelude
import
Gargantext.Prelude
...
@@ -45,6 +57,34 @@ data TsvDoc = TsvDoc
...
@@ -45,6 +57,34 @@ data TsvDoc = TsvDoc
deriving
(
Show
)
deriving
(
Show
)
tsv2doc
::
TsvDoc
->
HyperdataDocument
tsv2doc
(
TsvDoc
{
..
})
=
HyperdataDocument
{
_hd_bdd
=
Just
"TsvHal"
,
_hd_doi
=
Nothing
,
_hd_url
=
Nothing
,
_hd_page
=
Nothing
,
_hd_title
=
Just
tsv_title
,
_hd_authors
=
Just
tsv_authors
,
_hd_institutes
=
Nothing
,
_hd_source
=
Just
tsv_source
,
_hd_abstract
=
Just
tsv_abstract
,
_hd_publication_date
=
Just
$
pack
.
show
$
jour
(
fromIntegral
pubYear
)
pubMonth
pubDay
,
_hd_publication_year
=
Just
pubYear
,
_hd_publication_month
=
Just
pubMonth
,
_hd_publication_day
=
Just
pubDay
,
_hd_publication_hour
=
Nothing
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
Nothing
,
_hd_institutes_tree
=
Nothing
}
where
pubYear
=
fromMIntOrDec
defaultYear
tsv_publication_year
pubMonth
=
fromMaybe
defaultMonth
tsv_publication_month
pubDay
=
fromMaybe
defaultDay
tsv_publication_day
instance
FromNamedRecord
TsvDoc
where
instance
FromNamedRecord
TsvDoc
where
parseNamedRecord
r
=
do
parseNamedRecord
r
=
do
...
@@ -69,15 +109,29 @@ instance ToNamedRecord TsvDoc where
...
@@ -69,15 +109,29 @@ instance ToNamedRecord TsvDoc where
]
]
hyperdataDocument2tsvDoc
::
HyperdataDocument
->
TsvDoc
doc2tsv
::
HyperdataDocument
->
TsvDoc
hyperdataDocument2tsvDoc
h
=
TsvDoc
{
tsv_title
=
m
$
_hd_title
h
doc2tsv
h
=
TsvDoc
{
tsv_title
=
m
$
_hd_title
h
,
tsv_source
=
m
$
_hd_source
h
,
tsv_source
=
m
$
_hd_source
h
,
tsv_publication_year
=
Just
$
IntOrDec
$
mI
$
_hd_publication_year
h
,
tsv_publication_year
=
Just
$
IntOrDec
$
mI
$
_hd_publication_year
h
,
tsv_publication_month
=
Just
$
mI
$
_hd_publication_month
h
,
tsv_publication_month
=
Just
$
mI
$
_hd_publication_month
h
,
tsv_publication_day
=
Just
$
mI
$
_hd_publication_day
h
,
tsv_publication_day
=
Just
$
mI
$
_hd_publication_day
h
,
tsv_abstract
=
m
$
_hd_abstract
h
,
tsv_abstract
=
m
$
_hd_abstract
h
,
tsv_authors
=
m
$
_hd_authors
h
}
,
tsv_authors
=
m
$
_hd_authors
h
}
where
where
m
=
maybe
""
identity
m
=
maybe
""
identity
mI
=
maybe
0
identity
mI
=
maybe
0
identity
validators
::
[
ColumnValidator
]
validators
=
[
(
"Publication Day"
,
validNumber
)
,
(
"Publication Month"
,
validNumber
)
,
(
"Publication Year"
,
validNumber
)
,
(
"Authors"
,
validTextField
)
,
(
"Title"
,
validTextField
)
,
(
"Source"
,
validTextField
)
,
(
"Abstract"
,
validTextField
)
-- TODO validDateField
-- , ("Publication Date", validTextField)
]
src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv4.hs
View file @
2ceb742b
{-# LANGUAGE BangPatterns #-}
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
where
module
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
where
import
Data.Csv
import
Data.Csv
import
Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
--
import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Prelude
import
Gargantext.Prelude
...
...
src/Gargantext/Core/Text/Corpus/Parsers/TSV/TsvHal.hs
0 → 100644
View file @
2ceb742b
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
where
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.Csv
import
Data.Text
(
pack
)
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
Data.Vector
qualified
as
V
import
Gargantext.Core.Text.Corpus.Parsers.TSV.Types
(
Delimiter
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
(
readTsvLazyBS
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Prelude
------------------------------------------------------------------------
-- Hal Format
data
TsvHal
=
TsvHal
{
tsvHal_title
::
!
Text
,
tsvHal_source
::
!
Text
,
tsvHal_publication_year
::
!
Integer
,
tsvHal_publication_month
::
!
Int
,
tsvHal_publication_day
::
!
Int
,
tsvHal_abstract
::
!
Text
,
tsvHal_authors
::
!
Text
,
tsvHal_url
::
!
Text
,
tsvHal_isbn_s
::
!
Text
,
tsvHal_issue_s
::
!
Text
,
tsvHal_journalPublisher_s
::
!
Text
,
tsvHal_language_s
::
!
Text
,
tsvHal_doiId_s
::
!
Text
,
tsvHal_authId_i
::
!
Text
,
tsvHal_instStructId_i
::
!
Text
,
tsvHal_deptStructId_i
::
!
Text
,
tsvHal_labStructId_i
::
!
Text
,
tsvHal_rteamStructId_i
::
!
Text
,
tsvHal_docType_s
::
!
Text
}
deriving
(
Show
)
instance
FromNamedRecord
TsvHal
where
parseNamedRecord
r
=
do
tsvHal_title
<-
r
.:
"title"
tsvHal_source
<-
r
.:
"source"
tsvHal_publication_year
<-
r
.:
"publication_year"
tsvHal_publication_month
<-
r
.:
"publication_month"
tsvHal_publication_day
<-
r
.:
"publication_day"
tsvHal_abstract
<-
r
.:
"abstract"
tsvHal_authors
<-
r
.:
"authors"
tsvHal_url
<-
r
.:
"url"
tsvHal_isbn_s
<-
r
.:
"isbn_s"
tsvHal_issue_s
<-
r
.:
"issue_s"
tsvHal_journalPublisher_s
<-
r
.:
"journalPublisher_s"
tsvHal_language_s
<-
r
.:
"language_s"
tsvHal_doiId_s
<-
r
.:
"doiId_s"
tsvHal_authId_i
<-
r
.:
"authId_i"
tsvHal_instStructId_i
<-
r
.:
"instStructId_i"
tsvHal_deptStructId_i
<-
r
.:
"deptStructId_i"
tsvHal_labStructId_i
<-
r
.:
"labStructId_i"
tsvHal_rteamStructId_i
<-
r
.:
"rteamStructId_i"
tsvHal_docType_s
<-
r
.:
"docType_s"
pure
$
TsvHal
{
..
}
instance
ToNamedRecord
TsvHal
where
--toNamedRecord (TsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
toNamedRecord
(
TsvHal
{
..
})
=
namedRecord
[
"title"
.=
tsvHal_title
,
"source"
.=
tsvHal_source
,
"publication_year"
.=
tsvHal_publication_year
,
"publication_month"
.=
tsvHal_publication_month
,
"publication_day"
.=
tsvHal_publication_day
,
"abstract"
.=
tsvHal_abstract
,
"authors"
.=
tsvHal_authors
,
"url"
.=
tsvHal_url
,
"isbn_s"
.=
tsvHal_isbn_s
,
"issue_s"
.=
tsvHal_issue_s
,
"journalPublisher_s"
.=
tsvHal_journalPublisher_s
,
"language_s"
.=
tsvHal_language_s
,
"doiId_s"
.=
tsvHal_doiId_s
,
"authId_i"
.=
tsvHal_authId_i
,
"instStructId_i"
.=
tsvHal_instStructId_i
,
"deptStructId_i"
.=
tsvHal_deptStructId_i
,
"labStructId_i"
.=
tsvHal_labStructId_i
,
"rteamStructId_i"
.=
tsvHal_rteamStructId_i
,
"docType_s"
.=
tsvHal_docType_s
]
tsv2doc
::
TsvHal
->
HyperdataDocument
tsv2doc
(
TsvHal
{
..
})
=
HyperdataDocument
{
_hd_bdd
=
Just
"TsvHal"
,
_hd_doi
=
Just
tsvHal_doiId_s
,
_hd_url
=
Just
tsvHal_url
,
_hd_page
=
Nothing
,
_hd_title
=
Just
tsvHal_title
,
_hd_authors
=
Just
tsvHal_authors
,
_hd_institutes
=
Just
tsvHal_instStructId_i
,
_hd_source
=
Just
tsvHal_source
,
_hd_abstract
=
Just
tsvHal_abstract
,
_hd_publication_date
=
Just
$
pack
.
show
$
jour
tsvHal_publication_year
tsvHal_publication_month
tsvHal_publication_day
,
_hd_publication_year
=
Just
$
fromIntegral
tsvHal_publication_year
,
_hd_publication_month
=
Just
tsvHal_publication_month
,
_hd_publication_day
=
Just
tsvHal_publication_day
,
_hd_publication_hour
=
Nothing
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
Nothing
,
_hd_institutes_tree
=
Nothing
}
------------------------------------------------------------------------
-- | TODO use readFileLazy
readTsvHal
::
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
TsvHal
))
readTsvHal
fp
=
do
c
<-
BL
.
readFile
fp
pure
$
readTsvLazyBS
Tab
c
------------------------------------------------------------------------
parseHal
::
FilePath
->
IO
(
Either
Text
[
HyperdataDocument
])
parseHal
fp
=
do
r
<-
readTsvHal
fp
pure
$
V
.
toList
.
V
.
map
tsv2doc
.
snd
<$>
r
src/Gargantext/Core/Text/Corpus/Parsers/TSV/TsvPhylo.hs
0 → 100644
View file @
2ceb742b
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
where
import
Data.Csv
import
Gargantext.Prelude
------------------------------------------------------------------------
-- Tsv v3 weighted for phylo
data
Tsv'
=
Tsv'
{
tsv'_title
::
!
Text
,
tsv'_source
::
!
Text
,
tsv'_publication_year
::
!
Int
,
tsv'_publication_month
::
!
Int
,
tsv'_publication_day
::
!
Int
,
tsv'_abstract
::
!
Text
,
tsv'_authors
::
!
Text
,
tsv'_weight
::
!
Double
}
deriving
(
Show
)
instance
FromNamedRecord
Tsv'
where
parseNamedRecord
r
=
do
tsv'_title
<-
r
.:
"title"
tsv'_source
<-
r
.:
"source"
tsv'_publication_year
<-
r
.:
"publication_year"
tsv'_publication_month
<-
r
.:
"publication_month"
tsv'_publication_day
<-
r
.:
"publication_day"
tsv'_abstract
<-
r
.:
"abstract"
tsv'_authors
<-
r
.:
"authors"
tsv'_weight
<-
r
.:
"weight"
pure
$
Tsv'
{
..
}
src/Gargantext/Core/Text/Corpus/Parsers/TSV/Utils.hs
0 → 100644
View file @
2ceb742b
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
where
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.ByteString.Lazy.Char8
qualified
as
B8L
import
Data.Csv
import
Data.Text
(
pack
)
import
Data.Vector
(
Vector
)
import
Data.Vector
qualified
as
V
import
Gargantext.Core.Text.Corpus.Parsers.TSV.Types
import
Gargantext.Prelude
parseTsv
::
FromNamedRecord
tsvDoc
=>
(
tsvDoc
->
result
)
->
FilePath
->
IO
(
Either
Text
[
result
])
parseTsv
tsv2doc
fp
=
fmap
(
V
.
toList
.
V
.
map
tsv2doc
.
snd
)
<$>
readTSVFile
fp
-- | TODO use readByteStringLazy
readTsvLazyBS
::
FromNamedRecord
tsvDoc
=>
Delimiter
->
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
tsvDoc
)
readTsvLazyBS
d
bs
=
first
pack
$
decodeByNameWith
(
tsvDecodeOptions
d
)
bs
-- | TODO use readFileLazy
readTSVFile
::
FromNamedRecord
tsvDoc
=>
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
tsvDoc
))
readTSVFile
fp
=
do
file
<-
BL
.
readFile
fp
case
detectDelimiter
file
of
Left
err
->
pure
$
Left
err
Right
del
->
pure
$
readTsvLazyBS
del
file
readTSVFileDelim
::
FromNamedRecord
tsvDoc
=>
FilePath
->
Delimiter
->
IO
(
Either
Text
(
Header
,
Vector
tsvDoc
))
readTSVFileDelim
fp
del
=
do
file
<-
BL
.
readFile
fp
pure
$
readTsvLazyBS
del
file
-- Detects delimiter based on the first line
detectDelimiter
::
BL
.
ByteString
->
Either
Text
Delimiter
detectDelimiter
input
=
case
B8L
.
lines
input
of
(
firstLine
:
_
)
->
let
candidates
=
[(
','
,
count
','
firstLine
),
(
'
\t
'
,
count
'
\t
'
firstLine
)]
in
case
fst
$
maximumBy
(
comparing
snd
)
candidates
of
'
\n
'
->
Right
Line
'
\t
'
->
Right
Tab
','
->
Right
Comma
_
->
Left
$
"Invalid delimiter detected for input tsv document."
_
->
Left
"Couldn't detect a valid delimiter for the input document."
-- Count occurrences of a character in a ByteString
count
::
Char
->
BL
.
ByteString
->
Int64
count
c
=
BL
.
count
(
fromIntegral
(
fromEnum
c
))
------------------------------------------------------------------------
writeDocs2Tsv
::
ToNamedRecord
tsvDoc
=>
Header
->
(
doc
->
tsvDoc
)
->
FilePath
->
[
doc
]
->
IO
()
writeDocs2Tsv
hdr
doc2tsv
fp
hs
=
BL
.
writeFile
fp
$
encodeByNameWith
(
tsvEncodeOptions
Tab
)
hdr
(
doc2tsv
<$>
hs
)
test/Test/Core/Text/Corpus/TSV.hs
View file @
2ceb742b
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
module
Test.Core.Text.Corpus.TSV
(
tests
)
where
module
Test.Core.Text.Corpus.TSV
(
tests
)
where
import
Gargantext.Core.Text.Corpus.Parsers.TSV
import
Gargantext.Core.Text.Corpus.Parsers.TSV
import
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
qualified
as
TSVv3
import
Test.QuickCheck
import
Test.QuickCheck
import
Test.QuickCheck.Instances
()
import
Test.QuickCheck.Instances
()
import
Data.ByteString.Lazy.UTF8
as
BLU
import
Data.ByteString.Lazy.UTF8
as
BLU
...
@@ -182,7 +183,7 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
...
@@ -182,7 +183,7 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
let
line
=
createLineFromCorpus
tsv
del
let
line
=
createLineFromCorpus
tsv
del
let
headers
=
Prelude
.
map
DT
.
pack
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Source"
,
"Abstract"
]
let
headers
=
Prelude
.
map
DT
.
pack
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Source"
,
"Abstract"
]
let
splitLine
=
BL
.
splitWith
(
==
delimiter
del
)
line
let
splitLine
=
BL
.
splitWith
(
==
delimiter
del
)
line
case
testErrorPerLine
splitLine
del
headers
1
of
case
testErrorPerLine
TSVv3
.
validators
splitLine
del
headers
1
of
Right
_
->
True
Right
_
->
True
Left
_
->
validRandomCorpus
tsv
del
)
Left
_
->
validRandomCorpus
tsv
del
)
...
@@ -194,13 +195,13 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
...
@@ -194,13 +195,13 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
testTestCorrectFile
::
Property
testTestCorrectFile
::
Property
testTestCorrectFile
=
forAll
generateFile
(
\
file
->
do
testTestCorrectFile
=
forAll
generateFile
(
\
file
->
do
let
tsv
=
createFile
file
let
tsv
=
createFile
file
case
testCorrectFile
tsv
of
case
testCorrectFile
TSVv3
.
validators
tsv
of
Right
del
->
del
==
fDelimiter
file
Right
del
->
del
==
fDelimiter
file
Left
_
->
Prelude
.
all
(
\
x
->
do
Left
_
->
Prelude
.
all
(
\
x
->
do
let
del
=
fDelimiter
file
let
del
=
fDelimiter
file
let
headers
=
Prelude
.
map
DT
.
pack
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Source"
,
"Abstract"
]
let
headers
=
Prelude
.
map
DT
.
pack
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Source"
,
"Abstract"
]
let
splitLine
=
BL
.
splitWith
(
==
delimiter
del
)
$
createLineFromCorpus
x
del
let
splitLine
=
BL
.
splitWith
(
==
delimiter
del
)
$
createLineFromCorpus
x
del
case
testErrorPerLine
splitLine
del
headers
1
of
case
testErrorPerLine
TSVv3
.
validators
splitLine
del
headers
1
of
Right
_
->
True
Right
_
->
True
Left
_
->
validRandomCorpus
x
del
)
(
allCorpus
file
))
Left
_
->
validRandomCorpus
x
del
)
(
allCorpus
file
))
...
@@ -209,13 +210,13 @@ testTestCorrectFile = forAll generateFile (\file -> do
...
@@ -209,13 +210,13 @@ testTestCorrectFile = forAll generateFile (\file -> do
testTestCorrectFileWithNewLine
::
Property
testTestCorrectFileWithNewLine
::
Property
testTestCorrectFileWithNewLine
=
forAll
generateFile
(
\
file
->
do
testTestCorrectFileWithNewLine
=
forAll
generateFile
(
\
file
->
do
let
tsv
=
createFileWithNewLine
file
let
tsv
=
createFileWithNewLine
file
case
testCorrectFile
tsv
of
case
testCorrectFile
TSVv3
.
validators
tsv
of
Right
_
->
True
Right
_
->
True
Left
_
->
Prelude
.
all
(
\
x
->
do
Left
_
->
Prelude
.
all
(
\
x
->
do
let
del
=
fDelimiter
file
let
del
=
fDelimiter
file
let
headers
=
Prelude
.
map
DT
.
pack
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Source"
,
"Abstract"
]
let
headers
=
Prelude
.
map
DT
.
pack
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Source"
,
"Abstract"
]
let
splitLine
=
BL
.
splitWith
(
==
delimiter
del
)
$
createLineFromCorpus
x
del
let
splitLine
=
BL
.
splitWith
(
==
delimiter
del
)
$
createLineFromCorpus
x
del
case
testErrorPerLine
splitLine
del
headers
1
of
case
testErrorPerLine
TSVv3
.
validators
splitLine
del
headers
1
of
Right
_
->
True
Right
_
->
True
Left
_
->
validRandomCorpus
x
del
)
(
allCorpus
file
))
Left
_
->
validRandomCorpus
x
del
)
(
allCorpus
file
))
...
@@ -233,7 +234,7 @@ testFindDelimiter = forAll generateFileDelimiter (\file -> do
...
@@ -233,7 +234,7 @@ testFindDelimiter = forAll generateFileDelimiter (\file -> do
testGetHeader
::
Property
testGetHeader
::
Property
testGetHeader
=
forAll
randomHeaderList
(
\
headers
->
do
testGetHeader
=
forAll
randomHeaderList
(
\
headers
->
do
let
headersLines
=
(
BL
.
intercalate
(
delimiterBS
Tab
)
$
Prelude
.
map
BLU
.
fromString
headers
)
:
[]
let
headersLines
=
(
BL
.
intercalate
(
delimiterBS
Tab
)
$
Prelude
.
map
BLU
.
fromString
headers
)
:
[]
case
getHeaders
headersLines
Tab
of
case
getHeaders
TSVv3
.
validators
headersLines
Tab
of
Right
_
->
True
Right
_
->
True
Left
_
|
not
(
"Publication Day"
`
Prelude
.
elem
`
headers
)
->
True
Left
_
|
not
(
"Publication Day"
`
Prelude
.
elem
`
headers
)
->
True
|
not
(
"Publication Month"
`
Prelude
.
elem
`
headers
)
->
True
|
not
(
"Publication Month"
`
Prelude
.
elem
`
headers
)
->
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment