Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
149
Issues
149
List
Board
Labels
Milestones
Merge Requests
5
Merge Requests
5
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
b0d6365f
Commit
b0d6365f
authored
Mar 31, 2025
by
Alfredo Di Napoli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Remove unused code in Parsers.TSV module
parent
622f7062
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
30 additions
and
143 deletions
+30
-143
TSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+30
-143
No files found.
src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
View file @
b0d6365f
...
...
@@ -15,10 +15,38 @@ TSV parser for Gargantext corpus files.
-}
module
Gargantext.Core.Text.Corpus.Parsers.TSV
where
module
Gargantext.Core.Text.Corpus.Parsers.TSV
(
readTSVFile
,
Delimiter
(
..
)
,
parseHal
,
parseTsv
,
parseTsvC
,
tsvDecodeOptions
,
writeDocs2Tsv
,
readTsvHal
,
TsvHal
(
..
)
,
AtRow
(
..
)
,
ParseTsvResult
(
..
)
,
parseTsvWithDiagnostics
-- * Used in tests
,
testCorrectFile
,
testErrorPerLine
,
validTextField
,
validNumber
,
Tsv
'
(
..
)
,
TsvDoc
(
..
)
,
getHeaders
,
findDelimiter
,
delimiter
-- * Used in the main executables
,
readWeightedTsv
,
defaultDay
,
defaultYear
,
fromMIntOrDec
,
defaultMonth
)
where
import
Conduit
(
ConduitT
,
(
.|
),
yieldMany
,
mapC
)
import
Data.ByteString
qualified
as
BS
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.Csv
import
Data.Text
(
pack
)
...
...
@@ -29,8 +57,6 @@ import Data.Text.Read qualified as DTR
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
Data.Vector
qualified
as
V
import
Gargantext.Core.Text
(
sentences
,
unsentences
)
import
Gargantext.Core.Text.Context
(
splitBy
,
SplitContext
(
..
)
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Prelude
hiding
(
length
,
show
)
import
Protolude
...
...
@@ -59,91 +85,6 @@ data TsvGargV3 = TsvGargV3
,
d_authors
::
!
Text
}
deriving
(
Show
)
---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
toDoc
::
TsvGargV3
->
HyperdataDocument
toDoc
(
TsvGargV3
did
dt
_
dpy
dpm
dpd
dab
dau
)
=
HyperdataDocument
{
_hd_bdd
=
Just
"TSV"
,
_hd_doi
=
Just
.
pack
.
show
$
did
,
_hd_url
=
Nothing
,
_hd_page
=
Nothing
,
_hd_title
=
Just
dt
,
_hd_authors
=
Nothing
,
_hd_institutes
=
Just
dau
,
_hd_source
=
Just
dab
,
_hd_abstract
=
Nothing
,
_hd_publication_date
=
Nothing
,
_hd_publication_year
=
Just
dpy
,
_hd_publication_month
=
Just
dpm
,
_hd_publication_day
=
Just
dpd
,
_hd_publication_hour
=
Nothing
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
Nothing
,
_hd_institutes_tree
=
Nothing
}
---------------------------------------------------------------
-- | Types Conversions
toDocs
::
Vector
TsvDoc
->
[
TsvGargV3
]
toDocs
v
=
V
.
toList
$
V
.
zipWith
(
\
nId
(
TsvDoc
{
..
})
-- (TsvDoc t s mPy pm pd abst auth)
->
TsvGargV3
{
d_docId
=
nId
,
d_title
=
tsv_title
,
d_source
=
tsv_source
,
d_publication_year
=
fromMIntOrDec
defaultYear
tsv_publication_year
,
d_publication_month
=
fromMaybe
defaultMonth
tsv_publication_month
,
d_publication_day
=
fromMaybe
defaultDay
tsv_publication_day
,
d_abstract
=
tsv_abstract
,
d_authors
=
tsv_authors
})
(
V
.
enumFromN
1
(
V
.
length
v''
))
v''
where
v''
=
V
.
foldl
(
\
v'
sep
->
V
.
concatMap
(
splitDoc
(
docsSize
v'
)
sep
)
v'
)
v
seps
seps
=
V
.
fromList
[
Paragraphs
1
,
Sentences
3
,
Chars
3
]
---------------------------------------------------------------
fromDocs
::
Vector
TsvGargV3
->
Vector
TsvDoc
fromDocs
=
V
.
map
fromDocs'
where
fromDocs'
(
TsvGargV3
{
..
})
=
TsvDoc
{
tsv_title
=
d_title
,
tsv_source
=
d_source
,
tsv_publication_year
=
Just
$
IntOrDec
d_publication_year
,
tsv_publication_month
=
Just
d_publication_month
,
tsv_publication_day
=
Just
d_publication_day
,
tsv_abstract
=
d_abstract
,
tsv_authors
=
d_authors
}
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc
::
Mean
->
SplitContext
->
TsvDoc
->
Vector
TsvDoc
splitDoc
m
splt
doc
=
let
docSize
=
(
T
.
length
$
tsv_abstract
doc
)
in
if
(
docSize
>
1000
)
&&
(
mod
(
round
m
)
docSize
>=
10
)
then
splitDoc'
splt
doc
else
V
.
fromList
[
doc
]
where
splitDoc'
::
SplitContext
->
TsvDoc
->
Vector
TsvDoc
splitDoc'
contextSize
(
TsvDoc
{
..
})
=
V
.
fromList
$
[
firstDoc
]
<>
nextDocs
where
firstDoc
=
TsvDoc
{
tsv_abstract
=
firstAbstract
,
..
}
firstAbstract
=
head'
"splitDoc'1"
abstracts
nextDocs
=
map
(
\
txt
->
TsvDoc
{
tsv_title
=
head'
"splitDoc'2"
$
sentences
txt
,
tsv_abstract
=
unsentences
$
tail'
"splitDoc'1"
$
sentences
txt
,
..
}
)
(
tail'
"splitDoc'2"
abstracts
)
abstracts
=
(
splitBy
$
contextSize
)
tsv_abstract
---------------------------------------------------------------
---------------------------------------------------------------
type
Mean
=
Double
docsSize
::
Vector
TsvDoc
->
Mean
docsSize
tsvDoc
=
mean
ls
where
ls
=
V
.
toList
$
V
.
map
(
fromIntegral
.
T
.
length
.
tsv_abstract
)
tsvDoc
---------------------------------------------------------------
newtype
IntOrDec
=
IntOrDec
Int
...
...
@@ -371,36 +312,6 @@ getHeaders bl del = do
Just
headers
->
testAllHeadersPresence
(
map
(
\
x
->
T
.
replace
(
T
.
pack
"
\"
"
)
(
T
.
pack
""
)
(
lBLToText
x
))
headers
)
------------------------------------------------------------------------
readFileLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
a
))
readFileLazy
d
f
=
fmap
(
readByteStringLazy
d
f
)
.
BL
.
readFile
readFileStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
a
))
readFileStrict
d
f
=
fmap
(
readByteStringStrict
d
f
)
.
BS
.
readFile
readByteStringLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
a
)
readByteStringLazy
_f
d
bs
=
first
pack
$
decodeByNameWith
(
tsvDecodeOptions
d
)
bs
readByteStringStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
BS
.
ByteString
->
Either
Text
(
Header
,
Vector
a
)
readByteStringStrict
d
ff
=
readByteStringLazy
d
ff
.
BL
.
fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readTSVFile
::
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
TsvDoc
))
...
...
@@ -459,14 +370,6 @@ readTsvHal fp = do
readTsvHalLazyBS
::
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
TsvHal
)
readTsvHalLazyBS
bs
=
first
pack
$
decodeByNameWith
(
tsvDecodeOptions
Tab
)
bs
readTsvHalBSStrict
::
BS
.
ByteString
->
Either
Text
(
Header
,
Vector
TsvHal
)
readTsvHalBSStrict
bs
=
readTsvHalLazyBS
$
BL
.
fromStrict
bs
------------------------------------------------------------------------
writeFile
::
FilePath
->
(
Header
,
Vector
TsvDoc
)
->
IO
()
writeFile
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
(
tsvEncodeOptions
Tab
)
h
(
V
.
toList
vs
)
writeDocs2Tsv
::
FilePath
->
[
HyperdataDocument
]
->
IO
()
writeDocs2Tsv
fp
hs
=
BL
.
writeFile
fp
$
hyperdataDocument2tsv
hs
...
...
@@ -610,27 +513,11 @@ parseHal fp = do
r
<-
readTsvHal
fp
pure
$
V
.
toList
.
V
.
map
tsvHal2doc
.
snd
<$>
r
parseHal'
::
BL
.
ByteString
->
Either
Text
[
HyperdataDocument
]
parseHal'
bs
=
V
.
toList
.
V
.
map
tsvHal2doc
.
snd
<$>
readTsvHalLazyBS
bs
------------------------------------------------------------------------
parseTsv
::
FilePath
->
IO
(
Either
Text
[
HyperdataDocument
])
parseTsv
fp
=
fmap
(
V
.
toList
.
V
.
map
tsv2doc
.
snd
)
<$>
readTSVFile
fp
{-
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
parseTsv'
::
BL
.
ByteString
->
Either
Text
[
HyperdataDocument
]
parseTsv'
bs
=
do
let
result
=
case
testCorrectFile
bs
of
Left
_err
->
Left
_err
Right
del
->
readTsvLazyBS
del
bs
V
.
toList
.
V
.
map
tsv2doc
.
snd
<$>
result
parseTsvC
::
BL
.
ByteString
->
Either
Text
(
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
parseTsvC
bs
=
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment