Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
159
Issues
159
List
Board
Labels
Milestones
Merge Requests
7
Merge Requests
7
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
094b511b
Commit
094b511b
authored
Jan 16, 2025
by
Yoelis Acourt
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
chore: remove dead code
parent
82ffefa1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
1 addition
and
164 deletions
+1
-164
Isidore.hs
src/Gargantext/Core/Text/Corpus/API/Isidore.hs
+0
-10
TSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+1
-154
No files found.
src/Gargantext/Core/Text/Corpus/API/Isidore.hs
View file @
094b511b
...
@@ -14,14 +14,11 @@ Portability : POSIX
...
@@ -14,14 +14,11 @@ Portability : POSIX
module
Gargantext.Core.Text.Corpus.API.Isidore
(
module
Gargantext.Core.Text.Corpus.API.Isidore
(
get
get
-- * Internals (possibly unused?)
,
isidore2tsvFile
)
where
)
where
import
Data.Text
qualified
as
Text
import
Data.Text
qualified
as
Text
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers
(
cleanText
)
import
Gargantext.Core.Text.Corpus.Parsers
(
cleanText
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
writeDocs2Tsv
)
import
Gargantext.Core.Text.Corpus.Parsers.Date
qualified
as
Date
import
Gargantext.Core.Text.Corpus.Parsers.Date
qualified
as
Date
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Defaults
qualified
as
Defaults
import
Gargantext.Defaults
qualified
as
Defaults
...
@@ -50,13 +47,6 @@ get lang l q a = do
...
@@ -50,13 +47,6 @@ get lang l q a = do
hDocs
<-
mapM
(
isidoreToDoc
lang
)
(
toIsidoreDocs
iDocs
)
hDocs
<-
mapM
(
isidoreToDoc
lang
)
(
toIsidoreDocs
iDocs
)
pure
hDocs
pure
hDocs
isidore2tsvFile
::
FilePath
->
Lang
->
Maybe
Isidore
.
Limit
->
Maybe
Isidore
.
TextQuery
->
Maybe
Isidore
.
AuthorQuery
->
IO
()
isidore2tsvFile
fp
lang
li
tq
aq
=
do
hdocs
<-
get
lang
li
tq
aq
writeDocs2Tsv
fp
hdocs
isidoreToDoc
::
Lang
->
IsidoreDoc
->
IO
HyperdataDocument
isidoreToDoc
::
Lang
->
IsidoreDoc
->
IO
HyperdataDocument
isidoreToDoc
lang
(
IsidoreDoc
t
a
d
u
s
as
)
=
do
isidoreToDoc
lang
(
IsidoreDoc
t
a
d
u
s
as
)
=
do
let
let
...
...
src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
View file @
094b511b
...
@@ -15,7 +15,6 @@ TSV parser for Gargantext corpus files.
...
@@ -15,7 +15,6 @@ TSV parser for Gargantext corpus files.
module
Gargantext.Core.Text.Corpus.Parsers.TSV
where
module
Gargantext.Core.Text.Corpus.Parsers.TSV
where
import
Conduit
(
ConduitT
,
(
.|
),
yieldMany
,
mapC
)
import
Conduit
(
ConduitT
,
(
.|
),
yieldMany
,
mapC
)
import
Data.ByteString
qualified
as
BS
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.Csv
import
Data.Csv
import
Data.Text
(
pack
)
import
Data.Text
(
pack
)
...
@@ -26,8 +25,6 @@ import Data.Text.Read qualified as DTR
...
@@ -26,8 +25,6 @@ import Data.Text.Read qualified as DTR
import
Data.Time.Segment
(
jour
)
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
Data.Vector
(
Vector
)
import
Data.Vector
qualified
as
V
import
Data.Vector
qualified
as
V
import
Gargantext.Core.Text
(
sentences
,
unsentences
)
import
Gargantext.Core.Text.Context
(
splitBy
,
SplitContext
(
..
)
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Prelude
hiding
(
length
,
show
)
import
Gargantext.Prelude
hiding
(
length
,
show
)
import
Protolude
import
Protolude
...
@@ -42,6 +39,7 @@ headerTsvGargV3 =
...
@@ -42,6 +39,7 @@ headerTsvGargV3 =
,
"publication_day"
,
"publication_day"
,
"abstract"
,
"abstract"
,
"authors"
,
"authors"
]
]
---------------------------------------------------------------
---------------------------------------------------------------
data
TsvGargV3
=
TsvGargV3
data
TsvGargV3
=
TsvGargV3
...
@@ -56,91 +54,9 @@ data TsvGargV3 = TsvGargV3
...
@@ -56,91 +54,9 @@ data TsvGargV3 = TsvGargV3
}
}
deriving
(
Show
)
deriving
(
Show
)
---------------------------------------------------------------
---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
toDoc
::
TsvGargV3
->
HyperdataDocument
toDoc
(
TsvGargV3
did
dt
_
dpy
dpm
dpd
dab
dau
)
=
HyperdataDocument
{
_hd_bdd
=
Just
"TSV"
,
_hd_doi
=
Just
.
pack
.
show
$
did
,
_hd_url
=
Nothing
,
_hd_page
=
Nothing
,
_hd_title
=
Just
dt
,
_hd_authors
=
Nothing
,
_hd_institutes
=
Just
dau
,
_hd_source
=
Just
dab
,
_hd_abstract
=
Nothing
,
_hd_publication_date
=
Nothing
,
_hd_publication_year
=
Just
dpy
,
_hd_publication_month
=
Just
dpm
,
_hd_publication_day
=
Just
dpd
,
_hd_publication_hour
=
Nothing
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
Nothing
,
_hd_institutes_tree
=
Nothing
}
---------------------------------------------------------------
-- | Types Conversions
toDocs
::
Vector
TsvDoc
->
[
TsvGargV3
]
toDocs
v
=
V
.
toList
$
V
.
zipWith
(
\
nId
(
TsvDoc
{
..
})
-- (TsvDoc t s mPy pm pd abst auth)
->
TsvGargV3
{
d_docId
=
nId
,
d_title
=
tsv_title
,
d_source
=
tsv_source
,
d_publication_year
=
fromMIntOrDec
defaultYear
tsv_publication_year
,
d_publication_month
=
fromMaybe
defaultMonth
tsv_publication_month
,
d_publication_day
=
fromMaybe
defaultDay
tsv_publication_day
,
d_abstract
=
tsv_abstract
,
d_authors
=
tsv_authors
})
(
V
.
enumFromN
1
(
V
.
length
v''
))
v''
where
v''
=
V
.
foldl
(
\
v'
sep
->
V
.
concatMap
(
splitDoc
(
docsSize
v'
)
sep
)
v'
)
v
seps
seps
=
V
.
fromList
[
Paragraphs
1
,
Sentences
3
,
Chars
3
]
---------------------------------------------------------------
fromDocs
::
Vector
TsvGargV3
->
Vector
TsvDoc
fromDocs
=
V
.
map
fromDocs'
where
fromDocs'
(
TsvGargV3
{
..
})
=
TsvDoc
{
tsv_title
=
d_title
,
tsv_source
=
d_source
,
tsv_publication_year
=
Just
$
IntOrDec
d_publication_year
,
tsv_publication_month
=
Just
d_publication_month
,
tsv_publication_day
=
Just
d_publication_day
,
tsv_abstract
=
d_abstract
,
tsv_authors
=
d_authors
}
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc
::
Mean
->
SplitContext
->
TsvDoc
->
Vector
TsvDoc
splitDoc
m
splt
doc
=
let
docSize
=
(
T
.
length
$
tsv_abstract
doc
)
in
if
(
docSize
>
1000
)
&&
(
mod
(
round
m
)
docSize
>=
10
)
then
splitDoc'
splt
doc
else
V
.
fromList
[
doc
]
where
splitDoc'
::
SplitContext
->
TsvDoc
->
Vector
TsvDoc
splitDoc'
contextSize
(
TsvDoc
{
..
})
=
V
.
fromList
$
[
firstDoc
]
<>
nextDocs
where
firstDoc
=
TsvDoc
{
tsv_abstract
=
firstAbstract
,
..
}
firstAbstract
=
head'
"splitDoc'1"
abstracts
nextDocs
=
map
(
\
txt
->
TsvDoc
{
tsv_title
=
head'
"splitDoc'2"
$
sentences
txt
,
tsv_abstract
=
unsentences
$
tail'
"splitDoc'1"
$
sentences
txt
,
..
}
)
(
tail'
"splitDoc'2"
abstracts
)
abstracts
=
(
splitBy
$
contextSize
)
tsv_abstract
---------------------------------------------------------------
---------------------------------------------------------------
---------------------------------------------------------------
type
Mean
=
Double
type
Mean
=
Double
docsSize
::
Vector
TsvDoc
->
Mean
docsSize
tsvDoc
=
mean
ls
where
ls
=
V
.
toList
$
V
.
map
(
fromIntegral
.
T
.
length
.
tsv_abstract
)
tsvDoc
---------------------------------------------------------------
---------------------------------------------------------------
newtype
IntOrDec
=
IntOrDec
Int
newtype
IntOrDec
=
IntOrDec
Int
deriving
(
Show
,
Eq
,
Read
)
deriving
(
Show
,
Eq
,
Read
)
...
@@ -195,28 +111,12 @@ instance ToNamedRecord TsvDoc where
...
@@ -195,28 +111,12 @@ instance ToNamedRecord TsvDoc where
,
"authors"
.=
tsv_authors
,
"authors"
.=
tsv_authors
]
]
hyperdataDocument2tsvDoc
::
HyperdataDocument
->
TsvDoc
hyperdataDocument2tsvDoc
h
=
TsvDoc
{
tsv_title
=
m
$
_hd_title
h
,
tsv_source
=
m
$
_hd_source
h
,
tsv_publication_year
=
Just
$
IntOrDec
$
mI
$
_hd_publication_year
h
,
tsv_publication_month
=
Just
$
mI
$
_hd_publication_month
h
,
tsv_publication_day
=
Just
$
mI
$
_hd_publication_day
h
,
tsv_abstract
=
m
$
_hd_abstract
h
,
tsv_authors
=
m
$
_hd_authors
h
}
where
m
=
maybe
""
identity
mI
=
maybe
0
identity
data
Delimiter
=
Tab
|
Comma
|
Line
deriving
(
Eq
,
Show
)
data
Delimiter
=
Tab
|
Comma
|
Line
deriving
(
Eq
,
Show
)
tsvDecodeOptions
::
Delimiter
->
DecodeOptions
tsvDecodeOptions
::
Delimiter
->
DecodeOptions
tsvDecodeOptions
d
=
defaultDecodeOptions
{
decDelimiter
=
delimiter
d
}
tsvDecodeOptions
d
=
defaultDecodeOptions
{
decDelimiter
=
delimiter
d
}
tsvEncodeOptions
::
Delimiter
->
EncodeOptions
tsvEncodeOptions
d
=
defaultEncodeOptions
{
encDelimiter
=
delimiter
d
}
delimiter
::
Delimiter
->
Word8
delimiter
::
Delimiter
->
Word8
delimiter
Tab
=
fromIntegral
$
ord
'
\t
'
delimiter
Tab
=
fromIntegral
$
ord
'
\t
'
delimiter
Comma
=
fromIntegral
$
ord
','
delimiter
Comma
=
fromIntegral
$
ord
','
...
@@ -368,33 +268,6 @@ getHeaders bl del = do
...
@@ -368,33 +268,6 @@ getHeaders bl del = do
------------------------------------------------------------------------
------------------------------------------------------------------------
readFileLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
a
))
readFileLazy
d
f
=
fmap
(
readByteStringLazy
d
f
)
.
BL
.
readFile
readFileStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
a
))
readFileStrict
d
f
=
fmap
(
readByteStringStrict
d
f
)
.
BS
.
readFile
readByteStringLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
a
)
readByteStringLazy
_f
d
bs
=
first
pack
$
decodeByNameWith
(
tsvDecodeOptions
d
)
bs
readByteStringStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
BS
.
ByteString
->
Either
Text
(
Header
,
Vector
a
)
readByteStringStrict
d
ff
=
readByteStringLazy
d
ff
.
BL
.
fromStrict
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | TODO use readFileLazy
-- | TODO use readFileLazy
...
@@ -424,19 +297,8 @@ readTsvHal fp = do
...
@@ -424,19 +297,8 @@ readTsvHal fp = do
readTsvHalLazyBS
::
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
TsvHal
)
readTsvHalLazyBS
::
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
TsvHal
)
readTsvHalLazyBS
bs
=
first
pack
$
decodeByNameWith
(
tsvDecodeOptions
Tab
)
bs
readTsvHalLazyBS
bs
=
first
pack
$
decodeByNameWith
(
tsvDecodeOptions
Tab
)
bs
readTsvHalBSStrict
::
BS
.
ByteString
->
Either
Text
(
Header
,
Vector
TsvHal
)
readTsvHalBSStrict
bs
=
readTsvHalLazyBS
$
BL
.
fromStrict
bs
------------------------------------------------------------------------
writeFile
::
FilePath
->
(
Header
,
Vector
TsvDoc
)
->
IO
()
writeFile
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
(
tsvEncodeOptions
Tab
)
h
(
V
.
toList
vs
)
writeDocs2Tsv
::
FilePath
->
[
HyperdataDocument
]
->
IO
()
writeDocs2Tsv
fp
hs
=
BL
.
writeFile
fp
$
hyperdataDocument2tsv
hs
hyperdataDocument2tsv
::
[
HyperdataDocument
]
->
BL
.
ByteString
hyperdataDocument2tsv
hs
=
encodeByNameWith
(
tsvEncodeOptions
Tab
)
headerTsvGargV3
(
map
hyperdataDocument2tsvDoc
hs
)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- Hal Format
-- Hal Format
...
@@ -577,27 +439,12 @@ parseHal fp = do
...
@@ -577,27 +439,12 @@ parseHal fp = do
r
<-
readTsvHal
fp
r
<-
readTsvHal
fp
pure
$
V
.
toList
.
V
.
map
tsvHal2doc
.
snd
<$>
r
pure
$
V
.
toList
.
V
.
map
tsvHal2doc
.
snd
<$>
r
parseHal'
::
BL
.
ByteString
->
Either
Text
[
HyperdataDocument
]
parseHal'
bs
=
V
.
toList
.
V
.
map
tsvHal2doc
.
snd
<$>
readTsvHalLazyBS
bs
------------------------------------------------------------------------
------------------------------------------------------------------------
parseTsv
::
FilePath
->
IO
(
Either
Text
[
HyperdataDocument
])
parseTsv
::
FilePath
->
IO
(
Either
Text
[
HyperdataDocument
])
parseTsv
fp
=
fmap
(
V
.
toList
.
V
.
map
tsv2doc
.
snd
)
<$>
readTSVFile
fp
parseTsv
fp
=
fmap
(
V
.
toList
.
V
.
map
tsv2doc
.
snd
)
<$>
readTSVFile
fp
{-
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
parseTsv'
::
BL
.
ByteString
->
Either
Text
[
HyperdataDocument
]
parseTsv'
bs
=
do
let
result
=
case
(
testCorrectFile
bs
)
of
Left
_err
->
Left
_err
Right
del
->
readTsvLazyBS
del
bs
V
.
toList
.
V
.
map
tsv2doc
.
snd
<$>
result
parseTsvC
::
BL
.
ByteString
parseTsvC
::
BL
.
ByteString
->
Either
Text
(
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
->
Either
Text
(
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
parseTsvC
bs
=
parseTsvC
bs
=
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment