Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
157
Issues
157
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
ac7c3653
Commit
ac7c3653
authored
Mar 07, 2022
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[conduit] some work towards migrating file parser to conduit (does not compile)
parent
2a3137de
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
68 additions
and
30 deletions
+68
-30
.gitignore
.gitignore
+2
-0
New.hs
src/Gargantext/API/Node/Corpus/New.hs
+31
-29
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+24
-1
CSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+11
-0
No files found.
.gitignore
View file @
ac7c3653
...
...
@@ -38,3 +38,5 @@ repos
repo.json*
tmp*repo*json
data
devops/docker/js-cache
src/Gargantext/API/Node/Corpus/New.hs
View file @
ac7c3653
...
...
@@ -24,6 +24,7 @@ import Control.Lens hiding (elements, Empty)
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
qualified
Data.ByteString.Base64
as
BSB64
import
Data.Conduit.Internal
(
zipSources
)
import
Data.Either
import
Data.Maybe
(
fromMaybe
)
import
Data.Swagger
...
...
@@ -267,13 +268,15 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
printDebug
"[addToCorpusWithForm] Parsing corpus: "
cid
printDebug
"[addToCorpusWithForm] fileType"
ft
logStatus
jobLog
limit'
<-
view
$
hasConfig
.
gc_max_docs_parsers
let
limit
=
fromIntegral
limit'
let
parse
=
case
ft
of
CSV_HAL
->
Parser
.
parseFormat
Parser
.
CsvHal
CSV
->
Parser
.
parseFormat
Parser
.
CsvGargV3
WOS
->
Parser
.
parseFormat
Parser
.
WOS
PresseRIS
->
Parser
.
parseFormat
Parser
.
RisPresse
ZIP
->
Parser
.
parseFormat
Parser
.
ZIP
parse
C
=
case
ft
of
CSV_HAL
->
Parser
.
parseFormat
C
Parser
.
CsvHal
CSV
->
Parser
.
parseFormat
C
Parser
.
CsvGargV3
WOS
->
Parser
.
parseFormat
C
Parser
.
WOS
PresseRIS
->
Parser
.
parseFormat
C
Parser
.
RisPresse
ZIP
->
Parser
.
parseFormat
C
Parser
.
ZIP
-- TODO granularity of the logStatus
let
data
'
=
case
ft
of
...
...
@@ -281,37 +284,36 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
Left
err
->
panic
$
T
.
pack
"[addToCorpusWithForm] error decoding base64: "
<>
T
.
pack
err
Right
decoded
->
decoded
_
->
cs
d
eDocs
<-
liftBase
$
parse
data
'
case
eDocs
of
Right
docs
->
do
eDocs
C
<-
liftBase
$
parseC
data
'
case
eDocs
C
of
Right
docs
C
->
do
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
limit'
<-
view
$
hasConfig
.
gc_max_docs_parsers
let
limit
=
fromIntegral
limit'
if
length
docs
>
limit
then
do
printDebug
"[addToCorpusWithForm] number of docs exceeds the limit"
(
show
$
length
docs
)
let
panicMsg'
=
[
"[addToCorpusWithForm] number of docs ("
,
show
$
length
docs
,
") exceeds the MAX_DOCS_PARSERS limit ("
,
show
limit
,
")"
]
let
panicMsg
=
T
.
concat
$
T
.
pack
<$>
panicMsg'
logStatus
$
jobLogFailTotalWithMessage
panicMsg
jobLog
panic
panicMsg
else
pure
()
printDebug
"Parsing corpus finished : "
cid
logStatus
jobLog2
printDebug
"Starting extraction : "
cid
let
docsC'
=
zipSources
(
yieldMany
[
1
..
])
docsC
.|
mapMC
\
(
idx
,
doc
)
->
do
if
idx
>
limit
then
do
printDebug
"[addToCorpusWithForm] number of docs exceeds the limit"
(
show
limit
)
let
panicMsg'
=
[
"[addToCorpusWithForm] number of docs "
,
"exceeds the MAX_DOCS_PARSERS limit ("
,
show
limit
,
")"
]
let
panicMsg
=
T
.
concat
$
T
.
pack
<$>
panicMsg'
logStatus
$
jobLogFailTotalWithMessage
panicMsg
jobLog
panic
panicMsg
else
pure
doc
--printDebug "Parsing corpus finished : " cid
--logStatus jobLog2
--printDebug "Starting extraction : " cid
-- TODO granularity of the logStatus
_cid'
<-
flowCorpus
user
(
Right
[
cid
])
(
Multi
$
fromMaybe
EN
l
)
Nothing
(
Just
$
fromIntegral
$
length
docs
,
yieldMany
docs
.|
mapC
toHyperdataDocument
)
(
Just
$
fromIntegral
$
length
docs
,
docsC'
.|
mapC
toHyperdataDocument
)
--(map (map toHyperdataDocument) docs)
logStatus
...
...
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
ac7c3653
...
...
@@ -24,6 +24,7 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
where
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Conduit
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Control.Monad
(
join
)
...
...
@@ -48,7 +49,7 @@ import System.IO.Temp (emptySystemTempFile)
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
)
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
,
parseCsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import
qualified
Gargantext.Core.Text.Corpus.Parsers.Date
as
Date
...
...
@@ -78,6 +79,28 @@ data FileFormat = WOS | RIS | RisPresse
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
parseFormatC
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
Prelude
.
String
(
ConduitT
()
HyperdataDocument
IO
()
))
parseFormatC
CsvGargV3
bs
=
pure
$
transPipe
(
\
d
->
d
)
<$>
parseCsvC
$
DBL
.
fromStrict
bs
parseFormatC
CsvHal
bs
=
pure
$
transPipe
pure
<$>
parseCsvC
$
DBL
.
fromStrict
bs
parseFormatC
RisPresse
bs
=
do
docs
<-
snd
<$>
enrichWith
RisPresse
$
partitionEithers
$
[
runParser'
RisPresse
bs
]
pure
$
Right
$
docs
.|
mapMC
(
toDoc
RIS
)
parseFormatC
WOS
bs
=
do
docs
<-
snd
<$>
enrichWith
WOS
$
partitionEithers
$
[
runParser'
WOS
bs
]
pure
$
Right
$
docs
.|
mapMC
(
toDoc
WOS
)
parseFormatC
ZIP
bs
=
do
path
<-
emptySystemTempFile
"parsed-zip"
DB
.
writeFile
path
bs
parsedZip
<-
withArchive
path
$
do
DM
.
keys
<$>
getEntries
pure
$
Left
$
"Not implemented for ZIP, parsedZip"
<>
show
parsedZip
parseFormatC
_
_
=
undefined
parseFormat
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseFormat
CsvGargV3
bs
=
pure
$
parseCsv'
$
DBL
.
fromStrict
bs
...
...
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
View file @
ac7c3653
...
...
@@ -14,6 +14,7 @@ CSV parser for Gargantext corpus files.
module
Gargantext.Core.Text.Corpus.Parsers.CSV
where
import
Conduit
import
Control.Applicative
import
qualified
Data.ByteString
as
BS
import
qualified
Data.ByteString.Lazy
as
BL
...
...
@@ -462,6 +463,16 @@ parseCsv' bs = do
Right
res
->
Right
res
(
V
.
toList
.
V
.
map
csv2doc
.
snd
)
<$>
result
parseCsvC
::
BL
.
ByteString
->
Either
Prelude
.
String
(
ConduitT
()
HyperdataDocument
Identity
()
)
parseCsvC
bs
=
do
let
result
=
case
readCsvLazyBS
Comma
bs
of
Left
_err
->
readCsvLazyBS
Tab
bs
Right
res
->
Right
res
case
result
of
Left
err
->
Left
err
Right
r
->
Right
$
(
yieldMany
$
snd
r
)
.|
mapC
csv2doc
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment