Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
195
Issues
195
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
ac7c3653
Commit
ac7c3653
authored
Mar 07, 2022
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[conduit] some work towards migrating file parser to conduit (does not compile)
parent
2a3137de
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
68 additions
and
30 deletions
+68
-30
.gitignore
.gitignore
+2
-0
New.hs
src/Gargantext/API/Node/Corpus/New.hs
+31
-29
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+24
-1
CSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+11
-0
No files found.
.gitignore
View file @
ac7c3653
...
...
@@ -38,3 +38,5 @@ repos
repo.json*
tmp*repo*json
data
devops/docker/js-cache
src/Gargantext/API/Node/Corpus/New.hs
View file @
ac7c3653
...
...
@@ -24,6 +24,7 @@ import Control.Lens hiding (elements, Empty)
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
qualified
Data.ByteString.Base64
as
BSB64
import
Data.Conduit.Internal
(
zipSources
)
import
Data.Either
import
Data.Maybe
(
fromMaybe
)
import
Data.Swagger
...
...
@@ -267,13 +268,15 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
printDebug
"[addToCorpusWithForm] Parsing corpus: "
cid
printDebug
"[addToCorpusWithForm] fileType"
ft
logStatus
jobLog
limit'
<-
view
$
hasConfig
.
gc_max_docs_parsers
let
limit
=
fromIntegral
limit'
let
parse
=
case
ft
of
CSV_HAL
->
Parser
.
parseFormat
Parser
.
CsvHal
CSV
->
Parser
.
parseFormat
Parser
.
CsvGargV3
WOS
->
Parser
.
parseFormat
Parser
.
WOS
PresseRIS
->
Parser
.
parseFormat
Parser
.
RisPresse
ZIP
->
Parser
.
parseFormat
Parser
.
ZIP
parse
C
=
case
ft
of
CSV_HAL
->
Parser
.
parseFormat
C
Parser
.
CsvHal
CSV
->
Parser
.
parseFormat
C
Parser
.
CsvGargV3
WOS
->
Parser
.
parseFormat
C
Parser
.
WOS
PresseRIS
->
Parser
.
parseFormat
C
Parser
.
RisPresse
ZIP
->
Parser
.
parseFormat
C
Parser
.
ZIP
-- TODO granularity of the logStatus
let
data
'
=
case
ft
of
...
...
@@ -281,37 +284,36 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
Left
err
->
panic
$
T
.
pack
"[addToCorpusWithForm] error decoding base64: "
<>
T
.
pack
err
Right
decoded
->
decoded
_
->
cs
d
eDocs
<-
liftBase
$
parse
data
'
case
eDocs
of
Right
docs
->
do
eDocs
C
<-
liftBase
$
parseC
data
'
case
eDocs
C
of
Right
docs
C
->
do
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
limit'
<-
view
$
hasConfig
.
gc_max_docs_parsers
let
limit
=
fromIntegral
limit'
if
length
docs
>
limit
then
do
printDebug
"[addToCorpusWithForm] number of docs exceeds the limit"
(
show
$
length
docs
)
let
panicMsg'
=
[
"[addToCorpusWithForm] number of docs ("
,
show
$
length
docs
,
") exceeds the MAX_DOCS_PARSERS limit ("
let
docsC'
=
zipSources
(
yieldMany
[
1
..
])
docsC
.|
mapMC
\
(
idx
,
doc
)
->
do
if
idx
>
limit
then
do
printDebug
"[addToCorpusWithForm] number of docs exceeds the limit"
(
show
limit
)
let
panicMsg'
=
[
"[addToCorpusWithForm] number of docs "
,
"exceeds the MAX_DOCS_PARSERS limit ("
,
show
limit
,
")"
]
let
panicMsg
=
T
.
concat
$
T
.
pack
<$>
panicMsg'
logStatus
$
jobLogFailTotalWithMessage
panicMsg
jobLog
panic
panicMsg
else
pure
()
pure
doc
printDebug
"Parsing corpus finished : "
cid
logStatus
jobLog2
--
printDebug "Parsing corpus finished : " cid
--
logStatus jobLog2
printDebug
"Starting extraction : "
cid
--
printDebug "Starting extraction : " cid
-- TODO granularity of the logStatus
_cid'
<-
flowCorpus
user
(
Right
[
cid
])
(
Multi
$
fromMaybe
EN
l
)
Nothing
(
Just
$
fromIntegral
$
length
docs
,
yieldMany
docs
.|
mapC
toHyperdataDocument
)
(
Just
$
fromIntegral
$
length
docs
,
docsC'
.|
mapC
toHyperdataDocument
)
--(map (map toHyperdataDocument) docs)
logStatus
...
...
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
ac7c3653
...
...
@@ -24,6 +24,7 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
where
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Conduit
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Control.Monad
(
join
)
...
...
@@ -48,7 +49,7 @@ import System.IO.Temp (emptySystemTempFile)
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
)
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
,
parseCsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import
qualified
Gargantext.Core.Text.Corpus.Parsers.Date
as
Date
...
...
@@ -78,6 +79,28 @@ data FileFormat = WOS | RIS | RisPresse
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
parseFormatC
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
Prelude
.
String
(
ConduitT
()
HyperdataDocument
IO
()
))
parseFormatC
CsvGargV3
bs
=
pure
$
transPipe
(
\
d
->
d
)
<$>
parseCsvC
$
DBL
.
fromStrict
bs
parseFormatC
CsvHal
bs
=
pure
$
transPipe
pure
<$>
parseCsvC
$
DBL
.
fromStrict
bs
parseFormatC
RisPresse
bs
=
do
docs
<-
snd
<$>
enrichWith
RisPresse
$
partitionEithers
$
[
runParser'
RisPresse
bs
]
pure
$
Right
$
docs
.|
mapMC
(
toDoc
RIS
)
parseFormatC
WOS
bs
=
do
docs
<-
snd
<$>
enrichWith
WOS
$
partitionEithers
$
[
runParser'
WOS
bs
]
pure
$
Right
$
docs
.|
mapMC
(
toDoc
WOS
)
parseFormatC
ZIP
bs
=
do
path
<-
emptySystemTempFile
"parsed-zip"
DB
.
writeFile
path
bs
parsedZip
<-
withArchive
path
$
do
DM
.
keys
<$>
getEntries
pure
$
Left
$
"Not implemented for ZIP, parsedZip"
<>
show
parsedZip
parseFormatC
_
_
=
undefined
parseFormat
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseFormat
CsvGargV3
bs
=
pure
$
parseCsv'
$
DBL
.
fromStrict
bs
...
...
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
View file @
ac7c3653
...
...
@@ -14,6 +14,7 @@ CSV parser for Gargantext corpus files.
module
Gargantext.Core.Text.Corpus.Parsers.CSV
where
import
Conduit
import
Control.Applicative
import
qualified
Data.ByteString
as
BS
import
qualified
Data.ByteString.Lazy
as
BL
...
...
@@ -462,6 +463,16 @@ parseCsv' bs = do
Right
res
->
Right
res
(
V
.
toList
.
V
.
map
csv2doc
.
snd
)
<$>
result
parseCsvC
::
BL
.
ByteString
->
Either
Prelude
.
String
(
ConduitT
()
HyperdataDocument
Identity
()
)
parseCsvC
bs
=
do
let
result
=
case
readCsvLazyBS
Comma
bs
of
Left
_err
->
readCsvLazyBS
Tab
bs
Right
res
->
Right
res
case
result
of
Left
err
->
Left
err
Right
r
->
Right
$
(
yieldMany
$
snd
r
)
.|
mapC
csv2doc
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment