Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
ac7c3653
Commit
ac7c3653
authored
Mar 07, 2022
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[conduit] some work towards migrating file parser to conduit (does not compile)
parent
2a3137de
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
68 additions
and
30 deletions
+68
-30
.gitignore
.gitignore
+2
-0
New.hs
src/Gargantext/API/Node/Corpus/New.hs
+31
-29
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+24
-1
CSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+11
-0
No files found.
.gitignore
View file @
ac7c3653
...
@@ -38,3 +38,5 @@ repos
...
@@ -38,3 +38,5 @@ repos
repo.json*
repo.json*
tmp*repo*json
tmp*repo*json
data
data
devops/docker/js-cache
src/Gargantext/API/Node/Corpus/New.hs
View file @
ac7c3653
...
@@ -24,6 +24,7 @@ import Control.Lens hiding (elements, Empty)
...
@@ -24,6 +24,7 @@ import Control.Lens hiding (elements, Empty)
import
Data.Aeson
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Aeson.TH
(
deriveJSON
)
import
qualified
Data.ByteString.Base64
as
BSB64
import
qualified
Data.ByteString.Base64
as
BSB64
import
Data.Conduit.Internal
(
zipSources
)
import
Data.Either
import
Data.Either
import
Data.Maybe
(
fromMaybe
)
import
Data.Maybe
(
fromMaybe
)
import
Data.Swagger
import
Data.Swagger
...
@@ -267,13 +268,15 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
...
@@ -267,13 +268,15 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
printDebug
"[addToCorpusWithForm] Parsing corpus: "
cid
printDebug
"[addToCorpusWithForm] Parsing corpus: "
cid
printDebug
"[addToCorpusWithForm] fileType"
ft
printDebug
"[addToCorpusWithForm] fileType"
ft
logStatus
jobLog
logStatus
jobLog
limit'
<-
view
$
hasConfig
.
gc_max_docs_parsers
let
limit
=
fromIntegral
limit'
let
let
parse
=
case
ft
of
parse
C
=
case
ft
of
CSV_HAL
->
Parser
.
parseFormat
Parser
.
CsvHal
CSV_HAL
->
Parser
.
parseFormat
C
Parser
.
CsvHal
CSV
->
Parser
.
parseFormat
Parser
.
CsvGargV3
CSV
->
Parser
.
parseFormat
C
Parser
.
CsvGargV3
WOS
->
Parser
.
parseFormat
Parser
.
WOS
WOS
->
Parser
.
parseFormat
C
Parser
.
WOS
PresseRIS
->
Parser
.
parseFormat
Parser
.
RisPresse
PresseRIS
->
Parser
.
parseFormat
C
Parser
.
RisPresse
ZIP
->
Parser
.
parseFormat
Parser
.
ZIP
ZIP
->
Parser
.
parseFormat
C
Parser
.
ZIP
-- TODO granularity of the logStatus
-- TODO granularity of the logStatus
let
data
'
=
case
ft
of
let
data
'
=
case
ft
of
...
@@ -281,37 +284,36 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
...
@@ -281,37 +284,36 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
Left
err
->
panic
$
T
.
pack
"[addToCorpusWithForm] error decoding base64: "
<>
T
.
pack
err
Left
err
->
panic
$
T
.
pack
"[addToCorpusWithForm] error decoding base64: "
<>
T
.
pack
err
Right
decoded
->
decoded
Right
decoded
->
decoded
_
->
cs
d
_
->
cs
d
eDocs
<-
liftBase
$
parse
data
'
eDocs
C
<-
liftBase
$
parseC
data
'
case
eDocs
of
case
eDocs
C
of
Right
docs
->
do
Right
docs
C
->
do
-- TODO Add progress (jobStatus) update for docs - this is a
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
-- long action
limit'
<-
view
$
hasConfig
.
gc_max_docs_parsers
let
docsC'
=
zipSources
(
yieldMany
[
1
..
])
docsC
let
limit
=
fromIntegral
limit'
.|
mapMC
\
(
idx
,
doc
)
->
do
if
length
docs
>
limit
then
do
if
idx
>
limit
then
do
printDebug
"[addToCorpusWithForm] number of docs exceeds the limit"
(
show
$
length
docs
)
printDebug
"[addToCorpusWithForm] number of docs exceeds the limit"
(
show
limit
)
let
panicMsg'
=
[
"[addToCorpusWithForm] number of docs ("
let
panicMsg'
=
[
"[addToCorpusWithForm] number of docs "
,
show
$
length
docs
,
"exceeds the MAX_DOCS_PARSERS limit ("
,
") exceeds the MAX_DOCS_PARSERS limit ("
,
show
limit
,
show
limit
,
")"
]
,
")"
]
let
panicMsg
=
T
.
concat
$
T
.
pack
<$>
panicMsg'
let
panicMsg
=
T
.
concat
$
T
.
pack
<$>
panicMsg'
logStatus
$
jobLogFailTotalWithMessage
panicMsg
jobLog
logStatus
$
jobLogFailTotalWithMessage
panicMsg
jobLog
panic
panicMsg
panic
panicMsg
else
else
pure
()
pure
doc
printDebug
"Parsing corpus finished : "
cid
--
printDebug "Parsing corpus finished : " cid
logStatus
jobLog2
--
logStatus jobLog2
printDebug
"Starting extraction : "
cid
--
printDebug "Starting extraction : " cid
-- TODO granularity of the logStatus
-- TODO granularity of the logStatus
_cid'
<-
flowCorpus
user
_cid'
<-
flowCorpus
user
(
Right
[
cid
])
(
Right
[
cid
])
(
Multi
$
fromMaybe
EN
l
)
(
Multi
$
fromMaybe
EN
l
)
Nothing
Nothing
(
Just
$
fromIntegral
$
length
docs
,
yieldMany
docs
.|
mapC
toHyperdataDocument
)
(
Just
$
fromIntegral
$
length
docs
,
docsC'
.|
mapC
toHyperdataDocument
)
--(map (map toHyperdataDocument) docs)
--(map (map toHyperdataDocument) docs)
logStatus
logStatus
...
...
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
ac7c3653
...
@@ -24,6 +24,7 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
...
@@ -24,6 +24,7 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
where
where
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Conduit
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Control.Monad
(
join
)
import
Control.Monad
(
join
)
...
@@ -48,7 +49,7 @@ import System.IO.Temp (emptySystemTempFile)
...
@@ -48,7 +49,7 @@ import System.IO.Temp (emptySystemTempFile)
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
)
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
,
parseCsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import
qualified
Gargantext.Core.Text.Corpus.Parsers.Date
as
Date
import
qualified
Gargantext.Core.Text.Corpus.Parsers.Date
as
Date
...
@@ -78,6 +79,28 @@ data FileFormat = WOS | RIS | RisPresse
...
@@ -78,6 +79,28 @@ data FileFormat = WOS | RIS | RisPresse
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
-- | XML -- Not Implemented / see :
parseFormatC
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
Prelude
.
String
(
ConduitT
()
HyperdataDocument
IO
()
))
parseFormatC
CsvGargV3
bs
=
pure
$
transPipe
(
\
d
->
d
)
<$>
parseCsvC
$
DBL
.
fromStrict
bs
parseFormatC
CsvHal
bs
=
pure
$
transPipe
pure
<$>
parseCsvC
$
DBL
.
fromStrict
bs
parseFormatC
RisPresse
bs
=
do
docs
<-
snd
<$>
enrichWith
RisPresse
$
partitionEithers
$
[
runParser'
RisPresse
bs
]
pure
$
Right
$
docs
.|
mapMC
(
toDoc
RIS
)
parseFormatC
WOS
bs
=
do
docs
<-
snd
<$>
enrichWith
WOS
$
partitionEithers
$
[
runParser'
WOS
bs
]
pure
$
Right
$
docs
.|
mapMC
(
toDoc
WOS
)
parseFormatC
ZIP
bs
=
do
path
<-
emptySystemTempFile
"parsed-zip"
DB
.
writeFile
path
bs
parsedZip
<-
withArchive
path
$
do
DM
.
keys
<$>
getEntries
pure
$
Left
$
"Not implemented for ZIP, parsedZip"
<>
show
parsedZip
parseFormatC
_
_
=
undefined
parseFormat
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseFormat
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseFormat
CsvGargV3
bs
=
pure
$
parseCsv'
$
DBL
.
fromStrict
bs
parseFormat
CsvGargV3
bs
=
pure
$
parseCsv'
$
DBL
.
fromStrict
bs
...
...
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
View file @
ac7c3653
...
@@ -14,6 +14,7 @@ CSV parser for Gargantext corpus files.
...
@@ -14,6 +14,7 @@ CSV parser for Gargantext corpus files.
module
Gargantext.Core.Text.Corpus.Parsers.CSV
where
module
Gargantext.Core.Text.Corpus.Parsers.CSV
where
import
Conduit
import
Control.Applicative
import
Control.Applicative
import
qualified
Data.ByteString
as
BS
import
qualified
Data.ByteString
as
BS
import
qualified
Data.ByteString.Lazy
as
BL
import
qualified
Data.ByteString.Lazy
as
BL
...
@@ -462,6 +463,16 @@ parseCsv' bs = do
...
@@ -462,6 +463,16 @@ parseCsv' bs = do
Right
res
->
Right
res
Right
res
->
Right
res
(
V
.
toList
.
V
.
map
csv2doc
.
snd
)
<$>
result
(
V
.
toList
.
V
.
map
csv2doc
.
snd
)
<$>
result
parseCsvC
::
BL
.
ByteString
->
Either
Prelude
.
String
(
ConduitT
()
HyperdataDocument
Identity
()
)
parseCsvC
bs
=
do
let
result
=
case
readCsvLazyBS
Comma
bs
of
Left
_err
->
readCsvLazyBS
Tab
bs
Right
res
->
Right
res
case
result
of
Left
err
->
Left
err
Right
r
->
Right
$
(
yieldMany
$
snd
r
)
.|
mapC
csv2doc
------------------------------------------------------------------------
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
-- Csv v3 weighted for phylo
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment