Commit ac7c3653 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[conduit] some work towards migrating file parser to conduit (does not compile)

parent 2a3137de
...@@ -38,3 +38,5 @@ repos ...@@ -38,3 +38,5 @@ repos
repo.json* repo.json*
tmp*repo*json tmp*repo*json
data data
devops/docker/js-cache
...@@ -24,6 +24,7 @@ import Control.Lens hiding (elements, Empty) ...@@ -24,6 +24,7 @@ import Control.Lens hiding (elements, Empty)
import Data.Aeson import Data.Aeson
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import qualified Data.ByteString.Base64 as BSB64 import qualified Data.ByteString.Base64 as BSB64
import Data.Conduit.Internal (zipSources)
import Data.Either import Data.Either
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.Swagger import Data.Swagger
...@@ -267,13 +268,15 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -267,13 +268,15 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
printDebug "[addToCorpusWithForm] Parsing corpus: " cid printDebug "[addToCorpusWithForm] Parsing corpus: " cid
printDebug "[addToCorpusWithForm] fileType" ft printDebug "[addToCorpusWithForm] fileType" ft
logStatus jobLog logStatus jobLog
limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit'
let let
parse = case ft of parseC = case ft of
CSV_HAL -> Parser.parseFormat Parser.CsvHal CSV_HAL -> Parser.parseFormatC Parser.CsvHal
CSV -> Parser.parseFormat Parser.CsvGargV3 CSV -> Parser.parseFormatC Parser.CsvGargV3
WOS -> Parser.parseFormat Parser.WOS WOS -> Parser.parseFormatC Parser.WOS
PresseRIS -> Parser.parseFormat Parser.RisPresse PresseRIS -> Parser.parseFormatC Parser.RisPresse
ZIP -> Parser.parseFormat Parser.ZIP ZIP -> Parser.parseFormatC Parser.ZIP
-- TODO granularity of the logStatus -- TODO granularity of the logStatus
let data' = case ft of let data' = case ft of
...@@ -281,37 +284,36 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -281,37 +284,36 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
Right decoded -> decoded Right decoded -> decoded
_ -> cs d _ -> cs d
eDocs <- liftBase $ parse data' eDocsC <- liftBase $ parseC data'
case eDocs of case eDocsC of
Right docs -> do Right docsC -> do
-- TODO Add progress (jobStatus) update for docs - this is a -- TODO Add progress (jobStatus) update for docs - this is a
-- long action -- long action
limit' <- view $ hasConfig . gc_max_docs_parsers let docsC' = zipSources (yieldMany [1..]) docsC
let limit = fromIntegral limit' .| mapMC \(idx, doc) -> do
if length docs > limit then do if idx > limit then do
printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs) printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
let panicMsg' = [ "[addToCorpusWithForm] number of docs (" let panicMsg' = [ "[addToCorpusWithForm] number of docs "
, show $ length docs , "exceeds the MAX_DOCS_PARSERS limit ("
, ") exceeds the MAX_DOCS_PARSERS limit ("
, show limit , show limit
, ")" ] , ")" ]
let panicMsg = T.concat $ T.pack <$> panicMsg' let panicMsg = T.concat $ T.pack <$> panicMsg'
logStatus $ jobLogFailTotalWithMessage panicMsg jobLog logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
panic panicMsg panic panicMsg
else else
pure () pure doc
printDebug "Parsing corpus finished : " cid --printDebug "Parsing corpus finished : " cid
logStatus jobLog2 --logStatus jobLog2
printDebug "Starting extraction : " cid --printDebug "Starting extraction : " cid
-- TODO granularity of the logStatus -- TODO granularity of the logStatus
_cid' <- flowCorpus user _cid' <- flowCorpus user
(Right [cid]) (Right [cid])
(Multi $ fromMaybe EN l) (Multi $ fromMaybe EN l)
Nothing Nothing
(Just $ fromIntegral $ length docs, yieldMany docs .| mapC toHyperdataDocument) (Just $ fromIntegral $ length docs, docsC' .| mapC toHyperdataDocument)
--(map (map toHyperdataDocument) docs) --(map (map toHyperdataDocument) docs)
logStatus logStatus
......
...@@ -24,6 +24,7 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl ...@@ -24,6 +24,7 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
where where
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries) import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Conduit
import Control.Concurrent.Async as CCA (mapConcurrently) import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Attoparsec.ByteString (parseOnly, Parser) import Data.Attoparsec.ByteString (parseOnly, Parser)
import Control.Monad (join) import Control.Monad (join)
...@@ -48,7 +49,7 @@ import System.IO.Temp (emptySystemTempFile) ...@@ -48,7 +49,7 @@ import System.IO.Temp (emptySystemTempFile)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv') import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv', parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault) -- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
...@@ -78,6 +79,28 @@ data FileFormat = WOS | RIS | RisPresse ...@@ -78,6 +79,28 @@ data FileFormat = WOS | RIS | RisPresse
-- | PDF -- Not Implemented / pdftotext and import Pandoc ? -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see : -- | XML -- Not Implemented / see :
parseFormatC :: FileFormat -> DB.ByteString -> IO (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
parseFormatC CsvGargV3 bs = pure $ transPipe (\d -> d) <$> parseCsvC $ DBL.fromStrict bs
parseFormatC CsvHal bs = pure $ transPipe pure <$> parseCsvC $ DBL.fromStrict bs
parseFormatC RisPresse bs = do
docs <- snd
<$> enrichWith RisPresse
$ partitionEithers
$ [runParser' RisPresse bs]
pure $ Right $ docs .| mapMC (toDoc RIS)
parseFormatC WOS bs = do
docs <- snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
pure $ Right $ docs .| mapMC (toDoc WOS)
parseFormatC ZIP bs = do
path <- emptySystemTempFile "parsed-zip"
DB.writeFile path bs
parsedZip <- withArchive path $ do
DM.keys <$> getEntries
pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
parseFormatC _ _ = undefined
parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument]) parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
......
...@@ -14,6 +14,7 @@ CSV parser for Gargantext corpus files. ...@@ -14,6 +14,7 @@ CSV parser for Gargantext corpus files.
module Gargantext.Core.Text.Corpus.Parsers.CSV where module Gargantext.Core.Text.Corpus.Parsers.CSV where
import Conduit
import Control.Applicative import Control.Applicative
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
...@@ -462,6 +463,16 @@ parseCsv' bs = do ...@@ -462,6 +463,16 @@ parseCsv' bs = do
Right res -> Right res Right res -> Right res
(V.toList . V.map csv2doc . snd) <$> result (V.toList . V.map csv2doc . snd) <$> result
parseCsvC :: BL.ByteString -> Either Prelude.String (ConduitT () HyperdataDocument Identity ())
parseCsvC bs = do
let
result = case readCsvLazyBS Comma bs of
Left _err -> readCsvLazyBS Tab bs
Right res -> Right res
case result of
Left err -> Left err
Right r -> Right $ (yieldMany $ snd r) .| mapC csv2doc
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Csv v3 weighted for phylo -- Csv v3 weighted for phylo
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment