Commit ac7c3653 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[conduit] some work towards migrating file parser to conduit (does not compile)

parent 2a3137de
......@@ -38,3 +38,5 @@ repos
repo.json*
tmp*repo*json
data
devops/docker/js-cache
......@@ -24,6 +24,7 @@ import Control.Lens hiding (elements, Empty)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import qualified Data.ByteString.Base64 as BSB64
import Data.Conduit.Internal (zipSources)
import Data.Either
import Data.Maybe (fromMaybe)
import Data.Swagger
......@@ -267,13 +268,15 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
printDebug "[addToCorpusWithForm] Parsing corpus: " cid
printDebug "[addToCorpusWithForm] fileType" ft
logStatus jobLog
limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit'
let
parse = case ft of
CSV_HAL -> Parser.parseFormat Parser.CsvHal
CSV -> Parser.parseFormat Parser.CsvGargV3
WOS -> Parser.parseFormat Parser.WOS
PresseRIS -> Parser.parseFormat Parser.RisPresse
ZIP -> Parser.parseFormat Parser.ZIP
parseC = case ft of
CSV_HAL -> Parser.parseFormatC Parser.CsvHal
CSV -> Parser.parseFormatC Parser.CsvGargV3
WOS -> Parser.parseFormatC Parser.WOS
PresseRIS -> Parser.parseFormatC Parser.RisPresse
ZIP -> Parser.parseFormatC Parser.ZIP
-- TODO granularity of the logStatus
let data' = case ft of
......@@ -281,37 +284,36 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
Right decoded -> decoded
_ -> cs d
eDocs <- liftBase $ parse data'
case eDocs of
Right docs -> do
eDocsC <- liftBase $ parseC data'
case eDocsC of
Right docsC -> do
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit'
if length docs > limit then do
printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs)
let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
, show $ length docs
, ") exceeds the MAX_DOCS_PARSERS limit ("
, show limit
, ")" ]
let panicMsg = T.concat $ T.pack <$> panicMsg'
logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
panic panicMsg
else
pure ()
printDebug "Parsing corpus finished : " cid
logStatus jobLog2
printDebug "Starting extraction : " cid
let docsC' = zipSources (yieldMany [1..]) docsC
.| mapMC \(idx, doc) -> do
if idx > limit then do
printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
let panicMsg' = [ "[addToCorpusWithForm] number of docs "
, "exceeds the MAX_DOCS_PARSERS limit ("
, show limit
, ")" ]
let panicMsg = T.concat $ T.pack <$> panicMsg'
logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
panic panicMsg
else
pure doc
--printDebug "Parsing corpus finished : " cid
--logStatus jobLog2
--printDebug "Starting extraction : " cid
-- TODO granularity of the logStatus
_cid' <- flowCorpus user
(Right [cid])
(Multi $ fromMaybe EN l)
Nothing
(Just $ fromIntegral $ length docs, yieldMany docs .| mapC toHyperdataDocument)
(Just $ fromIntegral $ length docs, docsC' .| mapC toHyperdataDocument)
--(map (map toHyperdataDocument) docs)
logStatus
......
......@@ -24,6 +24,7 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
where
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Conduit
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Control.Monad (join)
......@@ -48,7 +49,7 @@ import System.IO.Temp (emptySystemTempFile)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv', parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
......@@ -78,6 +79,28 @@ data FileFormat = WOS | RIS | RisPresse
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
parseFormatC :: FileFormat -> DB.ByteString -> IO (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
parseFormatC CsvGargV3 bs = pure $ transPipe (\d -> d) <$> parseCsvC $ DBL.fromStrict bs
parseFormatC CsvHal bs = pure $ transPipe pure <$> parseCsvC $ DBL.fromStrict bs
parseFormatC RisPresse bs = do
docs <- snd
<$> enrichWith RisPresse
$ partitionEithers
$ [runParser' RisPresse bs]
pure $ Right $ docs .| mapMC (toDoc RIS)
parseFormatC WOS bs = do
docs <- snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
pure $ Right $ docs .| mapMC (toDoc WOS)
parseFormatC ZIP bs = do
path <- emptySystemTempFile "parsed-zip"
DB.writeFile path bs
parsedZip <- withArchive path $ do
DM.keys <$> getEntries
pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
parseFormatC _ _ = undefined
parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
......
......@@ -14,6 +14,7 @@ CSV parser for Gargantext corpus files.
module Gargantext.Core.Text.Corpus.Parsers.CSV where
import Conduit
import Control.Applicative
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BL
......@@ -462,6 +463,16 @@ parseCsv' bs = do
Right res -> Right res
(V.toList . V.map csv2doc . snd) <$> result
parseCsvC :: BL.ByteString -> Either Prelude.String (ConduitT () HyperdataDocument Identity ())
parseCsvC bs = do
let
result = case readCsvLazyBS Comma bs of
Left _err -> readCsvLazyBS Tab bs
Right res -> Right res
case result of
Left err -> Left err
Right r -> Right $ (yieldMany $ snd r) .| mapC csv2doc
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment