Commit 8cdbff09 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[conduit] attempt to fix length of parsed docs [does not compile]

This will show good progress bar.
parent de7cf704
Pipeline #2566 failed with stage
in 25 minutes and 27 seconds
...@@ -79,22 +79,38 @@ data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal ...@@ -79,22 +79,38 @@ data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
-- | PDF -- Not Implemented / pdftotext and import Pandoc ? -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see : -- | XML -- Not Implemented / see :
parseFormatC :: MonadBaseControl IO m => FileType -> FileFormat -> DB.ByteString -> m (Either Prelude.String (ConduitT () HyperdataDocument IO ())) parseFormatC :: MonadBaseControl IO m
parseFormatC CsvGargV3 Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs) => FileType
parseFormatC CsvHal Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs) -> FileFormat
-> DB.ByteString
-> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
parseFormatC CsvGargV3 Plain bs = do
eParsedC <- parseCsvC $ DBL.fromStrict bs
case eParsedC of
Left err -> pure $ Left err
Right (mLen, parsedC) -> pure $ (mLen, transPipe (pure . runIdentity) parsedC)
parseFormatC CsvHal Plain bs = do
eParsedC <- parseCsvC $ DBL.fromStrict bs
case eParsedC of
Left err -> pure $ Left err
Right (mLen, parsedC) -> pure $ (mLen, transPipe (pure . runIdentity) parsedC)
parseFormatC RisPresse Plain bs = do parseFormatC RisPresse Plain bs = do
--docs <- enrichWith RisPresse --docs <- enrichWith RisPresse
let eDocs = runParser' RisPresse bs let eDocs = runParser' RisPresse bs
pure $ (\docs -> yieldMany docs pure $ (\docs ->
.| mapC presseEnrich ( Just $ length docs
.| mapC (map $ both decodeUtf8) , yieldMany docs
.| mapMC (toDoc RIS)) <$> eDocs .| mapC presseEnrich
.| mapC (map $ both decodeUtf8)
.| mapMC (toDoc RIS)) ) <$> eDocs
parseFormatC WOS Plain bs = do parseFormatC WOS Plain bs = do
let eDocs = runParser' WOS bs let eDocs = runParser' WOS bs
pure $ (\docs -> yieldMany docs pure $ (\docs ->
.| mapC (map $ first WOS.keys) ( Just $ length docs
.| mapC (map $ both decodeUtf8) , yieldMany docs
.| mapMC (toDoc WOS)) <$> eDocs .| mapC (map $ first WOS.keys)
.| mapC (map $ both decodeUtf8)
.| mapMC (toDoc WOS)) ) <$> eDocs
parseFormatC ft ZIP bs = do parseFormatC ft ZIP bs = do
path <- liftBase $ emptySystemTempFile "parsed-zip" path <- liftBase $ emptySystemTempFile "parsed-zip"
liftBase $ DB.writeFile path bs liftBase $ DB.writeFile path bs
...@@ -110,7 +126,12 @@ parseFormatC ft ZIP bs = do ...@@ -110,7 +126,12 @@ parseFormatC ft ZIP bs = do
[] -> [] ->
case contents of case contents of
[] -> pure $ Left "No files in zip" [] -> pure $ Left "No files in zip"
_ -> pure $ Right $ ( sequenceConduits contents >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc") _ -> do
let lenghts = fst <$> contents
let contents' = snd <$> contents
let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
pure $ Right $ ( Just totalLength
, sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
_ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
parseFormatC _ _ _ = undefined parseFormatC _ _ _ = undefined
......
...@@ -463,7 +463,8 @@ parseCsv' bs = do ...@@ -463,7 +463,8 @@ parseCsv' bs = do
Right res -> Right res Right res -> Right res
(V.toList . V.map csv2doc . snd) <$> result (V.toList . V.map csv2doc . snd) <$> result
parseCsvC :: BL.ByteString -> Either Prelude.String (ConduitT () HyperdataDocument Identity ()) parseCsvC :: BL.ByteString
-> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
parseCsvC bs = do parseCsvC bs = do
let let
result = case readCsvLazyBS Comma bs of result = case readCsvLazyBS Comma bs of
...@@ -471,7 +472,7 @@ parseCsvC bs = do ...@@ -471,7 +472,7 @@ parseCsvC bs = do
Right res -> Right res Right res -> Right res
case result of case result of
Left err -> Left err Left err -> Left err
Right r -> Right $ (yieldMany $ snd r) .| mapC csv2doc Right r -> Right $ (Just $ length snd r, (yieldMany $ snd r) .| mapC csv2doc)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Csv v3 weighted for phylo -- Csv v3 weighted for phylo
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment