[conduit] some work towards migrating file parser to conduit (does not compile)

ac7c3653 · Przemyslaw Kaminski · 2a3137de · ac7c3653 · ac7c3653 · ac7c3653
Commit ac7c3653 authored Mar 07, 2022 by Przemyslaw Kaminski
Showing with 68 additions and 30 deletions

.gitignore .gitignore +2 -0

New.hs src/Gargantext/API/Node/Corpus/New.hs +31 -29

Parsers.hs src/Gargantext/Core/Text/Corpus/Parsers.hs +24 -1

CSV.hs src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs +11 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,5 @@ repos
 repo.json*
 tmp*repo*json
 data
+devops/docker/js-cache
--- a/src/Gargantext/API/Node/Corpus/New.hs
+++ b/src/Gargantext/API/Node/Corpus/New.hs
@@ -24,6 +24,7 @@ import Control.Lens hiding (elements, Empty)
 import Data.Aeson
 import Data.Aeson.TH (deriveJSON)
 import qualified Data.ByteString.Base64 as BSB64
+import Data.Conduit.Internal (zipSources)
 import Data.Either
 import Data.Maybe (fromMaybe)
 import Data.Swagger
@@ -267,13 +268,15 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
  printDebug "[addToCorpusWithForm] Parsing corpus: " cid
  printDebug "[addToCorpusWithForm] fileType" ft
  logStatus jobLog
+  limit' <- view $ hasConfig . gc_max_docs_parsers
+  let limit = fromIntegral limit'
  let
-    parse = case ft of
+    parseC = case ft of
-      CSV_HAL   -> Parser.parseFormat Parser.CsvHal
+      CSV_HAL   -> Parser.parseFormatC Parser.CsvHal
-      CSV       -> Parser.parseFormat Parser.CsvGargV3
+      CSV       -> Parser.parseFormatC Parser.CsvGargV3
-      WOS       -> Parser.parseFormat Parser.WOS
+      WOS       -> Parser.parseFormatC Parser.WOS
-      PresseRIS -> Parser.parseFormat Parser.RisPresse
+      PresseRIS -> Parser.parseFormatC Parser.RisPresse
-      ZIP       -> Parser.parseFormat Parser.ZIP
+      ZIP       -> Parser.parseFormatC Parser.ZIP
  -- TODO granularity of the logStatus
  let data' = case ft of
@@ -281,37 +284,36 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
          Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
          Right decoded -> decoded
        _   -> cs d
-  eDocs <- liftBase $ parse data'
+  eDocsC <- liftBase $ parseC data'
-  case eDocs of
+  case eDocsC of
-    Right docs -> do
+    Right docsC -> do
      -- TODO Add progress (jobStatus) update for docs - this is a
      -- long action
-      limit' <- view $ hasConfig . gc_max_docs_parsers
+      let docsC' = zipSources (yieldMany [1..]) docsC
-      let limit = fromIntegral limit'
+          .| mapMC \(idx, doc) -> do
-      if length docs > limit then do
+              if idx > limit then do
-        printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs)
+               printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
-        let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
+               let panicMsg' = [ "[addToCorpusWithForm] number of docs "
-                        , show $ length docs
+                               , "exceeds the MAX_DOCS_PARSERS limit ("
-                        , ") exceeds the MAX_DOCS_PARSERS limit ("
                               , show limit
                               , ")" ]
                let panicMsg = T.concat $ T.pack <$> panicMsg'
                logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
                panic panicMsg
              else
-        pure ()
+                pure doc
-      printDebug "Parsing corpus finished : " cid
+      --printDebug "Parsing corpus finished : " cid
-      logStatus jobLog2
+      --logStatus jobLog2
-      printDebug "Starting extraction     : " cid
+      --printDebug "Starting extraction     : " cid
      -- TODO granularity of the logStatus
      _cid' <- flowCorpus user
                          (Right [cid])
                          (Multi $ fromMaybe EN l)
                          Nothing
-                          (Just $ fromIntegral $ length docs, yieldMany docs .| mapC toHyperdataDocument)
+                          (Just $ fromIntegral $ length docs, docsC' .| mapC toHyperdataDocument)
                          --(map (map toHyperdataDocument) docs)
                          logStatus

--- a/src/Gargantext/Core/Text/Corpus/Parsers.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers.hs
@@ -24,6 +24,7 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
    where
 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
+import Conduit
 import Control.Concurrent.Async as CCA (mapConcurrently)
 import Data.Attoparsec.ByteString (parseOnly, Parser)
 import Control.Monad (join)
@@ -48,7 +49,7 @@ import System.IO.Temp (emptySystemTempFile)
 import Gargantext.Core (Lang(..))
 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
 import Gargantext.Prelude
-import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
+import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv', parseCsvC)
 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
 -- import Gargantext.Core.Text.Learn (detectLangDefault)
 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
@@ -78,6 +79,28 @@ data FileFormat = WOS | RIS | RisPresse
 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
 --                | XML        -- Not Implemented / see :
+parseFormatC :: FileFormat -> DB.ByteString -> IO (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
+parseFormatC CsvGargV3 bs = pure $ transPipe (\d -> d) <$> parseCsvC $ DBL.fromStrict bs
+parseFormatC CsvHal    bs = pure $ transPipe pure <$> parseCsvC $ DBL.fromStrict bs
+parseFormatC RisPresse bs = do
+  docs <- snd
+          <$> enrichWith RisPresse
+          $ partitionEithers
+          $ [runParser'  RisPresse bs]
+  pure $ Right $ docs .| mapMC (toDoc RIS)
+parseFormatC WOS bs = do
+  docs <- snd
+          <$> enrichWith WOS
+          $ partitionEithers
+          $ [runParser'  WOS bs]
+  pure $ Right $ docs .| mapMC (toDoc WOS)
+parseFormatC ZIP bs = do
+  path <- emptySystemTempFile "parsed-zip"
+  DB.writeFile path bs
+  parsedZip <- withArchive path $ do
+    DM.keys <$> getEntries
+  pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
+parseFormatC _ _ = undefined
 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs

--- a/src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
@@ -14,6 +14,7 @@ CSV parser for Gargantext corpus files.
 module Gargantext.Core.Text.Corpus.Parsers.CSV where
+import Conduit
 import Control.Applicative
 import qualified Data.ByteString      as BS
 import qualified Data.ByteString.Lazy as BL
@@ -462,6 +463,16 @@ parseCsv' bs = do
      Right res -> Right res
  (V.toList . V.map csv2doc . snd) <$> result
+parseCsvC :: BL.ByteString -> Either Prelude.String (ConduitT () HyperdataDocument Identity ())
+parseCsvC bs = do
+  let
+    result = case readCsvLazyBS Comma bs of
+      Left  _err -> readCsvLazyBS Tab bs
+      Right res -> Right res
+  case result of
+    Left err -> Left err
+    Right r -> Right $ (yieldMany $ snd r) .| mapC csv2doc
 ------------------------------------------------------------------------
 -- Csv v3 weighted for phylo