Commit 622f7062 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Add the parseTsvWithDiagnostics function

This will pave the way for a more thorough refactoring to
be able to emit diagnostics after parsing.
parent 9a26e565
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ViewPatterns #-}
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV
Description :
......@@ -31,6 +34,7 @@ import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (length, show)
import Protolude
import qualified Data.ByteString as B
---------------------------------------------------------------
headerTsvGargV3 :: Header
......@@ -406,7 +410,37 @@ readTSVFile fp = do
Left _err -> pure $ Left _err
Right del -> pure $ readTsvLazyBS del file
-- | Allows the parser to report errors happening at a given row in the document.
data AtRow a = AtRow Int a
data ParseTsvResult a
= ParseTsvResult
{ _ptr_parsed_records :: a
, _ptr_skipped_records :: [ AtRow (Text, Vector ByteString) ]
}
parseTsvWithDiagnostics :: Delimiter
-> BL.ByteString
-> Either Text (ParseTsvResult [TsvDoc])
parseTsvWithDiagnostics d bs = case decodeWith @Record (tsvDecodeOptions d) HasHeader bs of
Left err -> Left $ pack err
Right rawRecords -> Right $ uncurry ParseTsvResult . first reverse $
foldl' parse_raw_record (mempty, mempty) (zip (V.toList rawRecords) [ 1 .. ])
where
parse_raw_record :: ([TsvDoc], [AtRow (Text, Vector ByteString)])
-> (Vector ByteString, Int)
-> ([TsvDoc], [AtRow (Text, Vector ByteString)])
parse_raw_record (!succeeded, !failed) (input_record, current_row) =
case decodeByNameWith @TsvDoc (tsvDecodeOptions d) (intercalateRecords d input_record) of
Left err -> (succeeded, AtRow current_row (pack err, input_record) : failed)
Right (_, V.toList -> [tsvDoc]) -> (tsvDoc : succeeded, failed)
-- The one below won't happen, but it's left for completeness.
Right (_, V.toList -> tsvDocs) -> (tsvDocs <> succeeded, failed)
-- | \"Reconstructs\" a line out of a parsed record, so that it can be fed back
-- into the invidual parser.
intercalateRecords :: Delimiter -> Record -> BL.ByteString
intercalateRecords d r = BL.fromStrict $ B.intercalate (B.pack [delimiter d]) (V.toList r)
-- | TODO use readByteStringLazy
readTsvLazyBS :: Delimiter
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment