{-# LANGUAGE LambdaCase          #-}
{-# LANGUAGE ScopedTypeVariables #-}

{-|
Module      : Gargantext.Core.Text.Corpus.Parsers.TSV.Diagnostics
Description :
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

-}


module Gargantext.Core.Text.Corpus.Parsers.TSV.Diagnostics where

import Conduit ( ConduitT, (.|), yieldMany, mapC )
import Control.Lens (over)
import Data.ByteString (StrictByteString)
import Data.ByteString.Lazy qualified as BL
import Data.Csv
import Data.Csv.Incremental qualified as CSVI
import Data.Text (pack)
import Data.Text qualified as T
import Data.Text.Lazy            qualified as TL
import Data.Text.Lazy.Encoding    qualified as TL
import Data.Text.Read     qualified as DTR
import Data.Vector (Vector)
import Data.Vector qualified as V
import Gargantext.Core.Text.Corpus.Parsers.Types (AtRow(..), ParseCorpusResult(..))
import Gargantext.Core.Text.Corpus.Parsers.TSV.Types
import Gargantext.Core.Text.Corpus.Parsers.TSV.Utils (detectDelimiter)
import Gargantext.Prelude
import Prelude (String)



type ColumnName = Text
type FieldValue = BL.ByteString
type Validator = FieldValue -> ColumnName -> Int -> Either Text Bool
type ColumnValidator = (ColumnName, Validator)

------------------------------------------------------------------------

parseTsvC :: forall tsvDoc result. FromNamedRecord tsvDoc
          => (tsvDoc -> result)
          -> BL.ByteString
          -> Either Text (Integer, ConduitT () (ParseCorpusResult result) Identity ())
parseTsvC tsv2doc bs = convert_result <$> eResult
  where
    eResult :: Either Text [ParseCorpusResult tsvDoc]
    eResult = case detectDelimiter bs of
      Left err  -> Left err
      Right del -> Right $ parseTsvWithDiagnostics del bs

    convert_result :: [ParseCorpusResult tsvDoc]
                   -> (Integer, ConduitT () (ParseCorpusResult result) Identity ())
    convert_result rs =
      (fromIntegral $ length rs, yieldMany rs .| mapC tsvResult2doc)

    tsvResult2doc :: ParseCorpusResult tsvDoc -> ParseCorpusResult result
    tsvResult2doc = \case
      ParseRecordSucceeded r    -> ParseRecordSucceeded (tsv2doc r)
      ParseTsvRecordFailed err  -> ParseTsvRecordFailed err



-- | Parses the input 'ByteString' identifying a TSV document
-- using the provided delimiter. This function is /incremental/, i.e.
-- it will correctly identify rows which can't be parsed and skip them,
-- collecting the error along the way.
-- It returns a list of 'ParseCorpusResult', which will contain either
-- the parsed record or a diagnostic on the error on the given row.
parseTsvWithDiagnostics :: forall tsvDoc. FromNamedRecord tsvDoc
                        => Delimiter
                        -> BL.ByteString
                        -> [ParseCorpusResult tsvDoc]
parseTsvWithDiagnostics d bs =
  case drainParser (BL.foldrChunks go mkHdrParser bs) of
    MkHeaderParsingContext _p ->
      [ ParseTsvRecordFailed (AtRow 0 ("The parsing choked on the header (delimiter was " <> show d <> "). This might be a malformed TSV we can't recover from.", mempty)) ]
    MkRecordParsingContext (RecordParsingContext{..})
      -- As we accumulate records by consing (for efficiency sake),
      -- we need a final 'reverse' at the end.
      -> reverse $ _prs_ctx_parsed_records
  where

    drainParser :: ParsingContext (ParseCorpusResult tsvDoc)
                -> ParsingContext (ParseCorpusResult tsvDoc)
    drainParser = go mempty . go mempty -- step twice, once to apply the empty string, once to drain.

    mkHdrParser :: ParsingContext (ParseCorpusResult tsvDoc)
    mkHdrParser = MkHeaderParsingContext (CSVI.decodeByNameWith (tsvDecodeOptions d))

    go :: StrictByteString
       -> ParsingContext (ParseCorpusResult tsvDoc)
       -> ParsingContext (ParseCorpusResult tsvDoc)
    go strict_chunk ctx = case ctx of
      MkHeaderParsingContext p -> go_hdr strict_chunk p
      MkRecordParsingContext p -> go_rec strict_chunk p

    -- Specialised parser for the header: once it fails
    -- or completes, it morphs into a record parser.
    go_hdr :: StrictByteString
           -> CSVI.HeaderParser (CSVI.Parser (ParseCorpusResult tsvDoc))
           -> ParsingContext (ParseCorpusResult tsvDoc)
    go_hdr chunk p = case p of
      CSVI.FailH unconsumed err    ->
        MkRecordParsingContext $ stepParser chunk $ toRecordParsingCtx (CSVI.Fail unconsumed err)
      CSVI.PartialH continue        ->
        MkHeaderParsingContext (continue chunk)
      CSVI.DoneH _header rec_parser ->
        -- Turn this into a record parser by feeding the unconsumed plus the input.
        MkRecordParsingContext $ stepParser chunk $ toRecordParsingCtx rec_parser

    -- Specialised parser for the individual records.
    go_rec :: StrictByteString
           -> RecordParsingContext (ParseCorpusResult tsvDoc)
           -> ParsingContext (ParseCorpusResult tsvDoc)
    go_rec input_bs ctx = MkRecordParsingContext $ stepParser input_bs ctx

    -- Main workhorse: given an input 'StrictByteString', it steps
    -- the parser. If the parser reached a terminal state, it gets
    -- removed from the input 'RecordParsingContext'.
    stepParser :: StrictByteString
               -> RecordParsingContext (ParseCorpusResult tsvDoc)
               -> RecordParsingContext (ParseCorpusResult tsvDoc)
    stepParser input_bs ctx = case ctx ^. prs_ctx_parser of
      Nothing
        -> ctx
      Just (CSVI.Fail unconsumed err)
        -> ctx & over prs_ctx_parsed_records (mkErr ctx (T.pack err, unconsumed) :)
               & prs_ctx_parser .~ Nothing
      Just (CSVI.Many parsed continue)
        -> (addParsedRecords parsed ctx) & prs_ctx_parser .~ (Just $ continue input_bs)
      Just (CSVI.Done parsed)
        -> (addParsedRecords parsed ctx) & prs_ctx_parser .~ Nothing

    -- Convert a parser inside a 'ParsingContext' from a header parser into a record parser.
    toRecordParsingCtx :: CSVI.Parser (ParseCorpusResult tsvDoc)
                       -> RecordParsingContext (ParseCorpusResult tsvDoc)
    toRecordParsingCtx p =
      RecordParsingContext
        { _prs_ctx_parser         = Just p
        , _prs_ctx_parsed_records = []
        , _prs_ctx_row_cursor     = 1
        }

    mkErr :: RecordParsingContext (ParseCorpusResult tsvDoc) -> (Text, ByteString) -> ParseCorpusResult tsvDoc
    mkErr ctx pair = ParseTsvRecordFailed (AtRow (ctx ^. prs_ctx_row_cursor) pair)

    addParsedRecords :: [Either String (ParseCorpusResult tsvDoc)]
                     -> RecordParsingContext (ParseCorpusResult tsvDoc)
                     -> RecordParsingContext (ParseCorpusResult tsvDoc)
    addParsedRecords recs ctx = foldl' process_record ctx recs

    process_record :: RecordParsingContext (ParseCorpusResult tsvDoc)
                   -> Either String (ParseCorpusResult tsvDoc)
                   -> RecordParsingContext (ParseCorpusResult tsvDoc)
    process_record ctx result =
       let ctx' = ctx & over prs_ctx_row_cursor succ
       in case result of
            Left err  -> ctx' & over prs_ctx_parsed_records (mkErr ctx' (T.pack err, mempty) :)
            Right rec -> ctx' & over prs_ctx_parsed_records (rec :)



testCorrectFile :: [ColumnValidator] -> BL.ByteString -> Either Text Delimiter
testCorrectFile columnValidators bs =
  case findDelimiter bs of
    Left _err -> Left _err
    Right del -> do
      let bl = BL.splitWith (==delimiter Line) bs in
        case getHeaders columnValidators bl del of
          Left _err -> Left _err
          Right headers -> testIfErrorInFile columnValidators bl del headers



----------Test headers added to ggt
-- use a map to remove \r that sometimes appear at the end of a line
testAllHeadersPresence :: [ColumnValidator] -> [Text] -> Either Text [Text]
testAllHeadersPresence columnValidators headers = do
    let expectedHeaders = fst <$> columnValidators
    let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack "")) headers)) expectedHeaders
    if null listHeaders
        then Right headers
        else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)

getHeaders :: [ColumnValidator] -> [BL.ByteString] -> Delimiter -> Either Text [Text]
getHeaders columnValidators bl del = do
    let vec = V.fromList bl in
        case BL.splitWith (==delimiter del) <$> ((V.!?) vec 0) of
          Nothing -> Left "Error getHeaders"
          Just headers -> testAllHeadersPresence columnValidators (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)


testValue :: [ColumnValidator] -> FieldValue -> ColumnName -> Int -> Either Text Bool
testValue [] _val _columnHeader _lineno = Right True
testValue ((fieldName, validator):vs) val columnHeader lineno =
  if fieldName == columnHeader then
    validator val columnHeader lineno
  else
    testValue vs val columnHeader lineno


testErrorPerLine :: [ColumnValidator] -> [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool
testErrorPerLine _ [] _ [] _ = Right True
testErrorPerLine _ _ del [] l | del == Comma = Left (pack $ "Too many fields at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
                            | otherwise =  Left (pack $ "Too many fields at line " <> show l)
testErrorPerLine _ [] _ _ l = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine columnValidators (v:val) del (h:headers) lineno =
    case testValue columnValidators v h lineno of
        Left _err -> Left _err
        Right _ -> testErrorPerLine columnValidators val del headers lineno


checkNextLine :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine bl del headers res x = do
  case BL.splitWith (==delimiter del) <$> ((V.!?) bl (x+1)) of
    Nothing  -> Right (x, (BL.splitWith (==delimiter del) res))
    Just value -> if length value > 1
      then Right (x, (BL.splitWith (==delimiter del) res))
      else case BL.append res <$> ((V.!?) bl (x+1)) of
        Nothing  -> Left "checkNextLine2"
        Just val -> checkNextLine bl del headers val (x+1)


getMultipleLinefile :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile bl del headers res x = do
    let tmp = BL.splitWith (==delimiter del) res in
        if length tmp == length headers
          then checkNextLine bl del headers res x
          else
            if (length tmp > length headers) || (V.length bl == (x + 1))
              then
                Left (pack $ "Cannot parse the file at line " <> show x <> ". Maybe because of a delimiter")
              else do
                case BL.append res <$> ((V.!?) bl (x+1)) of
                  Nothing  -> Left "getMultipleLinefile"
                  Just val -> getMultipleLinefile bl del headers val (x+1)

anx :: [ColumnValidator] -> Vector BL.ByteString -> Delimiter -> [Text] -> Int -> Either Text Delimiter
anx columnValidators bl del headers x
            | length bl == x = Right del
            | otherwise      =
              case (V.!?) bl x of
                Nothing -> Left "anx"
                Just bs ->
                  case getMultipleLinefile bl del headers bs x of
                    Left _err -> Left _err
                    Right (y, val) -> case testErrorPerLine columnValidators val del headers (x + 1) of
                      Left _err -> Left _err
                      Right _   -> anx columnValidators bl del headers (y+1)


testIfErrorInFile :: [ColumnValidator] -> [BL.ByteString] -> Delimiter -> [Text] -> Either Text Delimiter
testIfErrorInFile columnValidators bl del headers = anx columnValidators (V.fromList bl) del headers 1


testDelimiter :: Delimiter -> BL.ByteString -> Bool
testDelimiter del bs =
    let x = BL.splitWith (== delimiter Line) bs
        vec = V.fromList x in
          case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
            Nothing -> False
            Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
              Nothing -> False
              Just f -> length e == length f && length e > 2

findDelimiter :: BL.ByteString -> Either Text Delimiter
findDelimiter bs
  | testDelimiter Tab bs = Right Tab
  | testDelimiter Comma bs = Right Comma
  | otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")

isNumeric :: Text -> Either Bool Int
isNumeric str = case DTR.decimal str of
  Right (x,y) -> if y == ""
    then Right x
    else Left False
  Left _ -> Left False

lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TL.decodeUtf8 b

validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
validNumber x columnHeader lineno = do
    let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
    case isNumeric number of
      Right val 
        | val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show lineno) <> " is negative")
        |otherwise -> Right True
      Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show lineno) <> " : value is not a number ")


validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
validTextField x columnHeader lineno = do
  let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
    if not (T.null xs) 
      then 
        if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2))) 
          then return True 
          else Left $ ("Encapsulation problem at line " <> pack (show lineno) <> " in column '" <> columnHeader <> "' : the caracter  \"  must only appear at the beginning and the end of a field ")
      else return True
      -- else Left $ ("The column '" <> columnHeader <> "' at line " <>  pack (show ligne) <> " is empty")
      -- Put a warning for the user to know their is a problem (empty column)
