{-|
Module      : Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
Description :
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

-}

module Gargantext.Core.Text.Corpus.Parsers.TSV.Utils where

import Data.ByteString.Lazy qualified as BL
import Data.ByteString.Lazy.Char8 qualified as B8L
import Data.Csv
import Data.Text (pack)
import Data.Vector (Vector)
import Data.Vector qualified as V
import Gargantext.Core.Text.Corpus.Parsers.TSV.Types
import Gargantext.Prelude



parseTsv :: FromNamedRecord tsvDoc
         => (tsvDoc -> result)
         -> FilePath
         -> IO (Either Text [result])
parseTsv tsv2doc fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp


-- | TODO use readByteStringLazy
readTsvLazyBS :: FromNamedRecord tsvDoc
              => Delimiter
              -> BL.ByteString
              -> Either Text (Header, Vector tsvDoc)
readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs

-- | TODO use readFileLazy
readTSVFile :: FromNamedRecord tsvDoc
            => FilePath
            -> IO (Either Text (Header, Vector tsvDoc))
readTSVFile fp = do
  file <- BL.readFile fp
  case detectDelimiter file of
    Left err  -> pure $ Left err
    Right del -> pure $ readTsvLazyBS del file

readTSVFileDelim :: FromNamedRecord tsvDoc
                 => FilePath
                 -> Delimiter
                 -> IO (Either Text (Header, Vector tsvDoc))
readTSVFileDelim fp del = do
  file <- BL.readFile fp
  pure $ readTsvLazyBS del file



-- Detects delimiter based on the first line
detectDelimiter :: BL.ByteString -> Either Text Delimiter
detectDelimiter input =
    case B8L.lines input of
        (firstLine : _) ->
            let candidates = [(',', count ',' firstLine), ('\t', count '\t' firstLine)]
            in case fst $ maximumBy (comparing snd) candidates of
                 '\n' -> Right Line
                 '\t' -> Right Tab
                 ','  -> Right Comma
                 _    -> Left $ "Invalid delimiter detected for input tsv document."
        _ -> Left "Couldn't detect a valid delimiter for the input document."


-- Count occurrences of a character in a ByteString
count :: Char -> BL.ByteString -> Int64
count c = BL.count (fromIntegral (fromEnum c))

------------------------------------------------------------------------

writeDocs2Tsv :: ToNamedRecord tsvDoc
              => Header
              -> (doc -> tsvDoc)
              -> FilePath
              -> [doc]
              -> IO ()
writeDocs2Tsv hdr doc2tsv fp hs =
  BL.writeFile fp $ encodeByNameWith (tsvEncodeOptions Tab) hdr (doc2tsv <$> hs)


