{-|
Module      : Gargantext.Core.Text.Corpus.Parsers.TSV
Description :
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

TSV parser for Gargantext corpus files.

-}


module Gargantext.Core.Text.Corpus.Parsers.TSV where

import Conduit ( ConduitT, (.|), yieldMany, mapC )
import Data.ByteString qualified as BS
import Data.ByteString.Lazy qualified as BL
import Data.Csv
import Data.Text (pack)
import Data.Text qualified as T
import Data.Text.Lazy            qualified as TL
import Data.Text.Lazy.Encoding    qualified as TL
import Data.Text.Read     qualified as DTR
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import Data.Vector qualified as V
import Gargantext.Core.Text ( sentences, unsentences )
import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (length, show)
import Protolude

---------------------------------------------------------------
headerTsvGargV3 :: Header
headerTsvGargV3 =
  header [ "title"
         , "source"
         , "publication_year"
         , "publication_month"
         , "publication_day"
         , "abstract"
         , "authors"
         ]
---------------------------------------------------------------
data TsvGargV3 = TsvGargV3
    { d_docId             :: !Int
    , d_title             :: !Text
    , d_source            :: !Text
    , d_publication_year  :: !Int
    , d_publication_month :: !Int
    , d_publication_day   :: !Int
    , d_abstract          :: !Text
    , d_authors           :: !Text
    }
    deriving (Show)
---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
toDoc :: TsvGargV3 -> HyperdataDocument
toDoc (TsvGargV3 did dt _ dpy dpm dpd dab dau) =
  HyperdataDocument { _hd_bdd = Just "TSV"
                    , _hd_doi = Just . pack . show $ did
                    , _hd_url = Nothing
                    , _hd_page = Nothing
                    , _hd_title = Just dt
                    , _hd_authors = Nothing
                    , _hd_institutes = Just dau
                    , _hd_source = Just dab
                    , _hd_abstract = Nothing
                    , _hd_publication_date = Nothing
                    , _hd_publication_year = Just dpy
                    , _hd_publication_month = Just dpm
                    , _hd_publication_day = Just dpd
                    , _hd_publication_hour = Nothing
                    , _hd_publication_minute = Nothing
                    , _hd_publication_second = Nothing
                    , _hd_language_iso2 = Nothing
                    , _hd_institutes_tree = Nothing }

---------------------------------------------------------------
-- | Types Conversions
toDocs :: Vector TsvDoc -> [TsvGargV3]
toDocs v = V.toList
         $ V.zipWith (\nId (TsvDoc { .. }) -- (TsvDoc t s mPy pm pd abst auth)
                       -> TsvGargV3 { d_docId = nId
                                    , d_title = tsv_title
                                    , d_source = tsv_source
                                    , d_publication_year = fromMIntOrDec defaultYear tsv_publication_year
                                    , d_publication_month = fromMaybe defaultMonth tsv_publication_month
                                    , d_publication_day = fromMaybe defaultDay tsv_publication_day
                                    , d_abstract = tsv_abstract
                                    , d_authors = tsv_authors })
                       (V.enumFromN 1 (V.length v'')) v''
          where
            v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
            seps= V.fromList [Paragraphs 1, Sentences 3, Chars 3]

---------------------------------------------------------------
fromDocs :: Vector TsvGargV3 -> Vector TsvDoc
fromDocs = V.map fromDocs'
  where
    fromDocs' (TsvGargV3 { .. }) = TsvDoc { tsv_title = d_title
                                          , tsv_source = d_source
                                          , tsv_publication_year = Just $ IntOrDec d_publication_year
                                          , tsv_publication_month = Just d_publication_month
                                          , tsv_publication_day = Just d_publication_day
                                          , tsv_abstract = d_abstract
                                          , tsv_authors = d_authors }

---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc m splt doc =
 let docSize = (T.length $ tsv_abstract doc) in
 if (docSize > 1000) && (mod (round m) docSize >= 10)
   then splitDoc' splt doc
   else V.fromList [doc]
  where
    splitDoc' :: SplitContext -> TsvDoc -> Vector TsvDoc
    splitDoc' contextSize (TsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
        where
          firstDoc = TsvDoc { tsv_abstract = firstAbstract, .. }
          firstAbstract = head' "splitDoc'1" abstracts

          nextDocs = map (\txt -> TsvDoc { tsv_title = head' "splitDoc'2" $ sentences txt
                                         , tsv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
                                         , .. }
                          ) (tail' "splitDoc'2" abstracts)

          abstracts    = (splitBy $ contextSize) tsv_abstract

---------------------------------------------------------------
---------------------------------------------------------------
type Mean = Double

docsSize :: Vector TsvDoc -> Mean
docsSize tsvDoc = mean ls
  where
    ls = V.toList $ V.map (fromIntegral . T.length . tsv_abstract) tsvDoc


---------------------------------------------------------------
newtype IntOrDec = IntOrDec Int
  deriving (Show, Eq, Read)
unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i
instance FromField IntOrDec where
  parseField s = case runParser (parseField s :: Parser Int) of
    Left _err -> IntOrDec . floor <$> (parseField s :: Parser Double)
    Right n   -> pure $ IntOrDec n
instance ToField IntOrDec where
  toField (IntOrDec i) = toField i

fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
defaultYear :: Int
defaultYear = 1973
defaultMonth :: Int
defaultMonth = 1
defaultDay :: Int
defaultDay = 1

data TsvDoc = TsvDoc
    { tsv_title             :: !Text
    , tsv_source            :: !Text
    , tsv_publication_year  :: !(Maybe IntOrDec)
    , tsv_publication_month :: !(Maybe Int)
    , tsv_publication_day   :: !(Maybe Int)
    , tsv_abstract          :: !Text
    , tsv_authors           :: !Text
    }
    deriving (Show)

instance FromNamedRecord TsvDoc where
  parseNamedRecord r = do
    tsv_title <- r .: "title" <|> r .: "Title"
    tsv_source <- r .: "source" <|> r .: "Source"
    tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
    tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
    tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
    tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
    tsv_authors <- r .: "authors" <|> r .: "Authors"
    pure $ TsvDoc { .. }

instance ToNamedRecord TsvDoc where
  toNamedRecord (TsvDoc{ .. }) =
    namedRecord [ "title"             .= tsv_title
                , "source"            .= tsv_source
                , "publication_year"  .= tsv_publication_year
                , "publication_month" .= tsv_publication_month
                , "publication_day"   .= tsv_publication_day
                , "abstract"          .= tsv_abstract
                , "authors"           .= tsv_authors
                ]

hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
                                    , tsv_source = m $ _hd_source h
                                    , tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
                                    , tsv_publication_month = Just $ mI $ _hd_publication_month h
                                    , tsv_publication_day = Just $ mI $ _hd_publication_day   h
                                    , tsv_abstract = m $ _hd_abstract h
                                    , tsv_authors = m $ _hd_authors h }

  where
    m = maybe "" identity
    mI = maybe 0 identity


data Delimiter = Tab | Comma | Line deriving (Eq, Show)

tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}

tsvEncodeOptions :: Delimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}

delimiter :: Delimiter -> Word8
delimiter Tab   = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
delimiter Line  = fromIntegral $ ord '\n'
------------------------------------------------------------------------


testDelimiter :: Delimiter -> BL.ByteString -> Bool
testDelimiter del bs =
    let x = BL.splitWith (== delimiter Line) bs
        vec = V.fromList x in
          case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
            Nothing -> False
            Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
              Nothing -> False
              Just f -> length e == length f && length e > 2

findDelimiter :: BL.ByteString -> Either Text Delimiter
findDelimiter bs
  | testDelimiter Tab bs = Right Tab
  | testDelimiter Comma bs = Right Comma
  | otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")

isNumeric :: Text -> Either Bool Int
isNumeric str = case DTR.decimal str of
  Right (x,y) -> if y == ""
    then Right x
    else Left False
  Left _ -> Left False

lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TL.decodeUtf8 b

validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
validNumber x columnHeader ligne = do
    let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
    case isNumeric number of
      Right val 
        | val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
        |otherwise -> Right True
      Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")


validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
validTextField x columnHeader ligne = do
  let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
    if not (T.null xs) 
      then 
        if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2))) 
          then return True 
          else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter  \"  must only appear at the beginning and the end of a field ")
      else return True
      -- else Left $ ("The column '" <> columnHeader <> "' at line " <>  pack (show ligne) <> " is empty")
      -- Put a warning for the user to know their is a problem (empty column)


testValue :: BL.ByteString -> Text -> Int -> Either Text Bool
testValue val columnHeader ligne = case columnHeader of
    "Publication Day" -> validNumber val columnHeader ligne
    "Publication Month" -> validNumber val columnHeader ligne
    "Publication Year" -> validNumber val columnHeader ligne
    "Authors" -> validTextField val columnHeader ligne
    "Title" -> validTextField val columnHeader ligne
    "Source" -> validTextField val columnHeader ligne
    "Abstract" -> validTextField val columnHeader ligne
    _ -> Right True

testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool
testErrorPerLine [] _ [] _ = Right True
testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
                            | otherwise =  Left (pack $ "Too much field at line " <> show l)
testErrorPerLine [] _ _ l = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine (v:val) del (h:headers) ligne =
    case testValue v h ligne of
        Left _err -> Left _err
        Right _ -> testErrorPerLine val del headers ligne


checkNextLine :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine bl del headers res x = do
  case BL.splitWith (==delimiter del) <$> ((V.!?) bl (x+1)) of
    Nothing  -> Right (x, (BL.splitWith (==delimiter del) res))
    Just value -> if length value > 1 
      then Right (x, (BL.splitWith (==delimiter del) res))
      else case BL.append res <$> ((V.!?) bl (x+1)) of
        Nothing  -> Left "checkNextLine2"
        Just val -> checkNextLine bl del headers val (x+1)

getMultipleLinefile :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile bl del headers res x = do
    let tmp = BL.splitWith (==delimiter del) res in
        if length tmp == length headers 
          then checkNextLine bl del headers res x
          else
            if (length tmp > length headers) || (V.length bl == (x + 1)) 
              then Left (pack $ "Cannot parse the file at line " <> show x <> ". Maybe because of a delimiter") 
              else do
                case BL.append res <$> ((V.!?) bl (x+1)) of
                  Nothing  -> Left "getMultipleLinefile"
                  Just val -> getMultipleLinefile bl del headers val (x+1)

anx :: Vector BL.ByteString -> Delimiter -> [Text] -> Int -> Either Text Delimiter
anx bl del headers x
            | length bl == x = Right del
            | otherwise      = 
              case (V.!?) bl x of 
                Nothing -> Left "anx"
                Just bs ->
                  case getMultipleLinefile bl del headers bs x of 
                    Left _err -> Left _err
                    Right (y, val) -> case testErrorPerLine val del headers (x + 1) of
                      Left _err -> Left _err
                      Right _   -> anx bl del headers (y+1)


testIfErrorInFile :: [BL.ByteString] -> Delimiter -> [Text] -> Either Text Delimiter
testIfErrorInFile bl del headers = anx (V.fromList bl) del headers 1

testCorrectFile :: BL.ByteString -> Either Text Delimiter 
testCorrectFile bs = 
  case findDelimiter bs of
    Left _err -> Left _err
    Right del -> do
      let bl = BL.splitWith (==delimiter Line) bs in
        case getHeaders bl del of
          Left _err -> Left _err
          Right headers -> testIfErrorInFile bl del headers        



----------Test headers added to ggt


-- use a map to remove \r that sometimes appear at the end of a line
testAllHeadersPresence :: [Text] -> Either Text [Text]
testAllHeadersPresence headers = do
    let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack ""))headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
    if null listHeaders
        then Right headers
        else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)

getHeaders :: [BL.ByteString] -> Delimiter -> Either Text [Text]
getHeaders bl del = do
    let vec = V.fromList bl in
        case BL.splitWith (==delimiter del) <$> ((V.!?) vec 0) of 
          Nothing -> Left "Error getHeaders"
          Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)


------------------------------------------------------------------------

readFileLazy :: (FromNamedRecord a)
             => proxy a
             -> Delimiter
             -> FilePath
             -> IO (Either Text (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile

readFileStrict :: (FromNamedRecord a)
               => proxy a
               -> Delimiter
               -> FilePath
               -> IO (Either Text (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile

readByteStringLazy :: (FromNamedRecord a)
                   => proxy a
                   -> Delimiter
                   -> BL.ByteString
                   -> Either Text (Header, Vector a)
readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs

readByteStringStrict :: (FromNamedRecord a)
                     => proxy a
                     -> Delimiter
                     -> BS.ByteString
                     -> Either Text (Header, Vector a)
readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict

------------------------------------------------------------------------
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc))
readTSVFile fp = do
  file <- BL.readFile fp
  case (testCorrectFile file) of
    Left _err -> pure $ Left _err
    Right del -> pure $ readTsvLazyBS del file



-- | TODO use readByteStringLazy
readTsvLazyBS :: Delimiter
              -> BL.ByteString
              -> Either Text (Header, Vector TsvDoc)
readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs

------------------------------------------------------------------------
-- | TODO use readFileLazy
readTsvHal :: FilePath -> IO (Either Text (Header, Vector TsvHal))
readTsvHal fp = do
  c <- BL.readFile fp
  pure $ readTsvHalLazyBS c

-- | TODO use readByteStringLazy
readTsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalLazyBS bs = first pack $ decodeByNameWith (tsvDecodeOptions Tab) bs

readTsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalBSStrict bs = readTsvHalLazyBS $ BL.fromStrict bs

------------------------------------------------------------------------
writeFile :: FilePath -> (Header, Vector TsvDoc) -> IO ()
writeFile fp (h, vs) = BL.writeFile fp $
                      encodeByNameWith (tsvEncodeOptions Tab) h (V.toList vs)

writeDocs2Tsv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Tsv fp hs = BL.writeFile fp $ hyperdataDocument2tsv hs

hyperdataDocument2tsv :: [HyperdataDocument] -> BL.ByteString
hyperdataDocument2tsv hs = encodeByNameWith (tsvEncodeOptions Tab) headerTsvGargV3 (map hyperdataDocument2tsvDoc hs)

------------------------------------------------------------------------
-- Hal Format
data TsvHal = TsvHal
    { tsvHal_title  :: !Text
    , tsvHal_source :: !Text
    , tsvHal_publication_year  :: !Integer
    , tsvHal_publication_month :: !Int
    , tsvHal_publication_day   :: !Int
    , tsvHal_abstract          :: !Text
    , tsvHal_authors           :: !Text

    , tsvHal_url               :: !Text
    , tsvHal_isbn_s            :: !Text
    , tsvHal_issue_s           :: !Text
    , tsvHal_journalPublisher_s:: !Text
    , tsvHal_language_s        :: !Text

    , tsvHal_doiId_s           :: !Text
    , tsvHal_authId_i          :: !Text
    , tsvHal_instStructId_i    :: !Text
    , tsvHal_deptStructId_i    :: !Text
    , tsvHal_labStructId_i     :: !Text

    , tsvHal_rteamStructId_i   :: !Text
    , tsvHal_docType_s         :: !Text
    }
    deriving (Show)

instance FromNamedRecord TsvHal where
  parseNamedRecord r = do
    tsvHal_title <- r .: "title"
    tsvHal_source <- r .: "source"
    tsvHal_publication_year <- r .: "publication_year"
    tsvHal_publication_month <- r .: "publication_month"
    tsvHal_publication_day <- r .: "publication_day"
    tsvHal_abstract <- r .: "abstract"
    tsvHal_authors <- r .: "authors"
    tsvHal_url <- r .: "url"
    tsvHal_isbn_s <- r .: "isbn_s"
    tsvHal_issue_s <- r .: "issue_s"
    tsvHal_journalPublisher_s <- r .: "journalPublisher_s"
    tsvHal_language_s <- r .: "language_s"
    tsvHal_doiId_s <- r .: "doiId_s"
    tsvHal_authId_i <- r .: "authId_i"
    tsvHal_instStructId_i <- r .: "instStructId_i"
    tsvHal_deptStructId_i <- r .: "deptStructId_i"
    tsvHal_labStructId_i <- r .: "labStructId_i"
    tsvHal_rteamStructId_i <- r .: "rteamStructId_i"
    tsvHal_docType_s <- r .: "docType_s"
    pure $ TsvHal { .. }

instance ToNamedRecord TsvHal where
  --toNamedRecord (TsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
  toNamedRecord (TsvHal { .. }) =
    namedRecord [ "title"  .= tsvHal_title
                , "source" .= tsvHal_source

                , "publication_year"  .= tsvHal_publication_year
                , "publication_month" .= tsvHal_publication_month
                , "publication_day"   .= tsvHal_publication_day

                , "abstract"          .= tsvHal_abstract
                , "authors"           .= tsvHal_authors

                , "url"                .= tsvHal_url
                , "isbn_s"             .= tsvHal_isbn_s
                , "issue_s"            .= tsvHal_issue_s
                , "journalPublisher_s" .= tsvHal_journalPublisher_s
                , "language_s"         .= tsvHal_language_s

                , "doiId_s"            .= tsvHal_doiId_s
                , "authId_i"           .= tsvHal_authId_i
                , "instStructId_i"     .= tsvHal_instStructId_i
                , "deptStructId_i"     .= tsvHal_deptStructId_i
                , "labStructId_i"      .= tsvHal_labStructId_i

                , "rteamStructId_i"    .= tsvHal_rteamStructId_i
                , "docType_s"          .= tsvHal_docType_s
               ]

tsvHal2doc :: TsvHal -> HyperdataDocument
tsvHal2doc (TsvHal { .. }) =
  HyperdataDocument { _hd_bdd = Just "TsvHal"
                    , _hd_doi = Just tsvHal_doiId_s
                    , _hd_url = Just tsvHal_url
                    , _hd_page = Nothing
                    , _hd_title = Just tsvHal_title
                    , _hd_authors = Just tsvHal_authors
                    , _hd_institutes = Just tsvHal_instStructId_i
                    , _hd_source = Just tsvHal_source
                    , _hd_abstract = Just tsvHal_abstract
                    , _hd_publication_date = Just $ pack . show $ jour tsvHal_publication_year
                                                                      tsvHal_publication_month
                                                                      tsvHal_publication_day
                    , _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
                    , _hd_publication_month = Just tsvHal_publication_month
                    , _hd_publication_day = Just tsvHal_publication_day
                    , _hd_publication_hour = Nothing
                    , _hd_publication_minute = Nothing
                    , _hd_publication_second = Nothing
                    , _hd_language_iso2 = Nothing
                    , _hd_institutes_tree = Nothing }


tsv2doc :: TsvDoc -> HyperdataDocument
tsv2doc (TsvDoc { .. })
  = HyperdataDocument { _hd_bdd = Just "TsvHal"
                      , _hd_doi = Nothing
                      , _hd_url = Nothing
                      , _hd_page = Nothing
                      , _hd_title = Just tsv_title
                      , _hd_authors = Just tsv_authors
                      , _hd_institutes = Nothing
                      , _hd_source = Just tsv_source
                      , _hd_abstract = Just tsv_abstract
                      , _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
                                                                         pubMonth
                                                                         pubDay
                      , _hd_publication_year = Just pubYear
                      , _hd_publication_month = Just pubMonth
                      , _hd_publication_day = Just pubDay
                      , _hd_publication_hour = Nothing
                      , _hd_publication_minute = Nothing
                      , _hd_publication_second = Nothing
                      , _hd_language_iso2 = Nothing
                    , _hd_institutes_tree = Nothing }
  where
    pubYear = fromMIntOrDec defaultYear tsv_publication_year
    pubMonth = fromMaybe defaultMonth tsv_publication_month
    pubDay = fromMaybe defaultDay tsv_publication_day

------------------------------------------------------------------------
parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do
  r <- readTsvHal fp
  pure $ V.toList . V.map tsvHal2doc . snd <$> r

parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
parseHal' bs = V.toList . V.map tsvHal2doc . snd <$> readTsvHalLazyBS bs

------------------------------------------------------------------------

parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp

{-
parseTsv' ::  BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}

parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = do
  let
    result = case (testCorrectFile bs) of
      Left _err -> Left _err
      Right del -> readTsvLazyBS del bs
  V.toList . V.map tsv2doc . snd <$> result

parseTsvC :: BL.ByteString
          -> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
parseTsvC bs =
  (\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc)) <$> eResult
  where
    eResult = case (testCorrectFile bs) of
      Left _err -> Left _err
      Right del -> readTsvLazyBS del bs

------------------------------------------------------------------------
-- Tsv v3 weighted for phylo

data Tsv' = Tsv'
      { tsv'_title             :: !Text
      , tsv'_source            :: !Text
      , tsv'_publication_year  :: !Int
      , tsv'_publication_month :: !Int
      , tsv'_publication_day   :: !Int
      , tsv'_abstract          :: !Text
      , tsv'_authors           :: !Text
      , tsv'_weight            :: !Double } deriving (Show)


instance FromNamedRecord Tsv' where
  parseNamedRecord r = do
    tsv'_title <- r .: "title"
    tsv'_source <- r .: "source"
    tsv'_publication_year <- r .: "publication_year"
    tsv'_publication_month <- r .: "publication_month"
    tsv'_publication_day <- r .: "publication_day"
    tsv'_abstract <- r .: "abstract"
    tsv'_authors <- r .: "authors"
    tsv'_weight <- r .: "weight"
    pure $ Tsv' { .. }

readWeightedTsv :: FilePath -> IO (Header, Vector Tsv')
readWeightedTsv fp =
  fmap (\bs ->
    case decodeByNameWith (tsvDecodeOptions Tab) bs of
      Left e       -> panicTrace (pack e)
      Right corpus -> corpus
    ) $ BL.readFile fp
