[tsv] migrate tsvv3 to separate module, separate tsv hal, tsv phylo

2ceb742b · Przemyslaw Kaminski · c8185ce7 · 2ceb742b · 2ceb742b · 2ceb742b
Verified Commit 2ceb742b authored Oct 21, 2025 by Przemyslaw Kaminski
13 changed files
--- a/bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
+++ b/bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
@@ -20,7 +20,8 @@ import Data.Tuple.Extra (both)
 import Data.Vector qualified as DV
 import GHC.Generics
 import Gargantext.Core.Text.Context (TermList)
-import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear)
+import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, fromMIntOrDec, defaultYear)
+import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 (tsv_title, tsv_abstract, tsv_publication_year)
 import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
 import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
 import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )

--- a/bin/gargantext-cli/CLI/Phylo/Common.hs
+++ b/bin/gargantext-cli/CLI/Phylo/Common.hs
@@ -14,7 +14,8 @@ import Gargantext.API.Ngrams.Prelude (toTermList)
 import Gargantext.API.Ngrams.Types
 import Gargantext.Core.Text.Context (TermList)
 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
-import Gargantext.Core.Text.Corpus.Parsers.TSV (tsv_title, tsv_abstract, tsv_publication_year, tsv_publication_month, tsv_publication_day, tsv'_source, tsv'_title, tsv'_abstract, tsv'_publication_year, tsv'_publication_month, tsv'_publication_day, tsv'_weight)
+import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 (tsv_title, tsv_abstract, tsv_publication_year, tsv_publication_month, tsv_publication_day)
+import Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo (tsv'_source, tsv'_title, tsv'_abstract, tsv'_publication_year, tsv'_publication_month, tsv'_publication_day, tsv'_weight)
 import Gargantext.Core.Text.Corpus.Parsers.TSV qualified as Tsv
 import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
 import Gargantext.Core.Text.Ngrams (NgramsType(..))
@@ -95,15 +96,17 @@ tsvToDocs parser patterns time path =
                                     , hour = Def.hour
                                     , minute = Def.minute
                                     , sec = Def.second }
-    Tsv' limit -> Vector.toList
+    Tsv' limit -> do
-      <$> Vector.take limit
+      (Right contents) <- Tsv.readTSVFileDelim path Tsv.Tab
-      <$> Vector.map (\row -> Document { date = toPhyloDate  (rowToUTCTimeR row) time
+      pure $ Vector.toList
-                                       , date' = toPhyloDate' (rowToUTCTimeR row) time
+        $ Vector.take limit
-                                       , text = termsInText patterns $ (tsv'_title row) <> " " <> (tsv'_abstract row)
+          (Vector.map (\row -> Document { date = toPhyloDate  (rowToUTCTimeR row) time
-                                       , weight = Just $ tsv'_weight row
+                                        , date' = toPhyloDate' (rowToUTCTimeR row) time
-                                       , sources = map (T.strip . pack) $ splitOn ";" (unpack $ (tsv'_source row))
+                                        , text = termsInText patterns $ tsv'_title row <> " " <> tsv'_abstract row
-                                       , docTime = time }
+                                        , weight = Just $ tsv'_weight row
-                     ) <$> snd <$> Tsv.readWeightedTsv path
+                                        , sources = map (T.strip . pack) $ splitOn ";" (unpack $ tsv'_source row) 
+                                        , docTime = time }
+                        ) $ snd contents)
      where
        rowToUTCTimeR row = UTCTimeR { year = tsv'_publication_year row
                                     , month = tsv'_publication_month row

--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -233,6 +233,10 @@ library
      Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
      Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
      Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
+      Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
+      Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
+      Gargantext.Core.Text.Corpus.Parsers.TSV.Types
+      Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
      Gargantext.Core.Text.Corpus.Query
      Gargantext.Core.Text.List
      Gargantext.Core.Text.List.Formats.TSV

--- a/src/Gargantext/Core/Text/Corpus/API/Isidore.hs
+++ b/src/Gargantext/Core/Text/Corpus/API/Isidore.hs
@@ -22,11 +22,12 @@ import Data.Text qualified as Text
 import Gargantext.Core (Lang(..))
 import Gargantext.Core.Text.Corpus.Parsers (cleanText)
 import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
+import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3
 import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
 import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
 import Gargantext.Defaults qualified as Defaults
 import Gargantext.Prelude hiding (get)
-import Isidore qualified as Isidore
+import Isidore qualified
 import Isidore.Client
 import Servant.Client ( ClientError(DecodeFailure) )
@@ -47,15 +48,14 @@ get lang l q a = do
  iDocs <- either printErr _content <$> Isidore.get l q a
-  hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
+  mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
-  pure hDocs
 isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit
    -> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
    -> IO ()
 isidore2tsvFile fp lang li tq aq = do
  hdocs <- get lang li tq aq
-  writeDocs2Tsv fp hdocs
+  writeDocs2Tsv TSVv3.headerTsvGargV3 TSVv3.doc2tsv fp hdocs
 isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
 isidoreToDoc lang (IsidoreDoc t a d u s as) = do
@@ -73,7 +73,7 @@ isidoreToDoc lang (IsidoreDoc t a d u s as) = do
    langText (OnlyText t2   ) = t2
    langText (ArrayText ts  ) = Text.unwords $ map langText ts
-  let mDateS = maybe (Just $ Text.pack $ show Defaults.year) Just d
+  let mDateS = d <|> Just (Text.pack $ show Defaults.year)
  let (utcTime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS
  pure HyperdataDocument

--- a/src/Gargantext/Core/Text/Corpus/Parsers.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers.hs
@@ -54,6 +54,8 @@ import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC, parseIstex)
 import Gargantext.Core.Text.Corpus.Parsers.RIS qualified as RIS
 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
 import Gargantext.Core.Text.Corpus.Parsers.TSV (parseHal, parseTsv, parseTsvC)
+import Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal qualified as TsvHal
+import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3
 import Gargantext.Core.Text.Corpus.Parsers.Types
 import Gargantext.Core.Text.Corpus.Parsers.WOS qualified as WOS
 import Gargantext.Core.Text.Ngrams (NgramsType(..))
@@ -111,10 +113,10 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
             -> DB.ByteString
             -> m (Either DT.Text (Integer, ConduitT () (ParseCorpusResult HyperdataDocument) IO ()))
    do_parse TsvGargV3 Plain bs = do
-      let eParsedC = parseTsvC $ DBL.fromStrict bs
+      let eParsedC = parseTsvC TSVv3.tsv2doc $ DBL.fromStrict bs
      pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
    do_parse TsvHal    Plain bs = do
-      let eParsedC = parseTsvC $ DBL.fromStrict bs
+      let eParsedC = parseTsvC TsvHal.tsv2doc $ DBL.fromStrict bs
      pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
    do_parse Istex Plain bs = do
      ep <- liftBase $ parseIstex EN $ DBL.fromStrict bs
@@ -221,7 +223,7 @@ parseFile :: FileType
          -> FileFormat
          -> FilePath
          -> IO (Either Text [HyperdataDocument])
-parseFile TsvGargV3 Plain p = parseTsv p
+parseFile TsvGargV3 Plain p = parseTsv TSVv3.tsv2doc p
 parseFile TsvHal    Plain p = parseHal p
 parseFile RisPresse Plain p = do
  docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p

--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Diagnostics.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Diagnostics.hs
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv3.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv3.hs
-{-# LANGUAGE BangPatterns     #-}
+{-|
+Module      : Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
+Description :
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
 module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 where
 import Data.Csv
+import Data.Text (pack)
+import Data.Time.Segment (jour)
+import Gargantext.Core.Text.Corpus.Parsers.TSV.Diagnostics
 import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
 import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
 import Gargantext.Prelude
@@ -45,6 +57,34 @@ data TsvDoc = TsvDoc
    deriving (Show)
+tsv2doc :: TsvDoc -> HyperdataDocument
+tsv2doc (TsvDoc { .. })
+  = HyperdataDocument { _hd_bdd = Just "TsvHal"
+                      , _hd_doi = Nothing
+                      , _hd_url = Nothing
+                      , _hd_page = Nothing
+                      , _hd_title = Just tsv_title
+                      , _hd_authors = Just tsv_authors
+                      , _hd_institutes = Nothing
+                      , _hd_source = Just tsv_source
+                      , _hd_abstract = Just tsv_abstract
+                      , _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
+                                                                         pubMonth
+                                                                         pubDay
+                      , _hd_publication_year = Just pubYear
+                      , _hd_publication_month = Just pubMonth
+                      , _hd_publication_day = Just pubDay
+                      , _hd_publication_hour = Nothing
+                      , _hd_publication_minute = Nothing
+                      , _hd_publication_second = Nothing
+                      , _hd_language_iso2 = Nothing
+                      , _hd_institutes_tree = Nothing }
+  where
+    pubYear = fromMIntOrDec defaultYear tsv_publication_year
+    pubMonth = fromMaybe defaultMonth tsv_publication_month
+    pubDay = fromMaybe defaultDay tsv_publication_day
 instance FromNamedRecord TsvDoc where
  parseNamedRecord r = do
@@ -69,15 +109,29 @@ instance ToNamedRecord TsvDoc where
                ]
-hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
+doc2tsv :: HyperdataDocument -> TsvDoc
-hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
+doc2tsv h = TsvDoc { tsv_title = m $ _hd_title h
-                                    , tsv_source = m $ _hd_source h
+                   , tsv_source = m $ _hd_source h
-                                    , tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
+                   , tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
-                                    , tsv_publication_month = Just $ mI $ _hd_publication_month h
+                   , tsv_publication_month = Just $ mI $ _hd_publication_month h
-                                    , tsv_publication_day = Just $ mI $ _hd_publication_day   h
+                   , tsv_publication_day = Just $ mI $ _hd_publication_day   h
-                                    , tsv_abstract = m $ _hd_abstract h
+                   , tsv_abstract = m $ _hd_abstract h
-                                    , tsv_authors = m $ _hd_authors h }
+                   , tsv_authors = m $ _hd_authors h }
  where
    m = maybe "" identity
    mI = maybe 0 identity
+validators :: [ColumnValidator]
+validators =
+  [ ("Publication Day", validNumber)
+  , ("Publication Month", validNumber)
+  , ("Publication Year", validNumber)
+  , ("Authors", validTextField)
+  , ("Title", validTextField)
+  , ("Source", validTextField)
+  , ("Abstract", validTextField)
+  -- TODO validDateField
+  -- , ("Publication Date", validTextField)
+  ]
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv4.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv4.hs
-{-# LANGUAGE BangPatterns     #-}
+{-|
+Module      : Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
+Description :
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
 module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4 where
 import Data.Csv
-import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
+-- import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
 import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
 import Gargantext.Prelude

--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TsvHal.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TsvHal.hs
+{-|
+Module      : Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
+Description :
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
+module Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal where
+import Data.ByteString.Lazy qualified as BL
+import Data.Csv
+import Data.Text (pack)
+import Data.Time.Segment (jour)
+import Data.Vector (Vector)
+import Data.Vector qualified as V
+import Gargantext.Core.Text.Corpus.Parsers.TSV.Types (Delimiter(..))
+import Gargantext.Core.Text.Corpus.Parsers.TSV.Utils (readTsvLazyBS)
+import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
+import Gargantext.Prelude
+------------------------------------------------------------------------
+-- Hal Format
+data TsvHal = TsvHal
+    { tsvHal_title  :: !Text
+    , tsvHal_source :: !Text
+    , tsvHal_publication_year  :: !Integer
+    , tsvHal_publication_month :: !Int
+    , tsvHal_publication_day   :: !Int
+    , tsvHal_abstract          :: !Text
+    , tsvHal_authors           :: !Text
+    , tsvHal_url               :: !Text
+    , tsvHal_isbn_s            :: !Text
+    , tsvHal_issue_s           :: !Text
+    , tsvHal_journalPublisher_s:: !Text
+    , tsvHal_language_s        :: !Text
+    , tsvHal_doiId_s           :: !Text
+    , tsvHal_authId_i          :: !Text
+    , tsvHal_instStructId_i    :: !Text
+    , tsvHal_deptStructId_i    :: !Text
+    , tsvHal_labStructId_i     :: !Text
+    , tsvHal_rteamStructId_i   :: !Text
+    , tsvHal_docType_s         :: !Text
+    }
+    deriving (Show)
+instance FromNamedRecord TsvHal where
+  parseNamedRecord r = do
+    tsvHal_title <- r .: "title"
+    tsvHal_source <- r .: "source"
+    tsvHal_publication_year <- r .: "publication_year"
+    tsvHal_publication_month <- r .: "publication_month"
+    tsvHal_publication_day <- r .: "publication_day"
+    tsvHal_abstract <- r .: "abstract"
+    tsvHal_authors <- r .: "authors"
+    tsvHal_url <- r .: "url"
+    tsvHal_isbn_s <- r .: "isbn_s"
+    tsvHal_issue_s <- r .: "issue_s"
+    tsvHal_journalPublisher_s <- r .: "journalPublisher_s"
+    tsvHal_language_s <- r .: "language_s"
+    tsvHal_doiId_s <- r .: "doiId_s"
+    tsvHal_authId_i <- r .: "authId_i"
+    tsvHal_instStructId_i <- r .: "instStructId_i"
+    tsvHal_deptStructId_i <- r .: "deptStructId_i"
+    tsvHal_labStructId_i <- r .: "labStructId_i"
+    tsvHal_rteamStructId_i <- r .: "rteamStructId_i"
+    tsvHal_docType_s <- r .: "docType_s"
+    pure $ TsvHal { .. }
+instance ToNamedRecord TsvHal where
+  --toNamedRecord (TsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
+  toNamedRecord (TsvHal { .. }) =
+    namedRecord [ "title"  .= tsvHal_title
+                , "source" .= tsvHal_source
+                , "publication_year"  .= tsvHal_publication_year
+                , "publication_month" .= tsvHal_publication_month
+                , "publication_day"   .= tsvHal_publication_day
+                , "abstract"          .= tsvHal_abstract
+                , "authors"           .= tsvHal_authors
+                , "url"                .= tsvHal_url
+                , "isbn_s"             .= tsvHal_isbn_s
+                , "issue_s"            .= tsvHal_issue_s
+                , "journalPublisher_s" .= tsvHal_journalPublisher_s
+                , "language_s"         .= tsvHal_language_s
+                , "doiId_s"            .= tsvHal_doiId_s
+                , "authId_i"           .= tsvHal_authId_i
+                , "instStructId_i"     .= tsvHal_instStructId_i
+                , "deptStructId_i"     .= tsvHal_deptStructId_i
+                , "labStructId_i"      .= tsvHal_labStructId_i
+                , "rteamStructId_i"    .= tsvHal_rteamStructId_i
+                , "docType_s"          .= tsvHal_docType_s
+               ]
+tsv2doc :: TsvHal -> HyperdataDocument
+tsv2doc (TsvHal { .. }) =
+  HyperdataDocument { _hd_bdd = Just "TsvHal"
+                    , _hd_doi = Just tsvHal_doiId_s
+                    , _hd_url = Just tsvHal_url
+                    , _hd_page = Nothing
+                    , _hd_title = Just tsvHal_title
+                    , _hd_authors = Just tsvHal_authors
+                    , _hd_institutes = Just tsvHal_instStructId_i
+                    , _hd_source = Just tsvHal_source
+                    , _hd_abstract = Just tsvHal_abstract
+                    , _hd_publication_date = Just $ pack . show $ jour tsvHal_publication_year
+                                                                      tsvHal_publication_month
+                                                                      tsvHal_publication_day
+                    , _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
+                    , _hd_publication_month = Just tsvHal_publication_month
+                    , _hd_publication_day = Just tsvHal_publication_day
+                    , _hd_publication_hour = Nothing
+                    , _hd_publication_minute = Nothing
+                    , _hd_publication_second = Nothing
+                    , _hd_language_iso2 = Nothing
+                    , _hd_institutes_tree = Nothing }
+------------------------------------------------------------------------
+-- | TODO use readFileLazy
+readTsvHal :: FilePath -> IO (Either Text (Header, Vector TsvHal))
+readTsvHal fp = do
+  c <- BL.readFile fp
+  pure $ readTsvLazyBS Tab c
+------------------------------------------------------------------------
+parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
+parseHal fp = do
+  r <- readTsvHal fp
+  pure $ V.toList . V.map tsv2doc . snd <$> r
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TsvPhylo.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TsvPhylo.hs
+{-|
+Module      : Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
+Description :
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
+module Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo where
+import Data.Csv
+import Gargantext.Prelude
+------------------------------------------------------------------------
+-- Tsv v3 weighted for phylo
+data Tsv' = Tsv'
+      { tsv'_title             :: !Text
+      , tsv'_source            :: !Text
+      , tsv'_publication_year  :: !Int
+      , tsv'_publication_month :: !Int
+      , tsv'_publication_day   :: !Int
+      , tsv'_abstract          :: !Text
+      , tsv'_authors           :: !Text
+      , tsv'_weight            :: !Double } deriving (Show)
+instance FromNamedRecord Tsv' where
+  parseNamedRecord r = do
+    tsv'_title <- r .: "title"
+    tsv'_source <- r .: "source"
+    tsv'_publication_year <- r .: "publication_year"
+    tsv'_publication_month <- r .: "publication_month"
+    tsv'_publication_day <- r .: "publication_day"
+    tsv'_abstract <- r .: "abstract"
+    tsv'_authors <- r .: "authors"
+    tsv'_weight <- r .: "weight"
+    pure $ Tsv' { .. }
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Utils.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Utils.hs
+{-|
+Module      : Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
+Description :
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
+module Gargantext.Core.Text.Corpus.Parsers.TSV.Utils where
+import Data.ByteString.Lazy qualified as BL
+import Data.ByteString.Lazy.Char8 qualified as B8L
+import Data.Csv
+import Data.Text (pack)
+import Data.Vector (Vector)
+import Data.Vector qualified as V
+import Gargantext.Core.Text.Corpus.Parsers.TSV.Types
+import Gargantext.Prelude
+parseTsv :: FromNamedRecord tsvDoc
+         => (tsvDoc -> result)
+         -> FilePath
+         -> IO (Either Text [result])
+parseTsv tsv2doc fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp
+-- | TODO use readByteStringLazy
+readTsvLazyBS :: FromNamedRecord tsvDoc
+              => Delimiter
+              -> BL.ByteString
+              -> Either Text (Header, Vector tsvDoc)
+readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
+-- | TODO use readFileLazy
+readTSVFile :: FromNamedRecord tsvDoc
+            => FilePath
+            -> IO (Either Text (Header, Vector tsvDoc))
+readTSVFile fp = do
+  file <- BL.readFile fp
+  case detectDelimiter file of
+    Left err  -> pure $ Left err
+    Right del -> pure $ readTsvLazyBS del file
+readTSVFileDelim :: FromNamedRecord tsvDoc
+                 => FilePath
+                 -> Delimiter
+                 -> IO (Either Text (Header, Vector tsvDoc))
+readTSVFileDelim fp del = do
+  file <- BL.readFile fp
+  pure $ readTsvLazyBS del file
+-- Detects delimiter based on the first line
+detectDelimiter :: BL.ByteString -> Either Text Delimiter
+detectDelimiter input =
+    case B8L.lines input of
+        (firstLine : _) ->
+            let candidates = [(',', count ',' firstLine), ('\t', count '\t' firstLine)]
+            in case fst $ maximumBy (comparing snd) candidates of
+                 '\n' -> Right Line
+                 '\t' -> Right Tab
+                 ','  -> Right Comma
+                 _    -> Left $ "Invalid delimiter detected for input tsv document."
+        _ -> Left "Couldn't detect a valid delimiter for the input document."
+-- Count occurrences of a character in a ByteString
+count :: Char -> BL.ByteString -> Int64
+count c = BL.count (fromIntegral (fromEnum c))
+------------------------------------------------------------------------
+writeDocs2Tsv :: ToNamedRecord tsvDoc
+              => Header
+              -> (doc -> tsvDoc)
+              -> FilePath
+              -> [doc]
+              -> IO ()
+writeDocs2Tsv hdr doc2tsv fp hs =
+  BL.writeFile fp $ encodeByNameWith (tsvEncodeOptions Tab) hdr (doc2tsv <$> hs)
--- a/test/Test/Core/Text/Corpus/TSV.hs
+++ b/test/Test/Core/Text/Corpus/TSV.hs
@@ -3,6 +3,7 @@
 module Test.Core.Text.Corpus.TSV (tests) where
 import Gargantext.Core.Text.Corpus.Parsers.TSV
+import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3
 import Test.QuickCheck
 import Test.QuickCheck.Instances ()
 import Data.ByteString.Lazy.UTF8 as BLU
@@ -182,7 +183,7 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
    let line = createLineFromCorpus tsv del
    let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
    let splitLine = BL.splitWith (==delimiter del) line
-    case testErrorPerLine splitLine del headers 1 of
+    case testErrorPerLine TSVv3.validators splitLine del headers 1 of
        Right _ -> True
        Left  _ -> validRandomCorpus tsv del)
@@ -194,13 +195,13 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
 testTestCorrectFile :: Property
 testTestCorrectFile = forAll generateFile (\file -> do
    let tsv = createFile file
-    case testCorrectFile tsv of
+    case testCorrectFile TSVv3.validators tsv of
        Right del -> del == fDelimiter file
        Left _ -> Prelude.all (\x -> do
            let del = fDelimiter file
            let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
            let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
-            case testErrorPerLine splitLine del headers 1 of
+            case testErrorPerLine TSVv3.validators splitLine del headers 1 of
                Right _ -> True
                Left  _ -> validRandomCorpus x del) (allCorpus file))
@@ -209,13 +210,13 @@ testTestCorrectFile = forAll generateFile (\file -> do
 testTestCorrectFileWithNewLine :: Property
 testTestCorrectFileWithNewLine = forAll generateFile (\file -> do
    let tsv = createFileWithNewLine file
-    case testCorrectFile tsv of
+    case testCorrectFile TSVv3.validators tsv of
        Right _ -> True
        Left _ -> Prelude.all (\x -> do
            let del = fDelimiter file
            let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
            let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
-            case testErrorPerLine splitLine del headers 1 of
+            case testErrorPerLine TSVv3.validators splitLine del headers 1 of
                Right _ -> True
                Left  _ -> validRandomCorpus x del) (allCorpus file))
@@ -233,7 +234,7 @@ testFindDelimiter = forAll generateFileDelimiter (\file -> do
 testGetHeader :: Property
 testGetHeader = forAll randomHeaderList (\headers -> do
    let headersLines = (BL.intercalate (delimiterBS Tab) $ Prelude.map BLU.fromString headers):[]
-    case getHeaders headersLines Tab of
+    case getHeaders TSVv3.validators headersLines Tab of
        Right _                                                     -> True
        Left  _ | not ("Publication Day" `Prelude.elem` headers)    -> True
                | not ("Publication Month" `Prelude.elem` headers)  -> True