Make tests play nice with the changes

Not mergeable: the tests fail for now. I suspect the problem comes from the tests for `testCorrectFile`, where the function `checkFromLine` tries to look up a wrong line number. Some more investigation would be needed to figure out exactly what happens (probably should show the TSV file being tested). Anyway, here are the changes in this commit: - Encapsulation of the TSV module in an `Internal` module, so we can export just the interface, but still expose the internals to the tests - Correction of type errors in the tests to have something that typechecks with the changes introduced before

Make tests play nice with the changes
Not mergeable: the tests fail for now. I suspect the problem comes from the tests for `testCorrectFile`, where the function `checkFromLine` tries to look up a wrong line number. Some more investigation would be needed to figure out exactly what happens (probably should show the TSV file being tested). Anyway, here are the changes in this commit: - Encapsulation of the TSV module in an `Internal` module, so we can export just the interface, but still expose the internals to the tests - Correction of type errors in the tests to have something that typechecks with the changes introduced before
ec9fe2d7 · Grégoire Locqueville · 51c8a407 · ec9fe2d7 · ec9fe2d7 · ec9fe2d7
Commit ec9fe2d7 authored Oct 14, 2024 by Grégoire Locqueville
4 changed files
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -197,6 +197,7 @@ library
      Gargantext.Core.Text.Corpus.Parsers
      Gargantext.Core.Text.Corpus.Parsers.Date
      Gargantext.Core.Text.Corpus.Parsers.TSV
+      Gargantext.Core.Text.Corpus.Parsers.TSV.Internal
      Gargantext.Core.Text.Corpus.Query
      Gargantext.Core.Text.List
      Gargantext.Core.Text.List.Formats.TSV

--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
@@ -33,631 +33,4 @@ module Gargantext.Core.Text.Corpus.Parsers.TSV
  )
 where
-import Conduit (ConduitT, yieldMany, mapC, (.|))
+import Gargantext.Core.Text.Corpus.Parsers.TSV.Internal
-import Data.ByteString.Lazy    qualified as BL
-import Data.Csv                qualified as CSV
-import Data.Csv                ((.=), (.:))
-import Data.Text               qualified as T
-import Data.Text.Lazy          qualified as TL
-import Data.Text.Lazy.Encoding qualified as TLE
-import Data.Text.Read          qualified as DTR
-import Data.Time.Segment (jour)
-import Data.Vector qualified as V
-import Data.Vector ((!?))
-import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
-import Gargantext.Prelude
-import Gargantext.Utils.Jobs.Error as Warn
---------------------------------
-- Utility types and functions --
---------------------------------
-- | Day of the UNIX reference time (UNIX epoch)
-defaultYear :: Int
-defaultYear = 1973
-- | UNIX epoch month
-defaultMonth :: Int
-defaultMonth = 1
-- | UNIX epoch year
-defaultDay :: Int
-defaultDay = 1
-- | The possible delimiters for a CSV file
-data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
-- | Concrete representation of the above type
-toWord8 :: ColumnDelimiter -> Word8
-toWord8 Tab       = fromIntegral $ ord '\t'
-toWord8 Comma     = fromIntegral $ ord ','
-toWord8 Semicolon = fromIntegral $ ord ';'
-- | Just the newline character in `Word8` form; handy to have around
-newline :: Word8
-newline = fromIntegral $ ord '\n'
-- | Split a text depending on the delimiter type
-split :: ColumnDelimiter -> BL.ByteString -> [BL.ByteString]
-split delimiter txt = BL.splitWith (== toWord8 delimiter) txt
-- | Remove all occurences of `needle` in `haystack`
-removeText :: T.Text -- ^ The text to remove (the `needle`)
-           -> T.Text -- ^ The original text (the `haystack`)
-           -> T.Text
-removeText needle haystack = T.replace needle "" haystack
-- | Convert from raw byte string into `Text`
-lBLToText :: BL.ByteString -> Text
-lBLToText b = TL.toStrict $ TLE.decodeUtf8 b
-- | Fail in the Either monad iff a predicate is False
-assertE :: Bool -- ^ The predicate to be tested
-        -> e    -- ^ The error if the predicate fails
-        -> Either e ()
-assertE predicate err = if predicate then Right () else Left err
-- | Trying getting a value out of a `Maybe`; if it does not work,
--   return an error message
-tryE :: Maybe a -- ^ The value to extract
-     -> e       -- ^ The error message
-     -> Either e a -- ^ The extracted value, in the `Either` monad
-tryE  Nothing     err = Left err
-tryE (Just value) _   = Right value
-- | Tests whether the input text can be parsed as a number
-parseNumber :: Text -> Maybe Int
-parseNumber str = case DTR.decimal str of
-  Right (x, "") -> Just x
-  _ -> Nothing
-- | An integer that may be the result of parsing either an integer value
-- or a floating-point value. If the parsed value was a floating-point number,
-- it is cropped to take only its integer part.
-newtype IntOrDec = IntOrDec { unIntOrDec :: Int }
-  deriving (Show, Eq, Read)
-instance CSV.FromField IntOrDec where
-  -- | Try parsing the field as an `Int`; if it fails, parse it as a `Double`
-  parseField s = case CSV.runParser (CSV.parseField s :: CSV.Parser Int) of
-    Left _  -> IntOrDec . floor <$> (CSV.parseField s :: CSV.Parser Double)
-    Right n -> pure $ IntOrDec n
-instance CSV.ToField IntOrDec where
-  toField = CSV.toField . unIntOrDec
-- | Default options for TSV encoding, except for the delimiter,
-- which is passed as argument
-defaultEncodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.EncodeOptions
-defaultEncodingOptionsWithDelimiter d = CSV.defaultEncodeOptions {CSV.encDelimiter = toWord8 d}
-- | Default options for TSV decoding, except for the delimiter,
-- which is passed as argument
-defaultDecodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.DecodeOptions
-defaultDecodingOptionsWithDelimiter d = CSV.defaultDecodeOptions {CSV.decDelimiter = toWord8 d}
--------------------
-- Basic TSV type --
--------------------
-data TsvDoc = TsvDoc
-  { tsv_title             :: !Text
-  , tsv_source            :: !Text
-  , tsv_publication_year  :: !(Maybe IntOrDec)
-  , tsv_publication_month :: !(Maybe Int)
-  , tsv_publication_day   :: !(Maybe Int)
-  , tsv_abstract          :: !Text
-  , tsv_authors           :: !Text
-  }
-  deriving (Show)
-instance CSV.FromNamedRecord TsvDoc where
-  parseNamedRecord r = do
-    tsv_title             <- r .: "title"             <|> r .: "Title"
-    tsv_source            <- r .: "source"            <|> r .: "Source"
-    tsv_publication_year  <- r .: "publication_year"  <|> r .: "Publication Year"
-    tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
-    tsv_publication_day   <- r .: "publication_day"   <|> r .: "Publication Day"
-    tsv_abstract          <- r .: "abstract"          <|> r .: "Abstract"
-    tsv_authors           <- r .: "authors"           <|> r .: "Authors"
-    pure $ TsvDoc { .. }
-instance CSV.ToNamedRecord TsvDoc where
-  toNamedRecord (TsvDoc{ .. }) =
-    CSV.namedRecord [ "title"             .= tsv_title
-                , "source"            .= tsv_source
-                , "publication_year"  .= tsv_publication_year
-                , "publication_month" .= tsv_publication_month
-                , "publication_day"   .= tsv_publication_day
-                , "abstract"          .= tsv_abstract
-                , "authors"           .= tsv_authors
-                ]
-tsv2doc :: TsvDoc -> HyperdataDocument
-tsv2doc (TsvDoc { .. })
-  = HyperdataDocument { _hd_bdd              = Just "TsvHal"
-                      , _hd_doi              = Nothing
-                      , _hd_url              = Nothing
-                      , _hd_page             = Nothing
-                      , _hd_title            = Just tsv_title
-                      , _hd_authors          = Just tsv_authors
-                      , _hd_institutes       = Nothing
-                      , _hd_source           = Just tsv_source
-                      , _hd_abstract         = Just tsv_abstract
-                      , _hd_publication_date = Just $ T.pack . show $ jour (fromIntegral pubYear)
-                                                                         pubMonth
-                                                                         pubDay
-                      , _hd_publication_year   = Just pubYear
-                      , _hd_publication_month  = Just pubMonth
-                      , _hd_publication_day    = Just pubDay
-                      , _hd_publication_hour   = Nothing
-                      , _hd_publication_minute = Nothing
-                      , _hd_publication_second = Nothing
-                      , _hd_language_iso2      = Nothing
-                    , _hd_institutes_tree      = Nothing }
-  where
-    pubYear  = maybe defaultYear unIntOrDec tsv_publication_year
-    pubMonth = fromMaybe defaultMonth tsv_publication_month
-    pubDay   = fromMaybe defaultDay tsv_publication_day
-- | Minimal header for a working TSV import
-- TODO why is this so much different from `minimalExportHeader`?
-minimalTsvImportHeaders :: [Text]
-minimalTsvImportHeaders = [ "Publication Day"
-                          , "Publication Month"
-                          , "Publication Year"
-                          , "Authors"
-                          , "Title"
-                          , "Source"
-                          , "Abstract"
-                          ]
-- | Minimal header for a working TSV export
-- TODO why is this so much different from `minimalImportHeader`?
-minimalTsvExportHeader :: CSV.Header
-minimalTsvExportHeader =
-  CSV.header [ "title"
-         , "source"
-         , "publication_year"
-         , "publication_month"
-         , "publication_day"
-         , "abstract"
-         , "authors"
-         ]
-- | Parses a TSV file into the native HyperdataDocument datatype..
--   Returns `Left <error message>` upon failure
-parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
-parseTsv filepath = do
-  eitherTsvFile <- readTSVFile filepath
-  return $ do
-    (_header, tsvDocs) <- eitherTsvFile
-    let hyperdataDocs = tsv2doc <$> tsvDocs
-    return $ V.toList hyperdataDocs
-- | Conduit version of `parseTsv`
-parseTsvC :: BL.ByteString
-          -> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
-parseTsvC tsvFile =
-  (\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
-  where
-    (eResult, warning) = case testCorrectFile tsvFile of
-      Left err -> (Left err, Warn.MalformedCorpus "")
-      Right (del,warn) -> (readTsvLazyBS del tsvFile, Warn.MalformedCorpus (T.intercalate (T.pack "\n") warn))
-- | TODO use readFileLazy
-readTSVFile :: FilePath -> IO (Either Text (CSV.Header, V.Vector TsvDoc))
-readTSVFile fp = do
-  file <- BL.readFile fp
-  case testCorrectFile file of
-    Left _err -> pure $ Left _err
-    Right (del,_) -> pure $ readTsvLazyBS del file
-- | Write a TSV file containing a collection of hyperdata documents
-writeDocs2Tsv :: FilePath             -- ^ The output filepath
-              -> [HyperdataDocument]  -- ^ The hyperdata documents to write
-              -> IO ()
-writeDocs2Tsv filepath hyperdataDocs = do
-  let tsvDocs = hyperdataDocument2tsvDoc <$> hyperdataDocs
-  let fileToWrite = CSV.encodeByNameWith (defaultEncodingOptionsWithDelimiter Tab)
-                                     minimalTsvExportHeader
-                                     tsvDocs
-  BL.writeFile filepath fileToWrite
-- | Convert between the native hyperdata datatype
--   and the transition TsvDoc datatype
-hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
-hyperdataDocument2tsvDoc h =
-  TsvDoc { tsv_title             = m $ _hd_title h
-         , tsv_source            = m $ _hd_source h
-         , tsv_publication_year  = Just $ IntOrDec $ mI $ _hd_publication_year h
-         , tsv_publication_month = Just $ mI $ _hd_publication_month h
-         , tsv_publication_day   = Just $ mI $ _hd_publication_day   h
-         , tsv_abstract          = m $ _hd_abstract h
-         , tsv_authors           = m $ _hd_authors h }
-  where
-    m = maybe "" identity
-    mI = maybe 0 identity
-------------------------------------
-readTsvLazyBS :: ColumnDelimiter
-              -> BL.ByteString
-              -> Either Text (CSV.Header, V.Vector TsvDoc)
-readTsvLazyBS d bs = first T.pack $ CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter d) bs
-testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text]) 
-testCorrectFile file = do
-  delimiter <- findColumnDelimiter file
-  let rows = BL.splitWith (== newline) file
-  headers <- getHeaders rows delimiter
-  (\content -> (delimiter, content)) <$> checkRows rows delimiter headers
-- | Given a delimiter type and a list of headers, checks that the rows
-- of the input CSV/TSV file, split into lines, are valid. If they are, return
-- the list of warnings encountered during checking.
--   Note that the first line (the header line) must be included,
-- but will not be checked here.
-checkRows :: [BL.ByteString] -- ^ The file, split in rows
-          -> ColumnDelimiter -- ^ The purported delimiter type
-          -> [Text]          -- ^ The headers
-          -> Either Text [Text]
-checkRows fileLines delimiter headers =
-  checkFromLine 2 (V.fromList fileLines) delimiter headers
-- | Detect which delimiter a file is using; if detection fails, an error message
-- is output inside the `Left` constructor
-findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter
-findColumnDelimiter bs
-  | testDelimiter Tab       bs = Right Tab
-  | testDelimiter Comma     bs = Right Comma
-  | testDelimiter Semicolon bs = Right Semicolon
-  | otherwise = Left (T.pack "Couldn't interpret file as a valid CSV/TSV file, no matter the delimiter (comma, tab or semicolon)")
-- | Get a TSV file's header fields
-getHeaders :: [BL.ByteString]    -- ^ The input file, split into lines
-           -> ColumnDelimiter    -- ^ This CSV/TSV's delimiter
-           -> Either Text [Text] -- ^ Left: error message; Right: the input file's headers
-getHeaders rows delimiter =
-  case fmap lBLToText . BL.splitWith (== toWord8 delimiter) <$> V.fromList rows !? 0 of 
-    Nothing -> Left "Error in function `getHeaders`: Empty file"
-    Just headers -> case testAllHeadersPresence $ removeText  "\"" <$> headers of
-      Left missing -> Left $ "Error in function `getHeaders: Missing headers: " <> T.intercalate ", " missing
-      Right ()     -> Right headers
-- | Check that all required headers are present for the basic TSV type
-testAllHeadersPresence :: [Text]           -- ^ List of headers
-                       -> Either [Text] () -- ^ Left: list of missing headers, if any; Right: success
-testAllHeadersPresence headers = do
-  -- Remove the \r character that sometimes appears at the end of a line
-  let cleanHeaders = removeText "\r" <$> headers
-  let missingHeaders = filter (`notElem` cleanHeaders) minimalTsvImportHeaders
-  if null missingHeaders
-    then Right ()
-    else Left missingHeaders
-- | Test whether the given file has the given character as a delimiter
-testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
-testDelimiter delimiter file = fromMaybe False $ do
-  -- do-notation in the Maybe monad for nice pattern matching
-  let rows = V.fromList $ BL.splitWith (== newline) file
-  headers  <- BL.splitWith (== toWord8 delimiter) <$> rows !? 0
-  firstRow <- BL.splitWith (== toWord8 delimiter) <$> rows !? 1
-  return $ length headers == length firstRow && length headers > 2
-- | Check a TSV file's rows starting at the given line number
-checkFromLine :: Int                    -- ^ The line number from which to check
-              -> V.Vector BL.ByteString -- ^ Individual lines of the TSV file
-              -> ColumnDelimiter        -- ^ The delimiter used in this file (comma, tab, ...)
-              -> [Text]                 -- ^ Expected TSV headers
-              -> Either Text [Text]
-              -- ^ `Left <error>` if an error has occured, otherwise `Right <list of warnings>`
-checkFromLine lineNumber tsvLines delim headers = do
-  assertE (length tsvLines == lineNumber)
-    "Error in function checkIntegrity: queried line number is greater than actual number of lines"
-  currentLine <- tryE (tsvLines !? lineNumber)
-    "Error in function checkIntegrity: no line at queried number"
-  (y, val) <- checkPartialRow tsvLines delim headers currentLine lineNumber
-  currentLineWarnings <- checkRow val delim headers (lineNumber + 1)
-  restOfWarnings      <- checkFromLine (y+1) tsvLines delim headers
-  return $ currentLineWarnings <> restOfWarnings
-- | Parse the row given as input. If the input is not a whole row, only part of a row,
-- fetch the next line and interpret it as the rest of the row
-checkPartialRow :: V.Vector BL.ByteString -- ^ The TSV file, split into lines
-                -> ColumnDelimiter        -- ^ The type of delimiter
-                -> [Text]                 -- ^ Headers
-                -> BL.ByteString          -- ^ Input row, or part of a row
-                -> Int                    -- ^ Current line number
-                -> Either Text (Int, [BL.ByteString])
-                -- ^ Left: Error message; Right: (Line number after the row, list of cells)
-checkPartialRow fileLines delimiter headers partialRow lineNum = do
-  let partialCells = split delimiter partialRow
-  if length partialCells == length headers
-    -- We've got a full row, we can check its contents:
-    then Right $ checkNextLine fileLines delimiter headers partialRow lineNum
-    else do
-      assertE ((length partialCells <= length headers) && (V.length fileLines < (lineNum + 1)))
-        (T.pack $ "Cannot parse the file at line " <> show lineNum <> ". Maybe because of a delimiter")
-      val <- tryE ((BL.append partialRow <$> fileLines) !? (lineNum+1))
-        "checkPartialRow"
-      checkPartialRow fileLines delimiter headers val (lineNum+1)
-- | Looks up the next line...?
-checkNextLine :: V.Vector BL.ByteString -- ^ All rows
-              -> ColumnDelimiter        -- ^ Type of delimiter
-              -> [Text]                 -- ^ Headers
-              -> BL.ByteString          -- ^ ???
-              -> Int                    -- ^ Line number
-              -> (Int, [BL.ByteString])
-checkNextLine allLines delimiter headers res lineNumber =
-  case allLines !? (lineNumber + 1) of
-    Nothing -> (lineNumber, split delimiter res)
-    Just nextLine -> let nextRow = split delimiter nextLine in
-      if length nextRow > 1
-        then (lineNumber, split delimiter res)
-        else checkNextLine allLines delimiter headers (res <> nextLine) (lineNumber + 1)
-- | Check for any errors or warnings in a given row
-checkRow :: [BL.ByteString] -- ^ The row to check, split into cells
-         -> ColumnDelimiter -- ^ The type of delimiter
-         -> [Text]          -- ^ The list of headers
-         -> Int             -- ^ Current line; used for logging purposes
-         -> Either Text [Text] -- ^ List of warnings
-- Empty row & no headers: nothing to check:
-checkRow [] _ [] _ = Right []
-- More cells than headers: error
-checkRow _ delimiter [] line = Left $
-  "Too many fields at line " <> (T.pack . show) line <> "." <>
-  if delimiter /= Comma
-    then ""
-    else "Try using tabulation as a delimiter. Other delimiters like comma (,) may appear in some texts."
-- More headers than cells: error
-checkRow [] _ _ line = Left $ "Missing one field on line " <> (T.pack . show) line
-- Still some fields and headers to consume
-checkRow (v:val) delimiter (h:headers) line = do
-    warning  <- checkValue v h line
-    warnings <- checkRow val delimiter headers line
-    return $ warning <> warnings
-- | Tests whether a given chunk of text is a valid field
-checkValue :: BL.ByteString -- ^ The field to check
-          -> Text          -- ^ Current column header; passed as input for logging purposes
-          -> Int           -- ^ Current line number; passed as input for logging purposes
-          -> Either Text [Text]
-checkValue field columnHeader lineNumber = case columnHeader of
-    "Publication Day"   -> checkNumber    field columnHeader lineNumber
-    "Publication Month" -> checkNumber    field columnHeader lineNumber
-    "Publication Year"  -> checkNumber    field columnHeader lineNumber
-    "Authors"           -> checkTextField field columnHeader lineNumber
-    "Title"             -> checkTextField field columnHeader lineNumber
-    "Source"            -> checkTextField field columnHeader lineNumber
-    "Abstract"          -> checkTextField field columnHeader lineNumber
-    _ -> Right []
-- | Checks whether a given chunk of text is a valid number
-checkNumber :: BL.ByteString -- ^ The text to parse as a number
-            -> Text          -- ^ Current header; passed as input for logging purposes
-            -> Int           -- ^ Current line; passed as input for logging purposes
-            -> Either Text [Text] -- ^ List of warnings
-checkNumber field columnHeader line = do
-  let cleanField = removeText "\"" $ lBLToText field
-  value <- tryE (parseNumber cleanField) $
-    "Error in column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " : value is not a number "
-  assertE (value >= 0) $
-    "Value of column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " is negative"
-  return []
-- | Checks whether a given chunk of text is a valid text field
--   (i.e. does not have parasite quotes)
-checkTextField :: BL.ByteString -- ^ The text to parse as a number
-               -> Text          -- ^ Current header; passed as input for logging purposes
-               -> Int           -- ^ Current line; passed as input for logging purposes
-               -> Either Text [Text] -- ^ List of warnings
-checkTextField field columnHeader ligne = do
-  let cleanField = removeText "\"\"" $ lBLToText field
-  if T.null cleanField
-    then Right ["The column '" <> columnHeader <> "' at line " <>  T.pack (show ligne) <> " is empty"]
-    else do
-      assertE (
-        -- Field is not empty...
-        (T.length cleanField > 0) &&
-          -- ... and either there are no quotes at all...
-        ( T.length (T.filter (== '\"') cleanField) == 0 ||
-          -- ... or there is only one quote at the beginning and one at the end
-          ((T.head cleanField == '"') &&
-           (T.last cleanField == '"') &&
-           (T.length (T.filter (== '\"') cleanField) == 2)
-          )
-        )) $
-        "Encapsulation problem at line " <> T.pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter  \"  must only appear at the beginning and the end of a field "
-      return []
---------------------------------------
-- | Tsv v3 weighted for phylo
-data Tsv' = Tsv'
-  { tsv'_title             :: !Text
-  , tsv'_source            :: !Text
-  , tsv'_publication_year  :: !Int
-  , tsv'_publication_month :: !Int
-  , tsv'_publication_day   :: !Int
-  , tsv'_abstract          :: !Text
-  , tsv'_authors           :: !Text
-  , tsv'_weight            :: !Double } deriving (Show)
-instance CSV.FromNamedRecord Tsv' where
-  parseNamedRecord r = do
-    tsv'_title <- r .: "title"
-    tsv'_source <- r .: "source"
-    tsv'_publication_year <- r .: "publication_year"
-    tsv'_publication_month <- r .: "publication_month"
-    tsv'_publication_day <- r .: "publication_day"
-    tsv'_abstract <- r .: "abstract"
-    tsv'_authors <- r .: "authors"
-    tsv'_weight <- r .: "weight"
-    pure $ Tsv' { .. }
-readWeightedTsv :: FilePath -> IO (CSV.Header, V.Vector Tsv')
-readWeightedTsv fp =
-  (\bs ->
-    case CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
-      Left e       -> panicTrace (T.pack e)
-      Right corpus -> corpus
-  ) <$> BL.readFile fp
-----------------------
-- | The data of a HAL document
-data TsvHal = TsvHal
-  { tsvHal_title  :: !Text
-  , tsvHal_source :: !Text
-  , tsvHal_publication_year  :: !Integer
-  , tsvHal_publication_month :: !Int
-  , tsvHal_publication_day   :: !Int
-  , tsvHal_abstract          :: !Text
-  , tsvHal_authors           :: !Text
-  , tsvHal_url               :: !Text
-  , tsvHal_isbn_s            :: !Text
-  , tsvHal_issue_s           :: !Text
-  , tsvHal_journalPublisher_s:: !Text
-  , tsvHal_language_s        :: !Text
-  , tsvHal_doiId_s           :: !Text
-  , tsvHal_authId_i          :: !Text
-  , tsvHal_instStructId_i    :: !Text
-  , tsvHal_deptStructId_i    :: !Text
-  , tsvHal_labStructId_i     :: !Text
-  , tsvHal_rteamStructId_i   :: !Text
-  , tsvHal_docType_s         :: !Text
-  }
-  deriving (Show)
-instance CSV.FromNamedRecord TsvHal where
-  parseNamedRecord r = do
-    tsvHal_title <- r .: "title"
-    tsvHal_source <- r .: "source"
-    tsvHal_publication_year <- r .: "publication_year"
-    tsvHal_publication_month <- r .: "publication_month"
-    tsvHal_publication_day <- r .: "publication_day"
-    tsvHal_abstract <- r .: "abstract"
-    tsvHal_authors <- r .: "authors"
-    tsvHal_url <- r .: "url"
-    tsvHal_isbn_s <- r .: "isbn_s"
-    tsvHal_issue_s <- r .: "issue_s"
-    tsvHal_journalPublisher_s <- r .: "journalPublisher_s"
-    tsvHal_language_s <- r .: "language_s"
-    tsvHal_doiId_s <- r .: "doiId_s"
-    tsvHal_authId_i <- r .: "authId_i"
-    tsvHal_instStructId_i <- r .: "instStructId_i"
-    tsvHal_deptStructId_i <- r .: "deptStructId_i"
-    tsvHal_labStructId_i <- r .: "labStructId_i"
-    tsvHal_rteamStructId_i <- r .: "rteamStructId_i"
-    tsvHal_docType_s <- r .: "docType_s"
-    pure $ TsvHal { .. }
-instance CSV.ToNamedRecord TsvHal where
-  toNamedRecord (TsvHal { .. }) =
-    CSV.namedRecord [ "title"  .= tsvHal_title
-                , "source" .= tsvHal_source
-                , "publication_year"  .= tsvHal_publication_year
-                , "publication_month" .= tsvHal_publication_month
-                , "publication_day"   .= tsvHal_publication_day
-                , "abstract"          .= tsvHal_abstract
-                , "authors"           .= tsvHal_authors
-                , "url"                .= tsvHal_url
-                , "isbn_s"             .= tsvHal_isbn_s
-                , "issue_s"            .= tsvHal_issue_s
-                , "journalPublisher_s" .= tsvHal_journalPublisher_s
-                , "language_s"         .= tsvHal_language_s
-                , "doiId_s"            .= tsvHal_doiId_s
-                , "authId_i"           .= tsvHal_authId_i
-                , "instStructId_i"     .= tsvHal_instStructId_i
-                , "deptStructId_i"     .= tsvHal_deptStructId_i
-                , "labStructId_i"      .= tsvHal_labStructId_i
-                , "rteamStructId_i"    .= tsvHal_rteamStructId_i
-                , "docType_s"          .= tsvHal_docType_s
-               ]
-readTsvHal :: FilePath
-           -> IO (Either Text (CSV.Header, V.Vector TsvHal))
-readTsvHal path = do
-  file <- BL.readFile path
-  return . first T.pack $ -- Applying `first` to the error message because we need a `Text`
-    CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) file
-parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
-parseHal fp = do
-  r <- readTsvHal fp
-  pure $ V.toList . V.map tsvHal2doc . snd <$> r
-tsvHal2doc :: TsvHal -> HyperdataDocument
-tsvHal2doc (TsvHal { .. }) =
-  HyperdataDocument { _hd_bdd = Just "TsvHal"
-                    , _hd_doi = Just tsvHal_doiId_s
-                    , _hd_url = Just tsvHal_url
-                    , _hd_page = Nothing
-                    , _hd_title = Just tsvHal_title
-                    , _hd_authors = Just tsvHal_authors
-                    , _hd_institutes = Just tsvHal_instStructId_i
-                    , _hd_source = Just tsvHal_source
-                    , _hd_abstract = Just tsvHal_abstract
-                    , _hd_publication_date = Just $ T.pack . show $ jour tsvHal_publication_year
-                                                                      tsvHal_publication_month
-                                                                      tsvHal_publication_day
-                    , _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
-                    , _hd_publication_month = Just tsvHal_publication_month
-                    , _hd_publication_day = Just tsvHal_publication_day
-                    , _hd_publication_hour = Nothing
-                    , _hd_publication_minute = Nothing
-                    , _hd_publication_second = Nothing
-                    , _hd_language_iso2 = Nothing
-                    , _hd_institutes_tree = Nothing }
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Internal.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Internal.hs
+{-|
+Module      : Gargantext.Core.Text.Corpus.Parsers.TSV
+Description :
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+TSV parser for Gargantext corpus files.
+-}
+module Gargantext.Core.Text.Corpus.Parsers.TSV.Internal
+  ( ColumnDelimiter (..)
+  , IntOrDec (..)
+  , TsvDoc (..)
+  , TsvHal (..)
+  , Tsv' (..)
+  , defaultDay
+  , defaultDecodingOptionsWithDelimiter
+  , defaultEncodingOptionsWithDelimiter
+  , defaultMonth
+  , defaultYear
+  , parseHal
+  , parseTsv
+  , parseTsvC
+  , readTSVFile
+  , readTsvHal
+  , readWeightedTsv
+  , writeDocs2Tsv
+  , checkNumber
+  , checkRow
+  , checkTextField
+  , findColumnDelimiter
+  , getHeaders
+  , testCorrectFile
+  , toWord8
+  )
+where
+import Conduit (ConduitT, yieldMany, mapC, (.|))
+import Data.ByteString.Lazy    qualified as BL
+import Data.Csv                qualified as CSV
+import Data.Csv                ((.=), (.:))
+import Data.Text               qualified as T
+import Data.Text.Lazy          qualified as TL
+import Data.Text.Lazy.Encoding qualified as TLE
+import Data.Text.Read          qualified as DTR
+import Data.Time.Segment (jour)
+import Data.Vector qualified as V
+import Data.Vector ((!?))
+import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
+import Gargantext.Prelude
+import Gargantext.Utils.Jobs.Error as Warn
+---------------------------------
+-- Utility types and functions --
+---------------------------------
+-- | Day of the UNIX reference time (UNIX epoch)
+defaultYear :: Int
+defaultYear = 1973
+-- | UNIX epoch month
+defaultMonth :: Int
+defaultMonth = 1
+-- | UNIX epoch year
+defaultDay :: Int
+defaultDay = 1
+-- | The possible delimiters for a CSV file
+data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
+-- | Concrete representation of the above type
+toWord8 :: ColumnDelimiter -> Word8
+toWord8 Tab       = fromIntegral $ ord '\t'
+toWord8 Comma     = fromIntegral $ ord ','
+toWord8 Semicolon = fromIntegral $ ord ';'
+-- | Just the newline character in `Word8` form; handy to have around
+newline :: Word8
+newline = fromIntegral $ ord '\n'
+-- | Split a text depending on the delimiter type
+split :: ColumnDelimiter -> BL.ByteString -> [BL.ByteString]
+split delimiter txt = BL.splitWith (== toWord8 delimiter) txt
+-- | Remove all occurences of `needle` in `haystack`
+removeText :: T.Text -- ^ The text to remove (the `needle`)
+           -> T.Text -- ^ The original text (the `haystack`)
+           -> T.Text
+removeText needle haystack = T.replace needle "" haystack
+-- | Convert from raw byte string into `Text`
+lBLToText :: BL.ByteString -> Text
+lBLToText b = TL.toStrict $ TLE.decodeUtf8 b
+-- | Fail in the Either monad iff a predicate is False
+assertE :: Bool -- ^ The predicate to be tested
+        -> e    -- ^ The error if the predicate fails
+        -> Either e ()
+assertE predicate err = if predicate then Right () else Left err
+-- | Trying getting a value out of a `Maybe`; if it does not work,
+--   return an error message
+tryE :: Maybe a -- ^ The value to extract
+     -> e       -- ^ The error message
+     -> Either e a -- ^ The extracted value, in the `Either` monad
+tryE  Nothing     err = Left err
+tryE (Just value) _   = Right value
+-- | Tests whether the input text can be parsed as a number
+parseNumber :: Text -> Maybe Int
+parseNumber str = case DTR.decimal str of
+  Right (x, "") -> Just x
+  _ -> Nothing
+-- | An integer that may be the result of parsing either an integer value
+-- or a floating-point value. If the parsed value was a floating-point number,
+-- it is cropped to take only its integer part.
+newtype IntOrDec = IntOrDec { unIntOrDec :: Int }
+  deriving (Show, Eq, Read)
+instance CSV.FromField IntOrDec where
+  -- | Try parsing the field as an `Int`; if it fails, parse it as a `Double`
+  parseField s = case CSV.runParser (CSV.parseField s :: CSV.Parser Int) of
+    Left _  -> IntOrDec . floor <$> (CSV.parseField s :: CSV.Parser Double)
+    Right n -> pure $ IntOrDec n
+instance CSV.ToField IntOrDec where
+  toField = CSV.toField . unIntOrDec
+-- | Default options for TSV encoding, except for the delimiter,
+-- which is passed as argument
+defaultEncodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.EncodeOptions
+defaultEncodingOptionsWithDelimiter d = CSV.defaultEncodeOptions {CSV.encDelimiter = toWord8 d}
+-- | Default options for TSV decoding, except for the delimiter,
+-- which is passed as argument
+defaultDecodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.DecodeOptions
+defaultDecodingOptionsWithDelimiter d = CSV.defaultDecodeOptions {CSV.decDelimiter = toWord8 d}
+--------------------
+-- Basic TSV type --
+--------------------
+data TsvDoc = TsvDoc
+  { tsv_title             :: !Text
+  , tsv_source            :: !Text
+  , tsv_publication_year  :: !(Maybe IntOrDec)
+  , tsv_publication_month :: !(Maybe Int)
+  , tsv_publication_day   :: !(Maybe Int)
+  , tsv_abstract          :: !Text
+  , tsv_authors           :: !Text
+  }
+  deriving (Show)
+instance CSV.FromNamedRecord TsvDoc where
+  parseNamedRecord r = do
+    tsv_title             <- r .: "title"             <|> r .: "Title"
+    tsv_source            <- r .: "source"            <|> r .: "Source"
+    tsv_publication_year  <- r .: "publication_year"  <|> r .: "Publication Year"
+    tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
+    tsv_publication_day   <- r .: "publication_day"   <|> r .: "Publication Day"
+    tsv_abstract          <- r .: "abstract"          <|> r .: "Abstract"
+    tsv_authors           <- r .: "authors"           <|> r .: "Authors"
+    pure $ TsvDoc { .. }
+instance CSV.ToNamedRecord TsvDoc where
+  toNamedRecord (TsvDoc{ .. }) =
+    CSV.namedRecord [ "title"             .= tsv_title
+                , "source"            .= tsv_source
+                , "publication_year"  .= tsv_publication_year
+                , "publication_month" .= tsv_publication_month
+                , "publication_day"   .= tsv_publication_day
+                , "abstract"          .= tsv_abstract
+                , "authors"           .= tsv_authors
+                ]
+tsv2doc :: TsvDoc -> HyperdataDocument
+tsv2doc (TsvDoc { .. })
+  = HyperdataDocument { _hd_bdd              = Just "TsvHal"
+                      , _hd_doi              = Nothing
+                      , _hd_url              = Nothing
+                      , _hd_page             = Nothing
+                      , _hd_title            = Just tsv_title
+                      , _hd_authors          = Just tsv_authors
+                      , _hd_institutes       = Nothing
+                      , _hd_source           = Just tsv_source
+                      , _hd_abstract         = Just tsv_abstract
+                      , _hd_publication_date = Just $ T.pack . show $ jour (fromIntegral pubYear)
+                                                                         pubMonth
+                                                                         pubDay
+                      , _hd_publication_year   = Just pubYear
+                      , _hd_publication_month  = Just pubMonth
+                      , _hd_publication_day    = Just pubDay
+                      , _hd_publication_hour   = Nothing
+                      , _hd_publication_minute = Nothing
+                      , _hd_publication_second = Nothing
+                      , _hd_language_iso2      = Nothing
+                    , _hd_institutes_tree      = Nothing }
+  where
+    pubYear  = maybe defaultYear unIntOrDec tsv_publication_year
+    pubMonth = fromMaybe defaultMonth tsv_publication_month
+    pubDay   = fromMaybe defaultDay tsv_publication_day
+-- | Minimal header for a working TSV import
+-- TODO why is this so much different from `minimalExportHeader`?
+minimalTsvImportHeaders :: [Text]
+minimalTsvImportHeaders = [ "Publication Day"
+                          , "Publication Month"
+                          , "Publication Year"
+                          , "Authors"
+                          , "Title"
+                          , "Source"
+                          , "Abstract"
+                          ]
+-- | Minimal header for a working TSV export
+-- TODO why is this so much different from `minimalImportHeader`?
+minimalTsvExportHeader :: CSV.Header
+minimalTsvExportHeader =
+  CSV.header [ "title"
+         , "source"
+         , "publication_year"
+         , "publication_month"
+         , "publication_day"
+         , "abstract"
+         , "authors"
+         ]
+-- | Parses a TSV file into the native HyperdataDocument datatype..
+--   Returns `Left <error message>` upon failure
+parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
+parseTsv filepath = do
+  eitherTsvFile <- readTSVFile filepath
+  return $ do
+    (_header, tsvDocs) <- eitherTsvFile
+    let hyperdataDocs = tsv2doc <$> tsvDocs
+    return $ V.toList hyperdataDocs
+-- | Conduit version of `parseTsv`
+parseTsvC :: BL.ByteString
+          -> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
+parseTsvC tsvFile =
+  (\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
+  where
+    (eResult, warning) = case testCorrectFile tsvFile of
+      Left err -> (Left err, Warn.MalformedCorpus "")
+      Right (del,warn) -> (readTsvLazyBS del tsvFile, Warn.MalformedCorpus (T.intercalate (T.pack "\n") warn))
+-- | TODO use readFileLazy
+readTSVFile :: FilePath -> IO (Either Text (CSV.Header, V.Vector TsvDoc))
+readTSVFile fp = do
+  file <- BL.readFile fp
+  case testCorrectFile file of
+    Left _err -> pure $ Left _err
+    Right (del,_) -> pure $ readTsvLazyBS del file
+-- | Write a TSV file containing a collection of hyperdata documents
+writeDocs2Tsv :: FilePath             -- ^ The output filepath
+              -> [HyperdataDocument]  -- ^ The hyperdata documents to write
+              -> IO ()
+writeDocs2Tsv filepath hyperdataDocs = do
+  let tsvDocs = hyperdataDocument2tsvDoc <$> hyperdataDocs
+  let fileToWrite = CSV.encodeByNameWith (defaultEncodingOptionsWithDelimiter Tab)
+                                     minimalTsvExportHeader
+                                     tsvDocs
+  BL.writeFile filepath fileToWrite
+-- | Convert between the native hyperdata datatype
+--   and the transition TsvDoc datatype
+hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
+hyperdataDocument2tsvDoc h =
+  TsvDoc { tsv_title             = m $ _hd_title h
+         , tsv_source            = m $ _hd_source h
+         , tsv_publication_year  = Just $ IntOrDec $ mI $ _hd_publication_year h
+         , tsv_publication_month = Just $ mI $ _hd_publication_month h
+         , tsv_publication_day   = Just $ mI $ _hd_publication_day   h
+         , tsv_abstract          = m $ _hd_abstract h
+         , tsv_authors           = m $ _hd_authors h }
+  where
+    m = maybe "" identity
+    mI = maybe 0 identity
+-------------------------------------
+readTsvLazyBS :: ColumnDelimiter
+              -> BL.ByteString
+              -> Either Text (CSV.Header, V.Vector TsvDoc)
+readTsvLazyBS d bs = first T.pack $ CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter d) bs
+testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text]) 
+testCorrectFile file = do
+  delimiter <- findColumnDelimiter file
+  let rows = BL.splitWith (== newline) file
+  headers <- getHeaders rows delimiter
+  (\content -> (delimiter, content)) <$> checkRows rows delimiter headers
+-- | Given a delimiter type and a list of headers, checks that the rows
+-- of the input CSV/TSV file, split into lines, are valid. If they are, return
+-- the list of warnings encountered during checking.
+--   Note that the first line (the header line) must be included,
+-- but will not be checked here.
+checkRows :: [BL.ByteString] -- ^ The file, split in rows
+          -> ColumnDelimiter -- ^ The purported delimiter type
+          -> [Text]          -- ^ The headers
+          -> Either Text [Text]
+checkRows fileLines delimiter headers =
+  checkFromLine 2 (V.fromList fileLines) delimiter headers
+-- | Detect which delimiter a file is using; if detection fails, an error message
+-- is output inside the `Left` constructor
+findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter
+findColumnDelimiter bs
+  | testDelimiter Tab       bs = Right Tab
+  | testDelimiter Comma     bs = Right Comma
+  | testDelimiter Semicolon bs = Right Semicolon
+  | otherwise = Left (T.pack "Couldn't interpret file as a valid CSV/TSV file, no matter the delimiter (comma, tab or semicolon)")
+-- | Get a TSV file's header fields
+getHeaders :: [BL.ByteString]    -- ^ The input file, split into lines
+           -> ColumnDelimiter    -- ^ This CSV/TSV's delimiter
+           -> Either Text [Text] -- ^ Left: error message; Right: the input file's headers
+getHeaders rows delimiter =
+  case fmap lBLToText . BL.splitWith (== toWord8 delimiter) <$> V.fromList rows !? 0 of 
+    Nothing -> Left "Error in function `getHeaders`: Empty file"
+    Just headers -> case testAllHeadersPresence $ removeText  "\"" <$> headers of
+      Left missing -> Left $ "Error in function `getHeaders: Missing headers: " <> T.intercalate ", " missing
+      Right ()     -> Right headers
+-- | Check that all required headers are present for the basic TSV type
+testAllHeadersPresence :: [Text]           -- ^ List of headers
+                       -> Either [Text] () -- ^ Left: list of missing headers, if any; Right: success
+testAllHeadersPresence headers = do
+  -- Remove the \r character that sometimes appears at the end of a line
+  let cleanHeaders = removeText "\r" <$> headers
+  let missingHeaders = filter (`notElem` cleanHeaders) minimalTsvImportHeaders
+  if null missingHeaders
+    then Right ()
+    else Left missingHeaders
+-- | Test whether the given file has the given character as a delimiter
+testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
+testDelimiter delimiter file = fromMaybe False $ do
+  -- do-notation in the Maybe monad for nice pattern matching
+  let rows = V.fromList $ BL.splitWith (== newline) file
+  headers  <- BL.splitWith (== toWord8 delimiter) <$> rows !? 0
+  firstRow <- BL.splitWith (== toWord8 delimiter) <$> rows !? 1
+  return $ length headers == length firstRow && length headers > 2
+-- | Check a TSV file's rows starting at the given line number
+checkFromLine :: Int                    -- ^ The line number from which to check
+              -> V.Vector BL.ByteString -- ^ Individual lines of the TSV file
+              -> ColumnDelimiter        -- ^ The delimiter used in this file (comma, tab, ...)
+              -> [Text]                 -- ^ Expected TSV headers
+              -> Either Text [Text]
+              -- ^ `Left <error>` if an error has occured, otherwise `Right <list of warnings>`
+checkFromLine lineNumber tsvLines delim headers = do
+  assertE (length tsvLines == lineNumber)
+    "Error in function checkIntegrity: queried line number is greater than actual number of lines"
+  currentLine <- tryE (tsvLines !? lineNumber)
+    "Error in function checkIntegrity: no line at queried number"
+  (y, val) <- checkPartialRow tsvLines delim headers currentLine lineNumber
+  currentLineWarnings <- checkRow val delim headers (lineNumber + 1)
+  restOfWarnings      <- checkFromLine (y+1) tsvLines delim headers
+  return $ currentLineWarnings <> restOfWarnings
+-- | Parse the row given as input. If the input is not a whole row, only part of a row,
+-- fetch the next line and interpret it as the rest of the row
+checkPartialRow :: V.Vector BL.ByteString -- ^ The TSV file, split into lines
+                -> ColumnDelimiter        -- ^ The type of delimiter
+                -> [Text]                 -- ^ Headers
+                -> BL.ByteString          -- ^ Input row, or part of a row
+                -> Int                    -- ^ Current line number
+                -> Either Text (Int, [BL.ByteString])
+                -- ^ Left: Error message; Right: (Line number after the row, list of cells)
+checkPartialRow fileLines delimiter headers partialRow lineNum = do
+  let partialCells = split delimiter partialRow
+  if length partialCells == length headers
+    -- We've got a full row, we can check its contents:
+    then Right $ checkNextLine fileLines delimiter headers partialRow lineNum
+    else do
+      assertE ((length partialCells <= length headers) && (V.length fileLines < (lineNum + 1)))
+        (T.pack $ "Cannot parse the file at line " <> show lineNum <> ". Maybe because of a delimiter")
+      val <- tryE ((BL.append partialRow <$> fileLines) !? (lineNum+1))
+        "checkPartialRow"
+      checkPartialRow fileLines delimiter headers val (lineNum+1)
+-- | Looks up the next line...?
+checkNextLine :: V.Vector BL.ByteString -- ^ All rows
+              -> ColumnDelimiter        -- ^ Type of delimiter
+              -> [Text]                 -- ^ Headers
+              -> BL.ByteString          -- ^ ???
+              -> Int                    -- ^ Line number
+              -> (Int, [BL.ByteString])
+checkNextLine allLines delimiter headers res lineNumber =
+  case allLines !? (lineNumber + 1) of
+    Nothing -> (lineNumber, split delimiter res)
+    Just nextLine -> let nextRow = split delimiter nextLine in
+      if length nextRow > 1
+        then (lineNumber, split delimiter res)
+        else checkNextLine allLines delimiter headers (res <> nextLine) (lineNumber + 1)
+-- | Check for any errors or warnings in a given row
+checkRow :: [BL.ByteString] -- ^ The row to check, split into cells
+         -> ColumnDelimiter -- ^ The type of delimiter
+         -> [Text]          -- ^ The list of headers
+         -> Int             -- ^ Current line; used for logging purposes
+         -> Either Text [Text] -- ^ List of warnings
+-- Empty row & no headers: nothing to check:
+checkRow [] _ [] _ = Right []
+-- More cells than headers: error
+checkRow _ delimiter [] line = Left $
+  "Too many fields at line " <> (T.pack . show) line <> "." <>
+  if delimiter /= Comma
+    then ""
+    else "Try using tabulation as a delimiter. Other delimiters like comma (,) may appear in some texts."
+-- More headers than cells: error
+checkRow [] _ _ line = Left $ "Missing one field on line " <> (T.pack . show) line
+-- Still some fields and headers to consume
+checkRow (v:val) delimiter (h:headers) line = do
+    warning  <- checkValue v h line
+    warnings <- checkRow val delimiter headers line
+    return $ warning <> warnings
+-- | Tests whether a given chunk of text is a valid field
+checkValue :: BL.ByteString -- ^ The field to check
+          -> Text          -- ^ Current column header; passed as input for logging purposes
+          -> Int           -- ^ Current line number; passed as input for logging purposes
+          -> Either Text [Text]
+checkValue field columnHeader lineNumber = case columnHeader of
+    "Publication Day"   -> checkNumber    field columnHeader lineNumber
+    "Publication Month" -> checkNumber    field columnHeader lineNumber
+    "Publication Year"  -> checkNumber    field columnHeader lineNumber
+    "Authors"           -> checkTextField field columnHeader lineNumber
+    "Title"             -> checkTextField field columnHeader lineNumber
+    "Source"            -> checkTextField field columnHeader lineNumber
+    "Abstract"          -> checkTextField field columnHeader lineNumber
+    _ -> Right []
+-- | Checks whether a given chunk of text is a valid number
+checkNumber :: BL.ByteString -- ^ The text to parse as a number
+            -> Text          -- ^ Current header; passed as input for logging purposes
+            -> Int           -- ^ Current line; passed as input for logging purposes
+            -> Either Text [Text] -- ^ List of warnings
+checkNumber field columnHeader line = do
+  let cleanField = removeText "\"" $ lBLToText field
+  value <- tryE (parseNumber cleanField) $
+    "Error in column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " : value is not a number "
+  assertE (value >= 0) $
+    "Value of column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " is negative"
+  return []
+-- | Checks whether a given chunk of text is a valid text field
+--   (i.e. does not have parasite quotes)
+checkTextField :: BL.ByteString -- ^ The text to parse as a number
+               -> Text          -- ^ Current header; passed as input for logging purposes
+               -> Int           -- ^ Current line; passed as input for logging purposes
+               -> Either Text [Text] -- ^ List of warnings
+checkTextField field columnHeader ligne = do
+  let cleanField = removeText "\"\"" $ lBLToText field
+  if T.null cleanField
+    then Right ["The column '" <> columnHeader <> "' at line " <>  T.pack (show ligne) <> " is empty"]
+    else do
+      assertE (
+        -- Field is not empty...
+        (T.length cleanField > 0) &&
+          -- ... and either there are no quotes at all...
+        ( T.length (T.filter (== '\"') cleanField) == 0 ||
+          -- ... or there is only one quote at the beginning and one at the end
+          ((T.head cleanField == '"') &&
+           (T.last cleanField == '"') &&
+           (T.length (T.filter (== '\"') cleanField) == 2)
+          )
+        )) $
+        "Encapsulation problem at line " <> T.pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter  \"  must only appear at the beginning and the end of a field "
+      return []
+---------------------------------------
+-- | Tsv v3 weighted for phylo
+data Tsv' = Tsv'
+  { tsv'_title             :: !Text
+  , tsv'_source            :: !Text
+  , tsv'_publication_year  :: !Int
+  , tsv'_publication_month :: !Int
+  , tsv'_publication_day   :: !Int
+  , tsv'_abstract          :: !Text
+  , tsv'_authors           :: !Text
+  , tsv'_weight            :: !Double } deriving (Show)
+instance CSV.FromNamedRecord Tsv' where
+  parseNamedRecord r = do
+    tsv'_title <- r .: "title"
+    tsv'_source <- r .: "source"
+    tsv'_publication_year <- r .: "publication_year"
+    tsv'_publication_month <- r .: "publication_month"
+    tsv'_publication_day <- r .: "publication_day"
+    tsv'_abstract <- r .: "abstract"
+    tsv'_authors <- r .: "authors"
+    tsv'_weight <- r .: "weight"
+    pure $ Tsv' { .. }
+readWeightedTsv :: FilePath -> IO (CSV.Header, V.Vector Tsv')
+readWeightedTsv fp =
+  (\bs ->
+    case CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
+      Left e       -> panicTrace (T.pack e)
+      Right corpus -> corpus
+  ) <$> BL.readFile fp
+-----------------------
+-- | The data of a HAL document
+data TsvHal = TsvHal
+  { tsvHal_title  :: !Text
+  , tsvHal_source :: !Text
+  , tsvHal_publication_year  :: !Integer
+  , tsvHal_publication_month :: !Int
+  , tsvHal_publication_day   :: !Int
+  , tsvHal_abstract          :: !Text
+  , tsvHal_authors           :: !Text
+  , tsvHal_url               :: !Text
+  , tsvHal_isbn_s            :: !Text
+  , tsvHal_issue_s           :: !Text
+  , tsvHal_journalPublisher_s:: !Text
+  , tsvHal_language_s        :: !Text
+  , tsvHal_doiId_s           :: !Text
+  , tsvHal_authId_i          :: !Text
+  , tsvHal_instStructId_i    :: !Text
+  , tsvHal_deptStructId_i    :: !Text
+  , tsvHal_labStructId_i     :: !Text
+  , tsvHal_rteamStructId_i   :: !Text
+  , tsvHal_docType_s         :: !Text
+  }
+  deriving (Show)
+instance CSV.FromNamedRecord TsvHal where
+  parseNamedRecord r = do
+    tsvHal_title <- r .: "title"
+    tsvHal_source <- r .: "source"
+    tsvHal_publication_year <- r .: "publication_year"
+    tsvHal_publication_month <- r .: "publication_month"
+    tsvHal_publication_day <- r .: "publication_day"
+    tsvHal_abstract <- r .: "abstract"
+    tsvHal_authors <- r .: "authors"
+    tsvHal_url <- r .: "url"
+    tsvHal_isbn_s <- r .: "isbn_s"
+    tsvHal_issue_s <- r .: "issue_s"
+    tsvHal_journalPublisher_s <- r .: "journalPublisher_s"
+    tsvHal_language_s <- r .: "language_s"
+    tsvHal_doiId_s <- r .: "doiId_s"
+    tsvHal_authId_i <- r .: "authId_i"
+    tsvHal_instStructId_i <- r .: "instStructId_i"
+    tsvHal_deptStructId_i <- r .: "deptStructId_i"
+    tsvHal_labStructId_i <- r .: "labStructId_i"
+    tsvHal_rteamStructId_i <- r .: "rteamStructId_i"
+    tsvHal_docType_s <- r .: "docType_s"
+    pure $ TsvHal { .. }
+instance CSV.ToNamedRecord TsvHal where
+  toNamedRecord (TsvHal { .. }) =
+    CSV.namedRecord [ "title"  .= tsvHal_title
+                , "source" .= tsvHal_source
+                , "publication_year"  .= tsvHal_publication_year
+                , "publication_month" .= tsvHal_publication_month
+                , "publication_day"   .= tsvHal_publication_day
+                , "abstract"          .= tsvHal_abstract
+                , "authors"           .= tsvHal_authors
+                , "url"                .= tsvHal_url
+                , "isbn_s"             .= tsvHal_isbn_s
+                , "issue_s"            .= tsvHal_issue_s
+                , "journalPublisher_s" .= tsvHal_journalPublisher_s
+                , "language_s"         .= tsvHal_language_s
+                , "doiId_s"            .= tsvHal_doiId_s
+                , "authId_i"           .= tsvHal_authId_i
+                , "instStructId_i"     .= tsvHal_instStructId_i
+                , "deptStructId_i"     .= tsvHal_deptStructId_i
+                , "labStructId_i"      .= tsvHal_labStructId_i
+                , "rteamStructId_i"    .= tsvHal_rteamStructId_i
+                , "docType_s"          .= tsvHal_docType_s
+               ]
+readTsvHal :: FilePath
+           -> IO (Either Text (CSV.Header, V.Vector TsvHal))
+readTsvHal path = do
+  file <- BL.readFile path
+  return . first T.pack $ -- Applying `first` to the error message because we need a `Text`
+    CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) file
+parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
+parseHal fp = do
+  r <- readTsvHal fp
+  pure $ V.toList . V.map tsvHal2doc . snd <$> r
+tsvHal2doc :: TsvHal -> HyperdataDocument
+tsvHal2doc (TsvHal { .. }) =
+  HyperdataDocument { _hd_bdd = Just "TsvHal"
+                    , _hd_doi = Just tsvHal_doiId_s
+                    , _hd_url = Just tsvHal_url
+                    , _hd_page = Nothing
+                    , _hd_title = Just tsvHal_title
+                    , _hd_authors = Just tsvHal_authors
+                    , _hd_institutes = Just tsvHal_instStructId_i
+                    , _hd_source = Just tsvHal_source
+                    , _hd_abstract = Just tsvHal_abstract
+                    , _hd_publication_date = Just $ T.pack . show $ jour tsvHal_publication_year
+                                                                      tsvHal_publication_month
+                                                                      tsvHal_publication_day
+                    , _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
+                    , _hd_publication_month = Just tsvHal_publication_month
+                    , _hd_publication_day = Just tsvHal_publication_day
+                    , _hd_publication_hour = Nothing
+                    , _hd_publication_minute = Nothing
+                    , _hd_publication_second = Nothing
+                    , _hd_language_iso2 = Nothing
+                    , _hd_institutes_tree = Nothing }
--- a/test/Test/Core/Text/Corpus/TSV.hs
+++ b/test/Test/Core/Text/Corpus/TSV.hs
@@ -2,19 +2,16 @@
 module Test.Core.Text.Corpus.TSV (tests) where
-import Gargantext.Core.Text.Corpus.Parsers.TSV
+import Gargantext.Core.Text.Corpus.Parsers.TSV.Internal
-import Test.QuickCheck
-import Test.QuickCheck.Instances ()
-import Data.ByteString.Lazy.UTF8 as BLU
 import Data.ByteString.Lazy as BL
-import Data.Char ( ord )
+import Data.ByteString.Lazy.UTF8 as BLU
+import Data.Char (ord)
 import Data.Text as DT (Text, pack, null, elem)
 import Data.Text.Encoding as DT
 import Prelude
+import Test.QuickCheck
+import Test.QuickCheck.Instances ()
 import Test.Tasty
-import Test.Tasty.HUnit
 import Test.Tasty.QuickCheck hiding (Positive, Negative)
 tests :: TestTree
@@ -30,12 +27,12 @@ tests = testGroup "TSV Parser" [
-delimiterBS :: Delimiter -> BL.ByteString
+delimiterBS :: ColumnDelimiter -> BL.ByteString
-delimiterBS Tab   = BLU.fromString "\t"
+delimiterBS Tab       = BLU.fromString "\t"
-delimiterBS Comma = BLU.fromString ","
+delimiterBS Comma     = BLU.fromString ","
-delimiterBS Line = BLU.fromString "\n"
+delimiterBS Semicolon = BLU.fromString ";"
-data File = File { fDelimiter :: Delimiter
+data File = File { fDelimiter :: ColumnDelimiter
                 , allCorpus  :: [RandomCorpus]
                 } 
                 deriving (Show)
@@ -56,19 +53,19 @@ instance Arbitrary File where
 arbitrarySizedFile :: Int -> Gen File
 arbitrarySizedFile m = do
-  del <- elements [Tab, Comma]
+  del  <- elements [Tab, Comma, Semicolon]
  corp <- vectorOf m (generateRandomCorpus)
  return (File del corp)
-delimiterToText :: Delimiter -> Text
+delimiterToText :: ColumnDelimiter -> Text
-delimiterToText Tab   = DT.pack "\t"
+delimiterToText Tab       = DT.pack "\t"
-delimiterToText Comma = DT.pack ","
+delimiterToText Comma     = DT.pack ","
-delimiterToText Line  = DT.pack "\n"
+delimiterToText Semicolon = DT.pack ";"
-delimiterToString :: Delimiter -> Char
+delimiterToString :: ColumnDelimiter -> Char
-delimiterToString Tab   = '\t'
+delimiterToString Tab       = '\t'
-delimiterToString Comma = ','
+delimiterToString Comma     = ','
-delimiterToString Line  = '\n'  
+delimiterToString Semicolon = ';'  
 textToBL :: Text -> BL.ByteString
 textToBL b = BL.fromChunks . return . DT.encodeUtf8 $ b
@@ -85,7 +82,7 @@ generateRandomCorpus = RandomCorpus
 generateFileDelimiter :: Gen File
 generateFileDelimiter = do
-  del <- elements [Tab, Comma]
+  del <- elements [Tab, Comma, Semicolon]
  m <- choose (1,5)
  corp <- vectorOf m (generateRandomCorpus)
  return (File del corp)
@@ -106,7 +103,7 @@ randomHeaderList = frequency [
  ]
 --TODO add delimiter
-createLineFromCorpus :: RandomCorpus -> Delimiter -> BL.ByteString
+createLineFromCorpus :: RandomCorpus -> ColumnDelimiter -> BL.ByteString
 createLineFromCorpus corpus delD = do
    let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"")
    let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"")
@@ -118,7 +115,7 @@ createLineFromCorpus corpus delD = do
    let del = delimiterToText delD
    textToBL(pDay <> del <> pMonth <> del <> pYears <> del <> aut <> del <> tit <> del <> sou <> del <> abt)
-createLineFromCorpusWithNewLine :: RandomCorpus -> Delimiter -> BL.ByteString
+createLineFromCorpusWithNewLine :: RandomCorpus -> ColumnDelimiter -> BL.ByteString
 createLineFromCorpusWithNewLine corpus delD = do
    let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"")
    let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"")
@@ -143,9 +140,9 @@ createFileWithNewLine file = do
    let allLines = BL.intercalate (BLU.fromString "\n") $ Prelude.map (\x -> createLineFromCorpus x (fDelimiter file)) (allCorpus file)
    headers <> (BLU.fromString "\n") <> allLines
-validRandomCorpus :: RandomCorpus -> Delimiter -> Bool
+validRandomCorpus :: RandomCorpus -> ColumnDelimiter -> Bool
 validRandomCorpus tsv del
-    | BL.length (BL.filter (==delimiter del) (createLineFromCorpus tsv del)) > 3= True
+    | BL.length (BL.filter (== toWord8 del) (createLineFromCorpus tsv del)) > 3= True
    | DT.null $ abstract tsv                                                    = True
    | DT.null $ title tsv                                                       = True
    | DT.null $ authors tsv                                                     = True
@@ -161,7 +158,7 @@ testValidNumber :: Property
 testValidNumber = forAll generateNumber (\s -> do
    let nbText = DT.pack $ show s
    let bl = textToBL nbText
-    case validNumber bl nbText 1 [] of
+    case checkNumber bl nbText 1 of
        Right _                 -> True
        Left  _ | BL.empty == bl -> True
                | s < 1 -> True
@@ -171,7 +168,7 @@ testValidNumber = forAll generateNumber (\s -> do
 testValidText :: Property
 testValidText = forAll generateString (\s -> 
    let bl = textToBL s in
-        case validTextField bl s 1 [] of
+        case checkTextField bl s 1 of
            Right _                 -> True
            Left  _ | BL.empty == bl -> True
                    | (fromIntegral $ ord '\"') `BL.elem` bl -> True
@@ -184,15 +181,11 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
    let del = Tab
    let line = createLineFromCorpus tsv del
    let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
-    let splitLine = BL.splitWith (==delimiter del) line
+    let splitLine = BL.splitWith (== toWord8 del) line
-    case testErrorPerLine splitLine del headers 1 [] of
+    case checkRow splitLine del headers 1 of
        Right _ -> True
        Left  _ -> validRandomCorpus tsv del)
--check : 
-- True Del
-- False Error
 -- Test if a file is OK
 testTestCorrectFile :: Property
 testTestCorrectFile = forAll generateFile (\file -> do
@@ -202,8 +195,8 @@ testTestCorrectFile = forAll generateFile (\file -> do
        Left _ -> Prelude.all (\x -> do 
            let del = fDelimiter file
            let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
-            let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
+            let splitLine = BL.splitWith (== toWord8 del) $ createLineFromCorpus x del
-            case testErrorPerLine splitLine del headers 1 [] of
+            case checkRow splitLine del headers 1 of
                Right _ -> True
                Left  _ -> validRandomCorpus x del) (allCorpus file))
@@ -217,20 +210,20 @@ testTestCorrectFileWithNewLine = forAll generateFile (\file -> do
        Left _ -> Prelude.all (\x -> do 
            let del = fDelimiter file
            let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
-            let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
+            let splitLine = BL.splitWith (== toWord8 del) $ createLineFromCorpus x del
-            case testErrorPerLine splitLine del headers 1 [] of
+            case checkRow splitLine del headers 1 of
                Right _ -> True
                Left  _ -> validRandomCorpus x del) (allCorpus file))
 testFindDelimiter :: Property
 testFindDelimiter = forAll generateFileDelimiter (\file -> do
    let tsv = createFile file
-    case findDelimiter tsv of
+    case findColumnDelimiter tsv of
        Right _ -> True
        Left _ -> do
            let line = Prelude.head $ allCorpus file
            let del = delimiterToString $ fDelimiter file
-            let delLine = delimiterToString Line
+            let delLine = '\n'
            del `DT.elem` (abstract line) || del `DT.elem` (authors line) || del `DT.elem` (title line) || del `DT.elem` (source line) || delLine `DT.elem` (abstract line) || delLine `DT.elem` (authors line) || delLine `DT.elem` (title line) || delLine `DT.elem` (source line))
 testGetHeader :: Property