Reorganize TSV parsing code

51c8a407 · Grégoire Locqueville · e4dfb4bd · 51c8a407 · 51c8a407 · 51c8a407
Commit 51c8a407 authored Oct 07, 2024 by Grégoire Locqueville
4 changed files
--- a/bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
+++ b/bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
@@ -20,7 +20,7 @@ import Data.Tuple.Extra (both)
 import Data.Vector qualified as DV
 import GHC.Generics
 import Gargantext.Core.Text.Context (TermList)
-import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear)
+import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, unIntOrDec, tsv_title, tsv_abstract, tsv_publication_year, defaultYear)
 import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
 import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
 import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
@@ -52,7 +52,7 @@ filterTermsAndCoocCLI (CorpusFile corpusFile) (TermListFile termListFile) (Outpu
    Right cf -> do
      let corpus = DM.fromListWith (<>)
                   . DV.toList
-                   . DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
+                   . DV.map (\n -> (maybe defaultYear unIntOrDec $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
                   . snd $ cf
      -- termListMap :: [Text]

--- a/bin/gargantext-cli/CLI/Phylo/Common.hs
+++ b/bin/gargantext-cli/CLI/Phylo/Common.hs
@@ -82,8 +82,8 @@ tsvToDocs parser patterns time path =
    Wos  _     -> Prelude.error "tsvToDocs: unimplemented"
    Tsv  limit -> Vector.toList
      <$> Vector.take limit
-      <$> Vector.map (\row -> Document (toPhyloDate  (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
+      <$> Vector.map (\row -> Document (toPhyloDate  (maybe Tsv.defaultYear Tsv.unIntOrDec $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
-                                       (toPhyloDate' (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
+                                       (toPhyloDate' (maybe Tsv.defaultYear Tsv.unIntOrDec $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
                                       (termsInText patterns $ (tsv_title row) <> " " <> (tsv_abstract row))
                                       Nothing
                                       []

--- a/src/Gargantext/Core/Ext/IMTUser.hs
+++ b/src/Gargantext/Core/Ext/IMTUser.hs
@@ -22,7 +22,7 @@ import Data.Csv ( (.:), header, decodeByNameWith, FromNamedRecord(..), Header )
 import Data.Text qualified as T
 import Data.Vector (Vector)
 import Data.Vector qualified as Vector
-import Gargantext.Core.Text.Corpus.Parsers.TSV ( tsvDecodeOptions, ColumnDelimiter(Tab) )
+import Gargantext.Core.Text.Corpus.Parsers.TSV (defaultDecodingOptionsWithDelimiter, ColumnDelimiter(Tab) )
 import Gargantext.Database.Admin.Types.Hyperdata.Contact
 import Gargantext.Prelude
 import System.FilePath.Posix (takeExtension)
@@ -119,7 +119,7 @@ readTSVFile_Annuaire' :: FilePath -> IO (Header, Vector IMTUser)
 readTSVFile_Annuaire' = fmap readTsvHalLazyBS' . BL.readFile
  where
    readTsvHalLazyBS' :: BL.ByteString -> (Header, Vector IMTUser)
-    readTsvHalLazyBS' bs = case decodeByNameWith (tsvDecodeOptions Tab) bs of
+    readTsvHalLazyBS' bs = case decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
          Left  e    -> panicTrace (cs e)
          Right rows -> rows

--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
@@ -12,135 +12,167 @@ TSV parser for Gargantext corpus files.
 -}
-module Gargantext.Core.Text.Corpus.Parsers.TSV where
+module Gargantext.Core.Text.Corpus.Parsers.TSV
+  ( ColumnDelimiter (..)
-import Conduit ( ConduitT, (.|), yieldMany, mapC )
+  , IntOrDec (..)
-import Data.ByteString qualified as BS
+  , TsvDoc (..)
-import Data.ByteString.Lazy qualified as BL
+  , TsvHal (..)
-import Data.Csv
+  , Tsv' (..)
-import Data.Text (pack)
+  , defaultDay
-import Data.Text qualified as T
+  , defaultDecodingOptionsWithDelimiter
-import Data.Text.Lazy            qualified as TL
+  , defaultEncodingOptionsWithDelimiter
-import Data.Text.Lazy.Encoding    qualified as TL
+  , defaultMonth
-import Data.Text.Read     qualified as DTR
+  , defaultYear
+  , parseHal
+  , parseTsv
+  , parseTsvC
+  , readTSVFile
+  , readTsvHal
+  , readWeightedTsv
+  , writeDocs2Tsv
+  )
+where
+import Conduit (ConduitT, yieldMany, mapC, (.|))
+import Data.ByteString.Lazy    qualified as BL
+import Data.Csv                qualified as CSV
+import Data.Csv                ((.=), (.:))
+import Data.Text               qualified as T
+import Data.Text.Lazy          qualified as TL
+import Data.Text.Lazy.Encoding qualified as TLE
+import Data.Text.Read          qualified as DTR
 import Data.Time.Segment (jour)
-import Data.Vector (Vector)
 import Data.Vector qualified as V
-import Gargantext.Core.Text ( sentences, unsentences )
+import Data.Vector ((!?))
-import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
 import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
 import Gargantext.Prelude
 import Gargantext.Utils.Jobs.Error as Warn
+---------------------------------
+-- Utility types and functions --
+---------------------------------
+-- | Day of the UNIX reference time (UNIX epoch)
+defaultYear :: Int
+defaultYear = 1973
+-- | UNIX epoch month
+defaultMonth :: Int
+defaultMonth = 1
+-- | UNIX epoch year
+defaultDay :: Int
+defaultDay = 1
 -- | The possible delimiters for a CSV file
-data LineDelimiter = Newline deriving (Eq, Show)
 data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
-class ToWord8 a where
+-- | Concrete representation of the above type
-  -- | Convert abstract representation into a Word8 character
+toWord8 :: ColumnDelimiter -> Word8
-  toWord8 :: a -> Word8
+toWord8 Tab       = fromIntegral $ ord '\t'
+toWord8 Comma     = fromIntegral $ ord ','
+toWord8 Semicolon = fromIntegral $ ord ';'
-instance ToWord8 LineDelimiter where
+-- | Just the newline character in `Word8` form; handy to have around
-  toWord8 Newline = fromIntegral $ ord '\n'
+newline :: Word8
+newline = fromIntegral $ ord '\n'
-instance ToWord8 ColumnDelimiter where
+-- | Split a text depending on the delimiter type
-  toWord8 Tab       = fromIntegral $ ord '\t'
+split :: ColumnDelimiter -> BL.ByteString -> [BL.ByteString]
-  toWord8 Comma     = fromIntegral $ ord ','
+split delimiter txt = BL.splitWith (== toWord8 delimiter) txt
-  toWord8 Semicolon = fromIntegral $ ord ';'
---------------------------------------------------------------
+-- | Remove all occurences of `needle` in `haystack`
-- | Minimal header for a working TSV import
+removeText :: T.Text -- ^ The text to remove (the `needle`)
-minimalTsvHeader :: Header
+           -> T.Text -- ^ The original text (the `haystack`)
-minimalTsvHeader =
+           -> T.Text
-  header [ "title"
+removeText needle haystack = T.replace needle "" haystack
-         , "source"
-         , "publication_year"
-         , "publication_month"
-         , "publication_day"
-         , "abstract"
-         , "authors"
-         ]
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
-splitDoc :: Mean -> SplitContext -> TsvDoc -> Vector TsvDoc
-splitDoc m splt doc =
- let docSize = (T.length $ tsv_abstract doc) in
- if (docSize > 1000) && (mod (round m) docSize >= 10)
-   then splitDoc' splt doc
-   else V.fromList [doc]
-  where
-    splitDoc' :: SplitContext -> TsvDoc -> Vector TsvDoc
-    splitDoc' contextSize (TsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
-        where
-          firstDoc = TsvDoc { tsv_abstract = firstAbstract, .. }
-          firstAbstract = head' "splitDoc'1" abstracts
-          nextDocs = map (\txt -> TsvDoc { tsv_title = head' "splitDoc'2" $ sentences txt
+-- | Convert from raw byte string into `Text`
-                                         , tsv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
+lBLToText :: BL.ByteString -> Text
-                                         , .. }
+lBLToText b = TL.toStrict $ TLE.decodeUtf8 b
-                          ) (tail' "splitDoc'2" abstracts)
+-- | Fail in the Either monad iff a predicate is False
+assertE :: Bool -- ^ The predicate to be tested
+        -> e    -- ^ The error if the predicate fails
+        -> Either e ()
+assertE predicate err = if predicate then Right () else Left err
+-- | Trying getting a value out of a `Maybe`; if it does not work,
+--   return an error message
+tryE :: Maybe a -- ^ The value to extract
+     -> e       -- ^ The error message
+     -> Either e a -- ^ The extracted value, in the `Either` monad
+tryE  Nothing     err = Left err
+tryE (Just value) _   = Right value
+-- | Tests whether the input text can be parsed as a number
+parseNumber :: Text -> Maybe Int
+parseNumber str = case DTR.decimal str of
+  Right (x, "") -> Just x
+  _ -> Nothing
+-- | An integer that may be the result of parsing either an integer value
+-- or a floating-point value. If the parsed value was a floating-point number,
+-- it is cropped to take only its integer part.
+newtype IntOrDec = IntOrDec { unIntOrDec :: Int }
+  deriving (Show, Eq, Read)
-          abstracts    = (splitBy $ contextSize) tsv_abstract
+instance CSV.FromField IntOrDec where
+  -- | Try parsing the field as an `Int`; if it fails, parse it as a `Double`
+  parseField s = case CSV.runParser (CSV.parseField s :: CSV.Parser Int) of
+    Left _  -> IntOrDec . floor <$> (CSV.parseField s :: CSV.Parser Double)
+    Right n -> pure $ IntOrDec n
+instance CSV.ToField IntOrDec where
+  toField = CSV.toField . unIntOrDec
---------------------------------------------------------------
---------------------------------------------------------------
-type Mean = Double
-docsSize :: Vector TsvDoc -> Mean
+-- | Default options for TSV encoding, except for the delimiter,
-docsSize tsvDoc = mean ls
+-- which is passed as argument
-  where
+defaultEncodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.EncodeOptions
-    ls = V.toList $ V.map (fromIntegral . T.length . tsv_abstract) tsvDoc
+defaultEncodingOptionsWithDelimiter d = CSV.defaultEncodeOptions {CSV.encDelimiter = toWord8 d}
+-- | Default options for TSV decoding, except for the delimiter,
+-- which is passed as argument
+defaultDecodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.DecodeOptions
+defaultDecodingOptionsWithDelimiter d = CSV.defaultDecodeOptions {CSV.decDelimiter = toWord8 d}
+--------------------
+-- Basic TSV type --
+--------------------
---------------------------------------------------------------
-newtype IntOrDec = IntOrDec Int
-  deriving (Show, Eq, Read)
-unIntOrDec :: IntOrDec -> Int
-unIntOrDec (IntOrDec i) = i
-instance FromField IntOrDec where
-  parseField s = case runParser (parseField s :: Parser Int) of
-    Left _err -> IntOrDec . floor <$> (parseField s :: Parser Double)
-    Right n   -> pure $ IntOrDec n
-instance ToField IntOrDec where
-  toField (IntOrDec i) = toField i
-fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
-fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
-defaultYear :: Int
-defaultYear = 1973
-defaultMonth :: Int
-defaultMonth = 1
-defaultDay :: Int
-defaultDay = 1
 data TsvDoc = TsvDoc
-    { tsv_title             :: !Text
+  { tsv_title             :: !Text
-    , tsv_source            :: !Text
+  , tsv_source            :: !Text
-    , tsv_publication_year  :: !(Maybe IntOrDec)
+  , tsv_publication_year  :: !(Maybe IntOrDec)
-    , tsv_publication_month :: !(Maybe Int)
+  , tsv_publication_month :: !(Maybe Int)
-    , tsv_publication_day   :: !(Maybe Int)
+  , tsv_publication_day   :: !(Maybe Int)
-    , tsv_abstract          :: !Text
+  , tsv_abstract          :: !Text
-    , tsv_authors           :: !Text
+  , tsv_authors           :: !Text
-    }
+  }
-    deriving (Show)
+  deriving (Show)
-instance FromNamedRecord TsvDoc where
+instance CSV.FromNamedRecord TsvDoc where
  parseNamedRecord r = do
-    tsv_title <- r .: "title" <|> r .: "Title"
+    tsv_title             <- r .: "title"             <|> r .: "Title"
-    tsv_source <- r .: "source" <|> r .: "Source"
+    tsv_source            <- r .: "source"            <|> r .: "Source"
-    tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
+    tsv_publication_year  <- r .: "publication_year"  <|> r .: "Publication Year"
    tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
-    tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
+    tsv_publication_day   <- r .: "publication_day"   <|> r .: "Publication Day"
-    tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
+    tsv_abstract          <- r .: "abstract"          <|> r .: "Abstract"
-    tsv_authors <- r .: "authors" <|> r .: "Authors"
+    tsv_authors           <- r .: "authors"           <|> r .: "Authors"
    pure $ TsvDoc { .. }
-instance ToNamedRecord TsvDoc where
+instance CSV.ToNamedRecord TsvDoc where
  toNamedRecord (TsvDoc{ .. }) =
-    namedRecord [ "title"             .= tsv_title
+    CSV.namedRecord [ "title"             .= tsv_title
                , "source"            .= tsv_source
                , "publication_year"  .= tsv_publication_year
                , "publication_month" .= tsv_publication_month
@@ -149,278 +181,398 @@ instance ToNamedRecord TsvDoc where
                , "authors"           .= tsv_authors
                ]
-hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
-hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
-                                    , tsv_source = m $ _hd_source h
-                                    , tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
-                                    , tsv_publication_month = Just $ mI $ _hd_publication_month h
-                                    , tsv_publication_day = Just $ mI $ _hd_publication_day   h
-                                    , tsv_abstract = m $ _hd_abstract h
-                                    , tsv_authors = m $ _hd_authors h }
+tsv2doc :: TsvDoc -> HyperdataDocument
+tsv2doc (TsvDoc { .. })
+  = HyperdataDocument { _hd_bdd              = Just "TsvHal"
+                      , _hd_doi              = Nothing
+                      , _hd_url              = Nothing
+                      , _hd_page             = Nothing
+                      , _hd_title            = Just tsv_title
+                      , _hd_authors          = Just tsv_authors
+                      , _hd_institutes       = Nothing
+                      , _hd_source           = Just tsv_source
+                      , _hd_abstract         = Just tsv_abstract
+                      , _hd_publication_date = Just $ T.pack . show $ jour (fromIntegral pubYear)
+                                                                         pubMonth
+                                                                         pubDay
+                      , _hd_publication_year   = Just pubYear
+                      , _hd_publication_month  = Just pubMonth
+                      , _hd_publication_day    = Just pubDay
+                      , _hd_publication_hour   = Nothing
+                      , _hd_publication_minute = Nothing
+                      , _hd_publication_second = Nothing
+                      , _hd_language_iso2      = Nothing
+                    , _hd_institutes_tree      = Nothing }
+  where
+    pubYear  = maybe defaultYear unIntOrDec tsv_publication_year
+    pubMonth = fromMaybe defaultMonth tsv_publication_month
+    pubDay   = fromMaybe defaultDay tsv_publication_day
+-- | Minimal header for a working TSV import
+-- TODO why is this so much different from `minimalExportHeader`?
+minimalTsvImportHeaders :: [Text]
+minimalTsvImportHeaders = [ "Publication Day"
+                          , "Publication Month"
+                          , "Publication Year"
+                          , "Authors"
+                          , "Title"
+                          , "Source"
+                          , "Abstract"
+                          ]
+-- | Minimal header for a working TSV export
+-- TODO why is this so much different from `minimalImportHeader`?
+minimalTsvExportHeader :: CSV.Header
+minimalTsvExportHeader =
+  CSV.header [ "title"
+         , "source"
+         , "publication_year"
+         , "publication_month"
+         , "publication_day"
+         , "abstract"
+         , "authors"
+         ]
+-- | Parses a TSV file into the native HyperdataDocument datatype..
+--   Returns `Left <error message>` upon failure
+parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
+parseTsv filepath = do
+  eitherTsvFile <- readTSVFile filepath
+  return $ do
+    (_header, tsvDocs) <- eitherTsvFile
+    let hyperdataDocs = tsv2doc <$> tsvDocs
+    return $ V.toList hyperdataDocs
+-- | Conduit version of `parseTsv`
+parseTsvC :: BL.ByteString
+          -> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
+parseTsvC tsvFile =
+  (\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
+  where
+    (eResult, warning) = case testCorrectFile tsvFile of
+      Left err -> (Left err, Warn.MalformedCorpus "")
+      Right (del,warn) -> (readTsvLazyBS del tsvFile, Warn.MalformedCorpus (T.intercalate (T.pack "\n") warn))
+-- | TODO use readFileLazy
+readTSVFile :: FilePath -> IO (Either Text (CSV.Header, V.Vector TsvDoc))
+readTSVFile fp = do
+  file <- BL.readFile fp
+  case testCorrectFile file of
+    Left _err -> pure $ Left _err
+    Right (del,_) -> pure $ readTsvLazyBS del file
+-- | Write a TSV file containing a collection of hyperdata documents
+writeDocs2Tsv :: FilePath             -- ^ The output filepath
+              -> [HyperdataDocument]  -- ^ The hyperdata documents to write
+              -> IO ()
+writeDocs2Tsv filepath hyperdataDocs = do
+  let tsvDocs = hyperdataDocument2tsvDoc <$> hyperdataDocs
+  let fileToWrite = CSV.encodeByNameWith (defaultEncodingOptionsWithDelimiter Tab)
+                                     minimalTsvExportHeader
+                                     tsvDocs
+  BL.writeFile filepath fileToWrite
+-- | Convert between the native hyperdata datatype
+--   and the transition TsvDoc datatype
+hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
+hyperdataDocument2tsvDoc h =
+  TsvDoc { tsv_title             = m $ _hd_title h
+         , tsv_source            = m $ _hd_source h
+         , tsv_publication_year  = Just $ IntOrDec $ mI $ _hd_publication_year h
+         , tsv_publication_month = Just $ mI $ _hd_publication_month h
+         , tsv_publication_day   = Just $ mI $ _hd_publication_day   h
+         , tsv_abstract          = m $ _hd_abstract h
+         , tsv_authors           = m $ _hd_authors h }
  where
    m = maybe "" identity
    mI = maybe 0 identity
-tsvDecodeOptions :: ColumnDelimiter -> DecodeOptions
+-------------------------------------
-tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = toWord8 d}
-tsvEncodeOptions :: ColumnDelimiter -> EncodeOptions
-tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = toWord8 d}
------------------------------------------------------------------------
+readTsvLazyBS :: ColumnDelimiter
+              -> BL.ByteString
+              -> Either Text (CSV.Header, V.Vector TsvDoc)
+readTsvLazyBS d bs = first T.pack $ CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter d) bs
-testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
-testDelimiter del bs =
-    let x = BL.splitWith (== toWord8 Newline) bs
-        vec = V.fromList x in
-          case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of
-            Nothing -> False
-            Just e -> case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 1) of
-              Nothing -> False
-              Just f -> length e == length f && length e > 2
+testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text]) 
+testCorrectFile file = do
+  delimiter <- findColumnDelimiter file
+  let rows = BL.splitWith (== newline) file
+  headers <- getHeaders rows delimiter
+  (\content -> (delimiter, content)) <$> checkRows rows delimiter headers
+-- | Given a delimiter type and a list of headers, checks that the rows
+-- of the input CSV/TSV file, split into lines, are valid. If they are, return
+-- the list of warnings encountered during checking.
+--   Note that the first line (the header line) must be included,
+-- but will not be checked here.
+checkRows :: [BL.ByteString] -- ^ The file, split in rows
+          -> ColumnDelimiter -- ^ The purported delimiter type
+          -> [Text]          -- ^ The headers
+          -> Either Text [Text]
+checkRows fileLines delimiter headers =
+  checkFromLine 2 (V.fromList fileLines) delimiter headers
+-- | Detect which delimiter a file is using; if detection fails, an error message
+-- is output inside the `Left` constructor
 findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter
 findColumnDelimiter bs
-  | testDelimiter Tab   bs = Right Tab
+  | testDelimiter Tab       bs = Right Tab
-  | testDelimiter Comma bs = Right Comma
+  | testDelimiter Comma     bs = Right Comma
-  | otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
+  | testDelimiter Semicolon bs = Right Semicolon
+  | otherwise = Left (T.pack "Couldn't interpret file as a valid CSV/TSV file, no matter the delimiter (comma, tab or semicolon)")
-isNumeric :: Text -> Either Bool Int
-isNumeric str = case DTR.decimal str of
-  Right (x,y) -> if y == ""
-    then Right x
-    else Left False
-  Left _ -> Left False
-lBLToText :: BL.ByteString -> Text
+-- | Get a TSV file's header fields
-lBLToText b = TL.toStrict $ TL.decodeUtf8 b
+getHeaders :: [BL.ByteString]    -- ^ The input file, split into lines
+           -> ColumnDelimiter    -- ^ This CSV/TSV's delimiter
-validNumber :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
+           -> Either Text [Text] -- ^ Left: error message; Right: the input file's headers
-validNumber x columnHeader ligne warn = do
+getHeaders rows delimiter =
-    let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
+  case fmap lBLToText . BL.splitWith (== toWord8 delimiter) <$> V.fromList rows !? 0 of 
-    case isNumeric number of
+    Nothing -> Left "Error in function `getHeaders`: Empty file"
-      Right val 
+    Just headers -> case testAllHeadersPresence $ removeText  "\"" <$> headers of
-        | val < 0   -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
+      Left missing -> Left $ "Error in function `getHeaders: Missing headers: " <> T.intercalate ", " missing
-        | otherwise -> Right warn
+      Right ()     -> Right headers
-      Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
-validTextField :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
-validTextField x columnHeader ligne warn = do
-  let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
-    if not (T.null xs) 
-      then 
-        if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2))) 
-          then Right warn
-          else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter  \"  must only appear at the beginning and the end of a field ")
-      else Right ( ("The column '" <> columnHeader <> "' at line " <>  pack (show ligne) <> " is empty") : warn)
-      -- else Left $ ("The column '" <> columnHeader <> "' at line " <>  pack (show ligne) <> " is empty")
-      -- Put a warning for the user to know their is a problem (empty column)
-testValue :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
-testValue val columnHeader ligne warn = case columnHeader of
-    "Publication Day" -> validNumber val columnHeader ligne warn
-    "Publication Month" -> validNumber val columnHeader ligne warn
-    "Publication Year" -> validNumber val columnHeader ligne warn
-    "Authors" -> validTextField val columnHeader ligne warn
-    "Title" -> validTextField val columnHeader ligne warn
-    "Source" -> validTextField val columnHeader ligne warn
-    "Abstract" -> validTextField val columnHeader ligne warn
-    _ -> Right warn
-testErrorPerLine :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Int  -> [Text] -> Either Text [Text]
-testErrorPerLine [] _ [] _ warn = Right warn
-testErrorPerLine _ del [] l _ | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
-                              | otherwise    = Left (pack $ "Too much field at line " <> show l)
-testErrorPerLine [] _ _ l _ = Left (pack $ "Missing one field at line " <> show l)
-testErrorPerLine (v:val) del (h:headers) ligne warn =
-    case testValue v h ligne warn of
-        Left _err -> Left _err
-        Right warning -> testErrorPerLine val del headers ligne warning
-checkNextLine :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
-checkNextLine bl del headers res x = do
-  case BL.splitWith (== toWord8 del) <$> ((V.!?) bl (x+1)) of
-    Nothing  -> Right (x, (BL.splitWith (== toWord8 del) res))
-    Just value -> if length value > 1 
-      then Right (x, (BL.splitWith (== toWord8 del) res))
-      else case BL.append res <$> ((V.!?) bl (x+1)) of
-        Nothing  -> Left "checkNextLine2"
-        Just val -> checkNextLine bl del headers val (x+1)
-getMultipleLinefile :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
-getMultipleLinefile bl del headers res x = do
-    let tmp = BL.splitWith (== toWord8 del) res in
-        if length tmp == length headers 
-          then checkNextLine bl del headers res x
-          else
-            if (length tmp > length headers) || (V.length bl == (x + 1)) 
-              then Left (pack $ "Cannot parse the file at line " <> show x <> ". Maybe because of a delimiter") 
-              else do
-                case BL.append res <$> ((V.!?) bl (x+1)) of
-                  Nothing  -> Left "getMultipleLinefile"
-                  Just val -> getMultipleLinefile bl del headers val (x+1)
-- | Check that the file is well-formed and throw warnings/errors accordingly
-checkIntegrity :: Vector BL.ByteString -- ^ Individual lines of the TSV file
-               -> ColumnDelimiter      -- ^ The delimiter used in this file (comma, tab, ...)
-               -> [Text]               -- ^ Expected TSV headers
-               -> Int                  -- ^ Current line number
-               -> [Text]               -- ^ Warning accumulator
-               -> Either Text [Text]   -- ^ Left error if an error has occured, otherwise Right (list of warnings)
-checkIntegrity tsvLines delim headers lineNumber warnings
-  | length tsvLines == lineNumber = Right warnings
-  | otherwise      =
-    case (V.!?) tsvLines lineNumber of
-      Nothing -> Left "Gargantext.Core.Text.Corpus.Parsers.Tsv: error in function checkTsv"
-      Just currentLine ->
-        case getMultipleLinefile tsvLines delim headers currentLine lineNumber of 
-          Left _err -> Left _err
-          Right (y, val) -> case testErrorPerLine val delim headers (lineNumber + 1) warnings of
-            Left _err -> Left _err
-            Right warning -> checkIntegrity tsvLines delim headers (y+1) warning
-testIfErrorInFile :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Either Text [Text]
-testIfErrorInFile bl del headers = checkIntegrity (V.fromList bl) del headers 2 []
-testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text]) 
-testCorrectFile bs = 
-  case findColumnDelimiter bs of
-    Left _err -> Left _err
-    Right del -> do
-      let bl = BL.splitWith (== toWord8 Newline) bs in
-        case getHeaders bl del of
-          Left _err -> Left _err
-          Right headers -> (\content -> (del, content)) <$> testIfErrorInFile bl del headers
+-- | Check that all required headers are present for the basic TSV type
+testAllHeadersPresence :: [Text]           -- ^ List of headers
+                       -> Either [Text] () -- ^ Left: list of missing headers, if any; Right: success
+testAllHeadersPresence headers = do
+  -- Remove the \r character that sometimes appears at the end of a line
+  let cleanHeaders = removeText "\r" <$> headers
+  let missingHeaders = filter (`notElem` cleanHeaders) minimalTsvImportHeaders
+  if null missingHeaders
+    then Right ()
+    else Left missingHeaders
----------Test headers added to ggt
+-- | Test whether the given file has the given character as a delimiter
+testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
+testDelimiter delimiter file = fromMaybe False $ do
+  -- do-notation in the Maybe monad for nice pattern matching
+  let rows = V.fromList $ BL.splitWith (== newline) file
+  headers  <- BL.splitWith (== toWord8 delimiter) <$> rows !? 0
+  firstRow <- BL.splitWith (== toWord8 delimiter) <$> rows !? 1
+  return $ length headers == length firstRow && length headers > 2
+-- | Check a TSV file's rows starting at the given line number
+checkFromLine :: Int                    -- ^ The line number from which to check
+              -> V.Vector BL.ByteString -- ^ Individual lines of the TSV file
+              -> ColumnDelimiter        -- ^ The delimiter used in this file (comma, tab, ...)
+              -> [Text]                 -- ^ Expected TSV headers
+              -> Either Text [Text]
+              -- ^ `Left <error>` if an error has occured, otherwise `Right <list of warnings>`
+checkFromLine lineNumber tsvLines delim headers = do
+  assertE (length tsvLines == lineNumber)
+    "Error in function checkIntegrity: queried line number is greater than actual number of lines"
+  currentLine <- tryE (tsvLines !? lineNumber)
+    "Error in function checkIntegrity: no line at queried number"
+  (y, val) <- checkPartialRow tsvLines delim headers currentLine lineNumber
+  currentLineWarnings <- checkRow val delim headers (lineNumber + 1)
+  restOfWarnings      <- checkFromLine (y+1) tsvLines delim headers
+  return $ currentLineWarnings <> restOfWarnings
+-- | Parse the row given as input. If the input is not a whole row, only part of a row,
+-- fetch the next line and interpret it as the rest of the row
+checkPartialRow :: V.Vector BL.ByteString -- ^ The TSV file, split into lines
+                -> ColumnDelimiter        -- ^ The type of delimiter
+                -> [Text]                 -- ^ Headers
+                -> BL.ByteString          -- ^ Input row, or part of a row
+                -> Int                    -- ^ Current line number
+                -> Either Text (Int, [BL.ByteString])
+                -- ^ Left: Error message; Right: (Line number after the row, list of cells)
+checkPartialRow fileLines delimiter headers partialRow lineNum = do
+  let partialCells = split delimiter partialRow
+  if length partialCells == length headers
+    -- We've got a full row, we can check its contents:
+    then Right $ checkNextLine fileLines delimiter headers partialRow lineNum
+    else do
+      assertE ((length partialCells <= length headers) && (V.length fileLines < (lineNum + 1)))
+        (T.pack $ "Cannot parse the file at line " <> show lineNum <> ". Maybe because of a delimiter")
+      val <- tryE ((BL.append partialRow <$> fileLines) !? (lineNum+1))
+        "checkPartialRow"
+      checkPartialRow fileLines delimiter headers val (lineNum+1)
+-- | Looks up the next line...?
+checkNextLine :: V.Vector BL.ByteString -- ^ All rows
+              -> ColumnDelimiter        -- ^ Type of delimiter
+              -> [Text]                 -- ^ Headers
+              -> BL.ByteString          -- ^ ???
+              -> Int                    -- ^ Line number
+              -> (Int, [BL.ByteString])
+checkNextLine allLines delimiter headers res lineNumber =
+  case allLines !? (lineNumber + 1) of
+    Nothing -> (lineNumber, split delimiter res)
+    Just nextLine -> let nextRow = split delimiter nextLine in
+      if length nextRow > 1
+        then (lineNumber, split delimiter res)
+        else checkNextLine allLines delimiter headers (res <> nextLine) (lineNumber + 1)
+-- | Check for any errors or warnings in a given row
+checkRow :: [BL.ByteString] -- ^ The row to check, split into cells
+         -> ColumnDelimiter -- ^ The type of delimiter
+         -> [Text]          -- ^ The list of headers
+         -> Int             -- ^ Current line; used for logging purposes
+         -> Either Text [Text] -- ^ List of warnings
+-- Empty row & no headers: nothing to check:
+checkRow [] _ [] _ = Right []
+-- More cells than headers: error
+checkRow _ delimiter [] line = Left $
+  "Too many fields at line " <> (T.pack . show) line <> "." <>
+  if delimiter /= Comma
+    then ""
+    else "Try using tabulation as a delimiter. Other delimiters like comma (,) may appear in some texts."
+-- More headers than cells: error
+checkRow [] _ _ line = Left $ "Missing one field on line " <> (T.pack . show) line
+-- Still some fields and headers to consume
+checkRow (v:val) delimiter (h:headers) line = do
+    warning  <- checkValue v h line
+    warnings <- checkRow val delimiter headers line
+    return $ warning <> warnings
+-- | Tests whether a given chunk of text is a valid field
+checkValue :: BL.ByteString -- ^ The field to check
+          -> Text          -- ^ Current column header; passed as input for logging purposes
+          -> Int           -- ^ Current line number; passed as input for logging purposes
+          -> Either Text [Text]
+checkValue field columnHeader lineNumber = case columnHeader of
+    "Publication Day"   -> checkNumber    field columnHeader lineNumber
+    "Publication Month" -> checkNumber    field columnHeader lineNumber
+    "Publication Year"  -> checkNumber    field columnHeader lineNumber
+    "Authors"           -> checkTextField field columnHeader lineNumber
+    "Title"             -> checkTextField field columnHeader lineNumber
+    "Source"            -> checkTextField field columnHeader lineNumber
+    "Abstract"          -> checkTextField field columnHeader lineNumber
+    _ -> Right []
+-- | Checks whether a given chunk of text is a valid number
+checkNumber :: BL.ByteString -- ^ The text to parse as a number
+            -> Text          -- ^ Current header; passed as input for logging purposes
+            -> Int           -- ^ Current line; passed as input for logging purposes
+            -> Either Text [Text] -- ^ List of warnings
+checkNumber field columnHeader line = do
+  let cleanField = removeText "\"" $ lBLToText field
+  value <- tryE (parseNumber cleanField) $
+    "Error in column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " : value is not a number "
+  assertE (value >= 0) $
+    "Value of column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " is negative"
+  return []
+-- | Checks whether a given chunk of text is a valid text field
+--   (i.e. does not have parasite quotes)
+checkTextField :: BL.ByteString -- ^ The text to parse as a number
+               -> Text          -- ^ Current header; passed as input for logging purposes
+               -> Int           -- ^ Current line; passed as input for logging purposes
+               -> Either Text [Text] -- ^ List of warnings
+checkTextField field columnHeader ligne = do
+  let cleanField = removeText "\"\"" $ lBLToText field
+  if T.null cleanField
+    then Right ["The column '" <> columnHeader <> "' at line " <>  T.pack (show ligne) <> " is empty"]
+    else do
+      assertE (
+        -- Field is not empty...
+        (T.length cleanField > 0) &&
+          -- ... and either there are no quotes at all...
+        ( T.length (T.filter (== '\"') cleanField) == 0 ||
+          -- ... or there is only one quote at the beginning and one at the end
+          ((T.head cleanField == '"') &&
+           (T.last cleanField == '"') &&
+           (T.length (T.filter (== '\"') cleanField) == 2)
+          )
+        )) $
+        "Encapsulation problem at line " <> T.pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter  \"  must only appear at the beginning and the end of a field "
+      return []
+---------------------------------------
+-- | Tsv v3 weighted for phylo
+data Tsv' = Tsv'
+  { tsv'_title             :: !Text
+  , tsv'_source            :: !Text
+  , tsv'_publication_year  :: !Int
+  , tsv'_publication_month :: !Int
+  , tsv'_publication_day   :: !Int
+  , tsv'_abstract          :: !Text
+  , tsv'_authors           :: !Text
+  , tsv'_weight            :: !Double } deriving (Show)
-- use a map to remove \r that sometimes appear at the end of a line
-testAllHeadersPresence :: [Text] -> Either Text [Text]
+instance CSV.FromNamedRecord Tsv' where
-testAllHeadersPresence headers = do
+  parseNamedRecord r = do
-    let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack ""))headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
+    tsv'_title <- r .: "title"
-    if null listHeaders
+    tsv'_source <- r .: "source"
-        then Right headers
+    tsv'_publication_year <- r .: "publication_year"
-        else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)
+    tsv'_publication_month <- r .: "publication_month"
+    tsv'_publication_day <- r .: "publication_day"
-getHeaders :: [BL.ByteString] -> ColumnDelimiter -> Either Text [Text]
+    tsv'_abstract <- r .: "abstract"
-getHeaders bl del = do
+    tsv'_authors <- r .: "authors"
-    let vec = V.fromList bl in
+    tsv'_weight <- r .: "weight"
-        case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of 
+    pure $ Tsv' { .. }
-          Nothing -> Left "Error getHeaders"
-          Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)
------------------------------------------------------------------------
-readFileLazy :: (FromNamedRecord a)
-             => proxy a
-             -> ColumnDelimiter
-             -> FilePath
-             -> IO (Either Text (Header, Vector a))
-readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
-readFileStrict :: (FromNamedRecord a)
-               => proxy a
-               -> ColumnDelimiter
-               -> FilePath
-               -> IO (Either Text (Header, Vector a))
-readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
-readByteStringLazy :: (FromNamedRecord a)
-                   => proxy a
-                   -> ColumnDelimiter
-                   -> BL.ByteString
-                   -> Either Text (Header, Vector a)
-readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
-readByteStringStrict :: (FromNamedRecord a)
-                     => proxy a
-                     -> ColumnDelimiter
-                     -> BS.ByteString
-                     -> Either Text (Header, Vector a)
-readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
-readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc))
-readTSVFile fp = do
-  file <- BL.readFile fp
-  case (testCorrectFile file) of
-    Left _err -> pure $ Left _err
-    Right (del,_) -> pure $ readTsvLazyBS del file
-- | TODO use readByteStringLazy
-readTsvLazyBS :: ColumnDelimiter
-              -> BL.ByteString
-              -> Either Text (Header, Vector TsvDoc)
-readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
------------------------------------------------------------------------
+readWeightedTsv :: FilePath -> IO (CSV.Header, V.Vector Tsv')
-- | TODO use readFileLazy
+readWeightedTsv fp =
-readTsvHal :: FilePath -> IO (Either Text (Header, Vector TsvHal))
+  (\bs ->
-readTsvHal fp = do
+    case CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
-  c <- BL.readFile fp
+      Left e       -> panicTrace (T.pack e)
-  pure $ readTsvHalLazyBS c
+      Right corpus -> corpus
+  ) <$> BL.readFile fp
-- | TODO use readByteStringLazy
-readTsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector TsvHal)
-readTsvHalLazyBS bs = first pack $ decodeByNameWith (tsvDecodeOptions Tab) bs
-readTsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector TsvHal)
-readTsvHalBSStrict bs = readTsvHalLazyBS $ BL.fromStrict bs
------------------------------------------------------------------------
-writeFile :: FilePath -> (Header, Vector TsvDoc) -> IO ()
-writeFile fp (h, vs) = BL.writeFile fp $
-                      encodeByNameWith (tsvEncodeOptions Tab) h (V.toList vs)
-writeDocs2Tsv :: FilePath -> [HyperdataDocument] -> IO ()
-writeDocs2Tsv fp hs = BL.writeFile fp $ hyperdataDocument2tsv hs
-hyperdataDocument2tsv :: [HyperdataDocument] -> BL.ByteString
+-----------------------
-hyperdataDocument2tsv hs = encodeByNameWith (tsvEncodeOptions Tab) minimalTsvHeader (map hyperdataDocument2tsvDoc hs)
------------------------------------------------------------------------
+-- | The data of a HAL document
-- Hal Format
 data TsvHal = TsvHal
-    { tsvHal_title  :: !Text
+  { tsvHal_title  :: !Text
-    , tsvHal_source :: !Text
+  , tsvHal_source :: !Text
-    , tsvHal_publication_year  :: !Integer
+  , tsvHal_publication_year  :: !Integer
-    , tsvHal_publication_month :: !Int
+  , tsvHal_publication_month :: !Int
-    , tsvHal_publication_day   :: !Int
+  , tsvHal_publication_day   :: !Int
-    , tsvHal_abstract          :: !Text
+  , tsvHal_abstract          :: !Text
-    , tsvHal_authors           :: !Text
+  , tsvHal_authors           :: !Text
-    , tsvHal_url               :: !Text
+  , tsvHal_url               :: !Text
-    , tsvHal_isbn_s            :: !Text
+  , tsvHal_isbn_s            :: !Text
-    , tsvHal_issue_s           :: !Text
+  , tsvHal_issue_s           :: !Text
-    , tsvHal_journalPublisher_s:: !Text
+  , tsvHal_journalPublisher_s:: !Text
-    , tsvHal_language_s        :: !Text
+  , tsvHal_language_s        :: !Text
-    , tsvHal_doiId_s           :: !Text
+  , tsvHal_doiId_s           :: !Text
-    , tsvHal_authId_i          :: !Text
+  , tsvHal_authId_i          :: !Text
-    , tsvHal_instStructId_i    :: !Text
+  , tsvHal_instStructId_i    :: !Text
-    , tsvHal_deptStructId_i    :: !Text
+  , tsvHal_deptStructId_i    :: !Text
-    , tsvHal_labStructId_i     :: !Text
+  , tsvHal_labStructId_i     :: !Text
-    , tsvHal_rteamStructId_i   :: !Text
+  , tsvHal_rteamStructId_i   :: !Text
-    , tsvHal_docType_s         :: !Text
+  , tsvHal_docType_s         :: !Text
-    }
+  }
-    deriving (Show)
+  deriving (Show)
-instance FromNamedRecord TsvHal where
+instance CSV.FromNamedRecord TsvHal where
  parseNamedRecord r = do
    tsvHal_title <- r .: "title"
    tsvHal_source <- r .: "source"
@@ -443,10 +595,9 @@ instance FromNamedRecord TsvHal where
    tsvHal_docType_s <- r .: "docType_s"
    pure $ TsvHal { .. }
-instance ToNamedRecord TsvHal where
+instance CSV.ToNamedRecord TsvHal where
-  --toNamedRecord (TsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
  toNamedRecord (TsvHal { .. }) =
-    namedRecord [ "title"  .= tsvHal_title
+    CSV.namedRecord [ "title"  .= tsvHal_title
                , "source" .= tsvHal_source
                , "publication_year"  .= tsvHal_publication_year
@@ -472,6 +623,21 @@ instance ToNamedRecord TsvHal where
                , "docType_s"          .= tsvHal_docType_s
               ]
+readTsvHal :: FilePath
+           -> IO (Either Text (CSV.Header, V.Vector TsvHal))
+readTsvHal path = do
+  file <- BL.readFile path
+  return . first T.pack $ -- Applying `first` to the error message because we need a `Text`
+    CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) file
+parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
+parseHal fp = do
+  r <- readTsvHal fp
+  pure $ V.toList . V.map tsvHal2doc . snd <$> r
 tsvHal2doc :: TsvHal -> HyperdataDocument
 tsvHal2doc (TsvHal { .. }) =
  HyperdataDocument { _hd_bdd = Just "TsvHal"
@@ -483,7 +649,7 @@ tsvHal2doc (TsvHal { .. }) =
                    , _hd_institutes = Just tsvHal_instStructId_i
                    , _hd_source = Just tsvHal_source
                    , _hd_abstract = Just tsvHal_abstract
-                    , _hd_publication_date = Just $ pack . show $ jour tsvHal_publication_year
+                    , _hd_publication_date = Just $ T.pack . show $ jour tsvHal_publication_year
                                                                      tsvHal_publication_month
                                                                      tsvHal_publication_day
                    , _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
@@ -495,100 +661,3 @@ tsvHal2doc (TsvHal { .. }) =
                    , _hd_language_iso2 = Nothing
                    , _hd_institutes_tree = Nothing }
-tsv2doc :: TsvDoc -> HyperdataDocument
-tsv2doc (TsvDoc { .. })
-  = HyperdataDocument { _hd_bdd = Just "TsvHal"
-                      , _hd_doi = Nothing
-                      , _hd_url = Nothing
-                      , _hd_page = Nothing
-                      , _hd_title = Just tsv_title
-                      , _hd_authors = Just tsv_authors
-                      , _hd_institutes = Nothing
-                      , _hd_source = Just tsv_source
-                      , _hd_abstract = Just tsv_abstract
-                      , _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
-                                                                         pubMonth
-                                                                         pubDay
-                      , _hd_publication_year = Just pubYear
-                      , _hd_publication_month = Just pubMonth
-                      , _hd_publication_day = Just pubDay
-                      , _hd_publication_hour = Nothing
-                      , _hd_publication_minute = Nothing
-                      , _hd_publication_second = Nothing
-                      , _hd_language_iso2 = Nothing
-                    , _hd_institutes_tree = Nothing }
-  where
-    pubYear = fromMIntOrDec defaultYear tsv_publication_year
-    pubMonth = fromMaybe defaultMonth tsv_publication_month
-    pubDay = fromMaybe defaultDay tsv_publication_day
------------------------------------------------------------------------
-parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
-parseHal fp = do
-  r <- readTsvHal fp
-  pure $ V.toList . V.map tsvHal2doc . snd <$> r
-parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
-parseHal' bs = V.toList . V.map tsvHal2doc . snd <$> readTsvHalLazyBS bs
------------------------------------------------------------------------
-parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
-parseTsv fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp
-{-
-parseTsv' ::  BL.ByteString -> Either Text [HyperdataDocument]
-parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
-parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
-parseTsv' bs = do
-  let
-    result = case (testCorrectFile bs) of
-      Left _err -> Left _err
-      Right (del,_) -> readTsvLazyBS del bs
-  V.toList . V.map tsv2doc . snd <$> result
-parseTsvC :: BL.ByteString
-          -> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
-parseTsvC bs =
-  (\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
-  where
-    (eResult, warning) = case (testCorrectFile bs) of
-      Left _err -> (Left _err, Warn.MalformedCorpus "")
-      Right (del,warn) -> (readTsvLazyBS del bs, Warn.MalformedCorpus (T.intercalate (pack "\n") warn))
------------------------------------------------------------------------
-- Tsv v3 weighted for phylo
-data Tsv' = Tsv'
-      { tsv'_title             :: !Text
-      , tsv'_source            :: !Text
-      , tsv'_publication_year  :: !Int
-      , tsv'_publication_month :: !Int
-      , tsv'_publication_day   :: !Int
-      , tsv'_abstract          :: !Text
-      , tsv'_authors           :: !Text
-      , tsv'_weight            :: !Double } deriving (Show)
-instance FromNamedRecord Tsv' where
-  parseNamedRecord r = do
-    tsv'_title <- r .: "title"
-    tsv'_source <- r .: "source"
-    tsv'_publication_year <- r .: "publication_year"
-    tsv'_publication_month <- r .: "publication_month"
-    tsv'_publication_day <- r .: "publication_day"
-    tsv'_abstract <- r .: "abstract"
-    tsv'_authors <- r .: "authors"
-    tsv'_weight <- r .: "weight"
-    pure $ Tsv' { .. }
-readWeightedTsv :: FilePath -> IO (Header, Vector Tsv')
-readWeightedTsv fp =
-  fmap (\bs ->
-    case decodeByNameWith (tsvDecodeOptions Tab) bs of
-      Left e       -> panicTrace (pack e)
-      Right corpus -> corpus
-    ) $ BL.readFile fp