[tsv] draft for new tsv v4 parser with publication_date

bba8091a · Przemyslaw Kaminski · d36a9126 · bba8091a · bba8091a · bba8091a
Verified Commit bba8091a authored Sep 29, 2025 by Przemyslaw Kaminski
7 changed files
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -229,6 +229,10 @@ library
      Gargantext.Core.Text.Corpus.Parsers.Types
      Gargantext.Core.Text.Corpus.Parsers.Date
      Gargantext.Core.Text.Corpus.Parsers.TSV
+      Gargantext.Core.Text.Corpus.Parsers.TSV.Diagnostics
+      Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
+      Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
+      Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
      Gargantext.Core.Text.Corpus.Query
      Gargantext.Core.Text.List
      Gargantext.Core.Text.List.Formats.TSV

--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
 {-# LANGUAGE BangPatterns     #-}
 {-# LANGUAGE LambdaCase       #-}
-{-# LANGUAGE TemplateHaskell  #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE ViewPatterns     #-}
 {-|
@@ -67,184 +66,9 @@ import Prelude (String)
 import Protolude
-- | Little helper data structure to make working with the incremental
-- TSV parsing a bit nicer.
-data ParsingContext a
-  = MkHeaderParsingContext (CSVI.HeaderParser (CSVI.Parser a))
-  | MkRecordParsingContext (RecordParsingContext a)
-  deriving Show
-data RecordParsingContext a = RecordParsingContext
-  { -- Either the header parser, if we just started parsing the document, or actual
-    -- incremental parser for the records.
-    -- This field will contain 'Nothing' if the parser has been drained and we need
-    -- to stop recursion.
-    _prs_ctx_parser         :: Maybe (CSVI.Parser a)
-  , _prs_ctx_parsed_records :: [a]
-  -- | Keeps track of the current row we are in, so that we can insert proper error
-  -- diagnostics.
-  , _prs_ctx_row_cursor     :: !Int
-  } deriving Show
-makeLenses ''RecordParsingContext
---------------------------------------------------------------
-headerTsvGargV3 :: Header
-headerTsvGargV3 =
-  header [ "title"
-         , "source"
-         , "publication_year"
-         , "publication_month"
-         , "publication_day"
-         , "abstract"
-         , "authors"
-         ]
---------------------------------------------------------------
-data TsvGargV3 = TsvGargV3
-    { d_docId             :: !Int
-    , d_title             :: !Text
-    , d_source            :: !Text
-    , d_publication_year  :: !Int
-    , d_publication_month :: !Int
-    , d_publication_day   :: !Int
-    , d_abstract          :: !Text
-    , d_authors           :: !Text
-    }
-    deriving (Show)
---------------------------------------------------------------
-newtype IntOrDec = IntOrDec Int
-  deriving (Show, Eq, Read)
-unIntOrDec :: IntOrDec -> Int
-unIntOrDec (IntOrDec i) = i
-instance FromField IntOrDec where
-  parseField s = case runParser (parseField s :: Parser Int) of
-    Left _err -> IntOrDec . floor <$> (parseField s :: Parser Double)
-    Right n   -> pure $ IntOrDec n
-instance ToField IntOrDec where
-  toField (IntOrDec i) = toField i
-fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
-fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
-defaultYear :: Int
-defaultYear = 1973
-defaultMonth :: Int
-defaultMonth = 1
-defaultDay :: Int
-defaultDay = 1
-data TsvDoc = TsvDoc
-    { tsv_title             :: !Text
-    , tsv_source            :: !Text
-    , tsv_publication_year  :: !(Maybe IntOrDec)
-    , tsv_publication_month :: !(Maybe Int)
-    , tsv_publication_day   :: !(Maybe Int)
-    , tsv_abstract          :: !Text
-    , tsv_authors           :: !Text
-    }
-    deriving (Show)
-instance FromNamedRecord TsvDoc where
-  parseNamedRecord r = do
-    tsv_title <- r .: "title" <|> r .: "Title"
-    tsv_source <- r .: "source" <|> r .: "Source"
-    tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
-    tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
-    tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
-    tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
-    tsv_authors <- r .: "authors" <|> r .: "Authors"
-    pure $ TsvDoc { .. }
-instance ToNamedRecord TsvDoc where
-  toNamedRecord (TsvDoc{ .. }) =
-    namedRecord [ "title"             .= tsv_title
-                , "source"            .= tsv_source
-                , "publication_year"  .= tsv_publication_year
-                , "publication_month" .= tsv_publication_month
-                , "publication_day"   .= tsv_publication_day
-                , "abstract"          .= tsv_abstract
-                , "authors"           .= tsv_authors
-                ]
-hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
-hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
-                                    , tsv_source = m $ _hd_source h
-                                    , tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
-                                    , tsv_publication_month = Just $ mI $ _hd_publication_month h
-                                    , tsv_publication_day = Just $ mI $ _hd_publication_day   h
-                                    , tsv_abstract = m $ _hd_abstract h
-                                    , tsv_authors = m $ _hd_authors h }
-  where
-    m = maybe "" identity
-    mI = maybe 0 identity
-data Delimiter = Tab | Comma | Line deriving (Eq, Show)
-tsvDecodeOptions :: Delimiter -> DecodeOptions
-tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
-tsvEncodeOptions :: Delimiter -> EncodeOptions
-tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
-delimiter :: Delimiter -> Word8
-delimiter Tab   = fromIntegral $ ord '\t'
-delimiter Comma = fromIntegral $ ord ','
-delimiter Line  = fromIntegral $ ord '\n'
 ------------------------------------------------------------------------
-testDelimiter :: Delimiter -> BL.ByteString -> Bool
-testDelimiter del bs =
-    let x = BL.splitWith (== delimiter Line) bs
-        vec = V.fromList x in
-          case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
-            Nothing -> False
-            Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
-              Nothing -> False
-              Just f -> length e == length f && length e > 2
-findDelimiter :: BL.ByteString -> Either Text Delimiter
-findDelimiter bs
-  | testDelimiter Tab bs = Right Tab
-  | testDelimiter Comma bs = Right Comma
-  | otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
-isNumeric :: Text -> Either Bool Int
-isNumeric str = case DTR.decimal str of
-  Right (x,y) -> if y == ""
-    then Right x
-    else Left False
-  Left _ -> Left False
-lBLToText :: BL.ByteString -> Text
-lBLToText b = TL.toStrict $ TL.decodeUtf8 b
-validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
-validNumber x columnHeader ligne = do
-    let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
-    case isNumeric number of
-      Right val 
-        | val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
-        |otherwise -> Right True
-      Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
-validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
-validTextField x columnHeader ligne = do
-  let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
-    if not (T.null xs) 
-      then 
-        if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2))) 
-          then return True 
-          else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter  \"  must only appear at the beginning and the end of a field ")
-      else return True
-      -- else Left $ ("The column '" <> columnHeader <> "' at line " <>  pack (show ligne) <> " is empty")
-      -- Put a warning for the user to know their is a problem (empty column)
 testValue :: BL.ByteString -> Text -> Int -> Either Text Bool
 testValue val columnHeader ligne = case columnHeader of
    "Publication Day" -> validNumber val columnHeader ligne
@@ -254,12 +78,13 @@ testValue val columnHeader ligne = case columnHeader of
    "Title" -> validTextField val columnHeader ligne
    "Source" -> validTextField val columnHeader ligne
    "Abstract" -> validTextField val columnHeader ligne
+    "Publication Date" -> validTextField val columnHeaer ligne
    _ -> Right True
 testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool
 testErrorPerLine [] _ [] _ = Right True
-testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
+testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too many fields at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
-                            | otherwise =  Left (pack $ "Too much field at line " <> show l)
+                            | otherwise =  Left (pack $ "Too many fields at line " <> show l)
 testErrorPerLine [] _ _ l = Left (pack $ "Missing one field at line " <> show l)
 testErrorPerLine (v:val) del (h:headers) ligne =
    case testValue v h ligne of
@@ -342,7 +167,7 @@ count c = BL.count (fromIntegral (fromEnum c))
 -- use a map to remove \r that sometimes appear at the end of a line
 testAllHeadersPresence :: [Text] -> Either Text [Text]
 testAllHeadersPresence headers = do
-    let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack ""))headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
+    let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack "")) headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
    if null listHeaders
        then Right headers
        else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)

--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Diagnostics.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Diagnostics.hs
+module Gargantext.Core.Text.Corpus.Parsers.TSV.Diagnostics where
+import Data.Csv
+import Gargantext.Core.Text.Corpus.Parsers.TSV.Types
+import Gargantext.Prelude
+testDelimiter :: Delimiter -> BL.ByteString -> Bool
+testDelimiter del bs =
+    let x = BL.splitWith (== delimiter Line) bs
+        vec = V.fromList x in
+          case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
+            Nothing -> False
+            Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
+              Nothing -> False
+              Just f -> length e == length f && length e > 2
+findDelimiter :: BL.ByteString -> Either Text Delimiter
+findDelimiter bs
+  | testDelimiter Tab bs = Right Tab
+  | testDelimiter Comma bs = Right Comma
+  | otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
+isNumeric :: Text -> Either Bool Int
+isNumeric str = case DTR.decimal str of
+  Right (x,y) -> if y == ""
+    then Right x
+    else Left False
+  Left _ -> Left False
+lBLToText :: BL.ByteString -> Text
+lBLToText b = TL.toStrict $ TL.decodeUtf8 b
+validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
+validNumber x columnHeader ligne = do
+    let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
+    case isNumeric number of
+      Right val 
+        | val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
+        |otherwise -> Right True
+      Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
+validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
+validTextField x columnHeader ligne = do
+  let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
+    if not (T.null xs) 
+      then 
+        if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2))) 
+          then return True 
+          else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter  \"  must only appear at the beginning and the end of a field ")
+      else return True
+      -- else Left $ ("The column '" <> columnHeader <> "' at line " <>  pack (show ligne) <> " is empty")
+      -- Put a warning for the user to know their is a problem (empty column)
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/IntOrDec.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/IntOrDec.hs
+module Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec where
+import Data.Csv (FromField(..), ToField(..), Parser, runParser)
+import Gargantext.Prelude
+newtype IntOrDec = IntOrDec Int
+  deriving (Show, Eq, Read)
+unIntOrDec :: IntOrDec -> Int
+unIntOrDec (IntOrDec i) = i
+instance FromField IntOrDec where
+  parseField s = case runParser (parseField s :: Parser Int) of
+    Left _err -> IntOrDec . floor <$> (parseField s :: Parser Double)
+    Right n   -> pure $ IntOrDec n
+instance ToField IntOrDec where
+  toField (IntOrDec i) = toField i
+fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
+fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
+defaultYear :: Int
+defaultYear = 1973
+defaultMonth :: Int
+defaultMonth = 1
+defaultDay :: Int
+defaultDay = 1
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv3.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv3.hs
+{-# LANGUAGE BangPatterns     #-}
+module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 where
+import Data.Csv
+import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
+import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
+import Gargantext.Prelude
+headerTsvGargV3 :: Header
+headerTsvGargV3 =
+  header [ "title"
+         , "source"
+         , "publication_year"
+         , "publication_month"
+         , "publication_day"
+         , "abstract"
+         , "authors"
+         ]
+data TsvGargV3 = TsvGargV3
+    { d_docId             :: !Int
+    , d_title             :: !Text
+    , d_source            :: !Text
+    , d_publication_year  :: !Int
+    , d_publication_month :: !Int
+    , d_publication_day   :: !Int
+    , d_abstract          :: !Text
+    , d_authors           :: !Text
+    }
+    deriving (Show)
+data TsvDoc = TsvDoc
+    { tsv_title             :: !Text
+    , tsv_source            :: !Text
+    , tsv_publication_year  :: !(Maybe IntOrDec)
+    , tsv_publication_month :: !(Maybe Int)
+    , tsv_publication_day   :: !(Maybe Int)
+    , tsv_abstract          :: !Text
+    , tsv_authors           :: !Text
+    }
+    deriving (Show)
+instance FromNamedRecord TsvDoc where
+  parseNamedRecord r = do
+    tsv_title <- r .: "title" <|> r .: "Title"
+    tsv_source <- r .: "source" <|> r .: "Source"
+    tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
+    tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
+    tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
+    tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
+    tsv_authors <- r .: "authors" <|> r .: "Authors"
+    pure $ TsvDoc { .. }
+instance ToNamedRecord TsvDoc where
+  toNamedRecord (TsvDoc{ .. }) =
+    namedRecord [ "title"             .= tsv_title
+                , "source"            .= tsv_source
+                , "publication_year"  .= tsv_publication_year
+                , "publication_month" .= tsv_publication_month
+                , "publication_day"   .= tsv_publication_day
+                , "abstract"          .= tsv_abstract
+                , "authors"           .= tsv_authors
+                ]
+hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
+hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
+                                    , tsv_source = m $ _hd_source h
+                                    , tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
+                                    , tsv_publication_month = Just $ mI $ _hd_publication_month h
+                                    , tsv_publication_day = Just $ mI $ _hd_publication_day   h
+                                    , tsv_abstract = m $ _hd_abstract h
+                                    , tsv_authors = m $ _hd_authors h }
+  where
+    m = maybe "" identity
+    mI = maybe 0 identity
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv4.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/TSVv4.hs
+{-# LANGUAGE BangPatterns     #-}
+module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4 where
+import Data.Csv
+import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
+import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
+import Gargantext.Prelude
+headerTsvGargV4 :: Header
+headerTsvGargV4 =
+  header [ "title"
+         , "source"
+         , "publication_year"
+         , "publication_month"
+         , "publication_day"
+         , "abstract"
+         , "authors"
+         ]
+data TsvGargV4 = TsvGargV4
+    { d_docId             :: !Int
+    , d_title             :: !Text
+    , d_source            :: !Text
+    , d_publication_date  :: !Text
+    , d_abstract          :: !Text
+    , d_authors           :: !Text
+    }
+    deriving (Show)
+data TsvDoc = TsvDoc
+    { tsv_title             :: !Text
+    , tsv_source            :: !Text
+    , tsv_publication_date  :: !Text
+    , tsv_abstract          :: !Text
+    , tsv_authors           :: !Text
+    }
+    deriving (Show)
+instance FromNamedRecord TsvDoc where
+  parseNamedRecord r = do
+    tsv_title <- r .: "title" <|> r .: "Title"
+    tsv_source <- r .: "source" <|> r .: "Source"
+    tsv_publication_date <- r .: "publication_date" <|> r .: "Publication Date"
+    tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
+    tsv_authors <- r .: "authors" <|> r .: "Authors"
+    pure $ TsvDoc { .. }
+instance ToNamedRecord TsvDoc where
+  toNamedRecord (TsvDoc{ .. }) =
+    namedRecord [ "title"             .= tsv_title
+                , "source"            .= tsv_source
+                , "publication_date"  .= tsv_publication_date
+                , "abstract"          .= tsv_abstract
+                , "authors"           .= tsv_authors
+                ]
+hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
+hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
+                                    , tsv_source = m $ _hd_source h
+                                    , tsv_publication_date = m $ _hd_publication_date h
+                                    , tsv_abstract = m $ _hd_abstract h
+                                    , tsv_authors = m $ _hd_authors h }
+  where
+    m = maybe "" identity
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Types.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV/Types.hs
+{-# LANGUAGE TemplateHaskell  #-}
+module Gargantext.Core.Text.Corpus.Parsers.TSV.Types where
+import Data.Csv (DecodeOptions(..), EncodeOptions(..), defaultDecodeOptions, defaultEncodeOptions)
+import Data.Csv.Incremental qualified as CSVI
+import Gargantext.Prelude
+-- | Little helper data structure to make working with the incremental
+-- TSV parsing a bit nicer.
+data ParsingContext a
+  = MkHeaderParsingContext (CSVI.HeaderParser (CSVI.Parser a))
+  | MkRecordParsingContext (RecordParsingContext a)
+  deriving Show
+data RecordParsingContext a = RecordParsingContext
+  { -- Either the header parser, if we just started parsing the document, or actual
+    -- incremental parser for the records.
+    -- This field will contain 'Nothing' if the parser has been drained and we need
+    -- to stop recursion.
+    _prs_ctx_parser         :: Maybe (CSVI.Parser a)
+  , _prs_ctx_parsed_records :: [a]
+  -- | Keeps track of the current row we are in, so that we can insert proper error
+  -- diagnostics.
+  , _prs_ctx_row_cursor     :: !Int
+  } deriving Show
+makeLenses ''RecordParsingContext
+data Delimiter = Tab | Comma | Line deriving (Eq, Show)
+tsvDecodeOptions :: Delimiter -> DecodeOptions
+tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
+tsvEncodeOptions :: Delimiter -> EncodeOptions
+tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
+delimiter :: Delimiter -> Word8
+delimiter Tab   = fromIntegral $ ord '\t'
+delimiter Comma = fromIntegral $ ord ','
+delimiter Line  = fromIntegral $ ord '\n'