[tsv] draft for new tsv v4 parser with publication_date

parent d36a9126
Pipeline #7933 failed with stages
in 25 minutes and 12 seconds
...@@ -229,6 +229,10 @@ library ...@@ -229,6 +229,10 @@ library
Gargantext.Core.Text.Corpus.Parsers.Types Gargantext.Core.Text.Corpus.Parsers.Types
Gargantext.Core.Text.Corpus.Parsers.Date Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.TSV Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Parsers.TSV.Diagnostics
Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.List Gargantext.Core.Text.List
Gargantext.Core.Text.List.Formats.TSV Gargantext.Core.Text.List.Formats.TSV
......
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-} {-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE ViewPatterns #-}
{-| {-|
...@@ -67,184 +66,9 @@ import Prelude (String) ...@@ -67,184 +66,9 @@ import Prelude (String)
import Protolude import Protolude
-- | Little helper data structure to make working with the incremental
-- TSV parsing a bit nicer.
data ParsingContext a
= MkHeaderParsingContext (CSVI.HeaderParser (CSVI.Parser a))
| MkRecordParsingContext (RecordParsingContext a)
deriving Show
data RecordParsingContext a = RecordParsingContext
{ -- Either the header parser, if we just started parsing the document, or actual
-- incremental parser for the records.
-- This field will contain 'Nothing' if the parser has been drained and we need
-- to stop recursion.
_prs_ctx_parser :: Maybe (CSVI.Parser a)
, _prs_ctx_parsed_records :: [a]
-- | Keeps track of the current row we are in, so that we can insert proper error
-- diagnostics.
, _prs_ctx_row_cursor :: !Int
} deriving Show
makeLenses ''RecordParsingContext
---------------------------------------------------------------
headerTsvGargV3 :: Header
headerTsvGargV3 =
header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
---------------------------------------------------------------
data TsvGargV3 = TsvGargV3
{ d_docId :: !Int
, d_title :: !Text
, d_source :: !Text
, d_publication_year :: !Int
, d_publication_month :: !Int
, d_publication_day :: !Int
, d_abstract :: !Text
, d_authors :: !Text
}
deriving (Show)
---------------------------------------------------------------
newtype IntOrDec = IntOrDec Int
deriving (Show, Eq, Read)
unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i
instance FromField IntOrDec where
parseField s = case runParser (parseField s :: Parser Int) of
Left _err -> IntOrDec . floor <$> (parseField s :: Parser Double)
Right n -> pure $ IntOrDec n
instance ToField IntOrDec where
toField (IntOrDec i) = toField i
fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
defaultYear :: Int
defaultYear = 1973
defaultMonth :: Int
defaultMonth = 1
defaultDay :: Int
defaultDay = 1
data TsvDoc = TsvDoc
{ tsv_title :: !Text
, tsv_source :: !Text
, tsv_publication_year :: !(Maybe IntOrDec)
, tsv_publication_month :: !(Maybe Int)
, tsv_publication_day :: !(Maybe Int)
, tsv_abstract :: !Text
, tsv_authors :: !Text
}
deriving (Show)
instance FromNamedRecord TsvDoc where
parseNamedRecord r = do
tsv_title <- r .: "title" <|> r .: "Title"
tsv_source <- r .: "source" <|> r .: "Source"
tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
tsv_authors <- r .: "authors" <|> r .: "Authors"
pure $ TsvDoc { .. }
instance ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) =
namedRecord [ "title" .= tsv_title
, "source" .= tsv_source
, "publication_year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month
, "publication_day" .= tsv_publication_day
, "abstract" .= tsv_abstract
, "authors" .= tsv_authors
]
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where
m = maybe "" identity
mI = maybe 0 identity
data Delimiter = Tab | Comma | Line deriving (Eq, Show)
tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
tsvEncodeOptions :: Delimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
delimiter Line = fromIntegral $ ord '\n'
------------------------------------------------------------------------ ------------------------------------------------------------------------
testDelimiter :: Delimiter -> BL.ByteString -> Bool
testDelimiter del bs =
let x = BL.splitWith (== delimiter Line) bs
vec = V.fromList x in
case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
Nothing -> False
Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
Nothing -> False
Just f -> length e == length f && length e > 2
findDelimiter :: BL.ByteString -> Either Text Delimiter
findDelimiter bs
| testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma
| otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
isNumeric :: Text -> Either Bool Int
isNumeric str = case DTR.decimal str of
Right (x,y) -> if y == ""
then Right x
else Left False
Left _ -> Left False
lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TL.decodeUtf8 b
validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
validNumber x columnHeader ligne = do
let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
case isNumeric number of
Right val
| val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
|otherwise -> Right True
Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
validTextField x columnHeader ligne = do
let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
if not (T.null xs)
then
if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2)))
then return True
else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field ")
else return True
-- else Left $ ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty")
-- Put a warning for the user to know their is a problem (empty column)
testValue :: BL.ByteString -> Text -> Int -> Either Text Bool testValue :: BL.ByteString -> Text -> Int -> Either Text Bool
testValue val columnHeader ligne = case columnHeader of testValue val columnHeader ligne = case columnHeader of
"Publication Day" -> validNumber val columnHeader ligne "Publication Day" -> validNumber val columnHeader ligne
...@@ -254,12 +78,13 @@ testValue val columnHeader ligne = case columnHeader of ...@@ -254,12 +78,13 @@ testValue val columnHeader ligne = case columnHeader of
"Title" -> validTextField val columnHeader ligne "Title" -> validTextField val columnHeader ligne
"Source" -> validTextField val columnHeader ligne "Source" -> validTextField val columnHeader ligne
"Abstract" -> validTextField val columnHeader ligne "Abstract" -> validTextField val columnHeader ligne
"Publication Date" -> validTextField val columnHeaer ligne
_ -> Right True _ -> Right True
testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool
testErrorPerLine [] _ [] _ = Right True testErrorPerLine [] _ [] _ = Right True
testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.") testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too many fields at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l) | otherwise = Left (pack $ "Too many fields at line " <> show l)
testErrorPerLine [] _ _ l = Left (pack $ "Missing one field at line " <> show l) testErrorPerLine [] _ _ l = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine (v:val) del (h:headers) ligne = testErrorPerLine (v:val) del (h:headers) ligne =
case testValue v h ligne of case testValue v h ligne of
...@@ -342,7 +167,7 @@ count c = BL.count (fromIntegral (fromEnum c)) ...@@ -342,7 +167,7 @@ count c = BL.count (fromIntegral (fromEnum c))
-- use a map to remove \r that sometimes appear at the end of a line -- use a map to remove \r that sometimes appear at the end of a line
testAllHeadersPresence :: [Text] -> Either Text [Text] testAllHeadersPresence :: [Text] -> Either Text [Text]
testAllHeadersPresence headers = do testAllHeadersPresence headers = do
let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack ""))headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack "")) headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
if null listHeaders if null listHeaders
then Right headers then Right headers
else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders) else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)
......
module Gargantext.Core.Text.Corpus.Parsers.TSV.Diagnostics where
import Data.Csv
import Gargantext.Core.Text.Corpus.Parsers.TSV.Types
import Gargantext.Prelude
testDelimiter :: Delimiter -> BL.ByteString -> Bool
testDelimiter del bs =
let x = BL.splitWith (== delimiter Line) bs
vec = V.fromList x in
case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
Nothing -> False
Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
Nothing -> False
Just f -> length e == length f && length e > 2
findDelimiter :: BL.ByteString -> Either Text Delimiter
findDelimiter bs
| testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma
| otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
isNumeric :: Text -> Either Bool Int
isNumeric str = case DTR.decimal str of
Right (x,y) -> if y == ""
then Right x
else Left False
Left _ -> Left False
lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TL.decodeUtf8 b
validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
validNumber x columnHeader ligne = do
let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
case isNumeric number of
Right val
| val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
|otherwise -> Right True
Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
validTextField x columnHeader ligne = do
let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
if not (T.null xs)
then
if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2)))
then return True
else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field ")
else return True
-- else Left $ ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty")
-- Put a warning for the user to know their is a problem (empty column)
module Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec where
import Data.Csv (FromField(..), ToField(..), Parser, runParser)
import Gargantext.Prelude
newtype IntOrDec = IntOrDec Int
deriving (Show, Eq, Read)
unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i
instance FromField IntOrDec where
parseField s = case runParser (parseField s :: Parser Int) of
Left _err -> IntOrDec . floor <$> (parseField s :: Parser Double)
Right n -> pure $ IntOrDec n
instance ToField IntOrDec where
toField (IntOrDec i) = toField i
fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
defaultYear :: Int
defaultYear = 1973
defaultMonth :: Int
defaultMonth = 1
defaultDay :: Int
defaultDay = 1
{-# LANGUAGE BangPatterns #-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 where
import Data.Csv
import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude
headerTsvGargV3 :: Header
headerTsvGargV3 =
header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
data TsvGargV3 = TsvGargV3
{ d_docId :: !Int
, d_title :: !Text
, d_source :: !Text
, d_publication_year :: !Int
, d_publication_month :: !Int
, d_publication_day :: !Int
, d_abstract :: !Text
, d_authors :: !Text
}
deriving (Show)
data TsvDoc = TsvDoc
{ tsv_title :: !Text
, tsv_source :: !Text
, tsv_publication_year :: !(Maybe IntOrDec)
, tsv_publication_month :: !(Maybe Int)
, tsv_publication_day :: !(Maybe Int)
, tsv_abstract :: !Text
, tsv_authors :: !Text
}
deriving (Show)
instance FromNamedRecord TsvDoc where
parseNamedRecord r = do
tsv_title <- r .: "title" <|> r .: "Title"
tsv_source <- r .: "source" <|> r .: "Source"
tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
tsv_authors <- r .: "authors" <|> r .: "Authors"
pure $ TsvDoc { .. }
instance ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) =
namedRecord [ "title" .= tsv_title
, "source" .= tsv_source
, "publication_year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month
, "publication_day" .= tsv_publication_day
, "abstract" .= tsv_abstract
, "authors" .= tsv_authors
]
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where
m = maybe "" identity
mI = maybe 0 identity
{-# LANGUAGE BangPatterns #-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4 where
import Data.Csv
import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude
headerTsvGargV4 :: Header
headerTsvGargV4 =
header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
data TsvGargV4 = TsvGargV4
{ d_docId :: !Int
, d_title :: !Text
, d_source :: !Text
, d_publication_date :: !Text
, d_abstract :: !Text
, d_authors :: !Text
}
deriving (Show)
data TsvDoc = TsvDoc
{ tsv_title :: !Text
, tsv_source :: !Text
, tsv_publication_date :: !Text
, tsv_abstract :: !Text
, tsv_authors :: !Text
}
deriving (Show)
instance FromNamedRecord TsvDoc where
parseNamedRecord r = do
tsv_title <- r .: "title" <|> r .: "Title"
tsv_source <- r .: "source" <|> r .: "Source"
tsv_publication_date <- r .: "publication_date" <|> r .: "Publication Date"
tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
tsv_authors <- r .: "authors" <|> r .: "Authors"
pure $ TsvDoc { .. }
instance ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) =
namedRecord [ "title" .= tsv_title
, "source" .= tsv_source
, "publication_date" .= tsv_publication_date
, "abstract" .= tsv_abstract
, "authors" .= tsv_authors
]
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_date = m $ _hd_publication_date h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where
m = maybe "" identity
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.Types where
import Data.Csv (DecodeOptions(..), EncodeOptions(..), defaultDecodeOptions, defaultEncodeOptions)
import Data.Csv.Incremental qualified as CSVI
import Gargantext.Prelude
-- | Little helper data structure to make working with the incremental
-- TSV parsing a bit nicer.
data ParsingContext a
= MkHeaderParsingContext (CSVI.HeaderParser (CSVI.Parser a))
| MkRecordParsingContext (RecordParsingContext a)
deriving Show
data RecordParsingContext a = RecordParsingContext
{ -- Either the header parser, if we just started parsing the document, or actual
-- incremental parser for the records.
-- This field will contain 'Nothing' if the parser has been drained and we need
-- to stop recursion.
_prs_ctx_parser :: Maybe (CSVI.Parser a)
, _prs_ctx_parsed_records :: [a]
-- | Keeps track of the current row we are in, so that we can insert proper error
-- diagnostics.
, _prs_ctx_row_cursor :: !Int
} deriving Show
makeLenses ''RecordParsingContext
data Delimiter = Tab | Comma | Line deriving (Eq, Show)
tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
tsvEncodeOptions :: Delimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
delimiter Line = fromIntegral $ ord '\n'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment