......@@ -20,6 +20,9 @@ import Data.ByteString.Lazy qualified as BL
import Data.Csv
import Data.Text (pack)
import Data.Text qualified as T
import Data.Text.Lazy qualified as TL
import Data.Text.Lazy.Encoding qualified as TL
import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import Data.Vector qualified as V
......@@ -205,7 +208,7 @@ hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
mI = maybe 0 identity
data Delimiter = Tab | Comma
data Delimiter = Tab | Comma | Line deriving (Eq, Show)
tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
......@@ -216,6 +219,150 @@ tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
delimiter Line = fromIntegral $ ord '\n'
testDelimiter :: Delimiter -> BL.ByteString -> Bool
testDelimiter del bs =
let x = BL.splitWith (== delimiter Line) bs
vec = V.fromList x in
case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
Nothing -> False
Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
Nothing -> False
Just f -> length e == length f && length e > 2
findDelimiter :: BL.ByteString -> Either Text Delimiter
findDelimiter bs
| testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma
| otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
isNumeric :: Text -> Either Bool Int
isNumeric str = case DTR.decimal str of
Right (x,y) -> if y == ""
then Right x
else Left False
Left _ -> Left False
lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TL.decodeUtf8 b
validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
validNumber x columnHeader ligne = do
let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
case isNumeric number of
Right val
| val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
|otherwise -> Right True
Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
validTextField x columnHeader ligne = do
let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
if not (T.null xs)
if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2)))
then return True
else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field ")
else return True
-- else Left $ ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty")
-- Put a warning for the user to know their is a problem (empty column)
testValue :: BL.ByteString -> Text -> Int -> Either Text Bool
testValue val columnHeader ligne = case columnHeader of
"Publication Day" -> validNumber val columnHeader ligne
"Publication Month" -> validNumber val columnHeader ligne
"Publication Year" -> validNumber val columnHeader ligne
"Authors" -> validTextField val columnHeader ligne
"Title" -> validTextField val columnHeader ligne
"Source" -> validTextField val columnHeader ligne
"Abstract" -> validTextField val columnHeader ligne
_ -> Right True
testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool
testErrorPerLine [] _ [] _ = Right True
testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l)
testErrorPerLine [] _ _ l = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine (v:val) del (h:headers) ligne =
case testValue v h ligne of
Left _err -> Left _err
Right _ -> testErrorPerLine val del headers ligne
checkNextLine :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine bl del headers res x = do
case BL.splitWith (==delimiter del) <$> ((V.!?) bl (x+1)) of
Nothing -> Right (x, (BL.splitWith (==delimiter del) res))
Just value -> if length value > 1
then Right (x, (BL.splitWith (==delimiter del) res))
else case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "checkNextLine2"
Just val -> checkNextLine bl del headers val (x+1)
getMultipleLinefile :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile bl del headers res x = do
let tmp = BL.splitWith (==delimiter del) res in
if length tmp == length headers
then checkNextLine bl del headers res x
if (length tmp > length headers) || (V.length bl == (x + 1))
then Left (pack $ "Cannot parse the file at line " <> show x <> ". Maybe because of a delimiter")
else do
case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "getMultipleLinefile"
Just val -> getMultipleLinefile bl del headers val (x+1)
anx :: Vector BL.ByteString -> Delimiter -> [Text] -> Int -> Either Text Delimiter
anx bl del headers x
| length bl == x = Right del
| otherwise =
case (V.!?) bl x of
Nothing -> Left "anx"
Just bs ->
case getMultipleLinefile bl del headers bs x of
Left _err -> Left _err
Right (y, val) -> case testErrorPerLine val del headers (x + 1) of
Left _err -> Left _err
Right _ -> anx bl del headers (y+1)
testIfErrorInFile :: [BL.ByteString] -> Delimiter -> [Text] -> Either Text Delimiter
testIfErrorInFile bl del headers = anx (V.fromList bl) del headers 1
testCorrectFile :: BL.ByteString -> Either Text Delimiter
testCorrectFile bs =
case findDelimiter bs of
Left _err -> Left _err
Right del -> do
let bl = BL.splitWith (==delimiter Line) bs in
case getHeaders bl del of
Left _err -> Left _err
Right headers -> testIfErrorInFile bl del headers
----------Test headers added to ggt
testAllHeadersPresence :: [Text] -> Either Text [Text]
testAllHeadersPresence headers = do
let listHeaders = filter (`notElem` headers) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
if null listHeaders
then Right headers
else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)
getHeaders :: [BL.ByteString] -> Delimiter -> Either Text [Text]
getHeaders bl del = do
let vec = V.fromList bl in
case BL.splitWith (==delimiter del) <$> ((V.!?) vec 0) of
Nothing -> Left "Error getHeaders"
Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)
......@@ -251,10 +398,10 @@ readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc))
readTSVFile fp = do
result <- readTsvLazyBS Comma <$> BL.readFile fp
case result of
Left _err -> readTsvLazyBS Tab <$> BL.readFile fp
Right res -> pure $ Right res
file <- BL.readFile fp
case (testCorrectFile file) of
Left _err -> pure $ Left _err
Right del -> pure $ readTsvLazyBS del file
