Commit e4dfb4bd authored by Grégoire Locqueville's avatar Grégoire Locqueville Committed by Grégoire Locqueville

Some refactoring

parent 9fc00811
...@@ -22,7 +22,7 @@ import Data.Csv ( (.:), header, decodeByNameWith, FromNamedRecord(..), Header ) ...@@ -22,7 +22,7 @@ import Data.Csv ( (.:), header, decodeByNameWith, FromNamedRecord(..), Header )
import Data.Text qualified as T import Data.Text qualified as T
import Data.Vector (Vector) import Data.Vector (Vector)
import Data.Vector qualified as Vector import Data.Vector qualified as Vector
import Gargantext.Core.Text.Corpus.Parsers.TSV ( tsvDecodeOptions, Delimiter(Tab) ) import Gargantext.Core.Text.Corpus.Parsers.TSV ( tsvDecodeOptions, ColumnDelimiter(Tab) )
import Gargantext.Database.Admin.Types.Hyperdata.Contact import Gargantext.Database.Admin.Types.Hyperdata.Contact
import Gargantext.Prelude import Gargantext.Prelude
import System.FilePath.Posix (takeExtension) import System.FilePath.Posix (takeExtension)
......
...@@ -29,9 +29,26 @@ import Data.Vector qualified as V ...@@ -29,9 +29,26 @@ import Data.Vector qualified as V
import Gargantext.Core.Text ( sentences, unsentences ) import Gargantext.Core.Text ( sentences, unsentences )
import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) ) import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (length, show) import Gargantext.Prelude
import Gargantext.Utils.Jobs.Error as Warn import Gargantext.Utils.Jobs.Error as Warn
import Protolude
-- | The possible delimiters for a CSV file
data LineDelimiter = Newline deriving (Eq, Show)
data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
class ToWord8 a where
-- | Convert abstract representation into a Word8 character
toWord8 :: a -> Word8
instance ToWord8 LineDelimiter where
toWord8 Newline = fromIntegral $ ord '\n'
instance ToWord8 ColumnDelimiter where
toWord8 Tab = fromIntegral $ ord '\t'
toWord8 Comma = fromIntegral $ ord ','
toWord8 Semicolon = fromIntegral $ ord ';'
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Minimal header for a working TSV import -- | Minimal header for a working TSV import
...@@ -145,35 +162,28 @@ hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h ...@@ -145,35 +162,28 @@ hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
m = maybe "" identity m = maybe "" identity
mI = maybe 0 identity mI = maybe 0 identity
tsvDecodeOptions :: ColumnDelimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = toWord8 d}
data Delimiter = Tab | Comma | Line deriving (Eq, Show) tsvEncodeOptions :: ColumnDelimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = toWord8 d}
tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
tsvEncodeOptions :: Delimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
delimiter Line = fromIntegral $ ord '\n'
------------------------------------------------------------------------ ------------------------------------------------------------------------
testDelimiter :: Delimiter -> BL.ByteString -> Bool testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
testDelimiter del bs = testDelimiter del bs =
let x = BL.splitWith (== delimiter Line) bs let x = BL.splitWith (== toWord8 Newline) bs
vec = V.fromList x in vec = V.fromList x in
case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of
Nothing -> False Nothing -> False
Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of Just e -> case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 1) of
Nothing -> False Nothing -> False
Just f -> length e == length f && length e > 2 Just f -> length e == length f && length e > 2
findDelimiter :: BL.ByteString -> Either Text Delimiter findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter
findDelimiter bs findColumnDelimiter bs
| testDelimiter Tab bs = Right Tab | testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma | testDelimiter Comma bs = Right Comma
| otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line") | otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
...@@ -221,7 +231,7 @@ testValue val columnHeader ligne warn = case columnHeader of ...@@ -221,7 +231,7 @@ testValue val columnHeader ligne warn = case columnHeader of
"Abstract" -> validTextField val columnHeader ligne warn "Abstract" -> validTextField val columnHeader ligne warn
_ -> Right warn _ -> Right warn
testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> [Text] -> Either Text [Text] testErrorPerLine :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Int -> [Text] -> Either Text [Text]
testErrorPerLine [] _ [] _ warn = Right warn testErrorPerLine [] _ [] _ warn = Right warn
testErrorPerLine _ del [] l _ | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.") testErrorPerLine _ del [] l _ | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l) | otherwise = Left (pack $ "Too much field at line " <> show l)
...@@ -232,19 +242,19 @@ testErrorPerLine (v:val) del (h:headers) ligne warn = ...@@ -232,19 +242,19 @@ testErrorPerLine (v:val) del (h:headers) ligne warn =
Right warning -> testErrorPerLine val del headers ligne warning Right warning -> testErrorPerLine val del headers ligne warning
checkNextLine :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString]) checkNextLine :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine bl del headers res x = do checkNextLine bl del headers res x = do
case BL.splitWith (==delimiter del) <$> ((V.!?) bl (x+1)) of case BL.splitWith (== toWord8 del) <$> ((V.!?) bl (x+1)) of
Nothing -> Right (x, (BL.splitWith (==delimiter del) res)) Nothing -> Right (x, (BL.splitWith (== toWord8 del) res))
Just value -> if length value > 1 Just value -> if length value > 1
then Right (x, (BL.splitWith (==delimiter del) res)) then Right (x, (BL.splitWith (== toWord8 del) res))
else case BL.append res <$> ((V.!?) bl (x+1)) of else case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "checkNextLine2" Nothing -> Left "checkNextLine2"
Just val -> checkNextLine bl del headers val (x+1) Just val -> checkNextLine bl del headers val (x+1)
getMultipleLinefile :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString]) getMultipleLinefile :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile bl del headers res x = do getMultipleLinefile bl del headers res x = do
let tmp = BL.splitWith (==delimiter del) res in let tmp = BL.splitWith (== toWord8 del) res in
if length tmp == length headers if length tmp == length headers
then checkNextLine bl del headers res x then checkNextLine bl del headers res x
else else
...@@ -255,32 +265,39 @@ getMultipleLinefile bl del headers res x = do ...@@ -255,32 +265,39 @@ getMultipleLinefile bl del headers res x = do
Nothing -> Left "getMultipleLinefile" Nothing -> Left "getMultipleLinefile"
Just val -> getMultipleLinefile bl del headers val (x+1) Just val -> getMultipleLinefile bl del headers val (x+1)
anx :: Vector BL.ByteString -> Delimiter -> [Text] -> Int -> [Text] -> Either Text (Delimiter, [Text])
anx bl del headers x warn -- | Check that the file is well-formed and throw warnings/errors accordingly
| length bl == x = Right (del, warn) checkIntegrity :: Vector BL.ByteString -- ^ Individual lines of the TSV file
| otherwise = -> ColumnDelimiter -- ^ The delimiter used in this file (comma, tab, ...)
case (V.!?) bl x of -> [Text] -- ^ Expected TSV headers
Nothing -> Left "anx" -> Int -- ^ Current line number
Just bs -> -> [Text] -- ^ Warning accumulator
case getMultipleLinefile bl del headers bs x of -> Either Text [Text] -- ^ Left error if an error has occured, otherwise Right (list of warnings)
Left _err -> Left _err checkIntegrity tsvLines delim headers lineNumber warnings
Right (y, val) -> case testErrorPerLine val del headers (x + 1) warn of | length tsvLines == lineNumber = Right warnings
Left _err -> Left _err | otherwise =
Right warning -> anx bl del headers (y+1) warning case (V.!?) tsvLines lineNumber of
Nothing -> Left "Gargantext.Core.Text.Corpus.Parsers.Tsv: error in function checkTsv"
Just currentLine ->
case getMultipleLinefile tsvLines delim headers currentLine lineNumber of
Left _err -> Left _err
Right (y, val) -> case testErrorPerLine val delim headers (lineNumber + 1) warnings of
Left _err -> Left _err
Right warning -> checkIntegrity tsvLines delim headers (y+1) warning
testIfErrorInFile :: [BL.ByteString] -> Delimiter -> [Text] -> Either Text (Delimiter, [Text]) testIfErrorInFile :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Either Text [Text]
testIfErrorInFile bl del headers = anx (V.fromList bl) del headers 1 [] testIfErrorInFile bl del headers = checkIntegrity (V.fromList bl) del headers 2 []
testCorrectFile :: BL.ByteString -> Either Text (Delimiter, [Text]) testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text])
testCorrectFile bs = testCorrectFile bs =
case findDelimiter bs of case findColumnDelimiter bs of
Left _err -> Left _err Left _err -> Left _err
Right del -> do Right del -> do
let bl = BL.splitWith (==delimiter Line) bs in let bl = BL.splitWith (== toWord8 Newline) bs in
case getHeaders bl del of case getHeaders bl del of
Left _err -> Left _err Left _err -> Left _err
Right headers -> testIfErrorInFile bl del headers Right headers -> (\content -> (del, content)) <$> testIfErrorInFile bl del headers
...@@ -295,10 +312,10 @@ testAllHeadersPresence headers = do ...@@ -295,10 +312,10 @@ testAllHeadersPresence headers = do
then Right headers then Right headers
else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders) else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)
getHeaders :: [BL.ByteString] -> Delimiter -> Either Text [Text] getHeaders :: [BL.ByteString] -> ColumnDelimiter -> Either Text [Text]
getHeaders bl del = do getHeaders bl del = do
let vec = V.fromList bl in let vec = V.fromList bl in
case BL.splitWith (==delimiter del) <$> ((V.!?) vec 0) of case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of
Nothing -> Left "Error getHeaders" Nothing -> Left "Error getHeaders"
Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers) Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)
...@@ -307,28 +324,28 @@ getHeaders bl del = do ...@@ -307,28 +324,28 @@ getHeaders bl del = do
readFileLazy :: (FromNamedRecord a) readFileLazy :: (FromNamedRecord a)
=> proxy a => proxy a
-> Delimiter -> ColumnDelimiter
-> FilePath -> FilePath
-> IO (Either Text (Header, Vector a)) -> IO (Either Text (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
readFileStrict :: (FromNamedRecord a) readFileStrict :: (FromNamedRecord a)
=> proxy a => proxy a
-> Delimiter -> ColumnDelimiter
-> FilePath -> FilePath
-> IO (Either Text (Header, Vector a)) -> IO (Either Text (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) readByteStringLazy :: (FromNamedRecord a)
=> proxy a => proxy a
-> Delimiter -> ColumnDelimiter
-> BL.ByteString -> BL.ByteString
-> Either Text (Header, Vector a) -> Either Text (Header, Vector a)
readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
readByteStringStrict :: (FromNamedRecord a) readByteStringStrict :: (FromNamedRecord a)
=> proxy a => proxy a
-> Delimiter -> ColumnDelimiter
-> BS.ByteString -> BS.ByteString
-> Either Text (Header, Vector a) -> Either Text (Header, Vector a)
readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
...@@ -345,7 +362,7 @@ readTSVFile fp = do ...@@ -345,7 +362,7 @@ readTSVFile fp = do
-- | TODO use readByteStringLazy -- | TODO use readByteStringLazy
readTsvLazyBS :: Delimiter readTsvLazyBS :: ColumnDelimiter
-> BL.ByteString -> BL.ByteString
-> Either Text (Header, Vector TsvDoc) -> Either Text (Header, Vector TsvDoc)
readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment