Commit e4dfb4bd authored by Grégoire Locqueville's avatar Grégoire Locqueville Committed by Grégoire Locqueville

Some refactoring

parent 9fc00811
......@@ -22,7 +22,7 @@ import Data.Csv ( (.:), header, decodeByNameWith, FromNamedRecord(..), Header )
import Data.Text qualified as T
import Data.Vector (Vector)
import Data.Vector qualified as Vector
import Gargantext.Core.Text.Corpus.Parsers.TSV ( tsvDecodeOptions, Delimiter(Tab) )
import Gargantext.Core.Text.Corpus.Parsers.TSV ( tsvDecodeOptions, ColumnDelimiter(Tab) )
import Gargantext.Database.Admin.Types.Hyperdata.Contact
import Gargantext.Prelude
import System.FilePath.Posix (takeExtension)
......
......@@ -29,9 +29,26 @@ import Data.Vector qualified as V
import Gargantext.Core.Text ( sentences, unsentences )
import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (length, show)
import Gargantext.Prelude
import Gargantext.Utils.Jobs.Error as Warn
import Protolude
-- | The possible delimiters for a CSV file
data LineDelimiter = Newline deriving (Eq, Show)
data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
class ToWord8 a where
-- | Convert abstract representation into a Word8 character
toWord8 :: a -> Word8
instance ToWord8 LineDelimiter where
toWord8 Newline = fromIntegral $ ord '\n'
instance ToWord8 ColumnDelimiter where
toWord8 Tab = fromIntegral $ ord '\t'
toWord8 Comma = fromIntegral $ ord ','
toWord8 Semicolon = fromIntegral $ ord ';'
---------------------------------------------------------------
-- | Minimal header for a working TSV import
......@@ -145,34 +162,27 @@ hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
m = maybe "" identity
mI = maybe 0 identity
tsvDecodeOptions :: ColumnDelimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = toWord8 d}
data Delimiter = Tab | Comma | Line deriving (Eq, Show)
tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
tsvEncodeOptions :: ColumnDelimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = toWord8 d}
tsvEncodeOptions :: Delimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
delimiter Line = fromIntegral $ ord '\n'
------------------------------------------------------------------------
testDelimiter :: Delimiter -> BL.ByteString -> Bool
testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
testDelimiter del bs =
let x = BL.splitWith (== delimiter Line) bs
let x = BL.splitWith (== toWord8 Newline) bs
vec = V.fromList x in
case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of
Nothing -> False
Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
Just e -> case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 1) of
Nothing -> False
Just f -> length e == length f && length e > 2
findDelimiter :: BL.ByteString -> Either Text Delimiter
findDelimiter bs
findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter
findColumnDelimiter bs
| testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma
| otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
......@@ -221,7 +231,7 @@ testValue val columnHeader ligne warn = case columnHeader of
"Abstract" -> validTextField val columnHeader ligne warn
_ -> Right warn
testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> [Text] -> Either Text [Text]
testErrorPerLine :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Int -> [Text] -> Either Text [Text]
testErrorPerLine [] _ [] _ warn = Right warn
testErrorPerLine _ del [] l _ | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l)
......@@ -232,19 +242,19 @@ testErrorPerLine (v:val) del (h:headers) ligne warn =
Right warning -> testErrorPerLine val del headers ligne warning
checkNextLine :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine bl del headers res x = do
case BL.splitWith (==delimiter del) <$> ((V.!?) bl (x+1)) of
Nothing -> Right (x, (BL.splitWith (==delimiter del) res))
case BL.splitWith (== toWord8 del) <$> ((V.!?) bl (x+1)) of
Nothing -> Right (x, (BL.splitWith (== toWord8 del) res))
Just value -> if length value > 1
then Right (x, (BL.splitWith (==delimiter del) res))
then Right (x, (BL.splitWith (== toWord8 del) res))
else case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "checkNextLine2"
Just val -> checkNextLine bl del headers val (x+1)
getMultipleLinefile :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile bl del headers res x = do
let tmp = BL.splitWith (==delimiter del) res in
let tmp = BL.splitWith (== toWord8 del) res in
if length tmp == length headers
then checkNextLine bl del headers res x
else
......@@ -255,32 +265,39 @@ getMultipleLinefile bl del headers res x = do
Nothing -> Left "getMultipleLinefile"
Just val -> getMultipleLinefile bl del headers val (x+1)
anx :: Vector BL.ByteString -> Delimiter -> [Text] -> Int -> [Text] -> Either Text (Delimiter, [Text])
anx bl del headers x warn
| length bl == x = Right (del, warn)
-- | Check that the file is well-formed and throw warnings/errors accordingly
checkIntegrity :: Vector BL.ByteString -- ^ Individual lines of the TSV file
-> ColumnDelimiter -- ^ The delimiter used in this file (comma, tab, ...)
-> [Text] -- ^ Expected TSV headers
-> Int -- ^ Current line number
-> [Text] -- ^ Warning accumulator
-> Either Text [Text] -- ^ Left error if an error has occured, otherwise Right (list of warnings)
checkIntegrity tsvLines delim headers lineNumber warnings
| length tsvLines == lineNumber = Right warnings
| otherwise =
case (V.!?) bl x of
Nothing -> Left "anx"
Just bs ->
case getMultipleLinefile bl del headers bs x of
case (V.!?) tsvLines lineNumber of
Nothing -> Left "Gargantext.Core.Text.Corpus.Parsers.Tsv: error in function checkTsv"
Just currentLine ->
case getMultipleLinefile tsvLines delim headers currentLine lineNumber of
Left _err -> Left _err
Right (y, val) -> case testErrorPerLine val del headers (x + 1) warn of
Right (y, val) -> case testErrorPerLine val delim headers (lineNumber + 1) warnings of
Left _err -> Left _err
Right warning -> anx bl del headers (y+1) warning
Right warning -> checkIntegrity tsvLines delim headers (y+1) warning
testIfErrorInFile :: [BL.ByteString] -> Delimiter -> [Text] -> Either Text (Delimiter, [Text])
testIfErrorInFile bl del headers = anx (V.fromList bl) del headers 1 []
testIfErrorInFile :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Either Text [Text]
testIfErrorInFile bl del headers = checkIntegrity (V.fromList bl) del headers 2 []
testCorrectFile :: BL.ByteString -> Either Text (Delimiter, [Text])
testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text])
testCorrectFile bs =
case findDelimiter bs of
case findColumnDelimiter bs of
Left _err -> Left _err
Right del -> do
let bl = BL.splitWith (==delimiter Line) bs in
let bl = BL.splitWith (== toWord8 Newline) bs in
case getHeaders bl del of
Left _err -> Left _err
Right headers -> testIfErrorInFile bl del headers
Right headers -> (\content -> (del, content)) <$> testIfErrorInFile bl del headers
......@@ -295,10 +312,10 @@ testAllHeadersPresence headers = do
then Right headers
else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)
getHeaders :: [BL.ByteString] -> Delimiter -> Either Text [Text]
getHeaders :: [BL.ByteString] -> ColumnDelimiter -> Either Text [Text]
getHeaders bl del = do
let vec = V.fromList bl in
case BL.splitWith (==delimiter del) <$> ((V.!?) vec 0) of
case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of
Nothing -> Left "Error getHeaders"
Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)
......@@ -307,28 +324,28 @@ getHeaders bl del = do
readFileLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> ColumnDelimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
readFileStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> ColumnDelimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> ColumnDelimiter
-> BL.ByteString
-> Either Text (Header, Vector a)
readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
readByteStringStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> ColumnDelimiter
-> BS.ByteString
-> Either Text (Header, Vector a)
readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
......@@ -345,7 +362,7 @@ readTSVFile fp = do
-- | TODO use readByteStringLazy
readTsvLazyBS :: Delimiter
readTsvLazyBS :: ColumnDelimiter
-> BL.ByteString
-> Either Text (Header, Vector TsvDoc)
readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment