Commit f3c4b508 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[MERGE]

parents bc89eed7 f15d2fee
......@@ -790,6 +790,7 @@ test-suite garg-test-tasty
Test.Core.Similarity
Test.Core.Text
Test.Core.Text.Corpus.Query
Test.Core.Text.Corpus.TSV
Test.Core.Text.Examples
Test.Core.Text.Flow
Test.Core.Utils
......@@ -864,6 +865,7 @@ test-suite garg-test-tasty
, postgresql-simple >= 0.6.4 && < 0.7
, pretty
, process ^>= 1.6.13.2
, protolude ^>= 0.3.3
, quickcheck-instances ^>= 0.3.25.2
, raw-strings-qq
, recover-rtti >= 0.4 && < 0.5
......@@ -893,6 +895,7 @@ test-suite garg-test-tasty
, unordered-containers ^>= 0.2.16.0
, unicode-collation >= 0.1.3.6
, unliftio
, utf8-string ^>= 1.0.2
, validity ^>= 0.11.0.1
, vector ^>= 0.12.3.0
, wai
......
......@@ -20,6 +20,9 @@ import Data.ByteString.Lazy qualified as BL
import Data.Csv
import Data.Text (pack)
import Data.Text qualified as T
import Data.Text.Lazy qualified as TL
import Data.Text.Lazy.Encoding qualified as TL
import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import Data.Vector qualified as V
......@@ -205,7 +208,7 @@ hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
mI = maybe 0 identity
data Delimiter = Tab | Comma
data Delimiter = Tab | Comma | Line deriving (Eq, Show)
tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
......@@ -216,6 +219,151 @@ tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
delimiter Line = fromIntegral $ ord '\n'
------------------------------------------------------------------------
testDelimiter :: Delimiter -> BL.ByteString -> Bool
testDelimiter del bs =
let x = BL.splitWith (== delimiter Line) bs
vec = V.fromList x in
case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
Nothing -> False
Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
Nothing -> False
Just f -> length e == length f && length e > 2
findDelimiter :: BL.ByteString -> Either Text Delimiter
findDelimiter bs
| testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma
| otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
isNumeric :: Text -> Either Bool Int
isNumeric str = case DTR.decimal str of
Right (x,y) -> if y == ""
then Right x
else Left False
Left _ -> Left False
lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TL.decodeUtf8 b
validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
validNumber x columnHeader ligne = do
let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
case isNumeric number of
Right val
| val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
|otherwise -> Right True
Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
validTextField x columnHeader ligne = do
let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
if not (T.null xs)
then
if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2)))
then return True
else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field ")
else return True
-- else Left $ ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty")
-- Put a warning for the user to know their is a problem (empty column)
testValue :: BL.ByteString -> Text -> Int -> Either Text Bool
testValue val columnHeader ligne = case columnHeader of
"Publication Day" -> validNumber val columnHeader ligne
"Publication Month" -> validNumber val columnHeader ligne
"Publication Year" -> validNumber val columnHeader ligne
"Authors" -> validTextField val columnHeader ligne
"Title" -> validTextField val columnHeader ligne
"Source" -> validTextField val columnHeader ligne
"Abstract" -> validTextField val columnHeader ligne
_ -> Right True
testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool
testErrorPerLine [] _ [] _ = Right True
testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l)
testErrorPerLine [] _ _ l = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine (v:val) del (h:headers) ligne =
case testValue v h ligne of
Left _err -> Left _err
Right _ -> testErrorPerLine val del headers ligne
checkNextLine :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine bl del headers res x = do
case BL.splitWith (==delimiter del) <$> ((V.!?) bl (x+1)) of
Nothing -> Right (x, (BL.splitWith (==delimiter del) res))
Just value -> if length value > 1
then Right (x, (BL.splitWith (==delimiter del) res))
else case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "checkNextLine2"
Just val -> checkNextLine bl del headers val (x+1)
getMultipleLinefile :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile bl del headers res x = do
let tmp = BL.splitWith (==delimiter del) res in
if length tmp == length headers
then checkNextLine bl del headers res x
else
if (length tmp > length headers) || (V.length bl == (x + 1))
then Left (pack $ "Cannot parse the file at line " <> show x <> ". Maybe because of a delimiter")
else do
case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "getMultipleLinefile"
Just val -> getMultipleLinefile bl del headers val (x+1)
anx :: Vector BL.ByteString -> Delimiter -> [Text] -> Int -> Either Text Delimiter
anx bl del headers x
| length bl == x = Right del
| otherwise =
case (V.!?) bl x of
Nothing -> Left "anx"
Just bs ->
case getMultipleLinefile bl del headers bs x of
Left _err -> Left _err
Right (y, val) -> case testErrorPerLine val del headers (x + 1) of
Left _err -> Left _err
Right _ -> anx bl del headers (y+1)
testIfErrorInFile :: [BL.ByteString] -> Delimiter -> [Text] -> Either Text Delimiter
testIfErrorInFile bl del headers = anx (V.fromList bl) del headers 1
testCorrectFile :: BL.ByteString -> Either Text Delimiter
testCorrectFile bs =
case findDelimiter bs of
Left _err -> Left _err
Right del -> do
let bl = BL.splitWith (==delimiter Line) bs in
case getHeaders bl del of
Left _err -> Left _err
Right headers -> testIfErrorInFile bl del headers
----------Test headers added to ggt
-- use a map to remove \r that sometimes appear at the end of a line
testAllHeadersPresence :: [Text] -> Either Text [Text]
testAllHeadersPresence headers = do
let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack ""))headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
if null listHeaders
then Right headers
else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)
getHeaders :: [BL.ByteString] -> Delimiter -> Either Text [Text]
getHeaders bl del = do
let vec = V.fromList bl in
case BL.splitWith (==delimiter del) <$> ((V.!?) vec 0) of
Nothing -> Left "Error getHeaders"
Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)
------------------------------------------------------------------------
......@@ -251,10 +399,10 @@ readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc))
readTSVFile fp = do
result <- readTsvLazyBS Comma <$> BL.readFile fp
case result of
Left _err -> readTsvLazyBS Tab <$> BL.readFile fp
Right res -> pure $ Right res
file <- BL.readFile fp
case (testCorrectFile file) of
Left _err -> pure $ Left _err
Right del -> pure $ readTsvLazyBS del file
......
This diff is collapsed.
......@@ -13,6 +13,7 @@ module Main where
import Gargantext.Prelude
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Text.Corpus.TSV as TSVParser
import qualified Test.Core.Utils as Utils
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.NLP as NLP
......@@ -48,6 +49,7 @@ main = do
, jobsSpec
, NgramsQuery.tests
, CorpusQuery.tests
, TSVParser.tests
, JSON.tests
, Errors.tests
, similaritySpec
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment