Commit f3c4b508 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[MERGE]

parents bc89eed7 f15d2fee
......@@ -790,6 +790,7 @@ test-suite garg-test-tasty
Test.Core.Similarity
Test.Core.Text
Test.Core.Text.Corpus.Query
Test.Core.Text.Corpus.TSV
Test.Core.Text.Examples
Test.Core.Text.Flow
Test.Core.Utils
......@@ -864,6 +865,7 @@ test-suite garg-test-tasty
, postgresql-simple >= 0.6.4 && < 0.7
, pretty
, process ^>= 1.6.13.2
, protolude ^>= 0.3.3
, quickcheck-instances ^>= 0.3.25.2
, raw-strings-qq
, recover-rtti >= 0.4 && < 0.5
......@@ -893,6 +895,7 @@ test-suite garg-test-tasty
, unordered-containers ^>= 0.2.16.0
, unicode-collation >= 0.1.3.6
, unliftio
, utf8-string ^>= 1.0.2
, validity ^>= 0.11.0.1
, vector ^>= 0.12.3.0
, wai
......
......@@ -20,6 +20,9 @@ import Data.ByteString.Lazy qualified as BL
import Data.Csv
import Data.Text (pack)
import Data.Text qualified as T
import Data.Text.Lazy qualified as TL
import Data.Text.Lazy.Encoding qualified as TL
import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import Data.Vector qualified as V
......@@ -205,7 +208,7 @@ hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
mI = maybe 0 identity
data Delimiter = Tab | Comma
data Delimiter = Tab | Comma | Line deriving (Eq, Show)
tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
......@@ -216,6 +219,151 @@ tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
delimiter Line = fromIntegral $ ord '\n'
------------------------------------------------------------------------
testDelimiter :: Delimiter -> BL.ByteString -> Bool
testDelimiter del bs =
let x = BL.splitWith (== delimiter Line) bs
vec = V.fromList x in
case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
Nothing -> False
Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
Nothing -> False
Just f -> length e == length f && length e > 2
findDelimiter :: BL.ByteString -> Either Text Delimiter
findDelimiter bs
| testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma
| otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
isNumeric :: Text -> Either Bool Int
isNumeric str = case DTR.decimal str of
Right (x,y) -> if y == ""
then Right x
else Left False
Left _ -> Left False
lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TL.decodeUtf8 b
validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
validNumber x columnHeader ligne = do
let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
case isNumeric number of
Right val
| val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
|otherwise -> Right True
Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
validTextField x columnHeader ligne = do
let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
if not (T.null xs)
then
if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2)))
then return True
else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field ")
else return True
-- else Left $ ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty")
-- Put a warning for the user to know their is a problem (empty column)
testValue :: BL.ByteString -> Text -> Int -> Either Text Bool
testValue val columnHeader ligne = case columnHeader of
"Publication Day" -> validNumber val columnHeader ligne
"Publication Month" -> validNumber val columnHeader ligne
"Publication Year" -> validNumber val columnHeader ligne
"Authors" -> validTextField val columnHeader ligne
"Title" -> validTextField val columnHeader ligne
"Source" -> validTextField val columnHeader ligne
"Abstract" -> validTextField val columnHeader ligne
_ -> Right True
testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool
testErrorPerLine [] _ [] _ = Right True
testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l)
testErrorPerLine [] _ _ l = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine (v:val) del (h:headers) ligne =
case testValue v h ligne of
Left _err -> Left _err
Right _ -> testErrorPerLine val del headers ligne
checkNextLine :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine bl del headers res x = do
case BL.splitWith (==delimiter del) <$> ((V.!?) bl (x+1)) of
Nothing -> Right (x, (BL.splitWith (==delimiter del) res))
Just value -> if length value > 1
then Right (x, (BL.splitWith (==delimiter del) res))
else case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "checkNextLine2"
Just val -> checkNextLine bl del headers val (x+1)
getMultipleLinefile :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile bl del headers res x = do
let tmp = BL.splitWith (==delimiter del) res in
if length tmp == length headers
then checkNextLine bl del headers res x
else
if (length tmp > length headers) || (V.length bl == (x + 1))
then Left (pack $ "Cannot parse the file at line " <> show x <> ". Maybe because of a delimiter")
else do
case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "getMultipleLinefile"
Just val -> getMultipleLinefile bl del headers val (x+1)
anx :: Vector BL.ByteString -> Delimiter -> [Text] -> Int -> Either Text Delimiter
anx bl del headers x
| length bl == x = Right del
| otherwise =
case (V.!?) bl x of
Nothing -> Left "anx"
Just bs ->
case getMultipleLinefile bl del headers bs x of
Left _err -> Left _err
Right (y, val) -> case testErrorPerLine val del headers (x + 1) of
Left _err -> Left _err
Right _ -> anx bl del headers (y+1)
testIfErrorInFile :: [BL.ByteString] -> Delimiter -> [Text] -> Either Text Delimiter
testIfErrorInFile bl del headers = anx (V.fromList bl) del headers 1
testCorrectFile :: BL.ByteString -> Either Text Delimiter
testCorrectFile bs =
case findDelimiter bs of
Left _err -> Left _err
Right del -> do
let bl = BL.splitWith (==delimiter Line) bs in
case getHeaders bl del of
Left _err -> Left _err
Right headers -> testIfErrorInFile bl del headers
----------Test headers added to ggt
-- use a map to remove \r that sometimes appear at the end of a line
testAllHeadersPresence :: [Text] -> Either Text [Text]
testAllHeadersPresence headers = do
let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack ""))headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
if null listHeaders
then Right headers
else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)
getHeaders :: [BL.ByteString] -> Delimiter -> Either Text [Text]
getHeaders bl del = do
let vec = V.fromList bl in
case BL.splitWith (==delimiter del) <$> ((V.!?) vec 0) of
Nothing -> Left "Error getHeaders"
Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)
------------------------------------------------------------------------
......@@ -251,10 +399,10 @@ readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc))
readTSVFile fp = do
result <- readTsvLazyBS Comma <$> BL.readFile fp
case result of
Left _err -> readTsvLazyBS Tab <$> BL.readFile fp
Right res -> pure $ Right res
file <- BL.readFile fp
case (testCorrectFile file) of
Left _err -> pure $ Left _err
Right del -> pure $ readTsvLazyBS del file
......
module Test.Core.Text.Corpus.TSV (tests) where
import Gargantext.Core.Text.Corpus.Parsers.TSV
import Test.QuickCheck
import Test.QuickCheck.Instances ()
import Data.ByteString.Lazy.UTF8 as BLU
import Data.ByteString.Lazy as BL
import Data.Char ( ord )
import Data.Text as DT (Text, pack, null, elem)
import Data.Text.Encoding as DT
import Prelude
import Test.Tasty
import Test.Tasty.HUnit
import Test.Tasty.QuickCheck hiding (Positive, Negative)
tests :: TestTree
tests = testGroup "TSV Parser" [
testProperty "Parses 'Valid Text'" testValidText
, testProperty "Parses 'Valid Number'" testValidNumber
, testProperty "Parses 'Error Per Line On A File'" testTestErrorPerLine
, testProperty "Parses 'Correct File'" testTestCorrectFile
, testProperty "Parses 'Correct File With New Line In Last Header'" testTestCorrectFileWithNewLine
, testProperty "Parses 'Find Delimiter'" testFindDelimiter
, testProperty "Parses 'Get Headers'" testGetHeader]
delimiterBS :: Delimiter -> BL.ByteString
delimiterBS Tab = BLU.fromString "\t"
delimiterBS Comma = BLU.fromString ","
delimiterBS Line = BLU.fromString "\n"
data File = File { fDelimiter :: Delimiter
, allCorpus :: [RandomCorpus]
}
deriving (Show)
data RandomCorpus =
RandomCorpus { abstract :: Text
, title :: Text
, authors :: Text
, source :: Text
, day :: Int
, month :: Int
, years :: Int
}
deriving (Show)
instance Arbitrary File where
arbitrary = sized arbitrarySizedFile
arbitrarySizedFile :: Int -> Gen File
arbitrarySizedFile m = do
del <- elements [Tab, Comma]
corp <- vectorOf m (generateRandomCorpus)
return (File del corp)
delimiterToText :: Delimiter -> Text
delimiterToText Tab = DT.pack "\t"
delimiterToText Comma = DT.pack ","
delimiterToText Line = DT.pack "\n"
delimiterToString :: Delimiter -> Char
delimiterToString Tab = '\t'
delimiterToString Comma = ','
delimiterToString Line = '\n'
textToBL :: Text -> BL.ByteString
textToBL b = BL.fromChunks . return . DT.encodeUtf8 $ b
generateRandomCorpus :: Gen RandomCorpus
generateRandomCorpus = RandomCorpus
<$> generateString
<*> generateString
<*> generateString
<*> generateString
<*> generateNumber
<*> generateNumber
<*> generateNumber
generateFileDelimiter :: Gen File
generateFileDelimiter = do
del <- elements [Tab, Comma]
m <- choose (1,5)
corp <- vectorOf m (generateRandomCorpus)
return (File del corp)
generateFile :: Gen File
generateFile = arbitrary :: Gen File
generateString :: Gen Text
generateString = arbitrary :: Gen Text
generateNumber :: Gen Int
generateNumber = arbitrary :: Gen Int
randomHeaderList :: Gen [String]
randomHeaderList = frequency [
(1, return [])
, (7, (:) <$> (elements ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]) <*> randomHeaderList)
]
--TODO add delimiter
createLineFromCorpus :: RandomCorpus -> Delimiter -> BL.ByteString
createLineFromCorpus corpus delD = do
let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"")
let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"")
let sou = (DT.pack "\"") <> (source corpus) <> (DT.pack "\"")
let abt = (DT.pack "\"") <> (abstract corpus) <> (DT.pack "\"")
let pDay = (DT.pack "\"") <> (DT.pack $ show $ day corpus) <> (DT.pack "\"")
let pMonth = (DT.pack "\"") <> (DT.pack $ show $ month corpus) <> (DT.pack "\"")
let pYears = (DT.pack "\"") <> (DT.pack $ show $ years corpus) <> (DT.pack "\"")
let del = delimiterToText delD
textToBL(pDay <> del <> pMonth <> del <> pYears <> del <> aut <> del <> tit <> del <> sou <> del <> abt)
createLineFromCorpusWithNewLine :: RandomCorpus -> Delimiter -> BL.ByteString
createLineFromCorpusWithNewLine corpus delD = do
let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"")
let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"")
let sou = (DT.pack "\"") <> (source corpus) <> (DT.pack "\"")
let abt = (DT.pack "\"") <> (abstract corpus) <> (DT.pack "\n") <> (abstract corpus) <> (DT.pack "\"")
let pDay = (DT.pack "\"") <> (DT.pack $ show $ day corpus) <> (DT.pack "\"")
let pMonth = (DT.pack "\"") <> (DT.pack $ show $ month corpus) <> (DT.pack "\"")
let pYears = (DT.pack "\"") <> (DT.pack $ show $ years corpus) <> (DT.pack "\"")
let del = delimiterToText delD
textToBL(pDay <> del <> pMonth <> del <> pYears <> del <> aut <> del <> tit <> del <> sou <> del <> abt)
createFile :: File -> BL.ByteString
createFile file = do
let headers = BL.intercalate (delimiterBS (fDelimiter file)) $ Prelude.map BLU.fromString ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let allLines = BL.intercalate (BLU.fromString "\n") $ Prelude.map (\x -> createLineFromCorpusWithNewLine x (fDelimiter file)) (allCorpus file)
headers <> (BLU.fromString "\n") <> allLines
createFileWithNewLine :: File -> BL.ByteString
createFileWithNewLine file = do
let headers = BL.intercalate (delimiterBS (fDelimiter file)) $ Prelude.map BLU.fromString ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let allLines = BL.intercalate (BLU.fromString "\n") $ Prelude.map (\x -> createLineFromCorpus x (fDelimiter file)) (allCorpus file)
headers <> (BLU.fromString "\n") <> allLines
validRandomCorpus :: RandomCorpus -> Delimiter -> Bool
validRandomCorpus tsv del
| BL.length (BL.filter (==delimiter del) (createLineFromCorpus tsv del)) > 3= True
| DT.null $ abstract tsv = True
| DT.null $ title tsv = True
| DT.null $ authors tsv = True
| DT.null $ source tsv = True
| DT.elem '\"' (abstract tsv ) = True
| DT.elem '\"' (title tsv) = True
| DT.elem '\"' (authors tsv) = True
| DT.elem '\"' (source tsv) = True
| otherwise = False
-- Test the 'validTextField' function (test if a field is good on garganText)
testValidNumber :: Property
testValidNumber = forAll generateNumber (\s -> do
let nbText = DT.pack $ show s
let bl = textToBL nbText
case validNumber bl nbText 1 of
Right _ -> True
Left _ | BL.empty == bl -> True
| s < 1 -> True
| otherwise -> False)
-- Test the 'validTextField' function (test if a field is good on garganText)
testValidText :: Property
testValidText = forAll generateString (\s ->
let bl = textToBL s in
case validTextField bl s 1 of
Right _ -> True
Left _ | BL.empty == bl -> True
| (fromIntegral $ ord '\"') `BL.elem` bl -> True
| otherwise -> False)
-- Test if a single line id OK
testTestErrorPerLine :: Property
testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
let del = Tab
let line = createLineFromCorpus tsv del
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) line
case testErrorPerLine splitLine del headers 1 of
Right _ -> True
Left _ -> validRandomCorpus tsv del)
--check :
-- True Del
-- False Error
-- Test if a file is OK
testTestCorrectFile :: Property
testTestCorrectFile = forAll generateFile (\file -> do
let tsv = createFile file
case testCorrectFile tsv of
Right del -> del == fDelimiter file
Left _ -> Prelude.all (\x -> do
let del = fDelimiter file
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
case testErrorPerLine splitLine del headers 1 of
Right _ -> True
Left _ -> validRandomCorpus x del) (allCorpus file))
-- almost the same as the one above but also test if a corpus with abstract of multiple line is OK
testTestCorrectFileWithNewLine :: Property
testTestCorrectFileWithNewLine = forAll generateFile (\file -> do
let tsv = createFileWithNewLine file
case testCorrectFile tsv of
Right _ -> True
Left _ -> Prelude.all (\x -> do
let del = fDelimiter file
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
case testErrorPerLine splitLine del headers 1 of
Right _ -> True
Left _ -> validRandomCorpus x del) (allCorpus file))
testFindDelimiter :: Property
testFindDelimiter = forAll generateFileDelimiter (\file -> do
let tsv = createFile file
case findDelimiter tsv of
Right _ -> True
Left _ -> do
let line = Prelude.head $ allCorpus file
let del = delimiterToString $ fDelimiter file
let delLine = delimiterToString Line
del `DT.elem` (abstract line) || del `DT.elem` (authors line) || del `DT.elem` (title line) || del `DT.elem` (source line) || delLine `DT.elem` (abstract line) || delLine `DT.elem` (authors line) || delLine `DT.elem` (title line) || delLine `DT.elem` (source line))
testGetHeader :: Property
testGetHeader = forAll randomHeaderList (\headers -> do
let headersLines = (BL.intercalate (delimiterBS Tab) $ Prelude.map BLU.fromString headers):[]
case getHeaders headersLines Tab of
Right _ -> True
Left _ | not ("Publication Day" `Prelude.elem` headers) -> True
| not ("Publication Month" `Prelude.elem` headers) -> True
| not ("Publication Year" `Prelude.elem` headers) -> True
| not ("Authors" `Prelude.elem` headers) -> True
| not ("Source" `Prelude.elem` headers) -> True
| not ("Title" `Prelude.elem` headers) -> True
| not ("Abstract" `Prelude.elem` headers) -> True
| otherwise -> False
)
\ No newline at end of file
......@@ -13,6 +13,7 @@ module Main where
import Gargantext.Prelude
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Text.Corpus.TSV as TSVParser
import qualified Test.Core.Utils as Utils
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.NLP as NLP
......@@ -48,6 +49,7 @@ main = do
, jobsSpec
, NgramsQuery.tests
, CorpusQuery.tests
, TSVParser.tests
, JSON.tests
, Errors.tests
, similaritySpec
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment