[MERGE]

f3c4b508 · Alexandre Delanoë · bc89eed7 · f15d2fee · f3c4b508 · f3c4b508
Commit f3c4b508 authored Jul 15, 2024 by Alexandre Delanoë
Showing with 407 additions and 5 deletions

gargantext.cabal gargantext.cabal +3 -0

TSV.hs src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs +153 -5

TSV.hs test/Test/Core/Text/Corpus/TSV.hs +249 -0

Main.hs test/drivers/tasty/Main.hs +2 -0

No files found.
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -790,6 +790,7 @@ test-suite garg-test-tasty
      Test.Core.Similarity
      Test.Core.Text
      Test.Core.Text.Corpus.Query
+      Test.Core.Text.Corpus.TSV
      Test.Core.Text.Examples
      Test.Core.Text.Flow
      Test.Core.Utils
@@ -864,6 +865,7 @@ test-suite garg-test-tasty
    , postgresql-simple >= 0.6.4 && < 0.7
    , pretty
    , process ^>= 1.6.13.2
+    , protolude ^>= 0.3.3
    , quickcheck-instances ^>= 0.3.25.2
    , raw-strings-qq
    , recover-rtti >= 0.4 && < 0.5
@@ -893,6 +895,7 @@ test-suite garg-test-tasty
    , unordered-containers ^>= 0.2.16.0
    , unicode-collation >= 0.1.3.6
    , unliftio
+    , utf8-string ^>= 1.0.2
    , validity ^>= 0.11.0.1
    , vector ^>= 0.12.3.0
    , wai

--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
@@ -20,6 +20,9 @@ import Data.ByteString.Lazy qualified as BL
 import Data.Csv
 import Data.Text (pack)
 import Data.Text qualified as T
+import Data.Text.Lazy            qualified as TL
+import Data.Text.Lazy.Encoding    qualified as TL
+import Data.Text.Read     qualified as DTR
 import Data.Time.Segment (jour)
 import Data.Vector (Vector)
 import Data.Vector qualified as V
@@ -205,7 +208,7 @@ hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
    mI = maybe 0 identity
-data Delimiter = Tab | Comma
+data Delimiter = Tab | Comma | Line deriving (Eq, Show)
 tsvDecodeOptions :: Delimiter -> DecodeOptions
 tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
@@ -216,6 +219,151 @@ tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
 delimiter :: Delimiter -> Word8
 delimiter Tab   = fromIntegral $ ord '\t'
 delimiter Comma = fromIntegral $ ord ','
+delimiter Line  = fromIntegral $ ord '\n'
+------------------------------------------------------------------------
+testDelimiter :: Delimiter -> BL.ByteString -> Bool
+testDelimiter del bs =
+    let x = BL.splitWith (== delimiter Line) bs
+        vec = V.fromList x in
+          case BL.splitWith (== delimiter del) <$> ((V.!?) vec 0) of
+            Nothing -> False
+            Just e -> case BL.splitWith (== delimiter del) <$> ((V.!?) vec 1) of
+              Nothing -> False
+              Just f -> length e == length f && length e > 2
+findDelimiter :: BL.ByteString -> Either Text Delimiter
+findDelimiter bs
+  | testDelimiter Tab bs = Right Tab
+  | testDelimiter Comma bs = Right Comma
+  | otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line")
+isNumeric :: Text -> Either Bool Int
+isNumeric str = case DTR.decimal str of
+  Right (x,y) -> if y == ""
+    then Right x
+    else Left False
+  Left _ -> Left False
+lBLToText :: BL.ByteString -> Text
+lBLToText b = TL.toStrict $ TL.decodeUtf8 b
+validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
+validNumber x columnHeader ligne = do
+    let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
+    case isNumeric number of
+      Right val 
+        | val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
+        |otherwise -> Right True
+      Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
+validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
+validTextField x columnHeader ligne = do
+  let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
+    if not (T.null xs) 
+      then 
+        if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2))) 
+          then return True 
+          else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter  \"  must only appear at the beginning and the end of a field ")
+      else return True
+      -- else Left $ ("The column '" <> columnHeader <> "' at line " <>  pack (show ligne) <> " is empty")
+      -- Put a warning for the user to know their is a problem (empty column)
+testValue :: BL.ByteString -> Text -> Int -> Either Text Bool
+testValue val columnHeader ligne = case columnHeader of
+    "Publication Day" -> validNumber val columnHeader ligne
+    "Publication Month" -> validNumber val columnHeader ligne
+    "Publication Year" -> validNumber val columnHeader ligne
+    "Authors" -> validTextField val columnHeader ligne
+    "Title" -> validTextField val columnHeader ligne
+    "Source" -> validTextField val columnHeader ligne
+    "Abstract" -> validTextField val columnHeader ligne
+    _ -> Right True
+testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool
+testErrorPerLine [] _ [] _ = Right True
+testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
+                            | otherwise =  Left (pack $ "Too much field at line " <> show l)
+testErrorPerLine [] _ _ l = Left (pack $ "Missing one field at line " <> show l)
+testErrorPerLine (v:val) del (h:headers) ligne =
+    case testValue v h ligne of
+        Left _err -> Left _err
+        Right _ -> testErrorPerLine val del headers ligne
+checkNextLine :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
+checkNextLine bl del headers res x = do
+  case BL.splitWith (==delimiter del) <$> ((V.!?) bl (x+1)) of
+    Nothing  -> Right (x, (BL.splitWith (==delimiter del) res))
+    Just value -> if length value > 1 
+      then Right (x, (BL.splitWith (==delimiter del) res))
+      else case BL.append res <$> ((V.!?) bl (x+1)) of
+        Nothing  -> Left "checkNextLine2"
+        Just val -> checkNextLine bl del headers val (x+1)
+getMultipleLinefile :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
+getMultipleLinefile bl del headers res x = do
+    let tmp = BL.splitWith (==delimiter del) res in
+        if length tmp == length headers 
+          then checkNextLine bl del headers res x
+          else
+            if (length tmp > length headers) || (V.length bl == (x + 1)) 
+              then Left (pack $ "Cannot parse the file at line " <> show x <> ". Maybe because of a delimiter") 
+              else do
+                case BL.append res <$> ((V.!?) bl (x+1)) of
+                  Nothing  -> Left "getMultipleLinefile"
+                  Just val -> getMultipleLinefile bl del headers val (x+1)
+anx :: Vector BL.ByteString -> Delimiter -> [Text] -> Int -> Either Text Delimiter
+anx bl del headers x
+            | length bl == x = Right del
+            | otherwise      = 
+              case (V.!?) bl x of 
+                Nothing -> Left "anx"
+                Just bs ->
+                  case getMultipleLinefile bl del headers bs x of 
+                    Left _err -> Left _err
+                    Right (y, val) -> case testErrorPerLine val del headers (x + 1) of
+                      Left _err -> Left _err
+                      Right _   -> anx bl del headers (y+1)
+testIfErrorInFile :: [BL.ByteString] -> Delimiter -> [Text] -> Either Text Delimiter
+testIfErrorInFile bl del headers = anx (V.fromList bl) del headers 1
+testCorrectFile :: BL.ByteString -> Either Text Delimiter 
+testCorrectFile bs = 
+  case findDelimiter bs of
+    Left _err -> Left _err
+    Right del -> do
+      let bl = BL.splitWith (==delimiter Line) bs in
+        case getHeaders bl del of
+          Left _err -> Left _err
+          Right headers -> testIfErrorInFile bl del headers        
+----------Test headers added to ggt
+-- use a map to remove \r that sometimes appear at the end of a line
+testAllHeadersPresence :: [Text] -> Either Text [Text]
+testAllHeadersPresence headers = do
+    let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack ""))headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
+    if null listHeaders
+        then Right headers
+        else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders)
+getHeaders :: [BL.ByteString] -> Delimiter -> Either Text [Text]
+getHeaders bl del = do
+    let vec = V.fromList bl in
+        case BL.splitWith (==delimiter del) <$> ((V.!?) vec 0) of 
+          Nothing -> Left "Error getHeaders"
+          Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)
 ------------------------------------------------------------------------
@@ -251,10 +399,10 @@ readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
 -- | TODO use readFileLazy
 readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc))
 readTSVFile fp = do
-  result <- readTsvLazyBS Comma <$> BL.readFile fp
+  file <- BL.readFile fp
-  case result of
+  case (testCorrectFile file) of
-    Left _err -> readTsvLazyBS Tab <$> BL.readFile fp
+    Left _err -> pure $ Left _err
-    Right res -> pure $ Right res
+    Right del -> pure $ readTsvLazyBS del file

--- a/test/Test/Core/Text/Corpus/TSV.hs
+++ b/test/Test/Core/Text/Corpus/TSV.hs
+module Test.Core.Text.Corpus.TSV (tests) where
+import Gargantext.Core.Text.Corpus.Parsers.TSV
+import Test.QuickCheck
+import Test.QuickCheck.Instances ()
+import Data.ByteString.Lazy.UTF8 as BLU
+import Data.ByteString.Lazy as BL
+import Data.Char ( ord )
+import Data.Text as DT (Text, pack, null, elem)
+import Data.Text.Encoding as DT
+import Prelude
+import Test.Tasty
+import Test.Tasty.HUnit
+import Test.Tasty.QuickCheck hiding (Positive, Negative)
+tests :: TestTree
+tests = testGroup "TSV Parser" [
+    testProperty "Parses 'Valid Text'" testValidText
+  , testProperty "Parses 'Valid Number'" testValidNumber
+  , testProperty "Parses 'Error Per Line On A File'" testTestErrorPerLine
+  , testProperty "Parses 'Correct File'" testTestCorrectFile
+  , testProperty "Parses 'Correct File With New Line In Last Header'" testTestCorrectFileWithNewLine
+  , testProperty "Parses 'Find Delimiter'" testFindDelimiter
+  , testProperty "Parses 'Get Headers'" testGetHeader]
+delimiterBS :: Delimiter -> BL.ByteString
+delimiterBS Tab   = BLU.fromString "\t"
+delimiterBS Comma = BLU.fromString ","
+delimiterBS Line = BLU.fromString "\n"
+data File = File { fDelimiter :: Delimiter
+                 , allCorpus  :: [RandomCorpus]
+                 } 
+                 deriving (Show)
+data RandomCorpus = 
+    RandomCorpus { abstract :: Text
+                 , title    :: Text
+                 , authors  :: Text
+                 , source   :: Text
+                 , day      :: Int
+                 , month    :: Int
+                 , years    :: Int
+                 } 
+                 deriving (Show)
+instance Arbitrary File where
+  arbitrary = sized arbitrarySizedFile
+arbitrarySizedFile :: Int -> Gen File
+arbitrarySizedFile m = do
+  del <- elements [Tab, Comma]
+  corp <- vectorOf m (generateRandomCorpus)
+  return (File del corp)
+delimiterToText :: Delimiter -> Text
+delimiterToText Tab   = DT.pack "\t"
+delimiterToText Comma = DT.pack ","
+delimiterToText Line  = DT.pack "\n"
+delimiterToString :: Delimiter -> Char
+delimiterToString Tab   = '\t'
+delimiterToString Comma = ','
+delimiterToString Line  = '\n'  
+textToBL :: Text -> BL.ByteString
+textToBL b = BL.fromChunks . return . DT.encodeUtf8 $ b
+generateRandomCorpus :: Gen RandomCorpus
+generateRandomCorpus = RandomCorpus 
+                    <$> generateString
+                    <*> generateString 
+                    <*> generateString
+                    <*> generateString 
+                    <*> generateNumber 
+                    <*> generateNumber 
+                    <*> generateNumber
+generateFileDelimiter :: Gen File
+generateFileDelimiter = do
+  del <- elements [Tab, Comma]
+  m <- choose (1,5)
+  corp <- vectorOf m (generateRandomCorpus)
+  return (File del corp)
+generateFile :: Gen File
+generateFile = arbitrary :: Gen File
+generateString :: Gen Text
+generateString = arbitrary :: Gen Text
+generateNumber :: Gen Int
+generateNumber = arbitrary :: Gen Int
+randomHeaderList :: Gen [String]
+randomHeaderList = frequency [ 
+    (1, return [])
+  , (7, (:) <$> (elements ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]) <*> randomHeaderList)
+  ]
+--TODO add delimiter
+createLineFromCorpus :: RandomCorpus -> Delimiter -> BL.ByteString
+createLineFromCorpus corpus delD = do
+    let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"")
+    let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"")
+    let sou = (DT.pack "\"") <> (source corpus) <> (DT.pack "\"")
+    let abt = (DT.pack "\"") <> (abstract corpus) <> (DT.pack "\"")
+    let pDay = (DT.pack "\"") <> (DT.pack $ show $ day corpus) <> (DT.pack "\"")
+    let pMonth = (DT.pack "\"") <> (DT.pack $ show $ month corpus) <> (DT.pack "\"")
+    let pYears = (DT.pack "\"") <> (DT.pack $ show $ years corpus) <> (DT.pack "\"")
+    let del = delimiterToText delD
+    textToBL(pDay <> del <> pMonth <> del <> pYears <> del <> aut <> del <> tit <> del <> sou <> del <> abt)
+createLineFromCorpusWithNewLine :: RandomCorpus -> Delimiter -> BL.ByteString
+createLineFromCorpusWithNewLine corpus delD = do
+    let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"")
+    let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"")
+    let sou = (DT.pack "\"") <> (source corpus) <> (DT.pack "\"")
+    let abt = (DT.pack "\"") <> (abstract corpus) <> (DT.pack "\n") <> (abstract corpus) <> (DT.pack "\"")
+    let pDay = (DT.pack "\"") <> (DT.pack $ show $ day corpus) <> (DT.pack "\"")
+    let pMonth = (DT.pack "\"") <> (DT.pack $ show $ month corpus) <> (DT.pack "\"")
+    let pYears = (DT.pack "\"") <> (DT.pack $ show $ years corpus) <> (DT.pack "\"")
+    let del = delimiterToText delD
+    textToBL(pDay <> del <> pMonth <> del <> pYears <> del <> aut <> del <> tit <> del <> sou <> del <> abt)
+createFile :: File -> BL.ByteString
+createFile file = do
+    let headers = BL.intercalate (delimiterBS (fDelimiter file)) $ Prelude.map BLU.fromString ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
+    let allLines = BL.intercalate (BLU.fromString "\n") $ Prelude.map (\x -> createLineFromCorpusWithNewLine x (fDelimiter file)) (allCorpus file)
+    headers <> (BLU.fromString "\n") <> allLines
+createFileWithNewLine :: File -> BL.ByteString
+createFileWithNewLine file = do
+    let headers = BL.intercalate (delimiterBS (fDelimiter file)) $ Prelude.map BLU.fromString ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
+    let allLines = BL.intercalate (BLU.fromString "\n") $ Prelude.map (\x -> createLineFromCorpus x (fDelimiter file)) (allCorpus file)
+    headers <> (BLU.fromString "\n") <> allLines
+validRandomCorpus :: RandomCorpus -> Delimiter -> Bool
+validRandomCorpus tsv del
+    | BL.length (BL.filter (==delimiter del) (createLineFromCorpus tsv del)) > 3= True
+    | DT.null $ abstract tsv                                                    = True
+    | DT.null $ title tsv                                                       = True
+    | DT.null $ authors tsv                                                     = True
+    | DT.null $ source tsv                                                      = True
+    | DT.elem '\"' (abstract tsv )                                              = True
+    | DT.elem '\"' (title tsv)                                                  = True
+    | DT.elem '\"' (authors tsv)                                                = True
+    | DT.elem '\"' (source tsv)                                                 = True
+    | otherwise                                                                 = False
+-- Test the 'validTextField' function (test if a field is good on garganText)
+testValidNumber :: Property
+testValidNumber = forAll generateNumber (\s -> do
+    let nbText = DT.pack $ show s
+    let bl = textToBL nbText
+    case validNumber bl nbText 1 of
+        Right _                 -> True
+        Left  _ | BL.empty == bl -> True
+                | s < 1 -> True
+                | otherwise     -> False)
+-- Test the 'validTextField' function (test if a field is good on garganText)
+testValidText :: Property
+testValidText = forAll generateString (\s -> 
+    let bl = textToBL s in
+        case validTextField bl s 1 of
+            Right _                 -> True
+            Left  _ | BL.empty == bl -> True
+                    | (fromIntegral $ ord '\"') `BL.elem` bl -> True
+                    | otherwise     -> False)
+-- Test if a single line id OK
+testTestErrorPerLine :: Property
+testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
+    let del = Tab
+    let line = createLineFromCorpus tsv del
+    let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
+    let splitLine = BL.splitWith (==delimiter del) line
+    case testErrorPerLine splitLine del headers 1 of
+        Right _ -> True
+        Left  _ -> validRandomCorpus tsv del)
+--check : 
+-- True Del
+-- False Error
+-- Test if a file is OK
+testTestCorrectFile :: Property
+testTestCorrectFile = forAll generateFile (\file -> do
+    let tsv = createFile file
+    case testCorrectFile tsv of
+        Right del -> del == fDelimiter file
+        Left _ -> Prelude.all (\x -> do 
+            let del = fDelimiter file
+            let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
+            let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
+            case testErrorPerLine splitLine del headers 1 of
+                Right _ -> True
+                Left  _ -> validRandomCorpus x del) (allCorpus file))
+-- almost the same as the one above but also test if a corpus with abstract of multiple line is OK
+testTestCorrectFileWithNewLine :: Property
+testTestCorrectFileWithNewLine = forAll generateFile (\file -> do
+    let tsv = createFileWithNewLine file
+    case testCorrectFile tsv of
+        Right _ -> True
+        Left _ -> Prelude.all (\x -> do 
+            let del = fDelimiter file
+            let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
+            let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
+            case testErrorPerLine splitLine del headers 1 of
+                Right _ -> True
+                Left  _ -> validRandomCorpus x del) (allCorpus file))
+testFindDelimiter :: Property
+testFindDelimiter = forAll generateFileDelimiter (\file -> do
+    let tsv = createFile file
+    case findDelimiter tsv of
+        Right _ -> True
+        Left _ -> do
+            let line = Prelude.head $ allCorpus file
+            let del = delimiterToString $ fDelimiter file
+            let delLine = delimiterToString Line
+            del `DT.elem` (abstract line) || del `DT.elem` (authors line) || del `DT.elem` (title line) || del `DT.elem` (source line) || delLine `DT.elem` (abstract line) || delLine `DT.elem` (authors line) || delLine `DT.elem` (title line) || delLine `DT.elem` (source line))
+testGetHeader :: Property
+testGetHeader = forAll randomHeaderList (\headers -> do
+    let headersLines = (BL.intercalate (delimiterBS Tab) $ Prelude.map BLU.fromString headers):[]
+    case getHeaders headersLines Tab of
+        Right _                                                     -> True
+        Left  _ | not ("Publication Day" `Prelude.elem` headers)    -> True
+                | not ("Publication Month" `Prelude.elem` headers)  -> True
+                | not ("Publication Year" `Prelude.elem` headers)   -> True
+                | not ("Authors" `Prelude.elem` headers)            -> True
+                | not ("Source" `Prelude.elem` headers)             -> True
+                | not ("Title" `Prelude.elem` headers)              -> True
+                | not ("Abstract" `Prelude.elem` headers)           -> True
+                | otherwise                                         -> False
+    )
\ No newline at end of file
--- a/test/drivers/tasty/Main.hs
+++ b/test/drivers/tasty/Main.hs
@@ -13,6 +13,7 @@ module Main where
 import Gargantext.Prelude
 import qualified Test.Core.Text.Corpus.Query     as CorpusQuery
+import qualified Test.Core.Text.Corpus.TSV       as TSVParser
 import qualified Test.Core.Utils                 as Utils
 import qualified Test.Graph.Clustering           as Graph
 import qualified Test.Ngrams.NLP                 as NLP
@@ -48,6 +49,7 @@ main = do
    , jobsSpec
    , NgramsQuery.tests
    , CorpusQuery.tests
+    , TSVParser.tests
    , JSON.tests
    , Errors.tests
    , similaritySpec