Commit 0ad09936 authored by Loïc Chapron's avatar Loïc Chapron Committed by Grégoire Locqueville

send warning/small parsing change

parent e3de4cdb
......@@ -60,7 +60,6 @@ import Servant.Job.Async (HasJobEnv(..), Job)
import Servant.Job.Async qualified as SJ
import Servant.Job.Core qualified
import System.Log.FastLogger qualified as FL
import Debug.Trace as DT
data Mode = Dev | Mock | Prod
deriving (Show, Read, Generic)
......@@ -245,7 +244,7 @@ instance Jobs.MonadJobStatus (GargM Env err) where
Jobs.markComplete h
emitWarning jh warn = DT.trace ("Test") $ updateJobProgress jh (addWarningEvent warn)
emitWarning jh warn = updateJobProgress jh (addWarningEvent warn)
markFailed mb_msg jh =
updateJobProgress jh (\latest -> case mb_msg of
......
......@@ -255,7 +255,7 @@ addToCorpusWithForm user cid nwf jobHandle = do
Right decoded -> decoded
eDocsC <- liftBase $ parseC (nwf ^. wf_fileformat) data'
case eDocsC of
Right (count, docsC) -> do
Right (count, docsC, warn) -> do
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
......@@ -295,13 +295,12 @@ addToCorpusWithForm user cid nwf jobHandle = do
-- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
-- TODO uncomment this
--sendMail user
emitWarning jobHandle (Warn.MissingAbstractFromCorpus "Test")
$(logLocM) WARNING $ T.pack $ "Warning in parsing"
--markFailed (Just $ Parser.ParseFormatError "Test") jobHandle
markComplete jobHandle
if (Warn.renderWarningDiagnostic warn == "") then
markComplete jobHandle
else
markCompleteWithWarning jobHandle warn
Left parseErr -> do
$(logLocM) ERROR $ "parse error: " <> (Parser._ParseFormatError parseErr)
markFailed (Just parseErr) jobHandle
......
......@@ -44,7 +44,7 @@ import Data.ByteString.Lazy qualified as DBL
import Data.List (lookup)
import Data.Map qualified as DM
import Data.Text qualified as DT
import Data.Tuple.Extra (both) -- , first, second)
import Data.Tuple.Extra (both, second3, fst3, snd3) -- , first, second)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.TSV (parseHal, parseTsv, parseTsvC)
......@@ -101,23 +101,23 @@ parseFormatC :: forall m. MonadBaseControl IO m
=> FileType
-> FileFormat
-> DB.ByteString
-> m (Either ParseFormatError (Integer, ConduitT () HyperdataDocument IO ()))
-> m (Either ParseFormatError (Integer, ConduitT () HyperdataDocument IO (), WarningDiagnostic))
parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
where
do_parse :: MonadBaseControl IO m
=> FileType
-> FileFormat
-> DB.ByteString
-> m (Either DT.Text (Integer, ConduitT () HyperdataDocument IO ()))
-> m (Either DT.Text (Integer, ConduitT () HyperdataDocument IO (), WarningDiagnostic))
do_parse TsvGargV3 Plain bs = do
let eParsedC = parseTsvC $ DBL.fromStrict bs
pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
pure (second3 (transPipe (pure . runIdentity)) <$> eParsedC)
do_parse TsvHal Plain bs = do
let eParsedC = parseTsvC $ DBL.fromStrict bs
pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
pure (second3 (transPipe (pure . runIdentity)) <$> eParsedC)
do_parse Istex Plain bs = do
ep <- liftBase $ parseIstex EN $ DBL.fromStrict bs
pure $ (\p -> (1, yieldMany [p])) <$> ep
pure $ (\p -> (1, yieldMany [p], MalformedCorpus "")) <$> ep
do_parse RisPresse Plain bs = do
--docs <- enrichWith RisPresse
let eDocs = runParser' RisPresse bs
......@@ -126,7 +126,8 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
, yieldMany docs
.| mapC presseEnrich
.| mapC (map $ both decodeUtf8)
.| mapMC (toDoc RIS)) ) <$> eDocs
.| mapMC (toDoc RIS)
, MalformedCorpus "") ) <$> eDocs
do_parse WOS Plain bs = do
let eDocs = runParser' WOS bs
pure $ (\docs ->
......@@ -134,7 +135,8 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
, yieldMany docs
.| mapC (map $ first WOS.keys)
.| mapC (map $ both decodeUtf8)
.| mapMC (toDoc WOS)) ) <$> eDocs
.| mapMC (toDoc WOS)
, MalformedCorpus "") ) <$> eDocs
do_parse Iramuteq Plain bs = do
let eDocs = runParser' Iramuteq bs
pure $ (\docs ->
......@@ -143,12 +145,12 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
.| mapC (map $ first Iramuteq.keys)
.| mapC (map $ both decodeUtf8)
.| mapMC (toDoc Iramuteq . map (second (DT.replace "_" " ")))
)
, MalformedCorpus "")
)
<$> eDocs
do_parse JSON Plain bs = do
let eParsedC = parseJSONC $ DBL.fromStrict bs
pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
pure (second3 (transPipe (pure . runIdentity)) <$> eParsedC)
do_parse fty ZIP bs = liftBase $ UZip.withZipFileBS bs $ do
fileNames <- filter (filterZIPFileNameP ft) . DM.keys <$> getEntries
printDebug "[do_parse] fileNames" fileNames
......@@ -163,11 +165,12 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
case contents of
[] -> pure $ Left "No files in zip"
_ -> do
let lenghts = fst <$> contents
let contents' = snd <$> contents
let lenghts = fst3 <$> contents
let contents' = snd3 <$> contents
let totalLength = sum lenghts
pure $ Right ( totalLength
, void (sequenceConduits contents') ) -- .| mapM_C (printDebug "[do_parse] doc")
, void (sequenceConduits contents')
, MalformedCorpus "" ) -- .| mapM_C (printDebug "[do_parse] doc")
_ -> pure $ Left $ DT.intercalate "\n" errs
do_parse _ _ _ = pure $ Left "Not implemented"
......
......@@ -25,6 +25,7 @@ import Gargantext.Core (Lang)
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex qualified as Istex
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (length)
import Gargantext.Utils.Jobs.Error as Warn
import Protolude
......@@ -66,12 +67,13 @@ readJSONLazyBS bs = first T.pack $ eitherDecode bs
parseJSONC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
-> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
parseJSONC bs = f <$> readJSONLazyBS bs
where
f (JSONStruct { documents }) =
( fromIntegral $ length documents
, yieldMany documents .| mapC doc2hyperdoc )
, yieldMany documents .| mapC doc2hyperdoc
, Warn.MalformedCorpus "" )
doc2hyperdoc :: JSONStructDocument -> HyperdataDocument
doc2hyperdoc (JSONStructDocument { document = JSONDocument { hyperdata } }) = hyperdata
......
......@@ -30,6 +30,7 @@ import Gargantext.Core.Text ( sentences, unsentences )
import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (length, show)
import Gargantext.Utils.Jobs.Error as Warn
import Protolude
---------------------------------------------------------------
......@@ -250,49 +251,49 @@ isNumeric str = case DTR.decimal str of
lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TL.decodeUtf8 b
validNumber :: BL.ByteString -> Text -> Int -> Either Text Bool
validNumber x columnHeader ligne = do
validNumber :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
validNumber x columnHeader ligne warn = do
let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x)
case isNumeric number of
Right val
| val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
|otherwise -> Right True
| val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative")
| otherwise -> Right warn
Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
validTextField :: BL.ByteString -> Text -> Int -> Either Text Bool
validTextField x columnHeader ligne = do
validTextField :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
validTextField x columnHeader ligne warn = do
let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
if not (T.null xs)
then
if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2)))
then return True
then Right warn
else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field ")
else return True
else Right ( ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty") : warn)
-- else Left $ ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty")
-- Put a warning for the user to know their is a problem (empty column)
testValue :: BL.ByteString -> Text -> Int -> Either Text Bool
testValue val columnHeader ligne = case columnHeader of
"Publication Day" -> validNumber val columnHeader ligne
"Publication Month" -> validNumber val columnHeader ligne
"Publication Year" -> validNumber val columnHeader ligne
"Authors" -> validTextField val columnHeader ligne
"Title" -> validTextField val columnHeader ligne
"Source" -> validTextField val columnHeader ligne
"Abstract" -> validTextField val columnHeader ligne
_ -> Right True
testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> Either Text Bool
testErrorPerLine [] _ [] _ = Right True
testErrorPerLine _ del [] l | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l)
testErrorPerLine [] _ _ l = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine (v:val) del (h:headers) ligne =
case testValue v h ligne of
testValue :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
testValue val columnHeader ligne warn = case columnHeader of
"Publication Day" -> validNumber val columnHeader ligne warn
"Publication Month" -> validNumber val columnHeader ligne warn
"Publication Year" -> validNumber val columnHeader ligne warn
"Authors" -> validTextField val columnHeader ligne warn
"Title" -> validTextField val columnHeader ligne warn
"Source" -> validTextField val columnHeader ligne warn
"Abstract" -> validTextField val columnHeader ligne warn
_ -> Right warn
testErrorPerLine :: [BL.ByteString] -> Delimiter -> [Text] -> Int -> [Text] -> Either Text [Text]
testErrorPerLine [] _ [] _ warn = Right warn
testErrorPerLine _ del [] l _ | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l)
testErrorPerLine [] _ _ l _ = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine (v:val) del (h:headers) ligne warn =
case testValue v h ligne warn of
Left _err -> Left _err
Right _ -> testErrorPerLine val del headers ligne
Right warning -> testErrorPerLine val del headers ligne warning
checkNextLine :: Vector BL.ByteString -> Delimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
......@@ -318,24 +319,24 @@ getMultipleLinefile bl del headers res x = do
Nothing -> Left "getMultipleLinefile"
Just val -> getMultipleLinefile bl del headers val (x+1)
anx :: Vector BL.ByteString -> Delimiter -> [Text] -> Int -> Either Text Delimiter
anx bl del headers x
| length bl == x = Right del
anx :: Vector BL.ByteString -> Delimiter -> [Text] -> Int -> [Text] -> Either Text (Delimiter, [Text])
anx bl del headers x warn
| length bl == x = Right (del, warn)
| otherwise =
case (V.!?) bl x of
Nothing -> Left "anx"
Just bs ->
case getMultipleLinefile bl del headers bs x of
Left _err -> Left _err
Right (y, val) -> case testErrorPerLine val del headers (x + 1) of
Right (y, val) -> case testErrorPerLine val del headers (x + 1) warn of
Left _err -> Left _err
Right _ -> anx bl del headers (y+1)
Right warning -> anx bl del headers (y+1) warning
testIfErrorInFile :: [BL.ByteString] -> Delimiter -> [Text] -> Either Text Delimiter
testIfErrorInFile bl del headers = anx (V.fromList bl) del headers 1
testIfErrorInFile :: [BL.ByteString] -> Delimiter -> [Text] -> Either Text (Delimiter, [Text])
testIfErrorInFile bl del headers = anx (V.fromList bl) del headers 1 []
testCorrectFile :: BL.ByteString -> Either Text Delimiter
testCorrectFile :: BL.ByteString -> Either Text (Delimiter, [Text])
testCorrectFile bs =
case findDelimiter bs of
Left _err -> Left _err
......@@ -403,7 +404,7 @@ readTSVFile fp = do
file <- BL.readFile fp
case (testCorrectFile file) of
Left _err -> pure $ Left _err
Right del -> pure $ readTsvLazyBS del file
Right (del,_) -> pure $ readTsvLazyBS del file
......@@ -593,17 +594,17 @@ parseTsv' bs = do
let
result = case (testCorrectFile bs) of
Left _err -> Left _err
Right del -> readTsvLazyBS del bs
Right (del,_) -> readTsvLazyBS del bs
V.toList . V.map tsv2doc . snd <$> result
parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
-> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
parseTsvC bs =
(\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc)) <$> eResult
(\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
where
eResult = case (testCorrectFile bs) of
Left _err -> Left _err
Right del -> readTsvLazyBS del bs
(eResult, warning) = case (testCorrectFile bs) of
Left _err -> (Left _err, Warn.MalformedCorpus "")
Right (del,warn) -> (readTsvLazyBS del bs, Warn.MalformedCorpus (T.intercalate (pack "\n") warn))
------------------------------------------------------------------------
-- Tsv v3 weighted for phylo
......
......@@ -161,7 +161,7 @@ testValidNumber :: Property
testValidNumber = forAll generateNumber (\s -> do
let nbText = DT.pack $ show s
let bl = textToBL nbText
case validNumber bl nbText 1 of
case validNumber bl nbText 1 [] of
Right _ -> True
Left _ | BL.empty == bl -> True
| s < 1 -> True
......@@ -171,7 +171,7 @@ testValidNumber = forAll generateNumber (\s -> do
testValidText :: Property
testValidText = forAll generateString (\s ->
let bl = textToBL s in
case validTextField bl s 1 of
case validTextField bl s 1 [] of
Right _ -> True
Left _ | BL.empty == bl -> True
| (fromIntegral $ ord '\"') `BL.elem` bl -> True
......@@ -185,7 +185,7 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
let line = createLineFromCorpus tsv del
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) line
case testErrorPerLine splitLine del headers 1 of
case testErrorPerLine splitLine del headers 1 [] of
Right _ -> True
Left _ -> validRandomCorpus tsv del)
......@@ -198,12 +198,12 @@ testTestCorrectFile :: Property
testTestCorrectFile = forAll generateFile (\file -> do
let tsv = createFile file
case testCorrectFile tsv of
Right del -> del == fDelimiter file
Right (del, _) -> del == fDelimiter file
Left _ -> Prelude.all (\x -> do
let del = fDelimiter file
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
case testErrorPerLine splitLine del headers 1 of
case testErrorPerLine splitLine del headers 1 [] of
Right _ -> True
Left _ -> validRandomCorpus x del) (allCorpus file))
......@@ -218,7 +218,7 @@ testTestCorrectFileWithNewLine = forAll generateFile (\file -> do
let del = fDelimiter file
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
case testErrorPerLine splitLine del headers 1 of
case testErrorPerLine splitLine del headers 1 [] of
Right _ -> True
Left _ -> validRandomCorpus x del) (allCorpus file))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment