Commit 946448be authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/433-dev-ignore-tsv-errors' into dev

parents 3bf63d30 a5f299c5
...@@ -75,6 +75,7 @@ import_p = fmap CCMD_import $ ImportArgs ...@@ -75,6 +75,7 @@ import_p = fmap CCMD_import $ ImportArgs
<*> ( option str ( long "user") ) <*> ( option str ( long "user") )
<*> ( option str ( long "name") ) <*> ( option str ( long "name") )
<*> settings_p <*> settings_p
-- <*> (fmap Limit ( option auto ( long "limit" <> metavar "INT" <> help "The limit for the query") ))
<*> ( option str ( long "corpus-path" <> help "Path to corpus file") ) <*> ( option str ( long "corpus-path" <> help "Path to corpus file") )
function_p :: String -> Either String ImportFunction function_p :: String -> Either String ImportFunction
......
...@@ -18,8 +18,8 @@ fi ...@@ -18,8 +18,8 @@ fi
# with the `sha256sum` result calculated on the `cabal.project` and # with the `sha256sum` result calculated on the `cabal.project` and
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI # `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in. # cache can kick in.
expected_cabal_project_hash="37352ca36ca5e69d9945da11439be4c3909297b338242855fa588dffdf1ba02b" expected_cabal_project_hash="37352ca36ca5e69d9945da11439be4c3909297b338242855fa588dffdf1ba02b"
expected_cabal_project_freeze_hash="cd52143d3a9d285360b59c6371d3e258552c1bc115bd612024db3de1f7593ff7" expected_cabal_project_freeze_hash="cd52143d3a9d285360b59c6371d3e258552c1bc115bd612024db3de1f7593ff7"
cabal --store-dir=$STORE_DIR v2-build --dry-run cabal --store-dir=$STORE_DIR v2-build --dry-run
......
...@@ -32,6 +32,7 @@ data-files: ...@@ -32,6 +32,7 @@ data-files:
ekg-assets/bootstrap-1.4.0.min.css ekg-assets/bootstrap-1.4.0.min.css
ekg-assets/chart_line_add.png ekg-assets/chart_line_add.png
ekg-assets/cross.png ekg-assets/cross.png
test-data/ngrams/433-utf-encoding-issue.tsv
test-data/ngrams/GarganText_DocsList-nodeId-177.json test-data/ngrams/GarganText_DocsList-nodeId-177.json
test-data/ngrams/GarganText_NgramsTerms-nodeId-177.json test-data/ngrams/GarganText_NgramsTerms-nodeId-177.json
test-data/ngrams/GarganText_NgramsTerms-QuantumComputing.json test-data/ngrams/GarganText_NgramsTerms-QuantumComputing.json
......
...@@ -178,35 +178,36 @@ ngramsListFromTSVData tsvData = case decodeTsv of ...@@ -178,35 +178,36 @@ ngramsListFromTSVData tsvData = case decodeTsv of
binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData
decodeTsv :: Either Prelude.String (Vector NgramsTableMap) decodeTsv :: Either Prelude.String (Vector NgramsTableMap)
decodeTsv = Tsv.decodeWithP tsvToNgramsTableMap decodeTsv = Vec.catMaybes <$>
(Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') }) Tsv.decodeWithP tsvToNgramsTableMap
Tsv.HasHeader (Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') })
binaryData Tsv.HasHeader
binaryData
-- | Converts a plain TSV 'Record' into an NgramsTableMap -- | Converts a plain TSV 'Record' into an NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser NgramsTableMap tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser (Maybe NgramsTableMap)
tsvToNgramsTableMap record = case Vec.toList record of tsvToNgramsTableMap record = case Vec.toList record of
(map P.decodeUtf8 -> [status, label, forms]) (map P.decodeUtf8 -> [status, label, forms])
-> pure $ conv status label forms -> pure $ Just $ conv status label forms
_ -> Prelude.fail "tsvToNgramsTableMap failed" -- WARNING: This silently ignores errors (#433)
_ -> pure Nothing
where where
conv :: Text -> Text -> Text -> NgramsTableMap conv :: Text -> Text -> Text -> NgramsTableMap
conv status label forms = Map.singleton (NgramsTerm label) conv status label forms = Map.singleton (NgramsTerm label)
$ NgramsRepoElement { _nre_size = 1 $ NgramsRepoElement { _nre_size = 1
, _nre_list = case status == "map" of , _nre_list = case status of
True -> MapTerm "map" -> MapTerm
False -> case status == "main" of "main" -> CandidateTerm
True -> CandidateTerm _ -> StopTerm
False -> StopTerm , _nre_root = Nothing
, _nre_root = Nothing , _nre_parent = Nothing
, _nre_parent = Nothing , _nre_children = MSet
, _nre_children = MSet $ Map.fromList
$ Map.fromList $ map (\form -> (NgramsTerm form, ()))
$ map (\form -> (NgramsTerm form, ())) $ filter (\w -> w /= "" && w /= label)
$ filter (\w -> w /= "" && w /= label) $ splitOn "|&|" forms
$ splitOn "|&|" forms }
}
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -418,6 +418,7 @@ insertMasterDocs ncs c lang hs = do ...@@ -418,6 +418,7 @@ insertMasterDocs ncs c lang hs = do
-- add documents to the corpus (create node_node link) -- add documents to the corpus (create node_node link)
-- this will enable global database monitoring -- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount))) mapNgramsDocs' :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
<- mapNodeIdNgrams <- mapNodeIdNgrams
<$> documentIdWithNgrams <$> documentIdWithNgrams
......
...@@ -23,8 +23,8 @@ import Control.Concurrent.STM.TChan ...@@ -23,8 +23,8 @@ import Control.Concurrent.STM.TChan
import Control.Concurrent.STM.TSem (newTSem, signalTSem, TSem) import Control.Concurrent.STM.TSem (newTSem, signalTSem, TSem)
import Control.Concurrent (threadDelay) import Control.Concurrent (threadDelay)
import Control.Lens ((^.)) import Control.Lens ((^.))
import Control.Monad.STM (atomically)
import Control.Monad (void) import Control.Monad (void)
import Control.Monad.STM (atomically)
import Data.Aeson qualified as Aeson import Data.Aeson qualified as Aeson
import Data.ByteString qualified as BS import Data.ByteString qualified as BS
import Data.Text.Encoding qualified as TE import Data.Text.Encoding qualified as TE
...@@ -47,12 +47,14 @@ import Test.Database.Types (test_config) ...@@ -47,12 +47,14 @@ import Test.Database.Types (test_config)
import Test.Hspec import Test.Hspec
import Test.Hspec.Wai.Internal (withApplication) import Test.Hspec.Wai.Internal (withApplication)
import Test.Instances () import Test.Instances ()
import Test.Utils (waitForTChanValue, waitForTSem)
import Test.Utils.Notifications (withAsyncWSConnection) import Test.Utils.Notifications (withAsyncWSConnection)
import Test.Utils (protected, waitForTChanValue, waitForTSem, withValidLoginA) import Test.Utils (protected, withValidLoginA)
import Text.RawString.QQ (r) import Text.RawString.QQ (r)
tests :: Spec tests :: Spec
tests = sequential $ around withTestDBAndPort $ do tests = sequential $ around withTestDBAndPort $ do
describe "Notifications" $ do describe "Notifications" $ do
......
...@@ -190,7 +190,7 @@ dbEnvSetup ctx = do ...@@ -190,7 +190,7 @@ dbEnvSetup ctx = do
_ <- createAliceAndBob testEnv _ <- createAliceAndBob testEnv
pure ctx pure ctx
-- show the full exceptions during testing, rather than shallowing them under a generic -- show the full exceptions during testing, rather than shallowing them under a generic
-- "Something went wrong". -- "Something went wrong".
showDebugExceptions :: SomeException -> Wai.Response showDebugExceptions :: SomeException -> Wai.Response
......
...@@ -213,6 +213,11 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do ...@@ -213,6 +213,11 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
, (NgramsTerm "brazorf", NgramsRepoElement 1 StopTerm Nothing Nothing (MSet mempty)) , (NgramsTerm "brazorf", NgramsRepoElement 1 StopTerm Nothing Nothing (MSet mempty))
])]) ])])
it "parses TSV with UTF-8 issues" $ \(SpecContext _testEnv _port _app _) -> do
simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName "test-data/ngrams/433-utf-encoding-issue.tsv")
-- we don't care about the output, only that the file was parsed without errors (this file is garbage)
ngramsListFromTSVData simpleNgrams `shouldSatisfy` isRight
it "allows uploading a CSV ngrams file" $ \(SpecContext testEnv port app _) -> do it "allows uploading a CSV ngrams file" $ \(SpecContext testEnv port app _) -> do
cId <- newCorpusForUser testEnv "alice" cId <- newCorpusForUser testEnv "alice"
let log_cfg = (test_config testEnv) ^. gc_logging let log_cfg = (test_config testEnv) ^. gc_logging
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment