Commit 946448be authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/433-dev-ignore-tsv-errors' into dev

parents 3bf63d30 a5f299c5
......@@ -75,6 +75,7 @@ import_p = fmap CCMD_import $ ImportArgs
<*> ( option str ( long "user") )
<*> ( option str ( long "name") )
<*> settings_p
-- <*> (fmap Limit ( option auto ( long "limit" <> metavar "INT" <> help "The limit for the query") ))
<*> ( option str ( long "corpus-path" <> help "Path to corpus file") )
function_p :: String -> Either String ImportFunction
......
......@@ -18,8 +18,8 @@ fi
# with the `sha256sum` result calculated on the `cabal.project` and
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in.
expected_cabal_project_hash="37352ca36ca5e69d9945da11439be4c3909297b338242855fa588dffdf1ba02b"
expected_cabal_project_freeze_hash="cd52143d3a9d285360b59c6371d3e258552c1bc115bd612024db3de1f7593ff7"
expected_cabal_project_hash="37352ca36ca5e69d9945da11439be4c3909297b338242855fa588dffdf1ba02b"
expected_cabal_project_freeze_hash="cd52143d3a9d285360b59c6371d3e258552c1bc115bd612024db3de1f7593ff7"
cabal --store-dir=$STORE_DIR v2-build --dry-run
......
......@@ -32,6 +32,7 @@ data-files:
ekg-assets/bootstrap-1.4.0.min.css
ekg-assets/chart_line_add.png
ekg-assets/cross.png
test-data/ngrams/433-utf-encoding-issue.tsv
test-data/ngrams/GarganText_DocsList-nodeId-177.json
test-data/ngrams/GarganText_NgramsTerms-nodeId-177.json
test-data/ngrams/GarganText_NgramsTerms-QuantumComputing.json
......
......@@ -178,35 +178,36 @@ ngramsListFromTSVData tsvData = case decodeTsv of
binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData
decodeTsv :: Either Prelude.String (Vector NgramsTableMap)
decodeTsv = Tsv.decodeWithP tsvToNgramsTableMap
(Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') })
Tsv.HasHeader
binaryData
decodeTsv = Vec.catMaybes <$>
Tsv.decodeWithP tsvToNgramsTableMap
(Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') })
Tsv.HasHeader
binaryData
-- | Converts a plain TSV 'Record' into an NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser (Maybe NgramsTableMap)
tsvToNgramsTableMap record = case Vec.toList record of
(map P.decodeUtf8 -> [status, label, forms])
-> pure $ conv status label forms
_ -> Prelude.fail "tsvToNgramsTableMap failed"
-> pure $ Just $ conv status label forms
-- WARNING: This silently ignores errors (#433)
_ -> pure Nothing
where
conv :: Text -> Text -> Text -> NgramsTableMap
conv status label forms = Map.singleton (NgramsTerm label)
$ NgramsRepoElement { _nre_size = 1
, _nre_list = case status == "map" of
True -> MapTerm
False -> case status == "main" of
True -> CandidateTerm
False -> StopTerm
, _nre_root = Nothing
, _nre_parent = Nothing
, _nre_children = MSet
$ Map.fromList
$ map (\form -> (NgramsTerm form, ()))
$ filter (\w -> w /= "" && w /= label)
$ splitOn "|&|" forms
}
, _nre_list = case status of
"map" -> MapTerm
"main" -> CandidateTerm
_ -> StopTerm
, _nre_root = Nothing
, _nre_parent = Nothing
, _nre_children = MSet
$ Map.fromList
$ map (\form -> (NgramsTerm form, ()))
$ filter (\w -> w /= "" && w /= label)
$ splitOn "|&|" forms
}
------------------------------------------------------------------------
......
......@@ -418,6 +418,7 @@ insertMasterDocs ncs c lang hs = do
-- add documents to the corpus (create node_node link)
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (TermsWeight, TermsCount)))
<- mapNodeIdNgrams
<$> documentIdWithNgrams
......
......@@ -23,8 +23,8 @@ import Control.Concurrent.STM.TChan
import Control.Concurrent.STM.TSem (newTSem, signalTSem, TSem)
import Control.Concurrent (threadDelay)
import Control.Lens ((^.))
import Control.Monad.STM (atomically)
import Control.Monad (void)
import Control.Monad.STM (atomically)
import Data.Aeson qualified as Aeson
import Data.ByteString qualified as BS
import Data.Text.Encoding qualified as TE
......@@ -47,12 +47,14 @@ import Test.Database.Types (test_config)
import Test.Hspec
import Test.Hspec.Wai.Internal (withApplication)
import Test.Instances ()
import Test.Utils (waitForTChanValue, waitForTSem)
import Test.Utils.Notifications (withAsyncWSConnection)
import Test.Utils (protected, waitForTChanValue, waitForTSem, withValidLoginA)
import Test.Utils (protected, withValidLoginA)
import Text.RawString.QQ (r)
tests :: Spec
tests = sequential $ around withTestDBAndPort $ do
describe "Notifications" $ do
......
......@@ -190,7 +190,7 @@ dbEnvSetup ctx = do
_ <- createAliceAndBob testEnv
pure ctx
-- show the full exceptions during testing, rather than shallowing them under a generic
-- "Something went wrong".
showDebugExceptions :: SomeException -> Wai.Response
......
......@@ -213,6 +213,11 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
, (NgramsTerm "brazorf", NgramsRepoElement 1 StopTerm Nothing Nothing (MSet mempty))
])])
it "parses TSV with UTF-8 issues" $ \(SpecContext _testEnv _port _app _) -> do
simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName "test-data/ngrams/433-utf-encoding-issue.tsv")
-- we don't care about the output, only that the file was parsed without errors (this file is garbage)
ngramsListFromTSVData simpleNgrams `shouldSatisfy` isRight
it "allows uploading a CSV ngrams file" $ \(SpecContext testEnv port app _) -> do
cId <- newCorpusForUser testEnv "alice"
let log_cfg = (test_config testEnv) ^. gc_logging
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment