[ngrams] ignore errors when parsing TSV files

Textual formats often have either encoding or column errors.

This is a common issue, see e.g.

We just ignore them.
......@@ -32,6 +32,7 @@ data-files:
......@@ -171,27 +171,28 @@ ngramsListFromTSVData tsvData = case decodeTsv of
binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData
decodeTsv :: Either Prelude.String (Vector NgramsTableMap)
decodeTsv = Tsv.decodeWithP tsvToNgramsTableMap
decodeTsv = Vec.catMaybes <$>
Tsv.decodeWithP tsvToNgramsTableMap
(Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') })
-- | Converts a plain TSV 'Record' into an NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser (Maybe NgramsTableMap)
tsvToNgramsTableMap record = case Vec.toList record of
(map P.decodeUtf8 -> [status, label, forms])
-> pure $ conv status label forms
_ -> Prelude.fail "tsvToNgramsTableMap failed"
-> pure $ Just $ conv status label forms
-- WARNING: This silently ignores errors (#433)
_ -> pure Nothing
conv :: Text -> Text -> Text -> NgramsTableMap
conv status label forms = Map.singleton (NgramsTerm label)
$ NgramsRepoElement { _nre_size = 1
, _nre_list = case status == "map" of
True -> MapTerm
False -> case status == "main" of
True -> CandidateTerm
False -> StopTerm
, _nre_list = case status of
"map" -> MapTerm
"main" -> CandidateTerm
_ -> StopTerm
, _nre_root = Nothing
, _nre_parent = Nothing
, _nre_children = MSet
......@@ -209,6 +209,11 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
, (NgramsTerm "brazorf", NgramsRepoElement 1 StopTerm Nothing Nothing (MSet mempty))
it "parses TSV with UTF-8 issues" $ \(SpecContext _testEnv _port _app _) -> do
simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName "test-data/ngrams/433-utf-encoding-issue.tsv")
-- we don't care about the output, only that the file was parsed without errors (this file is garbage)
ngramsListFromTSVData simpleNgrams `shouldSatisfy` isRight
it "allows uploading a CSV ngrams file" $ \(SpecContext testEnv port app _) -> do
cId <- newCorpusForUser testEnv "alice"
withApplication app $ do
