[ngrams] ignore errors when parsing TSV files

Related to
#433

Textual formats often have either encoding or column errors.

This is a common issue, see e.g.
https://duckdb.org/docs/data/csv/reading_faulty_csv_files.html

We just ignore them.
parent bf89561b
Pipeline #7261 passed with stages
in 87 minutes and 15 seconds
......@@ -32,6 +32,7 @@ data-files:
ekg-assets/bootstrap-1.4.0.min.css
ekg-assets/chart_line_add.png
ekg-assets/cross.png
test-data/ngrams/433-utf-encoding-issue.tsv
test-data/ngrams/GarganText_DocsList-nodeId-177.json
test-data/ngrams/GarganText_NgramsTerms-nodeId-177.json
test-data/ngrams/GarganText_NgramsTerms-QuantumComputing.json
......
......@@ -171,27 +171,28 @@ ngramsListFromTSVData tsvData = case decodeTsv of
binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData
decodeTsv :: Either Prelude.String (Vector NgramsTableMap)
decodeTsv = Tsv.decodeWithP tsvToNgramsTableMap
decodeTsv = Vec.catMaybes <$>
Tsv.decodeWithP tsvToNgramsTableMap
(Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') })
Tsv.HasHeader
binaryData
-- | Converts a plain TSV 'Record' into an NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser (Maybe NgramsTableMap)
tsvToNgramsTableMap record = case Vec.toList record of
(map P.decodeUtf8 -> [status, label, forms])
-> pure $ conv status label forms
_ -> Prelude.fail "tsvToNgramsTableMap failed"
-> pure $ Just $ conv status label forms
-- WARNING: This silently ignores errors (#433)
_ -> pure Nothing
where
conv :: Text -> Text -> Text -> NgramsTableMap
conv status label forms = Map.singleton (NgramsTerm label)
$ NgramsRepoElement { _nre_size = 1
, _nre_list = case status == "map" of
True -> MapTerm
False -> case status == "main" of
True -> CandidateTerm
False -> StopTerm
, _nre_list = case status of
"map" -> MapTerm
"main" -> CandidateTerm
_ -> StopTerm
, _nre_root = Nothing
, _nre_parent = Nothing
, _nre_children = MSet
......
......@@ -209,6 +209,11 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
, (NgramsTerm "brazorf", NgramsRepoElement 1 StopTerm Nothing Nothing (MSet mempty))
])])
it "parses TSV with UTF-8 issues" $ \(SpecContext _testEnv _port _app _) -> do
simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName "test-data/ngrams/433-utf-encoding-issue.tsv")
-- we don't care about the output, only that the file was parsed without errors (this file is garbage)
ngramsListFromTSVData simpleNgrams `shouldSatisfy` isRight
it "allows uploading a CSV ngrams file" $ \(SpecContext testEnv port app _) -> do
cId <- newCorpusForUser testEnv "alice"
withApplication app $ do
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment