[ngrams] ignore errors when parsing TSV files

Related to
#433

Textual formats often have either encoding or column errors.

This is a common issue, see e.g.
https://duckdb.org/docs/data/csv/reading_faulty_csv_files.html

We just ignore them.
parent bf89561b
Pipeline #7261 passed with stages
in 87 minutes and 15 seconds
...@@ -32,6 +32,7 @@ data-files: ...@@ -32,6 +32,7 @@ data-files:
ekg-assets/bootstrap-1.4.0.min.css ekg-assets/bootstrap-1.4.0.min.css
ekg-assets/chart_line_add.png ekg-assets/chart_line_add.png
ekg-assets/cross.png ekg-assets/cross.png
test-data/ngrams/433-utf-encoding-issue.tsv
test-data/ngrams/GarganText_DocsList-nodeId-177.json test-data/ngrams/GarganText_DocsList-nodeId-177.json
test-data/ngrams/GarganText_NgramsTerms-nodeId-177.json test-data/ngrams/GarganText_NgramsTerms-nodeId-177.json
test-data/ngrams/GarganText_NgramsTerms-QuantumComputing.json test-data/ngrams/GarganText_NgramsTerms-QuantumComputing.json
......
...@@ -171,35 +171,36 @@ ngramsListFromTSVData tsvData = case decodeTsv of ...@@ -171,35 +171,36 @@ ngramsListFromTSVData tsvData = case decodeTsv of
binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData
decodeTsv :: Either Prelude.String (Vector NgramsTableMap) decodeTsv :: Either Prelude.String (Vector NgramsTableMap)
decodeTsv = Tsv.decodeWithP tsvToNgramsTableMap decodeTsv = Vec.catMaybes <$>
(Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') }) Tsv.decodeWithP tsvToNgramsTableMap
Tsv.HasHeader (Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') })
binaryData Tsv.HasHeader
binaryData
-- | Converts a plain TSV 'Record' into an NgramsTableMap -- | Converts a plain TSV 'Record' into an NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser NgramsTableMap tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser (Maybe NgramsTableMap)
tsvToNgramsTableMap record = case Vec.toList record of tsvToNgramsTableMap record = case Vec.toList record of
(map P.decodeUtf8 -> [status, label, forms]) (map P.decodeUtf8 -> [status, label, forms])
-> pure $ conv status label forms -> pure $ Just $ conv status label forms
_ -> Prelude.fail "tsvToNgramsTableMap failed" -- WARNING: This silently ignores errors (#433)
_ -> pure Nothing
where where
conv :: Text -> Text -> Text -> NgramsTableMap conv :: Text -> Text -> Text -> NgramsTableMap
conv status label forms = Map.singleton (NgramsTerm label) conv status label forms = Map.singleton (NgramsTerm label)
$ NgramsRepoElement { _nre_size = 1 $ NgramsRepoElement { _nre_size = 1
, _nre_list = case status == "map" of , _nre_list = case status of
True -> MapTerm "map" -> MapTerm
False -> case status == "main" of "main" -> CandidateTerm
True -> CandidateTerm _ -> StopTerm
False -> StopTerm , _nre_root = Nothing
, _nre_root = Nothing , _nre_parent = Nothing
, _nre_parent = Nothing , _nre_children = MSet
, _nre_children = MSet $ Map.fromList
$ Map.fromList $ map (\form -> (NgramsTerm form, ()))
$ map (\form -> (NgramsTerm form, ())) $ filter (\w -> w /= "" && w /= label)
$ filter (\w -> w /= "" && w /= label) $ splitOn "|&|" forms
$ splitOn "|&|" forms }
}
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -209,6 +209,11 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do ...@@ -209,6 +209,11 @@ tests = sequential $ aroundAll withTestDBAndPort $ beforeAllWith dbEnvSetup $ do
, (NgramsTerm "brazorf", NgramsRepoElement 1 StopTerm Nothing Nothing (MSet mempty)) , (NgramsTerm "brazorf", NgramsRepoElement 1 StopTerm Nothing Nothing (MSet mempty))
])]) ])])
it "parses TSV with UTF-8 issues" $ \(SpecContext _testEnv _port _app _) -> do
simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName "test-data/ngrams/433-utf-encoding-issue.tsv")
-- we don't care about the output, only that the file was parsed without errors (this file is garbage)
ngramsListFromTSVData simpleNgrams `shouldSatisfy` isRight
it "allows uploading a CSV ngrams file" $ \(SpecContext testEnv port app _) -> do it "allows uploading a CSV ngrams file" $ \(SpecContext testEnv port app _) -> do
cId <- newCorpusForUser testEnv "alice" cId <- newCorpusForUser testEnv "alice"
withApplication app $ do withApplication app $ do
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment