From 9a26e5652d0f616fc744188a089f56cca7c8c5e8 Mon Sep 17 00:00:00 2001 From: Alfredo Di Napoli <alfredo@well-typed.com> Date: Mon, 31 Mar 2025 09:27:10 +0200 Subject: [PATCH] Reproduce TSV parsing issue for #380 --- gargantext.cabal | 1 + hie.yaml | 6 ++++ test-data/issue-380/corpus.tsv | 9 ++++++ test/Test/API/Private/List.hs | 59 +++++++++++++++++++++++++++------- test/Test/API/Routes.hs | 20 ++++++++++++ test/Test/API/UpdateList.hs | 1 + 6 files changed, 84 insertions(+), 12 deletions(-) create mode 100644 test-data/issue-380/corpus.tsv diff --git a/gargantext.cabal b/gargantext.cabal index 4928cfcf..132aabf3 100644 --- a/gargantext.cabal +++ b/gargantext.cabal @@ -56,6 +56,7 @@ data-files: test-data/test_config.toml test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir.tsv test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir12.csv + test-data/issue-380/corpus.tsv .clippy.dhall -- common options diff --git a/hie.yaml b/hie.yaml index 6e4b1019..c42ffd29 100644 --- a/hie.yaml +++ b/hie.yaml @@ -68,3 +68,9 @@ cradle: - path: "./test" component: "gargantext:test:garg-test-hspec" + + - path: "./bench/Main.hs" + component: "gargantext:bench:garg-bench" + + - path: "./bench/Paths_gargantext.hs" + component: "gargantext:bench:garg-bench" diff --git a/test-data/issue-380/corpus.tsv b/test-data/issue-380/corpus.tsv new file mode 100644 index 00000000..f4365e9c --- /dev/null +++ b/test-data/issue-380/corpus.tsv @@ -0,0 +1,9 @@ +Title Authors Source Abstract Publication Year Publication Month Publication Day +#Ecuador | 🚨Se insta antisananews mastodon.social #Ecuador | 🚨Se instaló la audiencia de juicio contra el exalcalde de #Quito, Jorge Yunda, y 13 personas más, procesadas por #peculado en la compra de 100.000 pruebas para detectar #COVID19 y que presuntamente abrÃan causado un perjuicio al Estado por 2’235.491,16 dólares.#Comparta #SuscrÃbase↩ðŸ‘SÃgame enRadio Antisana Media Online: https://antisananews.blogspot.com/TikTok: https://www.tiktok.com/@antisanamediaonline?lang=esTelegram: https://t.me/AntisanaMediaOnlineVK: https://vk.com/antisanamultimediosX: https://twitter.com/AntisanaNews 2024 02 06 +#NorthCarolina bann MatthewChat@mstdn.social toot.io #NorthCarolina banned immunocompromized people from wearing masks in public. This does NOT apply to the #KKK, as their is a specific exemption for them. #covid #masking 2024 05 16 +'The coronation of t BenHigbie@mastodon.social fosstodon.org 'The coronation of the Serbian Tsar Stefan DuÅ¡an as East Roman Emperor' from 'The Slav Epic' by Alphonse Mucha (1926) #art #arts #artist #artists #artlover #artlovers #arthistory #illustration #painting #paintings #inspiration #artmuseum #museum #artmuseums #museums #artnet 2024 05 05 +3/5 Krankenstand 202 ToveHarris mastodon.social 3/5 Krankenstand 2022 und 2023 komme Verlust von 350.000 Beschäftigten gleich. Arbeitsausfall werde derzeit durch Überstunden + erhöhte Produktivität aufgefangen. Ohne diese Leistungen der Beschäftigten gäbe es eine Lücke von 700.000 Beschäftigten.Und #CovidIsNotOver #LongCOVID #COVID #COVID19 #Coronahttps://www.vfa.de/de/wirtschaft-politik/macroscope/macroscope-hoher-krankenstand-drueckt-deutschland-in-die-rezession 2024 01 27 +@ABScientist @Hidde justafrog@mstdn.social mastodon.social @ABScientist @Hidde @roelgrif How about this one?https://nos.nl/artikel/2457983-viroloog-koopmans-coronagolf-in-china-nu-niet-heel-zorgelijk-voor-nederland 2023 12 28 +@cassandra17lina In gemswinc counter.social @cassandra17lina In a way, Covid was a gift to introverts 2024 01 24 +@erictopol This is t SpookieRobieTheCat@mastodon.social toot.io @erictopol This is the #CovidBrain that Trump and MAGA suffer from. All those #Antivaxx proponents will suffer the consequences too. And I'm ok with it. Be anti-science, be wilfully ignorant and live a life in agony, that's their choice. I shouldn't have to pay a dime for their stupidity. 2024 01 09 +A bunch of maskless crowgirl@hachyderm.io toot.io "A bunch of maskless #Covid "experts" like Gregg Gonsalves are now trying to sound credible by warning about H5N1.I think H5N1 is a serious problem that requires airborne mitigations and decontaminating the food supply.And I also don't dine in restaurants. Gregg on the other hand is proud of his masklessness.Be warned. Don't let these professional Covid minimizers get away with this crap with #H5N1." 2024 06 03 diff --git a/test/Test/API/Private/List.hs b/test/Test/API/Private/List.hs index 27ef09ba..f9d3d9e7 100644 --- a/test/Test/API/Private/List.hs +++ b/test/Test/API/Private/List.hs @@ -10,24 +10,59 @@ module Test.API.Private.List ( import Data.Aeson.QQ import Data.Text.IO qualified as TIO +import Fmt import Gargantext.API.Ngrams.List.Types import Gargantext.API.Ngrams.Types qualified as APINgrams import Gargantext.API.Node.Corpus.New.Types qualified as FType import Gargantext.Core.Config import Gargantext.Core.Types import Gargantext.Core.Types.Individu +import Gargantext.Core.Worker.Types import Gargantext.Prelude import Paths_gargantext +import Prelude import Servant.Client.Streaming import Test.API.Prelude (newCorpusForUser, checkEither) import Test.API.Routes import Test.API.Setup +import Test.API.UpdateList qualified as UpdateList import Test.Database.Types import Test.Hspec (Spec, it, aroundAll, describe, sequential) import Test.Hspec.Expectations import Test.Hspec.Wai.Internal (withApplication) import Test.Utils -import Fmt + +importTermsTSV :: SpecContext () -> String -> IO (JobInfo, CorpusId, ListId) +importTermsTSV (SpecContext testEnv port app _) name = do + cId <- liftIO $ newCorpusForUser testEnv "alice" + let log_cfg = test_config testEnv ^. gc_logging + withApplication app $ do + withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do + ([listId] :: [NodeId]) <- protectedJSON token "POST" (mkUrl port ("/node/" <> build cId)) [aesonQQ|{"pn_typename":"NodeList","pn_name":"Testing"}|] + -- Upload the CSV doc + simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName name) + let params = WithTextFile { _wtf_filetype = FType.TSV + , _wtf_data = simpleNgrams + , _wtf_name = "simple.tsv" } + pendingJob <- checkEither $ liftIO $ runClientM (add_tsv_to_list token listId params) clientEnv + jobInfo <- pollUntilWorkFinished log_cfg port pendingJob + + pure (jobInfo, cId, listId) + +importCorpusTSV :: SpecContext () -> String -> IO (JobInfo, CorpusId, ListId) +importCorpusTSV (SpecContext testEnv port app _) name = do + cId <- liftIO $ newCorpusForUser testEnv "alice" + let log_cfg = test_config testEnv ^. gc_logging + withApplication app $ do + withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do + ([listId] :: [NodeId]) <- protectedJSON token "POST" (mkUrl port ("/node/" <> build cId)) [aesonQQ|{"pn_typename":"NodeList","pn_name":"Testing"}|] + -- Upload the CSV doc + simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName name) + let params = UpdateList.mkNewWithForm simpleNgrams "simple.tsv" + pendingJob <- checkEither $ liftIO $ runClientM (importCorpus token listId params) clientEnv + jobInfo <- pollUntilWorkFinished log_cfg port pendingJob + + pure (jobInfo, cId, listId) tests :: Spec tests = sequential $ aroundAll withTestDBAndPort $ do @@ -39,20 +74,20 @@ tests = sequential $ aroundAll withTestDBAndPort $ do describe "Importing terms as TSV" $ do - it "should work for TSV with a missing 'forms' column" $ \(SpecContext testEnv port app _) -> do - cId <- newCorpusForUser testEnv "alice" - let log_cfg = test_config testEnv ^. gc_logging + it "should work for TSV with a missing 'forms' column" $ \ctx@(SpecContext _ port app _) -> do + (_, cId, listId) <- importTermsTSV ctx "test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir.tsv" withApplication app $ do withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do - ([listId] :: [NodeId]) <- protectedJSON token "POST" (mkUrl port ("/node/" <> build cId)) [aesonQQ|{"pn_typename":"NodeList","pn_name":"Testing"}|] - -- Upload the CSV doc - simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName "test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir.tsv") - let params = WithTextFile { _wtf_filetype = FType.TSV - , _wtf_data = simpleNgrams - , _wtf_name = "simple.tsv" } - pendingJob <- checkEither $ liftIO $ runClientM (add_tsv_to_list token listId params) clientEnv - _ <- pollUntilWorkFinished log_cfg port pendingJob + -- Now check that we can retrieve the ngrams, and the ngrams list is not empty! + liftIO $ do + eRes <- checkEither $ runClientM (get_table_ngrams token cId APINgrams.Terms listId 50 Nothing (Just MapTerm) Nothing Nothing Nothing Nothing) clientEnv + let (APINgrams.NgramsTable terms) = APINgrams._vc_data eRes + length terms `shouldSatisfy` (>= 1) + it "should handle dirty TSV as per issue #380" $ \ctx@(SpecContext _testEnv port app _) -> do + (_, cId, listId) <- importCorpusTSV ctx "test-data/issue-380/corpus.tsv" + withApplication app $ do + withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do -- Now check that we can retrieve the ngrams, and the ngrams list is not empty! liftIO $ do eRes <- checkEither $ runClientM (get_table_ngrams token cId APINgrams.Terms listId 50 Nothing (Just MapTerm) Nothing Nothing Nothing Nothing) clientEnv diff --git a/test/Test/API/Routes.hs b/test/Test/API/Routes.hs index 7567eb6c..76ba191f 100644 --- a/test/Test/API/Routes.hs +++ b/test/Test/API/Routes.hs @@ -30,6 +30,7 @@ module Test.API.Routes ( , add_tsv_to_list , get_corpus_sqlite_export , addTeamMember + , importCorpus ) where import Data.Text.Encoding qualified as TE @@ -66,6 +67,8 @@ import Servant (Headers, Header) import Servant.Auth.Client qualified as S import Servant.Client.Streaming import Servant.Conduit () +import Gargantext.API.Routes.Named.Corpus (addWithTempFileEp) +import Gargantext.API.Node.Types (NewWithForm) -- This is for requests made by http.client directly to hand-crafted URLs. @@ -381,3 +384,20 @@ addTeamMember (toServantToken -> token) nodeId params = fmap UnsafeMkNodeId $ & shareAPI & shareNodeEp & ($ params) + +importCorpus :: Token -> CorpusId -> NewWithForm -> ClientM JobInfo +importCorpus (toServantToken -> token) corpusId params = + clientRoutes & apiWithCustomErrorScheme + & ($ GES_new) + & backendAPI + & backendAPI' + & mkBackEndAPI + & gargAPIVersion + & gargPrivateAPI + & mkPrivateAPI + & ($ token) + & addWithTempFile + & addWithTempFileEp + & ($ corpusId) + & workerAPIPost + & (\submitForm -> submitForm params) diff --git a/test/Test/API/UpdateList.hs b/test/Test/API/UpdateList.hs index 01822ee6..4a3597b2 100644 --- a/test/Test/API/UpdateList.hs +++ b/test/Test/API/UpdateList.hs @@ -24,6 +24,7 @@ module Test.API.UpdateList ( , updateNode , createDocsList , createFortranDocsList + , mkNewWithForm ) where import Control.Lens (mapped, over) -- 2.21.0