From 9a26e5652d0f616fc744188a089f56cca7c8c5e8 Mon Sep 17 00:00:00 2001
From: Alfredo Di Napoli <alfredo@well-typed.com>
Date: Mon, 31 Mar 2025 09:27:10 +0200
Subject: [PATCH] Reproduce TSV parsing issue for #380

---
 gargantext.cabal               |  1 +
 hie.yaml                       |  6 ++++
 test-data/issue-380/corpus.tsv |  9 ++++++
 test/Test/API/Private/List.hs  | 59 +++++++++++++++++++++++++++-------
 test/Test/API/Routes.hs        | 20 ++++++++++++
 test/Test/API/UpdateList.hs    |  1 +
 6 files changed, 84 insertions(+), 12 deletions(-)
 create mode 100644 test-data/issue-380/corpus.tsv

diff --git a/gargantext.cabal b/gargantext.cabal
index 4928cfcf..132aabf3 100644
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -56,6 +56,7 @@ data-files:
     test-data/test_config.toml
     test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir.tsv
     test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir12.csv
+    test-data/issue-380/corpus.tsv
     .clippy.dhall
 
 -- common options
diff --git a/hie.yaml b/hie.yaml
index 6e4b1019..c42ffd29 100644
--- a/hie.yaml
+++ b/hie.yaml
@@ -68,3 +68,9 @@ cradle:
 
     - path: "./test"
       component: "gargantext:test:garg-test-hspec"
+
+    - path: "./bench/Main.hs"
+      component: "gargantext:bench:garg-bench"
+
+    - path: "./bench/Paths_gargantext.hs"
+      component: "gargantext:bench:garg-bench"
diff --git a/test-data/issue-380/corpus.tsv b/test-data/issue-380/corpus.tsv
new file mode 100644
index 00000000..f4365e9c
--- /dev/null
+++ b/test-data/issue-380/corpus.tsv
@@ -0,0 +1,9 @@
+Title	Authors	Source	Abstract	Publication Year	Publication Month	Publication Day
+#Ecuador | 🚨Se insta	antisananews	mastodon.social	#Ecuador | 🚨Se instaló la audiencia de juicio contra el exalcalde de #Quito, Jorge Yunda, y 13 personas más, procesadas por #peculado en la compra de 100.000 pruebas para detectar #COVID19 y que presuntamente abrían causado un perjuicio al Estado por 2’235.491,16 dólares.#Comparta #Suscríbase↩👍Sígame enRadio Antisana Media Online: https://antisananews.blogspot.com/TikTok: https://www.tiktok.com/@antisanamediaonline?lang=esTelegram: https://t.me/AntisanaMediaOnlineVK: https://vk.com/antisanamultimediosX: https://twitter.com/AntisanaNews	2024	02	06
+#NorthCarolina  bann	MatthewChat@mstdn.social	toot.io	#NorthCarolina  banned immunocompromized people from wearing masks in public. This does NOT apply to the #KKK, as their is a specific exemption for them. #covid #masking	2024	05	16
+'The coronation of t	BenHigbie@mastodon.social	fosstodon.org	'The coronation of the Serbian Tsar Stefan Dušan as East Roman Emperor' from 'The Slav Epic' by Alphonse Mucha (1926) #art #arts #artist #artists #artlover #artlovers #arthistory #illustration #painting #paintings #inspiration #artmuseum #museum #artmuseums #museums #artnet	2024	05	05
+3/5 Krankenstand 202	ToveHarris	mastodon.social	3/5 Krankenstand 2022 und 2023 komme Verlust von 350.000 Beschäftigten gleich. Arbeitsausfall werde derzeit durch Überstunden + erhöhte Produktivität aufgefangen. Ohne diese Leistungen der Beschäftigten gäbe es eine Lücke von 700.000 Beschäftigten.Und #CovidIsNotOver #LongCOVID #COVID #COVID19 #Coronahttps://www.vfa.de/de/wirtschaft-politik/macroscope/macroscope-hoher-krankenstand-drueckt-deutschland-in-die-rezession	2024	01	27
+@ABScientist @Hidde	justafrog@mstdn.social	mastodon.social	@ABScientist @Hidde @roelgrif How about this one?https://nos.nl/artikel/2457983-viroloog-koopmans-coronagolf-in-china-nu-niet-heel-zorgelijk-voor-nederland	2023	12	28
+@cassandra17lina In	gemswinc	counter.social	@cassandra17lina In a way, Covid was a gift to introverts	2024	01	24
+@erictopol This is t	SpookieRobieTheCat@mastodon.social	toot.io	@erictopol This is the #CovidBrain that Trump and MAGA suffer from. All those #Antivaxx proponents will suffer the consequences too. And I'm ok with it. Be anti-science, be wilfully ignorant and live a life in agony, that's their choice. I shouldn't have to pay a dime for their stupidity.	2024	01	09
+A bunch of maskless	crowgirl@hachyderm.io	toot.io	"A bunch of maskless #Covid "experts" like Gregg Gonsalves are now trying to sound credible by warning about H5N1.I think H5N1 is a serious problem that requires airborne mitigations and decontaminating the food supply.And I also don't dine in restaurants. Gregg on the other hand is proud of his masklessness.Be warned. Don't let these professional Covid minimizers get away with this crap with #H5N1."	2024	06	03
diff --git a/test/Test/API/Private/List.hs b/test/Test/API/Private/List.hs
index 27ef09ba..f9d3d9e7 100644
--- a/test/Test/API/Private/List.hs
+++ b/test/Test/API/Private/List.hs
@@ -10,24 +10,59 @@ module Test.API.Private.List (
 
 import Data.Aeson.QQ
 import Data.Text.IO qualified as TIO
+import Fmt
 import Gargantext.API.Ngrams.List.Types
 import Gargantext.API.Ngrams.Types qualified as APINgrams
 import Gargantext.API.Node.Corpus.New.Types qualified as FType
 import Gargantext.Core.Config
 import Gargantext.Core.Types
 import Gargantext.Core.Types.Individu
+import Gargantext.Core.Worker.Types
 import Gargantext.Prelude
 import Paths_gargantext
+import Prelude
 import Servant.Client.Streaming
 import Test.API.Prelude (newCorpusForUser, checkEither)
 import Test.API.Routes
 import Test.API.Setup
+import Test.API.UpdateList qualified as UpdateList
 import Test.Database.Types
 import Test.Hspec (Spec, it, aroundAll, describe, sequential)
 import Test.Hspec.Expectations
 import Test.Hspec.Wai.Internal (withApplication)
 import Test.Utils
-import Fmt
+
+importTermsTSV :: SpecContext () -> String -> IO (JobInfo, CorpusId, ListId)
+importTermsTSV (SpecContext testEnv port app _) name = do
+  cId <- liftIO $ newCorpusForUser testEnv "alice"
+  let log_cfg = test_config testEnv ^. gc_logging
+  withApplication app $ do
+    withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do
+      ([listId] :: [NodeId]) <- protectedJSON token "POST" (mkUrl port ("/node/" <> build cId)) [aesonQQ|{"pn_typename":"NodeList","pn_name":"Testing"}|]
+      -- Upload the CSV doc
+      simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName name)
+      let params = WithTextFile { _wtf_filetype = FType.TSV
+                                , _wtf_data = simpleNgrams
+                                , _wtf_name = "simple.tsv" }
+      pendingJob <- checkEither $ liftIO $ runClientM (add_tsv_to_list token listId params) clientEnv
+      jobInfo <- pollUntilWorkFinished log_cfg port pendingJob
+
+      pure (jobInfo, cId, listId)
+
+importCorpusTSV :: SpecContext () -> String -> IO (JobInfo, CorpusId, ListId)
+importCorpusTSV (SpecContext testEnv port app _) name = do
+  cId <- liftIO $ newCorpusForUser testEnv "alice"
+  let log_cfg = test_config testEnv ^. gc_logging
+  withApplication app $ do
+    withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do
+      ([listId] :: [NodeId]) <- protectedJSON token "POST" (mkUrl port ("/node/" <> build cId)) [aesonQQ|{"pn_typename":"NodeList","pn_name":"Testing"}|]
+      -- Upload the CSV doc
+      simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName name)
+      let params = UpdateList.mkNewWithForm simpleNgrams "simple.tsv"
+      pendingJob <- checkEither $ liftIO $ runClientM (importCorpus token listId params) clientEnv
+      jobInfo <- pollUntilWorkFinished log_cfg port pendingJob
+
+      pure (jobInfo, cId, listId)
 
 tests :: Spec
 tests = sequential $ aroundAll withTestDBAndPort $ do
@@ -39,20 +74,20 @@ tests = sequential $ aroundAll withTestDBAndPort $ do
 
     describe "Importing terms as TSV" $ do
 
-      it "should work for TSV with a missing 'forms' column" $ \(SpecContext testEnv port app _) -> do
-        cId <- newCorpusForUser testEnv "alice"
-        let log_cfg = test_config testEnv ^. gc_logging
+      it "should work for TSV with a missing 'forms' column" $ \ctx@(SpecContext _ port app _) -> do
+        (_, cId, listId) <- importTermsTSV ctx "test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir.tsv"
         withApplication app $ do
           withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do
-            ([listId] :: [NodeId]) <- protectedJSON token "POST" (mkUrl port ("/node/" <> build cId)) [aesonQQ|{"pn_typename":"NodeList","pn_name":"Testing"}|]
-            -- Upload the CSV doc
-            simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName "test-data/issue-381/Termes_A_Ajouter_T4SC_Intellixir.tsv")
-            let params = WithTextFile { _wtf_filetype = FType.TSV
-                                      , _wtf_data = simpleNgrams
-                                      , _wtf_name = "simple.tsv" }
-            pendingJob <- checkEither $ liftIO $ runClientM (add_tsv_to_list token listId params) clientEnv
-            _ <- pollUntilWorkFinished log_cfg port pendingJob
+            -- Now check that we can retrieve the ngrams, and the ngrams list is not empty!
+            liftIO $ do
+              eRes <- checkEither $ runClientM (get_table_ngrams token cId APINgrams.Terms listId 50 Nothing (Just MapTerm) Nothing Nothing Nothing Nothing) clientEnv
+              let (APINgrams.NgramsTable terms) = APINgrams._vc_data eRes
+              length terms `shouldSatisfy` (>= 1)
 
+      it "should handle dirty TSV as per issue #380" $ \ctx@(SpecContext _testEnv port app _) -> do
+        (_, cId, listId) <- importCorpusTSV ctx "test-data/issue-380/corpus.tsv"
+        withApplication app $ do
+          withValidLogin port "alice" (GargPassword "alice") $ \clientEnv token -> do
             -- Now check that we can retrieve the ngrams, and the ngrams list is not empty!
             liftIO $ do
               eRes <- checkEither $ runClientM (get_table_ngrams token cId APINgrams.Terms listId 50 Nothing (Just MapTerm) Nothing Nothing Nothing Nothing) clientEnv
diff --git a/test/Test/API/Routes.hs b/test/Test/API/Routes.hs
index 7567eb6c..76ba191f 100644
--- a/test/Test/API/Routes.hs
+++ b/test/Test/API/Routes.hs
@@ -30,6 +30,7 @@ module Test.API.Routes (
   , add_tsv_to_list
   , get_corpus_sqlite_export
   , addTeamMember
+  , importCorpus
   ) where
 
 import Data.Text.Encoding qualified as TE
@@ -66,6 +67,8 @@ import Servant (Headers, Header)
 import Servant.Auth.Client qualified as S
 import Servant.Client.Streaming
 import Servant.Conduit ()
+import Gargantext.API.Routes.Named.Corpus (addWithTempFileEp)
+import Gargantext.API.Node.Types (NewWithForm)
 
 
 -- This is for requests made by http.client directly to hand-crafted URLs.
@@ -381,3 +384,20 @@ addTeamMember (toServantToken -> token) nodeId params = fmap UnsafeMkNodeId $
                & shareAPI
                & shareNodeEp
                & ($ params)
+
+importCorpus :: Token -> CorpusId -> NewWithForm -> ClientM JobInfo
+importCorpus (toServantToken -> token) corpusId params =
+  clientRoutes & apiWithCustomErrorScheme
+               & ($ GES_new)
+               & backendAPI
+               & backendAPI'
+               & mkBackEndAPI
+               & gargAPIVersion
+               & gargPrivateAPI
+               & mkPrivateAPI
+               & ($ token)
+               & addWithTempFile
+               & addWithTempFileEp
+               & ($ corpusId)
+               & workerAPIPost
+               & (\submitForm -> submitForm params)
diff --git a/test/Test/API/UpdateList.hs b/test/Test/API/UpdateList.hs
index 01822ee6..4a3597b2 100644
--- a/test/Test/API/UpdateList.hs
+++ b/test/Test/API/UpdateList.hs
@@ -24,6 +24,7 @@ module Test.API.UpdateList (
   , updateNode
   , createDocsList
   , createFortranDocsList
+  , mkNewWithForm
   ) where
 
 import Control.Lens (mapped, over)
-- 
2.21.0