Merge branch '471-dev-node-multiterms' into 'dev'

Resolve "[Node Terms] On multiple Map terms, if the first term is already a map term, the count of the multiple map term is to zero" Closes #471 See merge request !414

Merge branch '471-dev-node-multiterms' into 'dev'
Resolve "[Node Terms] On multiple Map terms, if the first term is already a map term, the count of the multiple map term is to zero" Closes #471 See merge request !414
27192b88 · Przemyslaw Kaminski · 61aac410 · 5ca033e9 · 27192b88 · 27192b88
Commit 27192b88 authored Jul 31, 2025 by Przemyslaw Kaminski
17 changed files
--- a/.ghci
+++ b/.ghci
--- a/README.md
+++ b/README.md
@@ -196,7 +196,7 @@ Here are some tips:
 * Lock the hackage-index state in the `cabal.project`, so that the
  solver won't try to pull newer dependencies;
 * Specify constraints you want directly when building like `cabal
-  v2-build --constraint tasty==x.y.z.w`
+  v2-build --constraint hspec==x.y.z.w`
 * Generate another `.freeze` with `cabal v2-freeze` once you got the
  new build to compile (this is good for small, incremental upgrades)
 * Bounds in `.cabal` are definitely respected, but ofc the `.freeze`
@@ -250,16 +250,9 @@ Or, from "outside":
 $ nix-shell --run "cabal v2-test --test-show-details=streaming"
 ```

-If you want to run particular tests, use (for Tasty):
-
-```shell
-cabal v2-test garg-test-tasty --test-show-details=streaming --test-option=--pattern='/job status update and tracking/
-```
-
-or (for Hspec):
-
+If you want to run particular tests, use:
 ```shell
-cabal v2-test garg-test-hspec --test-show-details=streaming --test-option=--match='/Dispatcher, Central Exchange, WebSockets/'
+cabal v2-test garg-test --test-show-details=streaming --test-option=--match='/job status update and tracking/
 ```

 ### CI
@@ -427,15 +420,26 @@ Maybe you need to change the port to 5433 for database connection in your gargan
 If you want to use `haskell-language-server` for GHC 9.4.7, install it
 with `ghcup`:
 ```shell
-ghcup compile hls --version 2.7.0.0 --ghc 9.4.7
+ghcup compile hls --version 2.11.0.0 --ghc 9.6.6 --cabal-update
 ```
 https://haskell-language-server.readthedocs.io/en/latest/installation.html

+*NOTE* Sometimes it helps to remove `$HOME/.cache/hie-bios` if you
+experience strange errors like
+```
+fatal: Could not parse object '316d48b6a89593faaf1f2102e9714cea7e416e56'.
+```
+
+If you're using emacs, you could fire up an interactive REPL directly
+via e.g. `haskell-interactive-bring`.
+
+In repl, the `.ghci` file from our repo would be loaded which contains
+a useful `:ggload` macro that imports all exposed modules from the
+`gargantext` library.

 ## Running the tests <a name="running-tests"></a>

 Running the tests can be done via the following command:
-
 ```shell
 cabal v2-test --test-show-details=streaming --flags 'test-crypto no-phylo-debug-logs'
 ```
@@ -446,28 +450,22 @@ The flags have the following meaning:
  faster;
 * `no-phylo-debug-logs`: Suppresses the debugging logs which would normally be present in phylo pure (!) code.

-In order for some tests to run (like the phylo ones) is **required** to install the `gargantext-cli` via:
-
-```shell
-cabal v2-install gargantext:exe:gargantext
-```
-
-For tasty, if you want to run specific test (via patterns), use:
-
+**NOTE**: Using the above flags will cause rebuild, if you just build with `cabal v2-build`. Run with:
 ```shell
-cabal v2-run garg-test-tasty -- -p '/Ngrams/
+cabal v2-test --test-show-details=streaming
 ```
+so you don't rebuild.

-For integration tests, do:
+If you want to run specific test (via patterns), use:

 ```shell
-cabal v2-test garg-test-hspec --test-show-details=streaming --test-option=--match='/some pattern/'
+cabal v2-run garg-test --test-show-details=streaming --test-option=--match='/Ngrams/'
 ```

 You could also use [ghciwatch](https://mercurytechnologies.github.io/ghciwatch/) for testsing:

 ```shell
-ghciwatch --command "cabal v2-repl garg-test-tasty" --after-startup-ghci ':set args "--pattern" "/Ngrams/"' --after-startup-ghci "Main.main" --after-reload-ghci "Main.main" --watch src --watch test
+ghciwatch --command "cabal v2-repl garg-test" --after-startup-ghci ':set args "--match" "/Ngrams/"' --after-startup-ghci "Main.main" --after-reload-ghci "Main.main" --watch src --watch test
 ```

 ### Modifying a golden test to accept a new (expected) output
@@ -475,11 +473,16 @@ ghciwatch --command "cabal v2-repl garg-test-tasty" --after-startup-ghci ':set a
 Some tests, like the Phylo one, use golden testing to ensure that the JSON Phylo we generate is
 the same as an expected one. This allows us to catch regressions in the serialisation or in the algorithm.
 Sometimes, however, we genuinely want to modify the output so that it's the new reference (i.e. the new
-golden reference). To do so, it's enough to run the testsuite passing the `--accept` flag, for example:
+golden reference). 

+If you do want to update the output, go to `test/Test/Offline/Phylo.hs` and set `doUpdateGolden = True`.
+Run tests:
 ```shell
-cabal v2-test garg-test-tasty --test-show-details=streaming --flags 'test-crypto no-phylo-debug-logs' --test-option=--pattern='/Phylo/' --test-option=--accept"
+cabal v2-test garg-test --test-show-details=streaming --test-option=--match='/Phylo/'
 ```
+The second time you run tests, they should pass.
+
+**Don't forget to set `doUpdateGolden = False` after updating the golden files!**

 # Async workers <a name="async-workers"></a>


--- a/nix/pkgs.nix
+++ b/nix/pkgs.nix
 # https://nix.dev/tutorials/first-steps/towards-reproducibility-pinning-nixpkgs.html
-{ pkgs ? import (if builtins.elem builtins.currentSystem ["x86_64-darwin" "aarch64-darwin"]
-                 then ./pinned-25.05.darwin.nix
-                 else ./pinned-25.05.nix) {} }:
+{ pkgs ? import
+    (if builtins.elem builtins.currentSystem [ "x86_64-darwin" "aarch64-darwin" ]
+    then ./pinned-25.05.darwin.nix
+    else ./pinned-25.05.nix)
+    { }
+}:

 rec {
  inherit pkgs;
  ghcVersion = "ghc966";
  gargGhc = pkgs.haskell.compiler.${ghcVersion};
  cabal_install = pkgs.haskell.lib.compose.justStaticExecutables pkgs.haskell.packages.${ghcVersion}.cabal-install;
-  
-  graphviz = pkgs.callPackage ./graphviz.nix {};
-  igraph_0_10_4 = pkgs.callPackage ./igraph.nix {};
-  corenlp = pkgs.callPackage ./corenlp.nix { };  # 4.5.9
+
+  graphviz = pkgs.callPackage ./graphviz.nix { };
+  igraph_0_10_4 = pkgs.callPackage ./igraph.nix { };
+  corenlp = pkgs.callPackage ./corenlp.nix { }; # 4.5.9
  cabal2stack = pkgs.callPackage ./cabal2stack.nix { ghc = gargGhc; };
  nng_notls = pkgs.nng.overrideAttrs (old: {
-    cmakeFlags = (old.cmakeFlags or []) ++ [ "-DNNG_ENABLE_TLS=OFF" ];
+    cmakeFlags = (old.cmakeFlags or [ ]) ++ [ "-DNNG_ENABLE_TLS=OFF" ];
  });
-  
+
  hsBuildInputs = [
    gargGhc
    cabal_install
    pkgs.haskellPackages.alex
-    pkgs.haskellPackages.ghcid
    pkgs.haskellPackages.happy
    pkgs.haskellPackages.pretty-show
  ];
@@ -49,7 +51,7 @@ rec {
    libpqxx
    libsodium
    nng_notls
-    nil  # nix language server
+    nil # nix language server
    pcre
    pkg-config
    postgresql
@@ -58,9 +60,9 @@ rec {
    zlib
    zeromq
    curl
-  ] ++ ( lib.optionals stdenv.isDarwin [
-       darwin.apple_sdk.frameworks.Accelerate
-       ]);
+  ] ++ (lib.optionals stdenv.isDarwin [
+    darwin.apple_sdk.frameworks.Accelerate
+  ]);
  libPaths = pkgs.lib.makeLibraryPath nonhsBuildInputs;
  shellHook = ''
    export LD_LIBRARY_PATH="${pkgs.gfortran.cc.lib}:${libPaths}:$LD_LIBRARY_PATH"

--- a/src/Gargantext/Core/Text/Terms/Mono.hs
+++ b/src/Gargantext/Core/Text/Terms/Mono.hs
@@ -18,9 +18,9 @@ module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentenc
 import Data.List qualified as L
 import Data.Set qualified as S
 import Data.Text qualified as T
-import Gargantext.Core
+import Gargantext.Core ( Lang )
 import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
-import Gargantext.Core.Types
+import Gargantext.Core.Types ( TermsWithCount, Terms(..) )
 import Gargantext.Prelude hiding (words)
 import Prelude (String)
 --import Data.Char (isAlphaNum, isSpace)
@@ -43,7 +43,8 @@ monoTexts = L.concat . monoTextsBySentence

 -- | TODO use text2term only
 monoText2term :: Lang -> Text -> Terms
-monoText2term lang txt = Terms [txt] (S.singleton $ stem lang PorterAlgorithm txt)
+monoText2term lang txt = Terms { _terms_label = [txt]
+                               , _terms_stem = S.singleton $ stem lang PorterAlgorithm txt }

 monoTextsBySentence :: Text -> [[Text]]
 monoTextsBySentence = map T.words

--- a/src/Gargantext/Core/Text/Terms/WithList.hs
+++ b/src/Gargantext/Core/Text/Terms/WithList.hs
@@ -11,7 +11,6 @@ Here is a longer description of this module, containing some
 commentary with @some markup@.

 -}
-{-# LANGUAGE BangPatterns      #-}
 {-# LANGUAGE ViewPatterns      #-}

 module Gargantext.Core.Text.Terms.WithList (
@@ -26,6 +25,10 @@ module Gargantext.Core.Text.Terms.WithList (

  -- * Properties
  , prop_patterns_internal_consistency
+
+  -- * For debugging
+  , ReplaceTerms(..)
+  , replaceTerms
  ) where

 import Prelude (show)
@@ -39,7 +42,7 @@ import Data.Text qualified as T
 import GHC.Exts (sortWith)
 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
 import Gargantext.Core (Lang(ZH))
-import Gargantext.Core.Text.Context
+import Gargantext.Core.Text.Context ( Corpus, TermList, Label, MultiTerm )
 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
 import Gargantext.Core.Types (TermsCount)
 import Gargantext.Core.Utils (groupWithCounts)
@@ -58,7 +61,7 @@ data Pattern = Pattern
  }

 instance Show Pattern where
-  show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")"
+  show Pattern{..} = "Pattern { _pat_length = " <> Prelude.show _pat_length <> ", _pat_terms = " <> Prelude.show _pat_terms <> "}"

 type Patterns = [Pattern]

@@ -66,25 +69,44 @@ type Patterns = [Pattern]

 data ReplaceTerms = KeepAll | LongestOnly

+-- | Given a 'ReplaceTerms' strategy, patterns and a split text,
+-- return matching terms according to strategy.  This function is
+-- usually applied to words in the whole sentence (i.e. 'terms'
+-- variable contains a tokenized sentence, coming from
+-- 'monoTextsBySentence').
 replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
-replaceTerms rplaceTerms pats terms = go 0
+replaceTerms rTerms pats terms =
+  List.concat (
+    mapMaybe (\(ix, _t) ->
+                case IntMap.lookup ix m of
+                  Nothing -> Nothing
+                  -- lst :: [(Int, [Text])]
+                  -- snd <$> lst :: [[Text]]
+                  Just lst -> Just (snd <$> lst)) $ zip [0..] terms
+    )
+--replaceTerms rTerms pats terms = go 0
  where
-    terms_len = length terms
-
-    go ix | ix >= terms_len = []
-          | otherwise =
-      case IntMap.lookup ix m of
-        Nothing -> go (ix + 1)
-        Just (len, term) ->
-          term : go (ix + len)
-
+    -- termsLen :: Int
+    -- termsLen = length terms
+
+    -- go :: Int -> [[Text]]
+    -- go ix | ix >= termsLen = []
+    --       | otherwise =
+    --   case IntMap.lookup ix m of
+    --     Nothing -> go (ix + 1)
+    --     Just (len, term) ->
+    --       term : go (ix + len)
+
+    m :: IntMap [(Int, [Text])]
    m = toMap
-        [ (ix, (len, term))
-        | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
-
-    toMap = case rplaceTerms of
-      KeepAll -> IntMap.fromList
-      LongestOnly -> IntMap.fromListWith merge
+        [ (ix, (_pat_length, _pat_terms))
+        | Pattern { .. } <- pats
+        , ix <- KMP.match _pat_table terms ]
+
+    toMap :: [(IntMap.Key, (Int, [Text]))] -> IntMap [(Int, [Text])]
+    toMap kv = case rTerms of
+      KeepAll -> IntMap.fromListWith (<>) (second (:[]) <$> kv)
+      LongestOnly -> IntMap.map (:[]) $ IntMap.fromListWith merge kv
        where
          merge (len1, lab1) (len2, lab2) =
            if len2 < len1 then (len1, lab1) else (len2, lab2)
@@ -97,7 +119,7 @@ buildPatterns :: TermList -> Patterns
 buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
  where
    buildPattern :: Label -> [MultiTerm] -> [Pattern]
-    buildPattern label alts = mapMaybe (mkPattern label) $ map (\alt -> filter (/= "") alt) (label : alts)
+    buildPattern label alts = mapMaybe (mkPattern label) $ map (filter (/= "")) (label : alts)

    mkPattern :: Label -> [Text] -> Maybe Pattern
    mkPattern label alt
@@ -109,7 +131,9 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
        -- which is exactly what we will be given as part of 'termsInText',
        -- that calls 'monoTextsBySentence'. If we don't lower here it
        -- means we won't be getting matches, whereas in theory we could.
-        Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label)
+        Pattern { _pat_table  = KMP.build $ map T.toLower alt
+                , _pat_length = length alt
+                , _pat_terms  = map T.toLower label }
                         --(Terms label $ Set.empty) -- TODO check stems


@@ -119,9 +143,7 @@ type MatchedText = Text

 termsInText :: Lang -> Patterns -> Text -> [(MatchedText, TermsCount)]
 termsInText lang pats (manipulateText lang -> txt) =
-  groupWithCounts $ List.concat
-                  $ map (map unwords)
-                  $ extractTermsWithList pats txt
+  groupWithCounts $ List.concatMap (map unwords) (extractTermsWithList pats txt)

 -- | Manipulates the input 'Text' before passing it to 'termsInText'.
 -- In particular, if the language is Chinese (ZH), we add spaces.
@@ -135,12 +157,11 @@ extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentenc

 -- | Extract terms
 extractTermsWithList' :: Patterns -> Text -> [Text]
-extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
-                           . monoTextsBySentence
+extractTermsWithList' pats txt = map concat $ List.concat $ extractTermsWithList pats txt

 --------------------------------------------------------------------------
 addSpaces :: Text -> Text
-addSpaces = T.unwords . (T.chunksOf 1)
+addSpaces = T.intersperse ' '


 --------------------------------------------------------------------------

--- a/src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
+++ b/src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
@@ -120,15 +120,31 @@ evolvSeaLadder nbFdt lambda freq similarities graph = map snd
  where
    --------
    -- 2) find the local maxima in the quality distribution
+    -- TODO (seeg, #471) head throws errors when list is too short
+    -- (i.e. List.head . List.tail requires at least 2 elements in the
+    -- list). I propose this implementation, but I'm not sure of the
+    -- length of the list
    maxima :: [Bool]
-    maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
+    maxima
+      | List.null qua' = []
+      | List.length qua' == 1 = [True]
+      | otherwise = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++
+                    findMaxima qua' ++
+                    [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
+    -- maxima = if List.length qua' > 1 then
+    --   [snd (List.head qua') > snd (List.head $ List.tail qua')] ++
+    --   (findMaxima qua') ++
+    --   [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
+    --   else
+    --     [True, True]
+    -- maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
    --------
    -- 1.2)
    qua' :: [(Double,Double)]
    qua' = foldl (\acc (s,q) ->
-                    if length acc == 0
+                    if null acc
                      then [(s,q)]
-                      else if (snd (List.last acc)) == q
+                      else if snd (List.last acc) == q
                        then acc
                      else acc ++ [(s,q)]
      ) [] $ zip (Set.toList similarities) qua
@@ -137,10 +153,10 @@ evolvSeaLadder nbFdt lambda freq similarities graph = map snd
    qua :: [Double]
    qua = parMap rpar (\thr ->
                let edges    = filter (\edge -> snd edge >= thr) graph
-                    nodes    = nubOrd $ concat $ map (\((n,n'),_) -> [n,n']) edges
+                    nodes    = nubOrd $ concatMap (\((n,n'),_) -> [n,n']) edges
                    branches = toRelatedComponents nodes edges
                 in toPhyloQuality nbFdt lambda freq branches
-              ) $ (Set.toList similarities)
+              ) $ Set.toList similarities


 {-
@@ -212,7 +228,7 @@ appendGroups f lvl m phylo =
                                                  -- select the cooc of the periods
                                                  (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [pId])
                                                  -- select and merge the roots count of the periods
-                                                  (foldl (\acc count -> unionWith (+) acc count) empty 
+                                                  (foldl (\acc count -> unionWith (+) acc count) empty
                                                    $ elems $ restrictKeys (getRootsCountByDate phylo) $ periodsToYears [pId]))
                                              ] ) [] phyloCUnit)
                         else
@@ -408,9 +424,9 @@ groupDocsByPeriod' f pds docs =
  let docs'    = groupBy (\d d' -> f d == f d') $ sortOn f docs
      periods  = parMap rpar (inPeriode f docs') pds
   in tracePhylo ("\n" <> "-- | Group "
-             <> show(length docs)
+             <> show (length docs)
             <> " docs by "
-             <> show(length pds) <> " periods" <> "\n" :: Text)
+             <> show (length pds) <> " periods" <> "\n" :: Text)
    $ fromList $ zip pds periods
  where
    --------------------------------------
@@ -427,8 +443,8 @@ groupDocsByPeriod f pds es =
  let periods  = parMap rpar (inPeriode f es) pds

  in  tracePhylo ("\n" <> "-- | Group "
-             <> show(length es) <> " docs by "
-             <> show(length pds) <> " periods" <> "\n" :: Text)
+             <> show (length es) <> " docs by "
+             <> show (length pds) <> " periods" <> "\n" :: Text)
    $ fromList $ zip pds periods
  where
    --------------------------------------
@@ -457,7 +473,7 @@ docsToTermCount docs roots = fromList


 docsToTimeTermCount :: [Document] -> Vector Ngrams  -> (Map Date (Map Int Double))
-docsToTimeTermCount docs roots = 
+docsToTimeTermCount docs roots =
    let docs' = Map.map (\l -> fromList $ map (\lst -> (head' "docsToTimeTermCount" lst, fromIntegral $ length lst))
                             $ group $ D.sort l)
              $ fromListWith (++)
@@ -484,9 +500,9 @@ docsToTimeScaleNb docs =
    let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
        time  = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
    in  tracePhylo ("\n" <> "-- | Group "
-               <> show(length docs)
+               <> show (length docs)
               <> " docs by "
-               <> show(length time)
+               <> show (length time)
               <> " unit of time" <> "\n" :: Text)
      $ unionWith (+) time docs'

@@ -498,7 +514,7 @@ initPhyloScales lvlMax pId =


 setDefault :: PhyloConfig -> TimeUnit -> Int -> PhyloConfig
-setDefault conf timeScale nbDocs = defaultConfig 
+setDefault conf timeScale nbDocs = defaultConfig
                                          { corpusPath   = (corpusPath conf)
                                          , listPath     = (listPath conf)
                                          , outputPath   = (outputPath conf)
@@ -507,11 +523,11 @@ setDefault conf timeScale nbDocs = defaultConfig
                                          , phyloName    = (phyloName conf)
                                          , defaultMode  = True
                                          , timeUnit     = timeScale
-                                          , clique       = Fis (toSupport nbDocs) 3} 
+                                          , clique       = Fis (toSupport nbDocs) 3}
  where
    --------------------------------------
    toSupport :: Int -> Support
-    toSupport n 
+    toSupport n
      | n < 500  = 1
      | n < 1000 = 2
      | n < 2000 = 3
@@ -540,9 +556,9 @@ initPhylo docs conf =
                 else defaultPhyloParam { _phyloParam_config = conf }
        periods = toPeriods (D.sort $ D.nub $ map date docs) (getTimePeriod timeScale) (getTimeStep timeScale)
    in tracePhylo ("\n" <> "-- | Init a phylo out of "
-              <> show(length docs) <> " docs \n" :: Text)
+              <> show (length docs) <> " docs \n" :: Text)
       $ tracePhylo ("\n" <> "-- | lambda "
-                <> show(_qua_granularity $ phyloQuality $ _phyloParam_config params) :: Text) 
+                <> show (_qua_granularity $ phyloQuality $ _phyloParam_config params) :: Text)
       $ Phylo foundations
               docsSources
               docsCounts

--- a/src/Gargantext/Core/Viz/Phylo/PhyloTools.hs
+++ b/src/Gargantext/Core/Viz/Phylo/PhyloTools.hs
--- a/src/Gargantext/Database/Action/Flow/Utils.hs
+++ b/src/Gargantext/Database/Action/Flow/Utils.hs
@@ -11,6 +11,7 @@ Portability : POSIX

 module Gargantext.Database.Action.Flow.Utils
  ( docNgrams
+  , docNgrams'
  , documentIdWithNgrams
  , mapDocumentIdWithNgrams
  , insertDocNgrams
@@ -68,21 +69,29 @@ insertDocNgrams lId m = do



-- Given language, ngrams type, a list of terms and a
-- HyperdataDocument, return ngrams that are in this text, with counts.
-- This is a pure function (doesn't use corenlp nor PostgreSQL FTS).
+-- | Given language, ngrams type, a list of terms and a
+-- HyperdataDocument, return ngrams that are in this text, with
+-- counts.  This is a pure function (doesn't use corenlp nor
+-- PostgreSQL FTS).
 docNgrams :: Lang
          -> [NT.NgramsTerm]
          -> ContextOnlyId HyperdataDocument
          -> [(MatchedText, TermsCount)]
 docNgrams lang ts doc =
-  (
-    termsInText lang (buildPatternsWith lang ts)
+  docNgrams' lang ts
      $ T.unlines $ catMaybes
        [ doc ^. context_oid_hyperdata . hd_title
        , doc ^. context_oid_hyperdata . hd_abstract
        ]
-  )
+
+-- | Given language, ngrams type, a list of terms and a text, return
+-- ngrams that are in this text, with counts.
+docNgrams' :: Lang
+           -> [NT.NgramsTerm]
+           -> Text
+           -> [(MatchedText, TermsCount)]
+docNgrams' lang ts txt =
+  termsInText lang (buildPatternsWith lang ts) txt


 documentIdWithNgrams :: Monad m
@@ -93,7 +102,8 @@ documentIdWithNgrams f = toDocumentIdWithNgrams
  where
    toDocumentIdWithNgrams d = do
      e <- f $ _unIndex         d
-      pure $ DocumentIdWithNgrams d e
+      pure $ DocumentIdWithNgrams { documentWithId = d
+                                  , documentNgrams = e }

 mapDocumentIdWithNgrams :: Monad m
                        => ( a -> m (HashMap.HashMap b (Map NgramsType TermsWeight, TermsCount)) )

--- a/src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
+++ b/src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
@@ -132,6 +132,7 @@ getOccByNgramsOnlyFast_withSample cId int nt ngs =
  HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs


+-- Returns occurrences of ngrams in given corpus/list (for each ngram, a list of contexts is returned)
 getOccByNgramsOnlyFast :: CorpusId
                       -> ListId
                       -> NgramsType

--- a/test-data/phylo/187481.json
+++ b/test-data/phylo/187481.json
--- a/test-data/phylo/nadal.golden.json
+++ b/test-data/phylo/nadal.golden.json
--- a/test-data/phylo/phylo2dot2json.golden.json
+++ b/test-data/phylo/phylo2dot2json.golden.json
--- a/test-data/phylo/small-phylo.golden.json
+++ b/test-data/phylo/small-phylo.golden.json
--- a/test/Main.hs
+++ b/test/Main.hs
@@ -2,14 +2,14 @@

 module Main where

-import Control.Monad
+import Control.Monad ( MonadFail(fail) )
 import Data.Text (isInfixOf)
 import Data.Text qualified as T
 import Gargantext.Prelude hiding (isInfixOf)
-import System.IO
-import System.Posix.Process
-import System.Posix.Signals
+import System.IO ( BufferMode(NoBuffering), hSetBuffering )
 import System.Process
+import System.Posix.Process ( getProcessGroupIDOf )
+import System.Posix.Signals ( keyboardSignal, signalProcessGroup )
 import Test.API qualified as API
 import Test.Database.Operations qualified as DB
 import Test.Database.Transactions qualified as DBT
@@ -39,30 +39,41 @@ import Test.Utils.Crypto               qualified as Crypto
 import Test.Utils.Jobs                 qualified as Jobs


-startCoreNLPServer :: IO ProcessHandle
+startCoreNLPServer :: IO (Maybe ProcessHandle)
 startCoreNLPServer = do
  putText "calling start core nlp"
  devNull <- openFile "/dev/null" WriteMode
  let p = proc "startCoreNLPServer.sh" []
-  (_, _, _, hdl) <- (createProcess $ p { cwd = Nothing
-                    -- NOTE(adn) Issue #451, this one has to stay disabled, because if we
-                    -- turn it on, despite the confusing documentation on the `process` library
-                    -- it will cause the Haskell RTS to completely ignore the Ctrl^c and instead
-                    -- delegate it exclusively to the process here, which means that our CoreNLP
-                    -- server will shut down correctly, but the test running will stop responding
-                    -- to Ctrl^C requests.
-                    , delegate_ctlc = False
-                    , create_group = True
-                    , std_out = UseHandle devNull
-                    , std_err = UseHandle devNull
-                    }) `catch` \e -> case e of
-                                           _ | True <- "does not exist" `isInfixOf` (T.pack . show @SomeException $ e)
-                                             -> fail $ "Cannot execute the 'startCoreNLPServer.sh' script. Make sure you are in a nix environment."
-                                             | otherwise -> throwIO e
-  pure hdl
+  let connect = do
+        (_, _, _, hdl) <- createProcess p { cwd = Nothing
+                                    -- NOTE(adn) Issue #451, this one has to stay disabled, because if we
+                                    -- turn it on, despite the confusing documentation on the `process` library
+                                    -- it will cause the Haskell RTS to completely ignore the Ctrl^c and instead
+                                    -- delegate it exclusively to the process here, which means that our CoreNLP
+                                    -- server will shut down correctly, but the test running will stop responding
+                                    -- to Ctrl^C requests.
+                                          , delegate_ctlc = False
+                                          , create_group = True
+                                          , std_out = UseHandle devNull
+                                          , std_err = UseHandle devNull
+                                          }
+        pure $ Just hdl

-killProcessTree :: ProcessHandle -> IO ()
-killProcessTree ph = do
+  connect `catch` \e -> do
+    putText $ T.pack $ show @SomeException e
+    case e of
+      _ | True <- "does not exist" `isInfixOf` (T.pack . show @SomeException $ e)
+          -> fail $ "Cannot execute the 'startCoreNLPServer.sh' script. Make sure you are in a nix environment."
+        | True <- "Address already in use" `isInfixOf` (T.pack . show @SomeException $ e)
+          -> do
+              putText "Address already in use, but we hope for the best!"
+              pure Nothing
+      
+        | otherwise -> throwIO e
+
+killProcessTree :: Maybe ProcessHandle -> IO ()
+killProcessTree Nothing = pure ()
+killProcessTree (Just ph) = do
  pid <- getPid ph
  case pid of
    Nothing -> putText "Process already terminated"

--- a/test/Test/Ngrams/Terms.hs
+++ b/test/Test/Ngrams/Terms.hs
@@ -4,7 +4,7 @@ module Test.Ngrams.Terms (tests) where

 import Data.HashMap.Strict qualified as HashMap
 import Data.Map.Strict qualified as Map
-import Gargantext.API.Ngrams
+import Gargantext.API.Ngrams.Types ( NgramsTerm )
 import Gargantext.Core (Lang(..))
 import Gargantext.Core.Text.Context (TermList)
 import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(..))
@@ -32,6 +32,7 @@ unitTests = describe "Terms tests" $ do
  it "termsInText works 03" testTermsInText03
  it "termsInText works 04 (related to issue #221)" testTermsInText04
  it "extractTermsWithList' works 01" testExtractTermsWithList'01
+  it "extractTermsWithList' works 02 (#471)" testExtractTermsWithList'02
  it "docNgrams works 01" testDocNgrams01
  it "docNgrams works 02" testDocNgrams02
  it "ngramsByDoc works 01" testNgramsByDoc01
@@ -107,12 +108,18 @@ testExtractTermsWithList'01 = do
  let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
  extractTermsWithList' (buildPatterns termList) "Le chat blanc" @?= ["chat blanc"]

+-- #471
+testExtractTermsWithList'02 :: Assertion
+testExtractTermsWithList'02 = do
+  let termList = [(["patients"], []), (["patients", "with"], [])] :: TermList
+  extractTermsWithList' (buildPatterns termList) "patients with problems" @?= ["patients", "patientswith"]
+
 testDocNgrams01 :: Assertion
 testDocNgrams01 = do
  let terms = ["hello", "world"] :: [NgramsTerm]
  let hd = emptyHyperdataDocument { _hd_title = Just "hello world"
                                  , _hd_abstract = Nothing }
-  let ctx = ContextOnlyId 1 hd
+  let ctx = ContextOnlyId { _context_oid_id = 1, _context_oid_hyperdata = hd }
  let dNgrams = docNgrams EN terms ctx
  length dNgrams @?= 2

@@ -121,7 +128,7 @@ testDocNgrams02 = do
  let terms = ["hello", "world"] :: [NgramsTerm]
  let hd = emptyHyperdataDocument { _hd_title = Just "hello world, kaboom"
                                  , _hd_abstract = Nothing }
-  let ctx = ContextOnlyId 1 hd
+  let ctx = ContextOnlyId { _context_oid_id = 1, _context_oid_hyperdata = hd }
  let dNgrams = docNgrams EN terms ctx
  length dNgrams @?= 2

@@ -130,10 +137,10 @@ testNgramsByDoc01 = do
  let terms = ["hello", "world"] :: [NgramsTerm]
  let hd1 = emptyHyperdataDocument { _hd_title = Just "hello world, kaboom"
                                   , _hd_abstract = Nothing }
-  let ctx1 = ContextOnlyId 1 hd1
+  let ctx1 = ContextOnlyId { _context_oid_id = 1, _context_oid_hyperdata = hd1 }
  let hd2 = emptyHyperdataDocument { _hd_title = Just "world, boom world"
                                   , _hd_abstract = Nothing }
-  let ctx2 = ContextOnlyId 2 hd2
+  let ctx2 = ContextOnlyId { _context_oid_id = 2, _context_oid_hyperdata = hd2 }

  ngramsByDoc EN NgramsTerms terms ctx1 @?=
    HashMap.fromList

--- a/test/Test/Offline/Ngrams.hs
+++ b/test/Test/Offline/Ngrams.hs
 {-# LANGUAGE QuasiQuotes      #-}
 {-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE ViewPatterns     #-}
+
 module Test.Offline.Ngrams (tests) where

 import Prelude

 import Control.Lens
+import Data.Char (isSpace)
+import Data.Char qualified as Char
+import Data.Map.Strict (Map)
 import Data.Map.Strict qualified as Map
 import Data.Text qualified as T
+import Data.Tree
 import Gargantext.API.Ngrams (filterNgramsNodes, buildForest, destroyForest, pruneForest)
 import Gargantext.API.Ngrams.Types
 import Gargantext.API.Ngrams.Types qualified as NT
@@ -20,15 +25,12 @@ import Gargantext.Database.Admin.Types.Hyperdata
 import Gargantext.Database.Schema.Context
 import Test.HUnit
 import Test.Hspec
+import Test.Hspec.QuickCheck (prop)
 import Test.Instances ()
 import Test.Ngrams.Query (mkMapTerm)
 import Test.QuickCheck
 import Test.QuickCheck qualified as QC
-import Data.Tree
 import Text.RawString.QQ (r)
-import Data.Char (isSpace)
-import Data.Map.Strict (Map)
-import Test.Hspec.QuickCheck (prop)


 genScientificText :: Gen T.Text
@@ -54,12 +56,12 @@ punctuation = ",.();:-"

 genNgramsTermNonEmpty :: Gen NgramsTermNonEmpty
 genNgramsTermNonEmpty = do
-    singleChar <- arbitrary `suchThat` (\x -> x /= ' ' && isAllowed x)
+    singleChar <- arbitrary `suchThat` isAllowed
    txt        <- filter isAllowed <$> listOf1 genScientificChar
    pure $ NgramsTermNonEmpty $ (T.pack $ singleChar : txt)
  where
    isAllowed :: Char -> Bool
-    isAllowed s = not (s `elem` punctuation) && not (s `elem` ws) && not (isSep s)
+    isAllowed s = not (Char.isSpace s) && not (s `elem` punctuation) && not (s `elem` ws) && not (isSep s)

 -- In order to test the behaviour of 'docNgrams' we create wrappers around 'NgramsTerm' to have two
 -- different 'Arbitrary' flavours, one that always produces non-empty 'Text' fragments, and one that

--- a/test/Test/Offline/Phylo.hs
+++ b/test/Test/Offline/Phylo.hs
-{-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE QuasiQuotes #-}
 {-# LANGUAGE TupleSections #-}
+{-# LANGUAGE TypeApplications #-}

 module Test.Offline.Phylo (tests) where

-import CLI.Phylo.Common
+import CLI.Phylo.Common ( fileToList, fileToDocsDefault )
+import Control.Monad (when)
 import Data.Aeson as JSON
 import Data.Aeson.Encode.Pretty qualified as JSON
 import Data.Aeson.Types qualified as JSON
@@ -14,22 +14,28 @@ import Data.ByteString.Lazy qualified as BIO
 import Data.ByteString.Lazy qualified as BL
 import Data.GraphViz.Attributes.Complete qualified as Graphviz
 import Data.Text.Lazy as TL
-import Data.TreeDiff
+import Data.TreeDiff ( ansiWlEditExprCompact, ediff )
 import Data.Vector qualified as V
-import Gargantext.Core.Text.List.Formats.TSV
+import Gargantext.Core.Text.List.Formats.TSV ( tsvMapTermList )
 import Gargantext.Core.Types.Phylo hiding (Phylo(..))
-import Gargantext.Core.Viz.Phylo hiding (EdgeType(..))
 import Gargantext.Core.Viz.Phylo.API.Tools (readPhylo, phylo2dot2json)
 import Gargantext.Core.Viz.Phylo.Example qualified as Cleopatre
-import Gargantext.Core.Viz.Phylo.PhyloExport
+import Gargantext.Core.Viz.Phylo hiding (EdgeType(..))
+import Gargantext.Core.Viz.Phylo.PhyloExport ( toAttr )
 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
-import Gargantext.Core.Viz.Phylo.PhyloTools
-import Paths_gargantext
+import Gargantext.Core.Viz.Phylo.PhyloTools ( setConfig, ngramsToLabel, relatedComponents )
+import Paths_gargantext ( getDataFileName )
 import Prelude
 import Test.HUnit
 import Test.Hspec
 import Test.Hspec.Golden

+
+-- | Switch to 'True' if you want to update golden tests
+doUpdateGolden :: Bool
+doUpdateGolden = False
+
+
 phyloTestConfig :: PhyloConfig
 phyloTestConfig = PhyloConfig {
    corpusPath = "corpus.csv"
@@ -59,7 +65,7 @@ phyloGolden (fp, actualOutput) =
  Golden {
    output = actualOutput
  , encodePretty = C8.unpack . BIO.toStrict
-  , writeToFile = \_ _ -> pure ()
+  , writeToFile = if doUpdateGolden then \_ new' -> BL.writeFile fp new' else (\_ _ -> pure ())
  , readFromFile = BIO.readFile
  , goldenFile = fp
  , actualFile = Nothing
@@ -149,7 +155,7 @@ testSmallPhyloWithoutLinkExpectedOutput = do
  listPath'      <- getDataFileName "test-data/phylo/small_phylo_ngramslist.tsv"
  (Right config) <- fmap (\pcfg -> pcfg { corpusPath = corpusPath'
                                        , listPath   = listPath'
-                                        }) <$> (JSON.eitherDecodeFileStrict' bpaConfig)
+                                        }) <$> JSON.eitherDecodeFileStrict' bpaConfig
  mapList <- fileToList (listParser config) (listPath config)
  corpus <- fileToDocsDefault (corpusParser config)
                        (corpusPath config)
@@ -157,6 +163,7 @@ testSmallPhyloWithoutLinkExpectedOutput = do
                        mapList
  let actual = setConfig phyloTestConfig $ toPhyloWithoutLink corpus config
  expected   <- setConfig phyloTestConfig <$> (readPhylo =<< getDataFileName "test-data/phylo/small-phylo.golden.json")
+  when doUpdateGolden $ BL.writeFile "test-data/phylo/small-phylo.golden.json" (JSON.encodePretty actual)
  assertBool (show $ ansiWlEditExprCompact $ ediff expected actual) (expected == actual)

 testPhylo2dot2json :: IO (FilePath, GraphDataFuzzy)
@@ -170,45 +177,45 @@ compareGraphDataFuzzy :: GraphData -> GraphData -> Bool
 compareGraphDataFuzzy gd1 gd2 =
  and [ _gd__subgraph_cnt gd1 == _gd__subgraph_cnt gd2
      , _gd_directed gd1      == _gd_directed gd2
-      , and $ Prelude.map (uncurry compareEdgeDataFuzzy) $ Prelude.zip (_gd_edges gd1) (_gd_edges gd2)
-      , and $ Prelude.map (uncurry compareObjectDataFuzzy) $ Prelude.zip (_gd_objects gd1) (_gd_objects gd2)
+      , Prelude.all (uncurry compareEdgeDataFuzzy) (Prelude.zip (_gd_edges gd1) (_gd_edges gd2))
+      , Prelude.all (uncurry compareObjectDataFuzzy) $ Prelude.zip (_gd_objects gd1) (_gd_objects gd2)
      , _gd_strict gd1        == _gd_strict gd2
      , _gd_data gd1 `compareGraphDataDataFuzzy` _gd_data gd2
      ]
  where
    gdd1 `compareEdgeDataFuzzy` ggd2 = case (gdd1, ggd2) of
      (GroupToAncestor gvId1 ecd1 gad1, GroupToAncestor gvId2 ecd2 gad2)
-        -> and [ gvId1 == gvId2, ecd1 `compareEdgeCommonDataFuzzy` ecd2, gad1 == gad2 ]
+        -> (gvId1 == gvId2) && (ecd1 `compareEdgeCommonDataFuzzy` ecd2) && (gad1 == gad2)
      (GroupToAncestor{}, _)
        -> False
      (GroupToGroup gvId1 ecd1 gd1', GroupToGroup gvId2 ecd2 gd2')
-        -> and [ gvId1 == gvId2, ecd1 `compareEdgeCommonDataFuzzy` ecd2, gd1' == gd2' ]
+        -> (gvId1 == gvId2) && (ecd1 `compareEdgeCommonDataFuzzy` ecd2) && (gd1' == gd2')
      (GroupToGroup{}, _)
        -> False
      (BranchToGroup gvId1 ecd1 bgd1, BranchToGroup gvId2 ecd2 bgd2)
-        -> and [ gvId1 == gvId2, ecd1 `compareEdgeCommonDataFuzzy` ecd2, bgd1 == bgd2 ]
+        -> (gvId1 == gvId2) && (ecd1 `compareEdgeCommonDataFuzzy` ecd2) && (bgd1 == bgd2)
      (BranchToGroup{}, _)
        -> False
      (PeriodToPeriod gvId1 ecd1, PeriodToPeriod gvId2 ecd2)
-        -> and [ gvId1 == gvId2, ecd1 `compareEdgeCommonDataFuzzy` ecd2 ]
+        -> (gvId1 == gvId2) && (ecd1 `compareEdgeCommonDataFuzzy` ecd2)
      (PeriodToPeriod{}, _)
        -> False

    gdd1 `compareObjectDataFuzzy` ggd2 = case (gdd1, ggd2) of
      (GroupToNode  gvId1 ncd1 gnd1, GroupToNode  gvId2 ncd2 gnd2)
-        -> and [ gvId1 == gvId2, ncd1 `compareNodeCommonDataFuzzy` ncd2, gnd1 == gnd2 ]
+        -> (gvId1 == gvId2) && (ncd1 `compareNodeCommonDataFuzzy` ncd2) && (gnd1 == gnd2)
      (GroupToNode{}, _)
        -> False
      (BranchToNode gvId1 ncd1 bnd1, BranchToNode gvId2 ncd2 bnd2)
-        -> and [ gvId1 == gvId2, ncd1 `compareNodeCommonDataFuzzy` ncd2, bnd1 == bnd2 ]
+        -> (gvId1 == gvId2) && (ncd1 `compareNodeCommonDataFuzzy` ncd2) && (bnd1 == bnd2)
      (BranchToNode{}, _)
        -> False
      (PeriodToNode gvId1 ncd1 pnd1, PeriodToNode gvId2 ncd2 pnd2)
-        -> and [ gvId1 == gvId2, ncd1 `compareNodeCommonDataFuzzy` ncd2, pnd1 == pnd2 ]
+        -> (gvId1 == gvId2) && (ncd1 `compareNodeCommonDataFuzzy` ncd2) && (pnd1 == pnd2)
      (PeriodToNode{}, _)
        -> False
      (Layer gvId1 gdd1' ld1, Layer gvId2 gdd2 ld2)
-        -> and [ gvId1 == gvId2, gdd1' `compareGraphDataDataFuzzy` gdd2, ld1 == ld2 ]
+        -> (gvId1 == gvId2) && (gdd1' `compareGraphDataDataFuzzy` gdd2) && (ld1 == ld2)
      (Layer{}, _)
        -> False

@@ -222,10 +229,9 @@ compareGraphDataFuzzy gd1 gd2 =

    gdd1 `compareEdgeCommonDataFuzzy` gdd2 =
      -- Excluded fields: pos, width.
-      and [ _ed_color gdd1 == _ed_color gdd2
-          , _ed_head  gdd1 == _ed_head gdd2
-          , _ed_tail  gdd1 == _ed_tail gdd2
-          ]
+      (_ed_color gdd1 == _ed_color gdd2) &&
+      (_ed_head  gdd1 == _ed_head gdd2) &&
+      (_ed_tail  gdd1 == _ed_tail gdd2)

    gdd1 `compareGraphDataDataFuzzy` ggd2 =
      -- Excluded fields: bb, lp, lheight, lwidth.