[BREANKING ngrams] this should fix #471 multi-ngrams terms indexing

However, tests don't pass and I'm not sure if other functionality doesn't break.

[BREANKING ngrams] this should fix #471 multi-ngrams terms indexing
However, tests don't pass and I'm not sure if other functionality doesn't break.
b7d03cf6 · Przemyslaw Kaminski · bd92fece · b7d03cf6 · b7d03cf6 · b7d03cf6
Verified Commit b7d03cf6 authored Jun 05, 2025 by Przemyslaw Kaminski
4 changed files
--- a/src/Gargantext/Core/Text/Terms/Mono.hs
+++ b/src/Gargantext/Core/Text/Terms/Mono.hs
@@ -18,9 +18,9 @@ module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentenc
 import Data.List qualified as L
 import Data.Set qualified as S
 import Data.Text qualified as T
-import Gargantext.Core
+import Gargantext.Core ( Lang )
 import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
-import Gargantext.Core.Types
+import Gargantext.Core.Types ( TermsWithCount, Terms(..) )
 import Gargantext.Prelude hiding (words)
 import Prelude (String)
 --import Data.Char (isAlphaNum, isSpace)
@@ -43,7 +43,8 @@ monoTexts = L.concat . monoTextsBySentence

 -- | TODO use text2term only
 monoText2term :: Lang -> Text -> Terms
-monoText2term lang txt = Terms [txt] (S.singleton $ stem lang PorterAlgorithm txt)
+monoText2term lang txt = Terms { _terms_label = [txt]
+                               , _terms_stem = S.singleton $ stem lang PorterAlgorithm txt }

 monoTextsBySentence :: Text -> [[Text]]
 monoTextsBySentence = map T.words

--- a/src/Gargantext/Core/Text/Terms/WithList.hs
+++ b/src/Gargantext/Core/Text/Terms/WithList.hs
@@ -11,7 +11,6 @@ Here is a longer description of this module, containing some
 commentary with @some markup@.

 -}
-{-# LANGUAGE BangPatterns      #-}
 {-# LANGUAGE ViewPatterns      #-}

 module Gargantext.Core.Text.Terms.WithList (
@@ -26,6 +25,10 @@ module Gargantext.Core.Text.Terms.WithList (

  -- * Properties
  , prop_patterns_internal_consistency
+
+  -- * For debugging
+  , ReplaceTerms(..)
+  , replaceTerms
  ) where

 import Prelude (show)
@@ -39,7 +42,7 @@ import Data.Text qualified as T
 import GHC.Exts (sortWith)
 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
 import Gargantext.Core (Lang(ZH))
-import Gargantext.Core.Text.Context
+import Gargantext.Core.Text.Context ( Corpus, TermList, Label, MultiTerm )
 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
 import Gargantext.Core.Types (TermsCount)
 import Gargantext.Core.Utils (groupWithCounts)
@@ -58,7 +61,7 @@ data Pattern = Pattern
  }

 instance Show Pattern where
-  show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")"
+  show Pattern{..} = "Pattern { _pat_length = " <> Prelude.show _pat_length <> ", _pat_terms = " <> Prelude.show _pat_terms <> "}"

 type Patterns = [Pattern]

@@ -66,25 +69,44 @@ type Patterns = [Pattern]

 data ReplaceTerms = KeepAll | LongestOnly

+-- | Given a 'ReplaceTerms' strategy, patterns and a split text,
+-- return matching terms according to strategy.  This function is
+-- usually applied to words in the whole sentence (i.e. 'terms'
+-- variable contains a tokenized sentence, coming from
+-- 'monoTextsBySentence').
 replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
-replaceTerms rplaceTerms pats terms = go 0
+replaceTerms rTerms pats terms =
+  List.concat (
+    mapMaybe (\(ix, _t) ->
+                case IntMap.lookup ix m of
+                  Nothing -> Nothing
+                  -- lst :: [(Int, [Text])]
+                  -- snd <$> lst :: [[Text]]
+                  Just lst -> Just (snd <$> lst)) $ zip [0..] terms
+    )
+--replaceTerms rTerms pats terms = go 0
  where
-    terms_len = length terms
-
-    go ix | ix >= terms_len = []
-          | otherwise =
-      case IntMap.lookup ix m of
-        Nothing -> go (ix + 1)
-        Just (len, term) ->
-          term : go (ix + len)
-
+    -- termsLen :: Int
+    -- termsLen = length terms
+
+    -- go :: Int -> [[Text]]
+    -- go ix | ix >= termsLen = []
+    --       | otherwise =
+    --   case IntMap.lookup ix m of
+    --     Nothing -> go (ix + 1)
+    --     Just (len, term) ->
+    --       term : go (ix + len)
+
+    m :: IntMap [(Int, [Text])]
    m = toMap
-        [ (ix, (len, term))
-        | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
-
-    toMap = case rplaceTerms of
-      KeepAll -> IntMap.fromList
-      LongestOnly -> IntMap.fromListWith merge
+        [ (ix, (_pat_length, _pat_terms))
+        | Pattern { .. } <- pats
+        , ix <- KMP.match _pat_table terms ]
+
+    toMap :: [(IntMap.Key, (Int, [Text]))] -> IntMap [(Int, [Text])]
+    toMap kv = case rTerms of
+      KeepAll -> IntMap.fromListWith (<>) (second (:[]) <$> kv)
+      LongestOnly -> IntMap.map (:[]) $ IntMap.fromListWith merge kv
        where
          merge (len1, lab1) (len2, lab2) =
            if len2 < len1 then (len1, lab1) else (len2, lab2)
@@ -97,7 +119,7 @@ buildPatterns :: TermList -> Patterns
 buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
  where
    buildPattern :: Label -> [MultiTerm] -> [Pattern]
-    buildPattern label alts = mapMaybe (mkPattern label) $ map (\alt -> filter (/= "") alt) (label : alts)
+    buildPattern label alts = mapMaybe (mkPattern label) $ map (filter (/= "")) (label : alts)

    mkPattern :: Label -> [Text] -> Maybe Pattern
    mkPattern label alt
@@ -109,7 +131,9 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
        -- which is exactly what we will be given as part of 'termsInText',
        -- that calls 'monoTextsBySentence'. If we don't lower here it
        -- means we won't be getting matches, whereas in theory we could.
-        Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label)
+        Pattern { _pat_table  = KMP.build $ map T.toLower alt
+                , _pat_length = length alt
+                , _pat_terms  = map T.toLower label }
                         --(Terms label $ Set.empty) -- TODO check stems


@@ -119,9 +143,7 @@ type MatchedText = Text

 termsInText :: Lang -> Patterns -> Text -> [(MatchedText, TermsCount)]
 termsInText lang pats (manipulateText lang -> txt) =
-  groupWithCounts $ List.concat
-                  $ map (map unwords)
-                  $ extractTermsWithList pats txt
+  groupWithCounts $ List.concatMap (map unwords) (extractTermsWithList pats txt)

 -- | Manipulates the input 'Text' before passing it to 'termsInText'.
 -- In particular, if the language is Chinese (ZH), we add spaces.
@@ -135,12 +157,11 @@ extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentenc

 -- | Extract terms
 extractTermsWithList' :: Patterns -> Text -> [Text]
-extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
-                           . monoTextsBySentence
+extractTermsWithList' pats txt = map concat $ List.concat $ extractTermsWithList pats txt

 --------------------------------------------------------------------------
 addSpaces :: Text -> Text
-addSpaces = T.unwords . (T.chunksOf 1)
+addSpaces = T.intersperse ' '


 --------------------------------------------------------------------------

--- a/src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
+++ b/src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
@@ -120,6 +120,14 @@ evolvSeaLadder nbFdt lambda freq similarities graph = map snd
  where
    --------
    -- 2) find the local maxima in the quality distribution
+    -- TODO (seeg, #471) head throws errors when list is too short.
+    -- I propose this implementation, but I'm not sure of the length of the list
+    -- maxima = if List.length qua' > 1 then
+    --   [snd (List.head qua') > snd (List.head $ List.tail qua')] ++
+    --   (findMaxima qua') ++
+    --   [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
+    --   else
+    --     [True, True]
    maxima :: [Bool]
    maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
    --------

--- a/src/Gargantext/Database/Action/Flow/Utils.hs
+++ b/src/Gargantext/Database/Action/Flow/Utils.hs
@@ -11,6 +11,7 @@ Portability : POSIX

 module Gargantext.Database.Action.Flow.Utils
  ( docNgrams
+  , docNgrams'
  , documentIdWithNgrams
  , insertDocNgrams
  , insertDocs
@@ -68,21 +69,29 @@ insertDocNgrams lId m = do



-- Given language, ngrams type, a list of terms and a
-- HyperdataDocument, return ngrams that are in this text, with counts.
-- This is a pure function (doesn't use corenlp nor PostgreSQL FTS).
+-- | Given language, ngrams type, a list of terms and a
+-- HyperdataDocument, return ngrams that are in this text, with
+-- counts.  This is a pure function (doesn't use corenlp nor
+-- PostgreSQL FTS).
 docNgrams :: Lang
          -> [NT.NgramsTerm]
          -> ContextOnlyId HyperdataDocument
          -> [(MatchedText, TermsCount)]
 docNgrams lang ts doc =
-  (
-    termsInText lang (buildPatternsWith lang ts)
+  docNgrams' lang ts
      $ T.unlines $ catMaybes
        [ doc ^. context_oid_hyperdata . hd_title
        , doc ^. context_oid_hyperdata . hd_abstract
        ]
-  )
+
+-- | Given language, ngrams type, a list of terms and a text, return
+-- ngrams that are in this text, with counts.
+docNgrams' :: Lang
+           -> [NT.NgramsTerm]
+           -> Text
+           -> [(MatchedText, TermsCount)]
+docNgrams' lang ts txt =
+  termsInText lang (buildPatternsWith lang ts) txt


 documentIdWithNgrams :: HasNodeError err
@@ -94,7 +103,8 @@ documentIdWithNgrams f = traverse toDocumentIdWithNgrams
  where
    toDocumentIdWithNgrams d = do
      e <- f $ _unIndex         d
-      pure $ DocumentIdWithNgrams d e
+      pure $ DocumentIdWithNgrams { documentWithId = d
+                                  , documentNgrams = e }


 -- | TODO check optimization