[ngrams] declarative ngrams demo

cf12ea50 · Przemyslaw Kaminski · b3910bb4 · cf12ea50 · cf12ea50 · cf12ea50
Verified Commit cf12ea50 authored Jan 15, 2025 by Przemyslaw Kaminski
4 changed files
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -213,6 +213,7 @@ library
      Gargantext.Core.Text.Corpus.Parsers.Date
      Gargantext.Core.Text.Corpus.Parsers.TSV
      Gargantext.Core.Text.Corpus.Query
+      Gargantext.Core.Text.DeclarativeNgrams
      Gargantext.Core.Text.List
      Gargantext.Core.Text.List.Formats.TSV
      Gargantext.Core.Text.List.Group.WithStem
@@ -793,6 +794,7 @@ test-suite garg-test-tasty
      Test.Core.Text
      Test.Core.Text.Corpus.Query
      Test.Core.Text.Corpus.TSV
+      Test.Core.Text.DeclarativeNgrams
      Test.Core.Text.Examples
      Test.Core.Text.Flow
      Test.Core.Utils

--- a/src/Gargantext/Core/Text/DeclarativeNgrams.hs
+++ b/src/Gargantext/Core/Text/DeclarativeNgrams.hs
+{-|
+Module      : Gargantext.Core.Text.DeclarativeNgrams
+Description : "Declarative" ngrams, i.e. plain text with optional modifiers
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+See https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/386#note_12413
+-}
+
+{-# OPTIONS_GHC -Wno-deprecations #-}  -- warning about 'undefined' usage is thrown
+
+module Gargantext.Core.Text.DeclarativeNgrams where
+
+
+import Data.Text qualified as T
+import Gargantext.Core (Lang(..))
+import Gargantext.Prelude
+
+
+
+data Ngrams =
+  Ngrams { terms :: Text
+         , declaration :: Declaration }
+  deriving (Eq, Show)
+
+data Declaration =
+    Exact
+  | CaseInsensitive
+  | SingularPlural Lang
+  -- TODO: SingularPlural Lang
+  -- TODO: FrenchDeterminers (https://en.wikipedia.org/wiki/French_articles_and_determiners)
+  -- TODO Regex
+  -- TODO Combine CaseInsensitive and SingluarPlural
+  -- TODO Combine CaseInsensitive and FrenchDeterminers
+  deriving (Eq, Show)
+
+
+data NgramsWithPos =
+  NgramsWithPos { ngrams :: Ngrams
+                , match  :: Text
+                , pos    :: Int }
+  deriving (Eq, Show)
+
+
+
+
+-- | TODO Use parsec for this?
+-- e.g. matching case-insensitive:
+-- https://stackoverflow.com/questions/12937325/whats-the-cleanest-way-to-do-case-insensitive-parsing-with-text-combinators-par
+findNgrams :: [Ngrams] -> Text -> [NgramsWithPos]
+findNgrams [] _txt = []
+findNgrams (ng:ngs) txt = findNgrams' ng txt 0 <> (findNgrams ngs txt)
+
+findNgrams' :: Ngrams -> Text -> Int -> [NgramsWithPos]
+findNgrams' _ "" _ = []
+findNgrams' ngrams@(Ngrams { terms, declaration }) txt initialPos =
+  foldl (<>) [] $ map handleBreak (breakOnDeclaration declaration terms txt)
+  where
+    handleBreak :: (Text, Text, Text) -> [NgramsWithPos]
+    handleBreak (s, matching, _e) =
+      let pos = initialPos + T.length s
+          txt' = T.drop (T.length s) txt
+          match = T.take (T.length matching) txt'
+          ngramsWithPos = NgramsWithPos { ngrams, pos, match } in
+      (ngramsWithPos : (findNgrams' ngrams (T.drop (T.length matching) txt') (pos + T.length matching)))
+
+-- | Split declaration function in spirit of `T.breakOn` + Declaration type
+breakOnDeclaration :: Declaration -> Text -> Text -> [(Text, Text, Text)]
+breakOnDeclaration Exact terms txt =
+  case T.breakOn terms txt of
+    (_, "") -> []
+    (s, e) -> [(s, terms, T.drop (T.length terms) e)]
+breakOnDeclaration CaseInsensitive terms txt =
+  case T.breakOn (T.toLower terms) (T.toLower txt) of
+    (_, "") -> []
+    (s, e) -> [(s, T.take (T.length terms) e, T.drop (T.length terms) e)]
+breakOnDeclaration (SingularPlural lang) terms txt = singularOrPlural
+  where
+    pf = pluralForm lang terms
+    -- | Singular is a bit tricky, because it's a "subset" of plural,
+    -- without the "s", so we filter things out.
+    -- singular = filter (\(_s, m, e) -> T.take (T.length pf) (m <> e) /= pf) $ breakOnDeclaration Exact terms txt
+    singular = breakOnDeclaration Exact terms txt
+    plural = breakOnDeclaration Exact pf txt
+
+    singularOrPlural = case (singular, plural) of
+      ([s], [p]) ->
+        let (ss, _, _) = s
+            (sp, _, _) = p in
+        if T.length ss < T.length sp then [s] else [p]
+      ([s], _) -> [s]
+      (_, p) -> p
+
+pluralForm :: Lang -> Text -> Text
+pluralForm EN "door" = "door"
+pluralForm EN "sheep" = "sheep"
+pluralForm EN term = term <> "s"
+pluralForm _ _ = undefined
--- a/test/Test/Core/Text/DeclarativeNgrams.hs
+++ b/test/Test/Core/Text/DeclarativeNgrams.hs
+{-|
+Module      : Test.Core.Text.DecarativeNgrams
+Description :
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+-}
+
+module Test.Core.Text.DeclarativeNgrams where
+
+import Gargantext.Core (Lang(..))
+import Gargantext.Core.Text.DeclarativeNgrams
+import Gargantext.Prelude
+import Test.Hspec
+
+-- | Core.Text.DeclarativeNgrams tests
+test :: Spec
+test = do
+  describe "test text matching" $ do
+    it "exact match works" $ do
+      let ngrams = Ngrams { terms = "hello", declaration = Exact }
+      findNgrams [ngrams] "hello world, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
+      findNgrams [ngrams] "hello world, Hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" } ]
+      findNgrams [ngrams] "hello world, hello, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 20, match = "hello" } ]
+    it "case insensitive works" $ do
+      let ngrams = Ngrams { terms = "Hello", declaration = CaseInsensitive }
+      findNgrams [ngrams] "hello world, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
+      findNgrams [ngrams] "hello world, Hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "Hello" } ]
+      findNgrams [ngrams] "hello world, Hello, hEllo" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "Hello" }
+        , NgramsWithPos { ngrams, pos = 20, match = "hEllo" } ]
+    it "singular/plural works" $ do
+      let ngrams = Ngrams { terms = "world", declaration = SingularPlural EN }
+      findNgrams [ngrams] "hello world, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
+      findNgrams [ngrams] "hello worlds, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "worlds" } ]
+      findNgrams [ngrams] "hello world, Hello worlds" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "world" }
+        , NgramsWithPos { ngrams, pos = 19, match = "worlds" } ]
+      findNgrams [ngrams] "hello world, Hello, World" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
+      findNgrams [ngrams] "hello worlds, Hello, world" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "worlds" }
+        , NgramsWithPos { ngrams, pos = 21, match = "world" }]
+         
--- a/test/drivers/tasty/Main.hs
+++ b/test/drivers/tasty/Main.hs
@@ -17,6 +17,7 @@ import qualified Test.Core.Orchestrator          as Orchestrator
 import qualified Test.Core.Similarity             as Similarity
 import qualified Test.Core.Text.Corpus.Query      as CorpusQuery
 import qualified Test.Core.Text.Corpus.TSV        as TSVParser
+import qualified Test.Core.Text.DeclarativeNgrams as DNgrams
 import qualified Test.Core.Utils                  as Utils
 import qualified Test.Core.Worker                 as Worker
 import qualified Test.Graph.Clustering            as Graph
@@ -55,6 +56,7 @@ main = do
  similaritySpec        <- testSpec "Similarity" Similarity.test
  asyncUpdatesSpec      <- testSpec "Notifications" Notifications.test
  occurrencesSpec       <- testSpec "Occurrences" Occurrences.test
+  declarativeNgramsSpec <- testSpec "Declarative Ngrams" DNgrams.test

  protectStdoutBuffering $ defaultMain $ testGroup "Gargantext"
    [ utilSpec
@@ -78,4 +80,5 @@ main = do
    , Notifications.qcTests
    , Orchestrator.qcTests
    , NgramsTerms.tests
+    , declarativeNgramsSpec
    ]