[ngrams] declarative ngrams demo

cf12ea50 · Przemyslaw Kaminski · b3910bb4 · cf12ea50 · cf12ea50 · cf12ea50
Verified Commit cf12ea50 authored Jan 15, 2025 by Przemyslaw Kaminski
4 changed files
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -213,6 +213,7 @@ library
      Gargantext.Core.Text.Corpus.Parsers.Date
      Gargantext.Core.Text.Corpus.Parsers.TSV
      Gargantext.Core.Text.Corpus.Query
+      Gargantext.Core.Text.DeclarativeNgrams
      Gargantext.Core.Text.List
      Gargantext.Core.Text.List.Formats.TSV
      Gargantext.Core.Text.List.Group.WithStem
@@ -793,6 +794,7 @@ test-suite garg-test-tasty
      Test.Core.Text
      Test.Core.Text.Corpus.Query
      Test.Core.Text.Corpus.TSV
+      Test.Core.Text.DeclarativeNgrams
      Test.Core.Text.Examples
      Test.Core.Text.Flow
      Test.Core.Utils

--- a/src/Gargantext/Core/Text/DeclarativeNgrams.hs
+++ b/src/Gargantext/Core/Text/DeclarativeNgrams.hs
+{-|
+Module      : Gargantext.Core.Text.DeclarativeNgrams
+Description : "Declarative" ngrams, i.e. plain text with optional modifiers
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+See https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/386#note_12413
+-}
+{-# OPTIONS_GHC -Wno-deprecations #-}  -- warning about 'undefined' usage is thrown
+module Gargantext.Core.Text.DeclarativeNgrams where
+import Data.Text qualified as T
+import Gargantext.Core (Lang(..))
+import Gargantext.Prelude
+data Ngrams =
+  Ngrams { terms :: Text
+         , declaration :: Declaration }
+  deriving (Eq, Show)
+data Declaration =
+    Exact
+  | CaseInsensitive
+  | SingularPlural Lang
+  -- TODO: SingularPlural Lang
+  -- TODO: FrenchDeterminers (https://en.wikipedia.org/wiki/French_articles_and_determiners)
+  -- TODO Regex
+  -- TODO Combine CaseInsensitive and SingluarPlural
+  -- TODO Combine CaseInsensitive and FrenchDeterminers
+  deriving (Eq, Show)
+data NgramsWithPos =
+  NgramsWithPos { ngrams :: Ngrams
+                , match  :: Text
+                , pos    :: Int }
+  deriving (Eq, Show)
+-- | TODO Use parsec for this?
+-- e.g. matching case-insensitive:
+-- https://stackoverflow.com/questions/12937325/whats-the-cleanest-way-to-do-case-insensitive-parsing-with-text-combinators-par
+findNgrams :: [Ngrams] -> Text -> [NgramsWithPos]
+findNgrams [] _txt = []
+findNgrams (ng:ngs) txt = findNgrams' ng txt 0 <> (findNgrams ngs txt)
+findNgrams' :: Ngrams -> Text -> Int -> [NgramsWithPos]
+findNgrams' _ "" _ = []
+findNgrams' ngrams@(Ngrams { terms, declaration }) txt initialPos =
+  foldl (<>) [] $ map handleBreak (breakOnDeclaration declaration terms txt)
+  where
+    handleBreak :: (Text, Text, Text) -> [NgramsWithPos]
+    handleBreak (s, matching, _e) =
+      let pos = initialPos + T.length s
+          txt' = T.drop (T.length s) txt
+          match = T.take (T.length matching) txt'
+          ngramsWithPos = NgramsWithPos { ngrams, pos, match } in
+      (ngramsWithPos : (findNgrams' ngrams (T.drop (T.length matching) txt') (pos + T.length matching)))
+-- | Split declaration function in spirit of `T.breakOn` + Declaration type
+breakOnDeclaration :: Declaration -> Text -> Text -> [(Text, Text, Text)]
+breakOnDeclaration Exact terms txt =
+  case T.breakOn terms txt of
+    (_, "") -> []
+    (s, e) -> [(s, terms, T.drop (T.length terms) e)]
+breakOnDeclaration CaseInsensitive terms txt =
+  case T.breakOn (T.toLower terms) (T.toLower txt) of
+    (_, "") -> []
+    (s, e) -> [(s, T.take (T.length terms) e, T.drop (T.length terms) e)]
+breakOnDeclaration (SingularPlural lang) terms txt = singularOrPlural
+  where
+    pf = pluralForm lang terms
+    -- | Singular is a bit tricky, because it's a "subset" of plural,
+    -- without the "s", so we filter things out.
+    -- singular = filter (\(_s, m, e) -> T.take (T.length pf) (m <> e) /= pf) $ breakOnDeclaration Exact terms txt
+    singular = breakOnDeclaration Exact terms txt
+    plural = breakOnDeclaration Exact pf txt
+    singularOrPlural = case (singular, plural) of
+      ([s], [p]) ->
+        let (ss, _, _) = s
+            (sp, _, _) = p in
+        if T.length ss < T.length sp then [s] else [p]
+      ([s], _) -> [s]
+      (_, p) -> p
+pluralForm :: Lang -> Text -> Text
+pluralForm EN "door" = "door"
+pluralForm EN "sheep" = "sheep"
+pluralForm EN term = term <> "s"
+pluralForm _ _ = undefined
--- a/test/Test/Core/Text/DeclarativeNgrams.hs
+++ b/test/Test/Core/Text/DeclarativeNgrams.hs
+{-|
+Module      : Test.Core.Text.DecarativeNgrams
+Description :
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
+module Test.Core.Text.DeclarativeNgrams where
+import Gargantext.Core (Lang(..))
+import Gargantext.Core.Text.DeclarativeNgrams
+import Gargantext.Prelude
+import Test.Hspec
+-- | Core.Text.DeclarativeNgrams tests
+test :: Spec
+test = do
+  describe "test text matching" $ do
+    it "exact match works" $ do
+      let ngrams = Ngrams { terms = "hello", declaration = Exact }
+      findNgrams [ngrams] "hello world, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
+      findNgrams [ngrams] "hello world, Hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" } ]
+      findNgrams [ngrams] "hello world, hello, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 20, match = "hello" } ]
+    it "case insensitive works" $ do
+      let ngrams = Ngrams { terms = "Hello", declaration = CaseInsensitive }
+      findNgrams [ngrams] "hello world, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
+      findNgrams [ngrams] "hello world, Hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "Hello" } ]
+      findNgrams [ngrams] "hello world, Hello, hEllo" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 0, match = "hello" }
+        , NgramsWithPos { ngrams, pos = 13, match = "Hello" }
+        , NgramsWithPos { ngrams, pos = 20, match = "hEllo" } ]
+    it "singular/plural works" $ do
+      let ngrams = Ngrams { terms = "world", declaration = SingularPlural EN }
+      findNgrams [ngrams] "hello world, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
+      findNgrams [ngrams] "hello worlds, hello" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "worlds" } ]
+      findNgrams [ngrams] "hello world, Hello worlds" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "world" }
+        , NgramsWithPos { ngrams, pos = 19, match = "worlds" } ]
+      findNgrams [ngrams] "hello world, Hello, World" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
+      findNgrams [ngrams] "hello worlds, Hello, world" `shouldBe`
+        [ NgramsWithPos { ngrams, pos = 6, match = "worlds" }
+        , NgramsWithPos { ngrams, pos = 21, match = "world" }]
--- a/test/drivers/tasty/Main.hs
+++ b/test/drivers/tasty/Main.hs
@@ -12,25 +12,26 @@ module Main where
 import Gargantext.Prelude
-import qualified Test.Core.Notifications         as Notifications
+import qualified Test.Core.Notifications          as Notifications
-import qualified Test.Core.Orchestrator          as Orchestrator
+import qualified Test.Core.Orchestrator           as Orchestrator
-import qualified Test.Core.Similarity            as Similarity
+import qualified Test.Core.Similarity             as Similarity
-import qualified Test.Core.Text.Corpus.Query     as CorpusQuery
+import qualified Test.Core.Text.Corpus.Query      as CorpusQuery
-import qualified Test.Core.Text.Corpus.TSV       as TSVParser
+import qualified Test.Core.Text.Corpus.TSV        as TSVParser
-import qualified Test.Core.Utils                 as Utils
+import qualified Test.Core.Text.DeclarativeNgrams as DNgrams
-import qualified Test.Core.Worker                as Worker
+import qualified Test.Core.Utils                  as Utils
-import qualified Test.Graph.Clustering           as Graph
+import qualified Test.Core.Worker                 as Worker
-import qualified Test.Ngrams.Lang.Occurrences    as Occurrences
+import qualified Test.Graph.Clustering            as Graph
-import qualified Test.Ngrams.NLP                 as NLP
+import qualified Test.Ngrams.Lang.Occurrences     as Occurrences
-import qualified Test.Ngrams.Query               as NgramsQuery
+import qualified Test.Ngrams.NLP                  as NLP
-import qualified Test.Ngrams.Terms               as NgramsTerms
+import qualified Test.Ngrams.Query                as NgramsQuery
-import qualified Test.Offline.Errors             as Errors
+import qualified Test.Ngrams.Terms                as NgramsTerms
-import qualified Test.Offline.JSON               as JSON
+import qualified Test.Offline.Errors              as Errors
-import qualified Test.Offline.Phylo              as Phylo
+import qualified Test.Offline.JSON                as JSON
-import qualified Test.Offline.Stemming.Lancaster as Lancaster
+import qualified Test.Offline.Phylo               as Phylo
-import qualified Test.Parsers.Date               as PD
+import qualified Test.Offline.Stemming.Lancaster  as Lancaster
-import qualified Test.Utils.Crypto               as Crypto
+import qualified Test.Parsers.Date                as PD
-import qualified Test.Utils.Jobs                 as Jobs
+import qualified Test.Utils.Crypto                as Crypto
+import qualified Test.Utils.Jobs                  as Jobs
 import System.IO (hGetBuffering, hSetBuffering)
 import Test.Tasty
@@ -46,15 +47,16 @@ protectStdoutBuffering action =
 main :: IO ()
 main = do
-  utilSpec         <- testSpec "Utils" Utils.test
+  utilSpec              <- testSpec "Utils" Utils.test
-  clusteringSpec   <- testSpec "Graph Clustering" Graph.test
+  clusteringSpec        <- testSpec "Graph Clustering" Graph.test
-  dateSplitSpec    <- testSpec "Date split" PD.testDateSplit
+  dateSplitSpec         <- testSpec "Date split" PD.testDateSplit
-  cryptoSpec       <- testSpec "Crypto" Crypto.test
+  cryptoSpec            <- testSpec "Crypto" Crypto.test
-  nlpSpec          <- testSpec "NLP" NLP.test
+  nlpSpec               <- testSpec "NLP" NLP.test
-  jobsSpec         <- testSpec "Jobs" Jobs.test
+  jobsSpec              <- testSpec "Jobs" Jobs.test
-  similaritySpec   <- testSpec "Similarity" Similarity.test
+  similaritySpec        <- testSpec "Similarity" Similarity.test
-  asyncUpdatesSpec <- testSpec "Notifications" Notifications.test
+  asyncUpdatesSpec      <- testSpec "Notifications" Notifications.test
-  occurrencesSpec  <- testSpec "Occurrences" Occurrences.test
+  occurrencesSpec       <- testSpec "Occurrences" Occurrences.test
+  declarativeNgramsSpec <- testSpec "Declarative Ngrams" DNgrams.test
  protectStdoutBuffering $ defaultMain $ testGroup "Gargantext"
    [ utilSpec
@@ -78,4 +80,5 @@ main = do
    , Notifications.qcTests
    , Orchestrator.qcTests
    , NgramsTerms.tests
+    , declarativeNgramsSpec
    ]