[ngrams] declarative ngrams demo

parent b3910bb4
Pipeline #7218 passed with stages
in 72 minutes and 35 seconds
......@@ -213,6 +213,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.DeclarativeNgrams
Gargantext.Core.Text.List
Gargantext.Core.Text.List.Formats.TSV
Gargantext.Core.Text.List.Group.WithStem
......@@ -793,6 +794,7 @@ test-suite garg-test-tasty
Test.Core.Text
Test.Core.Text.Corpus.Query
Test.Core.Text.Corpus.TSV
Test.Core.Text.DeclarativeNgrams
Test.Core.Text.Examples
Test.Core.Text.Flow
Test.Core.Utils
......
{-|
Module : Gargantext.Core.Text.DeclarativeNgrams
Description : "Declarative" ngrams, i.e. plain text with optional modifiers
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
See https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/386#note_12413
-}
{-# OPTIONS_GHC -Wno-deprecations #-} -- warning about 'undefined' usage is thrown
module Gargantext.Core.Text.DeclarativeNgrams where
import Data.Text qualified as T
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
data Ngrams =
Ngrams { terms :: Text
, declaration :: Declaration }
deriving (Eq, Show)
data Declaration =
Exact
| CaseInsensitive
| SingularPlural Lang
-- TODO: SingularPlural Lang
-- TODO: FrenchDeterminers (https://en.wikipedia.org/wiki/French_articles_and_determiners)
-- TODO Regex
-- TODO Combine CaseInsensitive and SingluarPlural
-- TODO Combine CaseInsensitive and FrenchDeterminers
deriving (Eq, Show)
data NgramsWithPos =
NgramsWithPos { ngrams :: Ngrams
, match :: Text
, pos :: Int }
deriving (Eq, Show)
-- | TODO Use parsec for this?
-- e.g. matching case-insensitive:
-- https://stackoverflow.com/questions/12937325/whats-the-cleanest-way-to-do-case-insensitive-parsing-with-text-combinators-par
findNgrams :: [Ngrams] -> Text -> [NgramsWithPos]
findNgrams [] _txt = []
findNgrams (ng:ngs) txt = findNgrams' ng txt 0 <> (findNgrams ngs txt)
findNgrams' :: Ngrams -> Text -> Int -> [NgramsWithPos]
findNgrams' _ "" _ = []
findNgrams' ngrams@(Ngrams { terms, declaration }) txt initialPos =
foldl (<>) [] $ map handleBreak (breakOnDeclaration declaration terms txt)
where
handleBreak :: (Text, Text, Text) -> [NgramsWithPos]
handleBreak (s, matching, _e) =
let pos = initialPos + T.length s
txt' = T.drop (T.length s) txt
match = T.take (T.length matching) txt'
ngramsWithPos = NgramsWithPos { ngrams, pos, match } in
(ngramsWithPos : (findNgrams' ngrams (T.drop (T.length matching) txt') (pos + T.length matching)))
-- | Split declaration function in spirit of `T.breakOn` + Declaration type
breakOnDeclaration :: Declaration -> Text -> Text -> [(Text, Text, Text)]
breakOnDeclaration Exact terms txt =
case T.breakOn terms txt of
(_, "") -> []
(s, e) -> [(s, terms, T.drop (T.length terms) e)]
breakOnDeclaration CaseInsensitive terms txt =
case T.breakOn (T.toLower terms) (T.toLower txt) of
(_, "") -> []
(s, e) -> [(s, T.take (T.length terms) e, T.drop (T.length terms) e)]
breakOnDeclaration (SingularPlural lang) terms txt = singularOrPlural
where
pf = pluralForm lang terms
-- | Singular is a bit tricky, because it's a "subset" of plural,
-- without the "s", so we filter things out.
-- singular = filter (\(_s, m, e) -> T.take (T.length pf) (m <> e) /= pf) $ breakOnDeclaration Exact terms txt
singular = breakOnDeclaration Exact terms txt
plural = breakOnDeclaration Exact pf txt
singularOrPlural = case (singular, plural) of
([s], [p]) ->
let (ss, _, _) = s
(sp, _, _) = p in
if T.length ss < T.length sp then [s] else [p]
([s], _) -> [s]
(_, p) -> p
pluralForm :: Lang -> Text -> Text
pluralForm EN "door" = "door"
pluralForm EN "sheep" = "sheep"
pluralForm EN term = term <> "s"
pluralForm _ _ = undefined
{-|
Module : Test.Core.Text.DecarativeNgrams
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Test.Core.Text.DeclarativeNgrams where
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.DeclarativeNgrams
import Gargantext.Prelude
import Test.Hspec
-- | Core.Text.DeclarativeNgrams tests
test :: Spec
test = do
describe "test text matching" $ do
it "exact match works" $ do
let ngrams = Ngrams { terms = "hello", declaration = Exact }
findNgrams [ngrams] "hello world, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
findNgrams [ngrams] "hello world, Hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" } ]
findNgrams [ngrams] "hello world, hello, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "hello" }
, NgramsWithPos { ngrams, pos = 20, match = "hello" } ]
it "case insensitive works" $ do
let ngrams = Ngrams { terms = "Hello", declaration = CaseInsensitive }
findNgrams [ngrams] "hello world, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
findNgrams [ngrams] "hello world, Hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "Hello" } ]
findNgrams [ngrams] "hello world, Hello, hEllo" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "Hello" }
, NgramsWithPos { ngrams, pos = 20, match = "hEllo" } ]
it "singular/plural works" $ do
let ngrams = Ngrams { terms = "world", declaration = SingularPlural EN }
findNgrams [ngrams] "hello world, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
findNgrams [ngrams] "hello worlds, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "worlds" } ]
findNgrams [ngrams] "hello world, Hello worlds" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "world" }
, NgramsWithPos { ngrams, pos = 19, match = "worlds" } ]
findNgrams [ngrams] "hello world, Hello, World" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
findNgrams [ngrams] "hello worlds, Hello, world" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "worlds" }
, NgramsWithPos { ngrams, pos = 21, match = "world" }]
......@@ -17,6 +17,7 @@ import qualified Test.Core.Orchestrator as Orchestrator
import qualified Test.Core.Similarity as Similarity
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Text.Corpus.TSV as TSVParser
import qualified Test.Core.Text.DeclarativeNgrams as DNgrams
import qualified Test.Core.Utils as Utils
import qualified Test.Core.Worker as Worker
import qualified Test.Graph.Clustering as Graph
......@@ -55,6 +56,7 @@ main = do
similaritySpec <- testSpec "Similarity" Similarity.test
asyncUpdatesSpec <- testSpec "Notifications" Notifications.test
occurrencesSpec <- testSpec "Occurrences" Occurrences.test
declarativeNgramsSpec <- testSpec "Declarative Ngrams" DNgrams.test
protectStdoutBuffering $ defaultMain $ testGroup "Gargantext"
[ utilSpec
......@@ -78,4 +80,5 @@ main = do
, Notifications.qcTests
, Orchestrator.qcTests
, NgramsTerms.tests
, declarativeNgramsSpec
]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment