[ngrams] declarative ngrams demo

parent b3910bb4
Pipeline #7218 passed with stages
in 72 minutes and 35 seconds
...@@ -213,6 +213,7 @@ library ...@@ -213,6 +213,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.Date Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.TSV Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.DeclarativeNgrams
Gargantext.Core.Text.List Gargantext.Core.Text.List
Gargantext.Core.Text.List.Formats.TSV Gargantext.Core.Text.List.Formats.TSV
Gargantext.Core.Text.List.Group.WithStem Gargantext.Core.Text.List.Group.WithStem
...@@ -793,6 +794,7 @@ test-suite garg-test-tasty ...@@ -793,6 +794,7 @@ test-suite garg-test-tasty
Test.Core.Text Test.Core.Text
Test.Core.Text.Corpus.Query Test.Core.Text.Corpus.Query
Test.Core.Text.Corpus.TSV Test.Core.Text.Corpus.TSV
Test.Core.Text.DeclarativeNgrams
Test.Core.Text.Examples Test.Core.Text.Examples
Test.Core.Text.Flow Test.Core.Text.Flow
Test.Core.Utils Test.Core.Utils
......
{-|
Module : Gargantext.Core.Text.DeclarativeNgrams
Description : "Declarative" ngrams, i.e. plain text with optional modifiers
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
See https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/386#note_12413
-}
{-# OPTIONS_GHC -Wno-deprecations #-} -- warning about 'undefined' usage is thrown
module Gargantext.Core.Text.DeclarativeNgrams where
import Data.Text qualified as T
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
data Ngrams =
Ngrams { terms :: Text
, declaration :: Declaration }
deriving (Eq, Show)
data Declaration =
Exact
| CaseInsensitive
| SingularPlural Lang
-- TODO: SingularPlural Lang
-- TODO: FrenchDeterminers (https://en.wikipedia.org/wiki/French_articles_and_determiners)
-- TODO Regex
-- TODO Combine CaseInsensitive and SingluarPlural
-- TODO Combine CaseInsensitive and FrenchDeterminers
deriving (Eq, Show)
data NgramsWithPos =
NgramsWithPos { ngrams :: Ngrams
, match :: Text
, pos :: Int }
deriving (Eq, Show)
-- | TODO Use parsec for this?
-- e.g. matching case-insensitive:
-- https://stackoverflow.com/questions/12937325/whats-the-cleanest-way-to-do-case-insensitive-parsing-with-text-combinators-par
findNgrams :: [Ngrams] -> Text -> [NgramsWithPos]
findNgrams [] _txt = []
findNgrams (ng:ngs) txt = findNgrams' ng txt 0 <> (findNgrams ngs txt)
findNgrams' :: Ngrams -> Text -> Int -> [NgramsWithPos]
findNgrams' _ "" _ = []
findNgrams' ngrams@(Ngrams { terms, declaration }) txt initialPos =
foldl (<>) [] $ map handleBreak (breakOnDeclaration declaration terms txt)
where
handleBreak :: (Text, Text, Text) -> [NgramsWithPos]
handleBreak (s, matching, _e) =
let pos = initialPos + T.length s
txt' = T.drop (T.length s) txt
match = T.take (T.length matching) txt'
ngramsWithPos = NgramsWithPos { ngrams, pos, match } in
(ngramsWithPos : (findNgrams' ngrams (T.drop (T.length matching) txt') (pos + T.length matching)))
-- | Split declaration function in spirit of `T.breakOn` + Declaration type
breakOnDeclaration :: Declaration -> Text -> Text -> [(Text, Text, Text)]
breakOnDeclaration Exact terms txt =
case T.breakOn terms txt of
(_, "") -> []
(s, e) -> [(s, terms, T.drop (T.length terms) e)]
breakOnDeclaration CaseInsensitive terms txt =
case T.breakOn (T.toLower terms) (T.toLower txt) of
(_, "") -> []
(s, e) -> [(s, T.take (T.length terms) e, T.drop (T.length terms) e)]
breakOnDeclaration (SingularPlural lang) terms txt = singularOrPlural
where
pf = pluralForm lang terms
-- | Singular is a bit tricky, because it's a "subset" of plural,
-- without the "s", so we filter things out.
-- singular = filter (\(_s, m, e) -> T.take (T.length pf) (m <> e) /= pf) $ breakOnDeclaration Exact terms txt
singular = breakOnDeclaration Exact terms txt
plural = breakOnDeclaration Exact pf txt
singularOrPlural = case (singular, plural) of
([s], [p]) ->
let (ss, _, _) = s
(sp, _, _) = p in
if T.length ss < T.length sp then [s] else [p]
([s], _) -> [s]
(_, p) -> p
pluralForm :: Lang -> Text -> Text
pluralForm EN "door" = "door"
pluralForm EN "sheep" = "sheep"
pluralForm EN term = term <> "s"
pluralForm _ _ = undefined
{-|
Module : Test.Core.Text.DecarativeNgrams
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Test.Core.Text.DeclarativeNgrams where
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.DeclarativeNgrams
import Gargantext.Prelude
import Test.Hspec
-- | Core.Text.DeclarativeNgrams tests
test :: Spec
test = do
describe "test text matching" $ do
it "exact match works" $ do
let ngrams = Ngrams { terms = "hello", declaration = Exact }
findNgrams [ngrams] "hello world, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
findNgrams [ngrams] "hello world, Hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" } ]
findNgrams [ngrams] "hello world, hello, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "hello" }
, NgramsWithPos { ngrams, pos = 20, match = "hello" } ]
it "case insensitive works" $ do
let ngrams = Ngrams { terms = "Hello", declaration = CaseInsensitive }
findNgrams [ngrams] "hello world, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
findNgrams [ngrams] "hello world, Hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "Hello" } ]
findNgrams [ngrams] "hello world, Hello, hEllo" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "Hello" }
, NgramsWithPos { ngrams, pos = 20, match = "hEllo" } ]
it "singular/plural works" $ do
let ngrams = Ngrams { terms = "world", declaration = SingularPlural EN }
findNgrams [ngrams] "hello world, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
findNgrams [ngrams] "hello worlds, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "worlds" } ]
findNgrams [ngrams] "hello world, Hello worlds" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "world" }
, NgramsWithPos { ngrams, pos = 19, match = "worlds" } ]
findNgrams [ngrams] "hello world, Hello, World" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
findNgrams [ngrams] "hello worlds, Hello, world" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "worlds" }
, NgramsWithPos { ngrams, pos = 21, match = "world" }]
...@@ -12,25 +12,26 @@ module Main where ...@@ -12,25 +12,26 @@ module Main where
import Gargantext.Prelude import Gargantext.Prelude
import qualified Test.Core.Notifications as Notifications import qualified Test.Core.Notifications as Notifications
import qualified Test.Core.Orchestrator as Orchestrator import qualified Test.Core.Orchestrator as Orchestrator
import qualified Test.Core.Similarity as Similarity import qualified Test.Core.Similarity as Similarity
import qualified Test.Core.Text.Corpus.Query as CorpusQuery import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Text.Corpus.TSV as TSVParser import qualified Test.Core.Text.Corpus.TSV as TSVParser
import qualified Test.Core.Utils as Utils import qualified Test.Core.Text.DeclarativeNgrams as DNgrams
import qualified Test.Core.Worker as Worker import qualified Test.Core.Utils as Utils
import qualified Test.Graph.Clustering as Graph import qualified Test.Core.Worker as Worker
import qualified Test.Ngrams.Lang.Occurrences as Occurrences import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.NLP as NLP import qualified Test.Ngrams.Lang.Occurrences as Occurrences
import qualified Test.Ngrams.Query as NgramsQuery import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Terms as NgramsTerms import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Offline.Errors as Errors import qualified Test.Ngrams.Terms as NgramsTerms
import qualified Test.Offline.JSON as JSON import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.Phylo as Phylo import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Stemming.Lancaster as Lancaster import qualified Test.Offline.Phylo as Phylo
import qualified Test.Parsers.Date as PD import qualified Test.Offline.Stemming.Lancaster as Lancaster
import qualified Test.Utils.Crypto as Crypto import qualified Test.Parsers.Date as PD
import qualified Test.Utils.Jobs as Jobs import qualified Test.Utils.Crypto as Crypto
import qualified Test.Utils.Jobs as Jobs
import System.IO (hGetBuffering, hSetBuffering) import System.IO (hGetBuffering, hSetBuffering)
import Test.Tasty import Test.Tasty
...@@ -46,15 +47,16 @@ protectStdoutBuffering action = ...@@ -46,15 +47,16 @@ protectStdoutBuffering action =
main :: IO () main :: IO ()
main = do main = do
utilSpec <- testSpec "Utils" Utils.test utilSpec <- testSpec "Utils" Utils.test
clusteringSpec <- testSpec "Graph Clustering" Graph.test clusteringSpec <- testSpec "Graph Clustering" Graph.test
dateSplitSpec <- testSpec "Date split" PD.testDateSplit dateSplitSpec <- testSpec "Date split" PD.testDateSplit
cryptoSpec <- testSpec "Crypto" Crypto.test cryptoSpec <- testSpec "Crypto" Crypto.test
nlpSpec <- testSpec "NLP" NLP.test nlpSpec <- testSpec "NLP" NLP.test
jobsSpec <- testSpec "Jobs" Jobs.test jobsSpec <- testSpec "Jobs" Jobs.test
similaritySpec <- testSpec "Similarity" Similarity.test similaritySpec <- testSpec "Similarity" Similarity.test
asyncUpdatesSpec <- testSpec "Notifications" Notifications.test asyncUpdatesSpec <- testSpec "Notifications" Notifications.test
occurrencesSpec <- testSpec "Occurrences" Occurrences.test occurrencesSpec <- testSpec "Occurrences" Occurrences.test
declarativeNgramsSpec <- testSpec "Declarative Ngrams" DNgrams.test
protectStdoutBuffering $ defaultMain $ testGroup "Gargantext" protectStdoutBuffering $ defaultMain $ testGroup "Gargantext"
[ utilSpec [ utilSpec
...@@ -78,4 +80,5 @@ main = do ...@@ -78,4 +80,5 @@ main = do
, Notifications.qcTests , Notifications.qcTests
, Orchestrator.qcTests , Orchestrator.qcTests
, NgramsTerms.tests , NgramsTerms.tests
, declarativeNgramsSpec
] ]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment