[ngrams] declarative ngrams demo

parent b3910bb4
Pipeline #7218 passed with stages
in 72 minutes and 35 seconds
......@@ -213,6 +213,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.DeclarativeNgrams
Gargantext.Core.Text.List
Gargantext.Core.Text.List.Formats.TSV
Gargantext.Core.Text.List.Group.WithStem
......@@ -793,6 +794,7 @@ test-suite garg-test-tasty
Test.Core.Text
Test.Core.Text.Corpus.Query
Test.Core.Text.Corpus.TSV
Test.Core.Text.DeclarativeNgrams
Test.Core.Text.Examples
Test.Core.Text.Flow
Test.Core.Utils
......
{-|
Module : Gargantext.Core.Text.DeclarativeNgrams
Description : "Declarative" ngrams, i.e. plain text with optional modifiers
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
See https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/386#note_12413
-}
{-# OPTIONS_GHC -Wno-deprecations #-} -- warning about 'undefined' usage is thrown
module Gargantext.Core.Text.DeclarativeNgrams where
import Data.Text qualified as T
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
data Ngrams =
Ngrams { terms :: Text
, declaration :: Declaration }
deriving (Eq, Show)
data Declaration =
Exact
| CaseInsensitive
| SingularPlural Lang
-- TODO: SingularPlural Lang
-- TODO: FrenchDeterminers (https://en.wikipedia.org/wiki/French_articles_and_determiners)
-- TODO Regex
-- TODO Combine CaseInsensitive and SingluarPlural
-- TODO Combine CaseInsensitive and FrenchDeterminers
deriving (Eq, Show)
data NgramsWithPos =
NgramsWithPos { ngrams :: Ngrams
, match :: Text
, pos :: Int }
deriving (Eq, Show)
-- | TODO Use parsec for this?
-- e.g. matching case-insensitive:
-- https://stackoverflow.com/questions/12937325/whats-the-cleanest-way-to-do-case-insensitive-parsing-with-text-combinators-par
findNgrams :: [Ngrams] -> Text -> [NgramsWithPos]
findNgrams [] _txt = []
findNgrams (ng:ngs) txt = findNgrams' ng txt 0 <> (findNgrams ngs txt)
findNgrams' :: Ngrams -> Text -> Int -> [NgramsWithPos]
findNgrams' _ "" _ = []
findNgrams' ngrams@(Ngrams { terms, declaration }) txt initialPos =
foldl (<>) [] $ map handleBreak (breakOnDeclaration declaration terms txt)
where
handleBreak :: (Text, Text, Text) -> [NgramsWithPos]
handleBreak (s, matching, _e) =
let pos = initialPos + T.length s
txt' = T.drop (T.length s) txt
match = T.take (T.length matching) txt'
ngramsWithPos = NgramsWithPos { ngrams, pos, match } in
(ngramsWithPos : (findNgrams' ngrams (T.drop (T.length matching) txt') (pos + T.length matching)))
-- | Split declaration function in spirit of `T.breakOn` + Declaration type
breakOnDeclaration :: Declaration -> Text -> Text -> [(Text, Text, Text)]
breakOnDeclaration Exact terms txt =
case T.breakOn terms txt of
(_, "") -> []
(s, e) -> [(s, terms, T.drop (T.length terms) e)]
breakOnDeclaration CaseInsensitive terms txt =
case T.breakOn (T.toLower terms) (T.toLower txt) of
(_, "") -> []
(s, e) -> [(s, T.take (T.length terms) e, T.drop (T.length terms) e)]
breakOnDeclaration (SingularPlural lang) terms txt = singularOrPlural
where
pf = pluralForm lang terms
-- | Singular is a bit tricky, because it's a "subset" of plural,
-- without the "s", so we filter things out.
-- singular = filter (\(_s, m, e) -> T.take (T.length pf) (m <> e) /= pf) $ breakOnDeclaration Exact terms txt
singular = breakOnDeclaration Exact terms txt
plural = breakOnDeclaration Exact pf txt
singularOrPlural = case (singular, plural) of
([s], [p]) ->
let (ss, _, _) = s
(sp, _, _) = p in
if T.length ss < T.length sp then [s] else [p]
([s], _) -> [s]
(_, p) -> p
pluralForm :: Lang -> Text -> Text
pluralForm EN "door" = "door"
pluralForm EN "sheep" = "sheep"
pluralForm EN term = term <> "s"
pluralForm _ _ = undefined
{-|
Module : Test.Core.Text.DecarativeNgrams
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Test.Core.Text.DeclarativeNgrams where
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.DeclarativeNgrams
import Gargantext.Prelude
import Test.Hspec
-- | Core.Text.DeclarativeNgrams tests
test :: Spec
test = do
describe "test text matching" $ do
it "exact match works" $ do
let ngrams = Ngrams { terms = "hello", declaration = Exact }
findNgrams [ngrams] "hello world, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
findNgrams [ngrams] "hello world, Hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" } ]
findNgrams [ngrams] "hello world, hello, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "hello" }
, NgramsWithPos { ngrams, pos = 20, match = "hello" } ]
it "case insensitive works" $ do
let ngrams = Ngrams { terms = "Hello", declaration = CaseInsensitive }
findNgrams [ngrams] "hello world, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "hello" } ]
findNgrams [ngrams] "hello world, Hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "Hello" } ]
findNgrams [ngrams] "hello world, Hello, hEllo" `shouldBe`
[ NgramsWithPos { ngrams, pos = 0, match = "hello" }
, NgramsWithPos { ngrams, pos = 13, match = "Hello" }
, NgramsWithPos { ngrams, pos = 20, match = "hEllo" } ]
it "singular/plural works" $ do
let ngrams = Ngrams { terms = "world", declaration = SingularPlural EN }
findNgrams [ngrams] "hello world, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
findNgrams [ngrams] "hello worlds, hello" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "worlds" } ]
findNgrams [ngrams] "hello world, Hello worlds" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "world" }
, NgramsWithPos { ngrams, pos = 19, match = "worlds" } ]
findNgrams [ngrams] "hello world, Hello, World" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "world" } ]
findNgrams [ngrams] "hello worlds, Hello, world" `shouldBe`
[ NgramsWithPos { ngrams, pos = 6, match = "worlds" }
, NgramsWithPos { ngrams, pos = 21, match = "world" }]
......@@ -12,25 +12,26 @@ module Main where
import Gargantext.Prelude
import qualified Test.Core.Notifications as Notifications
import qualified Test.Core.Orchestrator as Orchestrator
import qualified Test.Core.Similarity as Similarity
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Text.Corpus.TSV as TSVParser
import qualified Test.Core.Utils as Utils
import qualified Test.Core.Worker as Worker
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.Lang.Occurrences as Occurrences
import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Ngrams.Terms as NgramsTerms
import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Phylo as Phylo
import qualified Test.Offline.Stemming.Lancaster as Lancaster
import qualified Test.Parsers.Date as PD
import qualified Test.Utils.Crypto as Crypto
import qualified Test.Utils.Jobs as Jobs
import qualified Test.Core.Notifications as Notifications
import qualified Test.Core.Orchestrator as Orchestrator
import qualified Test.Core.Similarity as Similarity
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Text.Corpus.TSV as TSVParser
import qualified Test.Core.Text.DeclarativeNgrams as DNgrams
import qualified Test.Core.Utils as Utils
import qualified Test.Core.Worker as Worker
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.Lang.Occurrences as Occurrences
import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Ngrams.Terms as NgramsTerms
import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Phylo as Phylo
import qualified Test.Offline.Stemming.Lancaster as Lancaster
import qualified Test.Parsers.Date as PD
import qualified Test.Utils.Crypto as Crypto
import qualified Test.Utils.Jobs as Jobs
import System.IO (hGetBuffering, hSetBuffering)
import Test.Tasty
......@@ -46,15 +47,16 @@ protectStdoutBuffering action =
main :: IO ()
main = do
utilSpec <- testSpec "Utils" Utils.test
clusteringSpec <- testSpec "Graph Clustering" Graph.test
dateSplitSpec <- testSpec "Date split" PD.testDateSplit
cryptoSpec <- testSpec "Crypto" Crypto.test
nlpSpec <- testSpec "NLP" NLP.test
jobsSpec <- testSpec "Jobs" Jobs.test
similaritySpec <- testSpec "Similarity" Similarity.test
asyncUpdatesSpec <- testSpec "Notifications" Notifications.test
occurrencesSpec <- testSpec "Occurrences" Occurrences.test
utilSpec <- testSpec "Utils" Utils.test
clusteringSpec <- testSpec "Graph Clustering" Graph.test
dateSplitSpec <- testSpec "Date split" PD.testDateSplit
cryptoSpec <- testSpec "Crypto" Crypto.test
nlpSpec <- testSpec "NLP" NLP.test
jobsSpec <- testSpec "Jobs" Jobs.test
similaritySpec <- testSpec "Similarity" Similarity.test
asyncUpdatesSpec <- testSpec "Notifications" Notifications.test
occurrencesSpec <- testSpec "Occurrences" Occurrences.test
declarativeNgramsSpec <- testSpec "Declarative Ngrams" DNgrams.test
protectStdoutBuffering $ defaultMain $ testGroup "Gargantext"
[ utilSpec
......@@ -78,4 +80,5 @@ main = do
, Notifications.qcTests
, Orchestrator.qcTests
, NgramsTerms.tests
, declarativeNgramsSpec
]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment