[ngrams] improve function documentation, add types, add unit tests

I want to understand ngrams algorithms better.
parent d7a70fd4
Pipeline #7169 passed with stages
in 55 minutes and 26 seconds
......@@ -371,7 +371,7 @@ https://haskell-language-server.readthedocs.io/en/latest/installation.html
Running the tests can be done via the following command:
```hs
```shell
cabal v2-test --test-show-details=streaming --flags 'test-crypto no-phylo-debug-logs'
```
......@@ -383,10 +383,19 @@ The flags have the following meaning:
In order for some tests to run (like the phylo ones) is **required** to install the `gargantext-cli` via:
```hs
```shell
cabal v2-install gargantext:exe:gargantext
```
For tasty, if you want to run specific test (via patterns), use:
```shell
cabal v2-run garg-test-tasty -- -p '/Ngrams/
```
For integration tests, do:
```shel
cabal v2-test garg-test-hspec --test-show-details=streaming --test-option=--match='/some pattern/'
```
### Modifying a golden test to accept a new (expected) output
Some tests, like the Phylo one, use golden testing to ensure that the JSON Phylo we generate is
......@@ -394,7 +403,7 @@ the same as an expected one. This allows us to catch regressions in the serialis
Sometimes, however, we genuinely want to modify the output so that it's the new reference (i.e. the new
golden reference). To do so, it's enough to run the testsuite passing the `--accept` flag, for example:
```hs
```shell
cabal v2-test garg-test-tasty --test-show-details=streaming --flags 'test-crypto no-phylo-debug-logs' --test-option=--pattern='/Phylo/' --test-option=--accept"
```
......
......@@ -805,6 +805,7 @@ test-suite garg-test-tasty
Test.Graph.Clustering
Test.Graph.Distance
Test.Instances
Test.Ngrams.Count
Test.Ngrams.Lang
Test.Ngrams.Lang.En
Test.Ngrams.Lang.Fr
......
......@@ -9,6 +9,7 @@ Portability : POSIX
-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
......@@ -108,7 +109,9 @@ getListNgrams nodeIds ngramsType = listNgramsFromRepo nodeIds ngramsType
<$> getRepo nodeIds
getTermsWith :: (HasNodeStory env err m, Eq a, Hashable a)
-- | Fetch terms from repo, gathering terms under the same root (parent).
getTermsWith :: forall a env err m.
(HasNodeStory env err m, Eq a, Hashable a)
=> (NgramsTerm -> a) -> [ListId]
-> NgramsType -> Set ListType
-> m (HashMap a [a])
......@@ -119,6 +122,7 @@ getTermsWith f ls ngt lts = HM.fromListWith (<>)
<$> mapTermListRoot ls ngt
<$> getRepo ls
where
toTreeWith :: (NgramsTerm, (b, Maybe NgramsTerm)) -> (a, [a])
toTreeWith (t, (_lt, maybeRoot)) = case maybeRoot of
Nothing -> (f t, [])
Just r -> (f r, [f t])
......
......@@ -65,6 +65,7 @@ import Data.Set qualified as Set
import Data.Text qualified as T
import EPO.API.Client.Types qualified as EPO
import Gargantext.API.Ngrams.Tools (getTermsWith)
import Gargantext.API.Ngrams.Types (NgramsTerm)
import Gargantext.Core (Lang(..), NLPServerConfig, withDefaultLanguage)
import Gargantext.Core.Notifications.CentralExchange.Types (HasCentralExchangeNotification(ce_notify), CEMessage(..))
import Gargantext.Core.Config (GargConfig(..), hasConfig)
......@@ -80,6 +81,7 @@ import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Text.Ngrams (NgramsType(NgramsTerms), Ngrams(_ngramsTerms))
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText)
import Gargantext.Core.Types (HasValidationError, TermsCount)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main ( ListType(MapTerm) )
......@@ -92,7 +94,7 @@ import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContex
import Gargantext.Database.Action.Search (searchDocInDatabase)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact )
import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataAnnuaire, HyperdataCorpus(_hc_lang) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( ToHyperdataDocument(toHyperdataDocument) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( ToHyperdataDocument(toHyperdataDocument), HyperdataDocument )
import Gargantext.Database.Admin.Types.Node hiding (DEBUG) -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Prelude (IsDBCmd, DBCmdWithEnv)
import Gargantext.Database.Query.Table.ContextNodeNgrams2 ( ContextNodeNgrams2Poly(..), insertContextNodeNgrams2 )
......@@ -494,19 +496,27 @@ reIndexWith cId lId nt lts = do
let corpusLang = withDefaultLanguage $ view (node_hyperdata . to _hc_lang) corpus_node
-- Getting [NgramsTerm]
ts <- List.concat
(ts :: [NgramsTerm]) <- List.concat
<$> map (\(k,vs) -> k:vs)
<$> HashMap.toList
<$> getTermsWith identity [lId] nt lts
-- Get all documents of the corpus
docs <- selectDocNodes cId
(docs :: [Context HyperdataDocument]) <- selectDocNodes cId
let
-- fromListWith (<>)
docNgramsMap :: [[((MatchedText, TermsCount), Map NgramsType (Map NodeId Int))]]
docNgramsMap = map (docNgrams corpusLang nt ts) docs
withExtractedNgrams :: [[(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]]
withExtractedNgrams =
map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ docNgramsMap
ngramsByDoc :: [HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))]
ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ map (docNgrams corpusLang nt ts) docs
$ withExtractedNgrams
-- Saving the indexation in database
mapM_ (saveDocNgramsWith lId) ngramsByDoc
......
......@@ -69,6 +69,9 @@ insertDocNgrams lId m = do
-- Given language, ngrams type, a list of terms and a
-- HyperdataDocument, return ngrams that are in this text, with counts.
-- This is a pure function (doesn't use corenlp nor PostgreSQL FTS).
docNgrams :: Lang
-> NgramsType
-> [NT.NgramsTerm]
......
{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
{-# LANGUAGE TypeApplications #-}
module Test.Ngrams.Count (tests) where
import Gargantext.API.Ngrams
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.WithList (buildPatternsWith, termsInText, Pattern(..))
import Gargantext.Prelude
import Test.Tasty
import Test.Tasty.HUnit
tests :: TestTree
tests = testGroup "Ngrams" [unitTests]
unitTests :: TestTree
unitTests = testGroup "Count tests"
[ -- Sorting
testCase "Build patterns works 01" testBuildPatterns01
, testCase "Build patterns works 02" testBuildPatterns02
, testCase "termsInText works 01" testTermsInText01
, testCase "termsInText works 02" testTermsInText02
, testCase "termsInText works 03" testTermsInText03
, testCase "termsInText works 04 (related to issue #221)" testTermsInText04
]
-- | Let's document how the `buildPatternsWith` function works.
testBuildPatterns01 :: Assertion
testBuildPatterns01 = do
let terms = ["hello world"] :: [NgramsTerm]
let enPatterns = buildPatternsWith EN terms
length enPatterns @?= 1
let Just pat = head enPatterns
_pat_length pat @?= 2
_pat_terms pat @?= ["hello", "world"]
-- | Let's document how the `buildPatternsWith` function works.
testBuildPatterns02 :: Assertion
testBuildPatterns02 = do
let terms = ["hello", "world"] :: [NgramsTerm]
let enPatterns = buildPatternsWith EN terms
length enPatterns @?= 2
let [pat1, pat2] = enPatterns
_pat_length pat1 @?= 1
_pat_terms pat1 @?= ["hello"]
_pat_length pat2 @?= 1
_pat_terms pat2 @?= ["world"]
-- | Let's document how the `termsInText` function works.
testTermsInText01 :: Assertion
testTermsInText01 = do
let terms = ["hello", "world"] :: [NgramsTerm]
let enPatterns = buildPatternsWith EN terms
let tit = termsInText EN enPatterns "Hello, world!"
length tit @?= 2
let [tit1, tit2] = tit
tit1 @?= ("hello", 1)
tit2 @?= ("world", 1)
-- | Let's document how the `termsInText` function works.
testTermsInText02 :: Assertion
testTermsInText02 = do
let terms = ["hello", "world"] :: [NgramsTerm]
let enPatterns = buildPatternsWith EN terms
let tit = termsInText EN enPatterns "Hello, world, hello!"
length tit @?= 2
let [tit1, tit2] = tit
tit1 @?= ("hello", 2)
tit2 @?= ("world", 1)
-- | Let's document how the `termsInText` function works.
testTermsInText03 :: Assertion
testTermsInText03 = do
let terms = ["hello", "world"] :: [NgramsTerm]
let enPatterns = buildPatternsWith EN terms
let tit = termsInText EN enPatterns "Hello, world, again!"
length tit @?= 2
let [tit1, tit2] = tit
tit1 @?= ("hello", 1)
tit2 @?= ("world", 1)
-- | Let's document how the `termsInText` function works.
-- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/221
testTermsInText04 :: Assertion
testTermsInText04 = do
let terms = ["feuilles de basilic"] :: [NgramsTerm]
let frPatterns = buildPatternsWith FR terms
let tit = termsInText FR frPatterns "Infos pratiques Nombre de personnes 1 personne Quantité1 verre Temps de préparation 5 minutes Degré de difficulté Très facile Coût Abordable Les ingrédients de la recette 4 feuilles de basilic 1 branche de romarin 15 ml de citron jaune 60 ml d'eau gazeuse au mastiqua 90 ml de Bulles de Muscat Jaillance La préparation de la recette Verser dans un verre type long drink le citron jaune, les feuilles de basilic et l'eau gazeuse."
length tit @?= 1
let [tit1] = tit
tit1 @?= ("feuilles de basilic", 2)
......@@ -12,23 +12,24 @@ module Main where
import Gargantext.Prelude
import qualified Test.Core.Notifications as Notifications
import qualified Test.Core.Similarity as Similarity
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Text.Corpus.TSV as TSVParser
import qualified Test.Core.Utils as Utils
import qualified Test.Core.Worker as Worker
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.Count as NgramsCount
import qualified Test.Ngrams.Lang.Occurrences as Occurrences
import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Query as NgramsQuery
import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Errors as Errors
import qualified Test.Offline.JSON as JSON
import qualified Test.Offline.Phylo as Phylo
import qualified Test.Offline.Stemming.Lancaster as Lancaster
import qualified Test.Parsers.Date as PD
import qualified Test.Utils.Crypto as Crypto
import qualified Test.Utils.Jobs as Jobs
import qualified Test.Core.Similarity as Similarity
import qualified Test.Core.Notifications as Notifications
import Test.Tasty
import Test.Tasty.Hspec
......@@ -65,4 +66,5 @@ main = do
, Worker.tests
, asyncUpdatesSpec
, Notifications.qcTests
, NgramsCount.tests
]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment