[ngrams] annotate types of ngrams algorithms

parent 08c7c91c
Pipeline #7170 passed with stages
in 52 minutes and 15 seconds
......@@ -392,10 +392,15 @@ For tasty, if you want to run specific test (via patterns), use:
cabal v2-run garg-test-tasty -- -p '/Ngrams/
```
For integration tests, do:
```shel
```shell
cabal v2-test garg-test-hspec --test-show-details=streaming --test-option=--match='/some pattern/'
```
You could also use [ghciwatch](https://mercurytechnologies.github.io/ghciwatch/) for testsing:
```shell
ghciwatch --command "cabal v2-repl garg-test-tasty" --after-startup-ghci ':set args "--pattern" "/Ngrams/"' --after-startup-ghci "Main.main" --after-reload-ghci "Main.main" --watch src --watch test
```
### Modifying a golden test to accept a new (expected) output
Some tests, like the Phylo one, use golden testing to ensure that the JSON Phylo we generate is
......
......@@ -264,6 +264,7 @@ library
Gargantext.Core.Worker.Types
Gargantext.Database.Action.Flow
Gargantext.Database.Action.Flow.Types
Gargantext.Database.Action.Flow.Utils
Gargantext.Database.Action.Metrics.TFICF
Gargantext.Database.Action.Search
Gargantext.Database.Action.User
......@@ -286,6 +287,7 @@ library
Gargantext.Database.Query.Table.Node.User
Gargantext.Database.Query.Table.User
Gargantext.Database.Query.Tree.Root
Gargantext.Database.Schema.Context
Gargantext.Database.Schema.Ngrams
Gargantext.Database.Schema.Node
Gargantext.Database.Schema.User
......@@ -413,7 +415,6 @@ library
Gargantext.Database.Action.Flow.Extract
Gargantext.Database.Action.Flow.List
Gargantext.Database.Action.Flow.Pairing
Gargantext.Database.Action.Flow.Utils
Gargantext.Database.Action.Index
Gargantext.Database.Action.Learn
Gargantext.Database.Action.Mail
......@@ -458,7 +459,6 @@ library
Gargantext.Database.Query.Table.NodeNgrams
Gargantext.Database.Query.Tree
Gargantext.Database.Query.Tree.Error
Gargantext.Database.Schema.Context
Gargantext.Database.Schema.ContextNodeNgrams
Gargantext.Database.Schema.ContextNodeNgrams2
Gargantext.Database.Schema.NodeContext
......
......@@ -102,7 +102,7 @@ import Gargantext.Database.Query.Table.Node ( MkCorpus, insertDefaultNodeIfNotEx
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert ( ToNode(toNode) ) -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId)
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser)
import Gargantext.Database.Schema.Ngrams ( indexNgrams, text2ngrams )
......@@ -502,7 +502,7 @@ reIndexWith cId lId nt lts = do
<$> getTermsWith identity [lId] nt lts
-- Get all documents of the corpus
(docs :: [Context HyperdataDocument]) <- selectDocNodes cId
(docs :: [ContextOnlyId HyperdataDocument]) <- selectDocNodesOnlyId cId
let
-- fromListWith (<>)
......
......@@ -38,7 +38,7 @@ import Gargantext.Database.Query.Table.ContextNodeNgrams ( ContextNodeNgramsPoly
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Schema.Context (context_hyperdata, context_id)
import Gargantext.Database.Schema.Context (context_oid_hyperdata, context_oid_id)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..))
import Gargantext.Database.Types ( Indexed(..), index )
import Gargantext.Prelude
......@@ -75,18 +75,18 @@ insertDocNgrams lId m = do
docNgrams :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> Gargantext.Database.Admin.Types.Node.Context HyperdataDocument
-> ContextOnlyId HyperdataDocument
-> [((MatchedText, TermsCount),
Map NgramsType (Map NodeId Int))]
docNgrams lang nt ts doc =
List.zip
(termsInText lang (buildPatternsWith lang ts)
$ T.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract
[ doc ^. context_oid_hyperdata . hd_title
, doc ^. context_oid_hyperdata . hd_abstract
]
)
(List.cycle [DM.fromList $ [(nt, DM.singleton (doc ^. context_id) 1 )]])
(List.cycle [DM.fromList $ [(nt, DM.singleton (doc ^. context_oid_id) 1 )]])
documentIdWithNgrams :: HasNodeError err
......
......@@ -47,15 +47,18 @@ instance HasText HyperdataDocument
, _hd_abstract h
]
defaultHyperdataDocument :: HyperdataDocument
defaultHyperdataDocument = case decode docExample of
Just hp -> hp
Nothing -> HyperdataDocument Nothing Nothing
emptyHyperdataDocument :: HyperdataDocument
emptyHyperdataDocument = HyperdataDocument Nothing Nothing
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
defaultHyperdataDocument :: HyperdataDocument
defaultHyperdataDocument = case decode docExample of
Just hp -> hp
Nothing -> emptyHyperdataDocument
where
docExample :: ByteString
docExample = "{\"doi\":\"sdfds\",\"publication_day\":6,\"language_iso2\":\"en\",\"publication_minute\":0,\"publication_month\":7,\"language_iso3\":\"eng\",\"publication_second\":0,\"authors\":\"Nils Hovdenak, Kjell Haram\",\"publication_year\":2012,\"publication_date\":\"2012-07-06 00:00:00+00:00\",\"language_name\":\"English\",\"realdate_full_\":\"2012 01 12\",\"source\":\"European journal of obstetrics, gynecology, and reproductive biology\",\"abstract\":\"The literature was searched for publications on minerals and vitamins during pregnancy and the possible influence of supplements on pregnancy outcome.\",\"title\":\"Influence of mineral and vitamin supplements on pregnancy outcome.\",\"publication_hour\":0}"
......
......@@ -103,6 +103,7 @@ type ContextTitle = Text
-- | NodePoly indicates that Node has a Polymorphism Type
type Node json = NodePoly NodeId (Maybe Hash) NodeTypeId UserId (Maybe ParentId) NodeName UTCTime json
type Context json = ContextPoly NodeId (Maybe Hash) NodeTypeId UserId (Maybe ParentId) ContextTitle UTCTime json
type ContextOnlyId json = ContextPolyOnlyId NodeId json
-- | NodeSearch (queries)
-- type NodeSearch json = NodePolySearch NodeId NodeTypeId UserId (Maybe ParentId) NodeName UTCTime json (Maybe TSVector)
......
......@@ -22,6 +22,7 @@ module Gargantext.Database.Query.Table.NodeContext
, queryNodeContextTable
, selectDocsDates
, selectDocNodes
, selectDocNodesOnlyId
, selectDocs
, nodeContextsCategory
, nodeContextsScore
......@@ -413,6 +414,15 @@ queryDocNodes cId = proc () -> do
restrict -< (c ^. context_typename) .== sqlInt4 (toDBid NodeDocument)
returnA -< c
selectDocNodesOnlyId :: HasDBid NodeType => CorpusId -> DBCmd err [ContextOnlyId HyperdataDocument]
selectDocNodesOnlyId cId = runOpaQuery (queryDocNodesOnlyId cId)
queryDocNodesOnlyId :: HasDBid NodeType => CorpusId -> O.Select ContextOnlyIdRead
queryDocNodesOnlyId cId = proc () -> do
c <- queryDocNodes cId -< ()
returnA -< ContextOnlyId (c ^. context_id) (c ^. context_hyperdata)
joinInCorpus :: O.Select (ContextRead, MaybeFields NodeContextRead)
joinInCorpus = proc () -> do
c <- queryContextTable -< ()
......
......@@ -52,6 +52,23 @@ $(makeLenses ''ContextPoly)
$(makeAdaptorAndInstance "pContext" ''ContextPoly)
$(makeLensesWith abbreviatedFields ''ContextPoly)
------------------------------------------------------------------------
data ContextPolyOnlyId id hyperdata =
ContextOnlyId { _context_oid_id :: !id
, _context_oid_hyperdata :: !hyperdata }
deriving (Show, Generic)
------------------------------------------------------------------------
-- Automatic instances derivation
$(deriveJSON (unPrefix "_context_oid_") ''ContextPolyOnlyId)
$(makeLenses ''ContextPolyOnlyId)
$(makeAdaptorAndInstance "pContextOnlyId" ''ContextPolyOnlyId)
$(makeLensesWith abbreviatedFields ''ContextPolyOnlyId)
------------------------------------------------------------------------
contextTable :: Table ContextWrite ContextRead
contextTable = Table "contexts" (pContext Context { _context_id = optionalTableField "id"
, _context_hash_id = optionalTableField "hash_id"
......@@ -87,6 +104,10 @@ type ContextRead = ContextPoly (Field SqlInt4 )
(Field SqlText )
(Field SqlTimestamptz )
(Field SqlJsonb )
type ContextOnlyIdRead = ContextPolyOnlyId (Field SqlInt4 )
(Field SqlJsonb )
------------------------------------------------------------------------
-- | Context(Read|Write)Search is slower than Context(Write|Read) use it
-- for full text search only
......
......@@ -6,7 +6,11 @@ module Test.Ngrams.Count (tests) where
import Gargantext.API.Ngrams
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (buildPatternsWith, termsInText, Pattern(..))
import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..), emptyHyperdataDocument )
import Gargantext.Database.Schema.Context ( ContextPolyOnlyId(..) )
import Gargantext.Prelude
import Test.Tasty
import Test.Tasty.HUnit
......@@ -20,10 +24,13 @@ unitTests = testGroup "Count tests"
[ -- Sorting
testCase "Build patterns works 01" testBuildPatterns01
, testCase "Build patterns works 02" testBuildPatterns02
, testCase "termsInText works 01" testTermsInText01
, testCase "termsInText works 02" testTermsInText02
, testCase "termsInText works 03" testTermsInText03
, testCase "termsInText works 04 (related to issue #221)" testTermsInText04
, testCase "docNgrams works 01" testDocNgrams01
]
-- | Let's document how the `buildPatternsWith` function works.
......@@ -91,3 +98,12 @@ testTermsInText04 = do
length tit @?= 1
let [tit1] = tit
tit1 @?= ("feuilles de basilic", 2)
testDocNgrams01 :: Assertion
testDocNgrams01 = do
let terms = ["hello", "world"] :: [NgramsTerm]
let hd = emptyHyperdataDocument { _hd_title = Just "hello world"
, _hd_abstract = Nothing }
let ctx = ContextOnlyId 1 hd
let dNgrams = docNgrams EN NgramsTerms terms ctx
length dNgrams @?= 2
......@@ -31,9 +31,18 @@ import qualified Test.Parsers.Date as PD
import qualified Test.Utils.Crypto as Crypto
import qualified Test.Utils.Jobs as Jobs
import System.IO (hGetBuffering, hSetBuffering)
import Test.Tasty
import Test.Tasty.Hspec
-- | https://mercurytechnologies.github.io/ghciwatch/integration/tasty.html
protectStdoutBuffering :: IO a -> IO a
protectStdoutBuffering action =
bracket
(hGetBuffering stdout)
(\bufferMode -> hSetBuffering stdout bufferMode)
(const action)
main :: IO ()
main = do
utilSpec <- testSpec "Utils" Utils.test
......@@ -46,7 +55,7 @@ main = do
asyncUpdatesSpec <- testSpec "Notifications" Notifications.test
occurrencesSpec <- testSpec "Occurrences" Occurrences.test
defaultMain $ testGroup "Gargantext"
protectStdoutBuffering $ defaultMain $ testGroup "Gargantext"
[ utilSpec
, clusteringSpec
, dateSplitSpec
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment