Commit 9006d1a9 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

insertMasterDoc return richer type, no ngrams do not abort insertion

After dwelling a bit on the solution space, I concluded that no ngrams
shouldn't result into the document being skipped. The rationale would be
that if, for example, the NLP server is temporarily unavailable, we
should still insert the doc and regenerate the ngrams at a later stage.
parent ec5cdd78
Pipeline #7918 passed with stages
in 79 minutes and 5 seconds
......@@ -745,6 +745,7 @@ common commonTestDependencies
, cryptohash
, directory ^>= 1.3.7.1
, epo-api-client
, exceptions
, fast-logger ^>= 3.2.2
, filepath ^>= 1.4.2.2
, fmt
......
......@@ -40,6 +40,13 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
, flowAnnuaire
, buildSocialList
, addDocumentsToHyperCorpus
, extractNgramsFromDocument
, insertMasterDoc
, insertMasterDocs
, noUncommittedNgrams
, InsertDocResult(..)
, InsertDocWarning(..)
, DocumentHashId(..)
, reIndexWith
, ngramsByDoc
......@@ -354,21 +361,10 @@ addDocumentsToHyperCorpus jobHandle mb_hyper la corpusId docs = do
-- First extract all the ngrams for the input documents via the nlp server,
-- log errors (if any) and pass the final result to 'insertMasterDocs'.
uncommittedNgrams <- extractNgramsFromDocuments nlp la docs
(failures, ids) <- runDBTx $ do
(f,i) <- insertMasterDocs cfg uncommittedNgrams mb_hyper docs
void $ Doc.add corpusId (map nodeId2ContextId i)
pure (f,i)
forM_ failures $ \failure -> do
case failure of
NgramsNotFound _mb_hashId docId -> do
let msg = UnsafeMkHumanFriendlyErrorText $ "Couldn't find the associated ngrams for input document " <> show docId <> ", therefore the added document won't have any ngrams."
emitWarning msg jobHandle
DocumentInsertionError err -> do
-- FIXME(adn) we should give some kind of identifier to help the user
$(logLocM) ERROR $ "Error in document insertion: " <> T.pack (displayException err)
let msg = UnsafeMkHumanFriendlyErrorText $ "Skipping document insertion due to an iternal database error"
emitWarning msg jobHandle
pure ids
ids <- insertMasterDocs jobHandle cfg uncommittedNgrams mb_hyper docs
runDBTx $ do
void $ Doc.add corpusId (map nodeId2ContextId ids)
pure ids
------------------------------------------------------------------------
createNodes :: ( HasNodeError err
......@@ -495,10 +491,14 @@ newtype UncommittedNgrams doc = UncommittedNgrams
deriving stock Show
deriving newtype (Semigroup, Monoid)
data InsertDocError
data InsertDocResult
= -- | If available, some warnings
InsertDocSucceded !DocId !(Maybe InsertDocWarning)
deriving (Show, Eq)
data InsertDocWarning
= NgramsNotFound !(Maybe DocumentHashId) !DocId
| DocumentInsertionError !SomeException
deriving Show
deriving (Show, Eq)
extractNgramsFromDocument :: ( UniqParameters doc
, HasText doc
......@@ -528,22 +528,25 @@ extractNgramsFromDocument nlpServer lang doc =
commitNgramsForDocument :: UniqParameters doc
=> UncommittedNgrams doc
-> Indexed ContextId (Node doc)
-> Either InsertDocError CommittedNgrams
commitNgramsForDocument (UncommittedNgrams ng) (Indexed oldIx node) = do
-> Either InsertDocWarning CommittedNgrams
commitNgramsForDocument (UncommittedNgrams ng) (Indexed cIx node) = do
docId <- mb_docId
case Map.lookup docId ng of
Nothing -> Left $ NgramsNotFound (Just docId) (_node_id node)
Nothing -> Left $ NgramsNotFound (Just docId) (contextId2NodeId cIx)
Just ngs -> Right $ mkNodeIdNgramsMap [reIndex ngs]
where
mb_docId = case DocumentHashId <$> _node_hash_id node of
Nothing -> Left $ NgramsNotFound Nothing (_node_id node)
Nothing -> Left $ NgramsNotFound Nothing (contextId2NodeId cIx)
Just dId -> Right dId
reIndex :: DocumentIdWithNgrams DocumentHashId doc ExtractedNgrams
-> DocumentIdWithNgrams NodeId doc ExtractedNgrams
reIndex did =
let (Indexed _ a) = documentWithId did
in did { documentWithId = Indexed (contextId2NodeId oldIx) a }
in did { documentWithId = Indexed (contextId2NodeId cIx) a }
noUncommittedNgrams :: UncommittedNgrams docs
noUncommittedNgrams = UncommittedNgrams mempty
extractNgramsFromDocuments :: forall doc env err m.
( HasText doc
......@@ -570,9 +573,10 @@ insertMasterDocs :: forall env err doc c m. ( HasNodeError err
, FlowCorpus doc
, MkCorpus c
, IsDBCmd env err m
, MonadCatch m
, MonadJobStatus m
)
=> GargConfig
=> JobHandle m
-> GargConfig
-> UncommittedNgrams doc
-- ^ The ngrams extracted for /all/ the documents
-- and indexed by the hash of the given document.
......@@ -580,20 +584,22 @@ insertMasterDocs :: forall env err doc c m. ( HasNodeError err
-- with the node being created.
-> Maybe c
-> [doc]
-> m ([InsertDocError], [DocId])
insertMasterDocs cfg uncommittedNgrams c docs =
bimap reverse reverse <$> foldlM go (mempty, mempty) docs
-> m [DocId]
insertMasterDocs jobHandle cfg uncommittedNgrams c docs =
reverse <$> foldlM go mempty docs
where
go :: ([InsertDocError], [DocId]) -> doc -> m ([InsertDocError], [DocId])
go (!errs, !documents) doc = do
res <- try $ runDBTx (insertMasterDoc cfg uncommittedNgrams c doc)
go :: [DocId] -> doc -> m [DocId]
go !documents doc = do
res <- runDBTx (insertMasterDoc cfg uncommittedNgrams c doc)
case res of
Left err
-> pure (DocumentInsertionError err : errs, documents)
Right (Left err)
-> pure (err : errs, documents)
Right (Right d)
-> pure (errs, d : documents)
InsertDocSucceded d Nothing ->
pure (d : documents)
InsertDocSucceded d (Just warning) -> do
case warning of
NgramsNotFound _mb_hashId docId -> do
let msg = UnsafeMkHumanFriendlyErrorText $ "Couldn't find the associated ngrams for input document " <> show docId <> ", therefore the added document won't have any ngrams."
emitWarning msg jobHandle
pure (d : documents)
insertMasterDoc :: ( HasNodeError err
, UniqParameters doc
......@@ -608,7 +614,7 @@ insertMasterDoc :: ( HasNodeError err
-- with the node being created.
-> Maybe c
-> doc
-> DBUpdate err (Either InsertDocError DocId)
-> DBUpdate err InsertDocResult
insertMasterDoc cfg uncommittedNgrams c h = do
(masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c
documentWithId <- insertDoc masterUserId masterCorpusId (toNode masterUserId Nothing h)
......@@ -620,11 +626,11 @@ insertMasterDoc cfg uncommittedNgrams c h = do
-- this will enable global database monitoring
case commitNgramsForDocument uncommittedNgrams documentWithId of
Left failed -> pure $ Left failed
Left failed -> pure $ InsertDocSucceded (contextId2NodeId $ _index documentWithId) (Just failed)
Right ngramsDocsMap -> do
lId <- getOrMkList masterCorpusId masterUserId
_ <- saveDocNgramsWith lId ngramsDocsMap
pure $ Right (contextId2NodeId $ _index documentWithId)
pure $ InsertDocSucceded (contextId2NodeId $ _index documentWithId) Nothing
saveDocNgramsWith :: ListId
......
......@@ -66,6 +66,9 @@ tests = parallel $ around withTestDB $ beforeWith (\ctx -> setupEnvironment ctx
it "Can add language to Corpus" corpusAddLanguage
describe "With test documents" $ beforeWith addCorpusDocuments $ do
it "Can add documents to a Corpus" corpusAddDocuments
it "Correctly return an error if uncommitted ngrams map is empty" corpusAddDocNgramsEmpty
it "Correctly return the same Id if a document is inserted twice" corpusAddDocAlreadyExisting
it "Can survive failures in a batch" corpusAddDocBatchSurvival
describe "Corpus search" $ do
it "Can stem query terms" stemmingTest
it "Can return the number of docs in a corpus" corpusReturnCount
......@@ -81,7 +84,7 @@ tests = parallel $ around withTestDB $ beforeWith (\ctx -> setupEnvironment ctx
it "Publishes the root and its first level children" testPublishRecursiveFirstLevel
it "Publishes the root and its recursive children" testPublishRecursiveNLevel
it "Publishes in a lenient way but it's still considered read-only" testPublishLenientWorks
nodeStoryTests :: Spec
nodeStoryTests = sequential $
-- run 'withTestDB' before _every_ test item
......
......@@ -9,14 +9,19 @@ Portability : POSIX
-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Test.Database.Operations.DocumentSearch where
import Control.Lens
import Control.Monad.Reader
import Data.Aeson.QQ.Simple
import Data.Aeson.Types
import Data.Maybe (fromJust)
import Data.Text qualified as T
import Gargantext.Core
import Gargantext.Core.Config
import Gargantext.Core.NLP (HasNLPServer(..))
import Gargantext.Core.Text.Corpus.Query qualified as API
import Gargantext.Core.Text.Terms.Mono.Stem
import Gargantext.Core.Types
......@@ -30,15 +35,15 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Prelude
import Gargantext.Database.Query.Facet
import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Table.Node.Error (nodeErrorWith)
import Gargantext.Database.Query.Tree.Root
import Gargantext.Database.Query.Table.Node.Error
import Gargantext.Database.Query.Table.NodeContext (selectCountDocs)
import Gargantext.Database.Query.Tree.Root
import Gargantext.Database.Schema.Node (NodePoly(..))
import Network.URI (parseURI)
import Prelude
import Test.Database.Types
import Test.Hspec.Expectations
import Test.HUnit hiding (Node)
import Test.Hspec.Expectations
exampleDocument_01 :: HyperdataDocument
......@@ -114,6 +119,31 @@ exampleDocument_04 = either error id $ parseEither parseJSON $ [aesonQQ|
}
|]
exampleDocument_05 :: HyperdataDocument
exampleDocument_05 = either error id $ parseEither parseJSON $ [aesonQQ|
{
"bdd": "Arxiv"
, "doi": ""
, "url": "http://arxiv.org/pdf/1407.5670v1"
, "title": "GQL for Dummies"
, "source": ""
, "authors": "Ennio Annio"
, "abstract": " This article provides an introduction to GraphQL" , "institutes": ""
, "language_iso2": "EN"
, "publication_date": "2014-07-21T21:20:31Z"
, "publication_year": 2014
}
|]
exampleDocument_06 :: HyperdataDocument
exampleDocument_06 = exampleDocument_05 { _hd_title = Just "GQL Part 1" }
exampleDocument_07 :: HyperdataDocument
exampleDocument_07 = exampleDocument_05 { _hd_title = Just "GQL Part 2" }
exampleDocument_08 :: HyperdataDocument
exampleDocument_08 = exampleDocument_05 { _hd_title = Just "GQL Part 3" }
getCorporaWithParentIdOrFail :: HasNodeError err => NodeId -> DBQuery err x (Node HyperdataCorpus)
getCorporaWithParentIdOrFail parentId = do
xs <- getCorporaWithParentId parentId
......@@ -131,13 +161,74 @@ addCorpusDocuments env = runTestMonad env $ do
let lang = EN
let docs = [exampleDocument_01, exampleDocument_02, exampleDocument_03, exampleDocument_04]
_ <- addDocumentsToHyperCorpus (test_job_handle env)
ids <- addDocumentsToHyperCorpus (test_job_handle env)
(Just $ _node_hyperdata $ corpus)
(Multi lang)
corpusId
docs
liftIO $ ids `shouldBe` [ UnsafeMkNodeId 1, UnsafeMkNodeId 2, UnsafeMkNodeId 3, UnsafeMkNodeId 4 ]
pure env
corpusAddDocNgramsEmpty :: TestEnv -> Assertion
corpusAddDocNgramsEmpty env = runTestMonad env $ do
cfg <- view hasConfig
parentId <- runDBQuery $ getRootId (UserName userMaster)
[corpus] <- runDBQuery $ getCorporaWithParentId parentId
let doc = exampleDocument_05
res <- runDBTx $ insertMasterDoc cfg noUncommittedNgrams (Just $ _node_hyperdata corpus) doc
let wrn = NgramsNotFound (Just (DocumentHashId "\\x0b41caa65782b9570530910d1942a21bb1859386ee4c976e7b4cd0624ee966f3")) (UnsafeMkNodeId 5)
liftIO $ res `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) (Just wrn)
corpusAddDocAlreadyExisting :: TestEnv -> Assertion
corpusAddDocAlreadyExisting env = runTestMonad env $ do
cfg <- view hasConfig
parentId <- runDBQuery $ getRootId (UserName userMaster)
[corpus] <- runDBQuery $ getCorporaWithParentId parentId
let doc = exampleDocument_05
let la = Mono EN
-- Let's use a bogus NLP server port
let nlp = NLPServerConfig CoreNLP (fromJust $ parseURI "http://localhost:9999")
uncommittedNgrams <- extractNgramsFromDocument nlp la doc
oldCount <- docsInCorpusCount
res <- runDBTx $ insertMasterDoc cfg uncommittedNgrams (Just $ _node_hyperdata corpus) doc
interimCount <- docsInCorpusCount
liftIO $ do
interimCount `shouldBe` oldCount + 1
res `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) Nothing
-- The second time the node gets cached and returned.
res1 <- runDBTx $ insertMasterDoc cfg uncommittedNgrams (Just $ _node_hyperdata corpus) doc
newCount <- docsInCorpusCount
liftIO $ do
newCount `shouldBe` interimCount
res1 `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) Nothing
-- | We test that if we add 3 documents, out of which the middle one doesn't have any keyed ngrams
-- the other two are still persisted.
corpusAddDocBatchSurvival :: TestEnv -> Assertion
corpusAddDocBatchSurvival env = runTestMonad env $ do
cfg <- view hasConfig
parentId <- runDBQuery $ getRootId (UserName userMaster)
[corpus] <- runDBQuery $ getCorporaWithParentId parentId
let la = Mono EN
nlp <- view (nlpServerGet (_tt_lang la))
oldCount <- docsInCorpusCount
let docs = [exampleDocument_06, exampleDocument_07, exampleDocument_08 ]
uNgrams1 <- extractNgramsFromDocument nlp la exampleDocument_06
uNgrams2 <- extractNgramsFromDocument nlp la exampleDocument_08
savedDocs <- insertMasterDocs (test_job_handle env) cfg (uNgrams1 <> uNgrams2) (Just $ _node_hyperdata corpus) docs
c' <- docsInCorpusCount
liftIO $ do
-- failures `shouldBe` [DocumentInsertionError "InternalNodeError Cannot make node due to: Couldn't find the associated ngrams for document with hash \\x666b8e7bfd7c0af37d630e1097791f7ba438a669ecb6d1cb38014edd0b7a2977, therefore the added document won't have any ngrams."]
c' `shouldBe` oldCount + 3
savedDocs `shouldBe` [ UnsafeMkNodeId 5, UnsafeMkNodeId 6, UnsafeMkNodeId 7 ]
corpusAddDocuments :: TestEnv -> Assertion
corpusAddDocuments env = runTestMonad env $ do
......@@ -247,11 +338,13 @@ corpusSearchDB01 env = do
liftIO $ do
length results `shouldBe` 0 -- doesn't exist, we just check that proper to_tsquery is called
docsInCorpusCount :: TestMonad Int
docsInCorpusCount = runDBQuery $ do
parentId <- getRootId (UserName userMaster)
corpus <- getCorporaWithParentIdOrFail parentId
selectCountDocs (_node_id corpus)
corpusReturnCount :: TestEnv -> Assertion
corpusReturnCount env = do
runTestMonad env $ do
count <- runDBQuery $ do
parentId <- getRootId (UserName userMaster)
corpus <- getCorporaWithParentIdOrFail parentId
selectCountDocs (_node_id corpus)
liftIO $ count @?= 4
count <- runTestMonad env $ docsInCorpusCount
liftIO $ count @?= 4
......@@ -94,7 +94,7 @@ runTestMonadM :: Show err => env -> TestMonadM env err a -> IO a
runTestMonadM env m = do
res <- flip runReaderT env . runExceptT . _TestMonad $ m
case res of
Left err -> throwIO $ userError (show err)
Left err -> throwIO $ userError ("runTestMonadM: " <> show err)
Right x -> pure x
runTestMonad :: TestEnv -> TestMonadM TestEnv BackendInternalError a -> IO a
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment