Commit 9006d1a9 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

insertMasterDoc return richer type, no ngrams do not abort insertion

After dwelling a bit on the solution space, I concluded that no ngrams
shouldn't result into the document being skipped. The rationale would be
that if, for example, the NLP server is temporarily unavailable, we
should still insert the doc and regenerate the ngrams at a later stage.
parent ec5cdd78
Pipeline #7918 passed with stages
in 79 minutes and 5 seconds
...@@ -745,6 +745,7 @@ common commonTestDependencies ...@@ -745,6 +745,7 @@ common commonTestDependencies
, cryptohash , cryptohash
, directory ^>= 1.3.7.1 , directory ^>= 1.3.7.1
, epo-api-client , epo-api-client
, exceptions
, fast-logger ^>= 3.2.2 , fast-logger ^>= 3.2.2
, filepath ^>= 1.4.2.2 , filepath ^>= 1.4.2.2
, fmt , fmt
......
...@@ -40,6 +40,13 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -40,6 +40,13 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
, flowAnnuaire , flowAnnuaire
, buildSocialList , buildSocialList
, addDocumentsToHyperCorpus , addDocumentsToHyperCorpus
, extractNgramsFromDocument
, insertMasterDoc
, insertMasterDocs
, noUncommittedNgrams
, InsertDocResult(..)
, InsertDocWarning(..)
, DocumentHashId(..)
, reIndexWith , reIndexWith
, ngramsByDoc , ngramsByDoc
...@@ -354,21 +361,10 @@ addDocumentsToHyperCorpus jobHandle mb_hyper la corpusId docs = do ...@@ -354,21 +361,10 @@ addDocumentsToHyperCorpus jobHandle mb_hyper la corpusId docs = do
-- First extract all the ngrams for the input documents via the nlp server, -- First extract all the ngrams for the input documents via the nlp server,
-- log errors (if any) and pass the final result to 'insertMasterDocs'. -- log errors (if any) and pass the final result to 'insertMasterDocs'.
uncommittedNgrams <- extractNgramsFromDocuments nlp la docs uncommittedNgrams <- extractNgramsFromDocuments nlp la docs
(failures, ids) <- runDBTx $ do ids <- insertMasterDocs jobHandle cfg uncommittedNgrams mb_hyper docs
(f,i) <- insertMasterDocs cfg uncommittedNgrams mb_hyper docs runDBTx $ do
void $ Doc.add corpusId (map nodeId2ContextId i) void $ Doc.add corpusId (map nodeId2ContextId ids)
pure (f,i) pure ids
forM_ failures $ \failure -> do
case failure of
NgramsNotFound _mb_hashId docId -> do
let msg = UnsafeMkHumanFriendlyErrorText $ "Couldn't find the associated ngrams for input document " <> show docId <> ", therefore the added document won't have any ngrams."
emitWarning msg jobHandle
DocumentInsertionError err -> do
-- FIXME(adn) we should give some kind of identifier to help the user
$(logLocM) ERROR $ "Error in document insertion: " <> T.pack (displayException err)
let msg = UnsafeMkHumanFriendlyErrorText $ "Skipping document insertion due to an iternal database error"
emitWarning msg jobHandle
pure ids
------------------------------------------------------------------------ ------------------------------------------------------------------------
createNodes :: ( HasNodeError err createNodes :: ( HasNodeError err
...@@ -495,10 +491,14 @@ newtype UncommittedNgrams doc = UncommittedNgrams ...@@ -495,10 +491,14 @@ newtype UncommittedNgrams doc = UncommittedNgrams
deriving stock Show deriving stock Show
deriving newtype (Semigroup, Monoid) deriving newtype (Semigroup, Monoid)
data InsertDocError data InsertDocResult
= -- | If available, some warnings
InsertDocSucceded !DocId !(Maybe InsertDocWarning)
deriving (Show, Eq)
data InsertDocWarning
= NgramsNotFound !(Maybe DocumentHashId) !DocId = NgramsNotFound !(Maybe DocumentHashId) !DocId
| DocumentInsertionError !SomeException deriving (Show, Eq)
deriving Show
extractNgramsFromDocument :: ( UniqParameters doc extractNgramsFromDocument :: ( UniqParameters doc
, HasText doc , HasText doc
...@@ -528,22 +528,25 @@ extractNgramsFromDocument nlpServer lang doc = ...@@ -528,22 +528,25 @@ extractNgramsFromDocument nlpServer lang doc =
commitNgramsForDocument :: UniqParameters doc commitNgramsForDocument :: UniqParameters doc
=> UncommittedNgrams doc => UncommittedNgrams doc
-> Indexed ContextId (Node doc) -> Indexed ContextId (Node doc)
-> Either InsertDocError CommittedNgrams -> Either InsertDocWarning CommittedNgrams
commitNgramsForDocument (UncommittedNgrams ng) (Indexed oldIx node) = do commitNgramsForDocument (UncommittedNgrams ng) (Indexed cIx node) = do
docId <- mb_docId docId <- mb_docId
case Map.lookup docId ng of case Map.lookup docId ng of
Nothing -> Left $ NgramsNotFound (Just docId) (_node_id node) Nothing -> Left $ NgramsNotFound (Just docId) (contextId2NodeId cIx)
Just ngs -> Right $ mkNodeIdNgramsMap [reIndex ngs] Just ngs -> Right $ mkNodeIdNgramsMap [reIndex ngs]
where where
mb_docId = case DocumentHashId <$> _node_hash_id node of mb_docId = case DocumentHashId <$> _node_hash_id node of
Nothing -> Left $ NgramsNotFound Nothing (_node_id node) Nothing -> Left $ NgramsNotFound Nothing (contextId2NodeId cIx)
Just dId -> Right dId Just dId -> Right dId
reIndex :: DocumentIdWithNgrams DocumentHashId doc ExtractedNgrams reIndex :: DocumentIdWithNgrams DocumentHashId doc ExtractedNgrams
-> DocumentIdWithNgrams NodeId doc ExtractedNgrams -> DocumentIdWithNgrams NodeId doc ExtractedNgrams
reIndex did = reIndex did =
let (Indexed _ a) = documentWithId did let (Indexed _ a) = documentWithId did
in did { documentWithId = Indexed (contextId2NodeId oldIx) a } in did { documentWithId = Indexed (contextId2NodeId cIx) a }
noUncommittedNgrams :: UncommittedNgrams docs
noUncommittedNgrams = UncommittedNgrams mempty
extractNgramsFromDocuments :: forall doc env err m. extractNgramsFromDocuments :: forall doc env err m.
( HasText doc ( HasText doc
...@@ -570,9 +573,10 @@ insertMasterDocs :: forall env err doc c m. ( HasNodeError err ...@@ -570,9 +573,10 @@ insertMasterDocs :: forall env err doc c m. ( HasNodeError err
, FlowCorpus doc , FlowCorpus doc
, MkCorpus c , MkCorpus c
, IsDBCmd env err m , IsDBCmd env err m
, MonadCatch m , MonadJobStatus m
) )
=> GargConfig => JobHandle m
-> GargConfig
-> UncommittedNgrams doc -> UncommittedNgrams doc
-- ^ The ngrams extracted for /all/ the documents -- ^ The ngrams extracted for /all/ the documents
-- and indexed by the hash of the given document. -- and indexed by the hash of the given document.
...@@ -580,20 +584,22 @@ insertMasterDocs :: forall env err doc c m. ( HasNodeError err ...@@ -580,20 +584,22 @@ insertMasterDocs :: forall env err doc c m. ( HasNodeError err
-- with the node being created. -- with the node being created.
-> Maybe c -> Maybe c
-> [doc] -> [doc]
-> m ([InsertDocError], [DocId]) -> m [DocId]
insertMasterDocs cfg uncommittedNgrams c docs = insertMasterDocs jobHandle cfg uncommittedNgrams c docs =
bimap reverse reverse <$> foldlM go (mempty, mempty) docs reverse <$> foldlM go mempty docs
where where
go :: ([InsertDocError], [DocId]) -> doc -> m ([InsertDocError], [DocId]) go :: [DocId] -> doc -> m [DocId]
go (!errs, !documents) doc = do go !documents doc = do
res <- try $ runDBTx (insertMasterDoc cfg uncommittedNgrams c doc) res <- runDBTx (insertMasterDoc cfg uncommittedNgrams c doc)
case res of case res of
Left err InsertDocSucceded d Nothing ->
-> pure (DocumentInsertionError err : errs, documents) pure (d : documents)
Right (Left err) InsertDocSucceded d (Just warning) -> do
-> pure (err : errs, documents) case warning of
Right (Right d) NgramsNotFound _mb_hashId docId -> do
-> pure (errs, d : documents) let msg = UnsafeMkHumanFriendlyErrorText $ "Couldn't find the associated ngrams for input document " <> show docId <> ", therefore the added document won't have any ngrams."
emitWarning msg jobHandle
pure (d : documents)
insertMasterDoc :: ( HasNodeError err insertMasterDoc :: ( HasNodeError err
, UniqParameters doc , UniqParameters doc
...@@ -608,7 +614,7 @@ insertMasterDoc :: ( HasNodeError err ...@@ -608,7 +614,7 @@ insertMasterDoc :: ( HasNodeError err
-- with the node being created. -- with the node being created.
-> Maybe c -> Maybe c
-> doc -> doc
-> DBUpdate err (Either InsertDocError DocId) -> DBUpdate err InsertDocResult
insertMasterDoc cfg uncommittedNgrams c h = do insertMasterDoc cfg uncommittedNgrams c h = do
(masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus cfg MkCorpusUserMaster c
documentWithId <- insertDoc masterUserId masterCorpusId (toNode masterUserId Nothing h) documentWithId <- insertDoc masterUserId masterCorpusId (toNode masterUserId Nothing h)
...@@ -620,11 +626,11 @@ insertMasterDoc cfg uncommittedNgrams c h = do ...@@ -620,11 +626,11 @@ insertMasterDoc cfg uncommittedNgrams c h = do
-- this will enable global database monitoring -- this will enable global database monitoring
case commitNgramsForDocument uncommittedNgrams documentWithId of case commitNgramsForDocument uncommittedNgrams documentWithId of
Left failed -> pure $ Left failed Left failed -> pure $ InsertDocSucceded (contextId2NodeId $ _index documentWithId) (Just failed)
Right ngramsDocsMap -> do Right ngramsDocsMap -> do
lId <- getOrMkList masterCorpusId masterUserId lId <- getOrMkList masterCorpusId masterUserId
_ <- saveDocNgramsWith lId ngramsDocsMap _ <- saveDocNgramsWith lId ngramsDocsMap
pure $ Right (contextId2NodeId $ _index documentWithId) pure $ InsertDocSucceded (contextId2NodeId $ _index documentWithId) Nothing
saveDocNgramsWith :: ListId saveDocNgramsWith :: ListId
......
...@@ -66,6 +66,9 @@ tests = parallel $ around withTestDB $ beforeWith (\ctx -> setupEnvironment ctx ...@@ -66,6 +66,9 @@ tests = parallel $ around withTestDB $ beforeWith (\ctx -> setupEnvironment ctx
it "Can add language to Corpus" corpusAddLanguage it "Can add language to Corpus" corpusAddLanguage
describe "With test documents" $ beforeWith addCorpusDocuments $ do describe "With test documents" $ beforeWith addCorpusDocuments $ do
it "Can add documents to a Corpus" corpusAddDocuments it "Can add documents to a Corpus" corpusAddDocuments
it "Correctly return an error if uncommitted ngrams map is empty" corpusAddDocNgramsEmpty
it "Correctly return the same Id if a document is inserted twice" corpusAddDocAlreadyExisting
it "Can survive failures in a batch" corpusAddDocBatchSurvival
describe "Corpus search" $ do describe "Corpus search" $ do
it "Can stem query terms" stemmingTest it "Can stem query terms" stemmingTest
it "Can return the number of docs in a corpus" corpusReturnCount it "Can return the number of docs in a corpus" corpusReturnCount
...@@ -81,7 +84,7 @@ tests = parallel $ around withTestDB $ beforeWith (\ctx -> setupEnvironment ctx ...@@ -81,7 +84,7 @@ tests = parallel $ around withTestDB $ beforeWith (\ctx -> setupEnvironment ctx
it "Publishes the root and its first level children" testPublishRecursiveFirstLevel it "Publishes the root and its first level children" testPublishRecursiveFirstLevel
it "Publishes the root and its recursive children" testPublishRecursiveNLevel it "Publishes the root and its recursive children" testPublishRecursiveNLevel
it "Publishes in a lenient way but it's still considered read-only" testPublishLenientWorks it "Publishes in a lenient way but it's still considered read-only" testPublishLenientWorks
nodeStoryTests :: Spec nodeStoryTests :: Spec
nodeStoryTests = sequential $ nodeStoryTests = sequential $
-- run 'withTestDB' before _every_ test item -- run 'withTestDB' before _every_ test item
......
...@@ -9,14 +9,19 @@ Portability : POSIX ...@@ -9,14 +9,19 @@ Portability : POSIX
-} -}
{-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Test.Database.Operations.DocumentSearch where module Test.Database.Operations.DocumentSearch where
import Control.Lens
import Control.Monad.Reader import Control.Monad.Reader
import Data.Aeson.QQ.Simple import Data.Aeson.QQ.Simple
import Data.Aeson.Types import Data.Aeson.Types
import Data.Maybe (fromJust)
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Config
import Gargantext.Core.NLP (HasNLPServer(..))
import Gargantext.Core.Text.Corpus.Query qualified as API import Gargantext.Core.Text.Corpus.Query qualified as API
import Gargantext.Core.Text.Terms.Mono.Stem import Gargantext.Core.Text.Terms.Mono.Stem
import Gargantext.Core.Types import Gargantext.Core.Types
...@@ -30,15 +35,15 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document ...@@ -30,15 +35,15 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Prelude import Gargantext.Database.Prelude
import Gargantext.Database.Query.Facet import Gargantext.Database.Query.Facet
import Gargantext.Database.Query.Table.Node import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Node.Error (HasNodeError) import Gargantext.Database.Query.Table.Node.Error
import Gargantext.Database.Query.Table.Node.Error (nodeErrorWith)
import Gargantext.Database.Query.Tree.Root
import Gargantext.Database.Query.Table.NodeContext (selectCountDocs) import Gargantext.Database.Query.Table.NodeContext (selectCountDocs)
import Gargantext.Database.Query.Tree.Root
import Gargantext.Database.Schema.Node (NodePoly(..)) import Gargantext.Database.Schema.Node (NodePoly(..))
import Network.URI (parseURI)
import Prelude import Prelude
import Test.Database.Types import Test.Database.Types
import Test.Hspec.Expectations
import Test.HUnit hiding (Node) import Test.HUnit hiding (Node)
import Test.Hspec.Expectations
exampleDocument_01 :: HyperdataDocument exampleDocument_01 :: HyperdataDocument
...@@ -114,6 +119,31 @@ exampleDocument_04 = either error id $ parseEither parseJSON $ [aesonQQ| ...@@ -114,6 +119,31 @@ exampleDocument_04 = either error id $ parseEither parseJSON $ [aesonQQ|
} }
|] |]
exampleDocument_05 :: HyperdataDocument
exampleDocument_05 = either error id $ parseEither parseJSON $ [aesonQQ|
{
"bdd": "Arxiv"
, "doi": ""
, "url": "http://arxiv.org/pdf/1407.5670v1"
, "title": "GQL for Dummies"
, "source": ""
, "authors": "Ennio Annio"
, "abstract": " This article provides an introduction to GraphQL" , "institutes": ""
, "language_iso2": "EN"
, "publication_date": "2014-07-21T21:20:31Z"
, "publication_year": 2014
}
|]
exampleDocument_06 :: HyperdataDocument
exampleDocument_06 = exampleDocument_05 { _hd_title = Just "GQL Part 1" }
exampleDocument_07 :: HyperdataDocument
exampleDocument_07 = exampleDocument_05 { _hd_title = Just "GQL Part 2" }
exampleDocument_08 :: HyperdataDocument
exampleDocument_08 = exampleDocument_05 { _hd_title = Just "GQL Part 3" }
getCorporaWithParentIdOrFail :: HasNodeError err => NodeId -> DBQuery err x (Node HyperdataCorpus) getCorporaWithParentIdOrFail :: HasNodeError err => NodeId -> DBQuery err x (Node HyperdataCorpus)
getCorporaWithParentIdOrFail parentId = do getCorporaWithParentIdOrFail parentId = do
xs <- getCorporaWithParentId parentId xs <- getCorporaWithParentId parentId
...@@ -131,13 +161,74 @@ addCorpusDocuments env = runTestMonad env $ do ...@@ -131,13 +161,74 @@ addCorpusDocuments env = runTestMonad env $ do
let lang = EN let lang = EN
let docs = [exampleDocument_01, exampleDocument_02, exampleDocument_03, exampleDocument_04] let docs = [exampleDocument_01, exampleDocument_02, exampleDocument_03, exampleDocument_04]
_ <- addDocumentsToHyperCorpus (test_job_handle env) ids <- addDocumentsToHyperCorpus (test_job_handle env)
(Just $ _node_hyperdata $ corpus) (Just $ _node_hyperdata $ corpus)
(Multi lang) (Multi lang)
corpusId corpusId
docs docs
liftIO $ ids `shouldBe` [ UnsafeMkNodeId 1, UnsafeMkNodeId 2, UnsafeMkNodeId 3, UnsafeMkNodeId 4 ]
pure env pure env
corpusAddDocNgramsEmpty :: TestEnv -> Assertion
corpusAddDocNgramsEmpty env = runTestMonad env $ do
cfg <- view hasConfig
parentId <- runDBQuery $ getRootId (UserName userMaster)
[corpus] <- runDBQuery $ getCorporaWithParentId parentId
let doc = exampleDocument_05
res <- runDBTx $ insertMasterDoc cfg noUncommittedNgrams (Just $ _node_hyperdata corpus) doc
let wrn = NgramsNotFound (Just (DocumentHashId "\\x0b41caa65782b9570530910d1942a21bb1859386ee4c976e7b4cd0624ee966f3")) (UnsafeMkNodeId 5)
liftIO $ res `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) (Just wrn)
corpusAddDocAlreadyExisting :: TestEnv -> Assertion
corpusAddDocAlreadyExisting env = runTestMonad env $ do
cfg <- view hasConfig
parentId <- runDBQuery $ getRootId (UserName userMaster)
[corpus] <- runDBQuery $ getCorporaWithParentId parentId
let doc = exampleDocument_05
let la = Mono EN
-- Let's use a bogus NLP server port
let nlp = NLPServerConfig CoreNLP (fromJust $ parseURI "http://localhost:9999")
uncommittedNgrams <- extractNgramsFromDocument nlp la doc
oldCount <- docsInCorpusCount
res <- runDBTx $ insertMasterDoc cfg uncommittedNgrams (Just $ _node_hyperdata corpus) doc
interimCount <- docsInCorpusCount
liftIO $ do
interimCount `shouldBe` oldCount + 1
res `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) Nothing
-- The second time the node gets cached and returned.
res1 <- runDBTx $ insertMasterDoc cfg uncommittedNgrams (Just $ _node_hyperdata corpus) doc
newCount <- docsInCorpusCount
liftIO $ do
newCount `shouldBe` interimCount
res1 `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) Nothing
-- | We test that if we add 3 documents, out of which the middle one doesn't have any keyed ngrams
-- the other two are still persisted.
corpusAddDocBatchSurvival :: TestEnv -> Assertion
corpusAddDocBatchSurvival env = runTestMonad env $ do
cfg <- view hasConfig
parentId <- runDBQuery $ getRootId (UserName userMaster)
[corpus] <- runDBQuery $ getCorporaWithParentId parentId
let la = Mono EN
nlp <- view (nlpServerGet (_tt_lang la))
oldCount <- docsInCorpusCount
let docs = [exampleDocument_06, exampleDocument_07, exampleDocument_08 ]
uNgrams1 <- extractNgramsFromDocument nlp la exampleDocument_06
uNgrams2 <- extractNgramsFromDocument nlp la exampleDocument_08
savedDocs <- insertMasterDocs (test_job_handle env) cfg (uNgrams1 <> uNgrams2) (Just $ _node_hyperdata corpus) docs
c' <- docsInCorpusCount
liftIO $ do
-- failures `shouldBe` [DocumentInsertionError "InternalNodeError Cannot make node due to: Couldn't find the associated ngrams for document with hash \\x666b8e7bfd7c0af37d630e1097791f7ba438a669ecb6d1cb38014edd0b7a2977, therefore the added document won't have any ngrams."]
c' `shouldBe` oldCount + 3
savedDocs `shouldBe` [ UnsafeMkNodeId 5, UnsafeMkNodeId 6, UnsafeMkNodeId 7 ]
corpusAddDocuments :: TestEnv -> Assertion corpusAddDocuments :: TestEnv -> Assertion
corpusAddDocuments env = runTestMonad env $ do corpusAddDocuments env = runTestMonad env $ do
...@@ -247,11 +338,13 @@ corpusSearchDB01 env = do ...@@ -247,11 +338,13 @@ corpusSearchDB01 env = do
liftIO $ do liftIO $ do
length results `shouldBe` 0 -- doesn't exist, we just check that proper to_tsquery is called length results `shouldBe` 0 -- doesn't exist, we just check that proper to_tsquery is called
docsInCorpusCount :: TestMonad Int
docsInCorpusCount = runDBQuery $ do
parentId <- getRootId (UserName userMaster)
corpus <- getCorporaWithParentIdOrFail parentId
selectCountDocs (_node_id corpus)
corpusReturnCount :: TestEnv -> Assertion corpusReturnCount :: TestEnv -> Assertion
corpusReturnCount env = do corpusReturnCount env = do
runTestMonad env $ do count <- runTestMonad env $ docsInCorpusCount
count <- runDBQuery $ do liftIO $ count @?= 4
parentId <- getRootId (UserName userMaster)
corpus <- getCorporaWithParentIdOrFail parentId
selectCountDocs (_node_id corpus)
liftIO $ count @?= 4
...@@ -94,7 +94,7 @@ runTestMonadM :: Show err => env -> TestMonadM env err a -> IO a ...@@ -94,7 +94,7 @@ runTestMonadM :: Show err => env -> TestMonadM env err a -> IO a
runTestMonadM env m = do runTestMonadM env m = do
res <- flip runReaderT env . runExceptT . _TestMonad $ m res <- flip runReaderT env . runExceptT . _TestMonad $ m
case res of case res of
Left err -> throwIO $ userError (show err) Left err -> throwIO $ userError ("runTestMonadM: " <> show err)
Right x -> pure x Right x -> pure x
runTestMonad :: TestEnv -> TestMonadM TestEnv BackendInternalError a -> IO a runTestMonad :: TestEnv -> TestMonadM TestEnv BackendInternalError a -> IO a
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment