Commit d362b468 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Merge branch 'adinapoli/issue-481' into 'dev'

Have `extractNgramsFromDocument` catch the right exception in case extraction fails

Closes #481

See merge request !419
parents 39b1d756 09114e70
Pipeline #7690 passed with stages
in 44 minutes and 16 seconds
...@@ -116,11 +116,11 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams } ...@@ -116,11 +116,11 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
instance Hashable ExtractedNgrams instance Hashable ExtractedNgrams
-- | A typeclass that represents extracting ngrams from an entity. -- | A typeclass that represents extracting ngrams from an entity.
class ExtractNgrams h where class Monad m => ExtractNgrams m h where
extractNgrams :: NLPServerConfig extractNgrams :: NLPServerConfig
-> TermType Lang -> TermType Lang
-> h -> h
-> DBCmd err (HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount)) -> m (HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
------------------------------------------------------------------------ ------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
......
...@@ -11,9 +11,19 @@ Multi-terms are ngrams where n > 1. ...@@ -11,9 +11,19 @@ Multi-terms are ngrams where n > 1.
-} -}
module Gargantext.Core.Text.Terms.Multi (multiterms, Terms(..), tokenTag2terms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP) module Gargantext.Core.Text.Terms.Multi (
multiterms
, Terms(..)
, MultitermsExtractionException(..)
, tokenTag2terms
, multiterms_rake
, tokenTagsWith
, tokenTags
, cleanTextForNLP
)
where where
import Control.Exception.Safe qualified as Safe
import Data.Attoparsec.Text as DAT (space, notChar, string ) import Data.Attoparsec.Text as DAT (space, notChar, string )
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..)) import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
...@@ -25,14 +35,23 @@ import Gargantext.Core.Types ( POS(NP), Terms(Terms), TermsWithCount, TokenTag(T ...@@ -25,14 +35,23 @@ import Gargantext.Core.Types ( POS(NP), Terms(Terms), TermsWithCount, TokenTag(T
import Gargantext.Core.Utils (groupWithCounts) import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Utils.SpacyNLP qualified as SpacyNLP import Gargantext.Utils.SpacyNLP qualified as SpacyNLP
import Network.HTTP.Client
import Replace.Attoparsec.Text as RAT ( streamEdit ) import Replace.Attoparsec.Text as RAT ( streamEdit )
------------------------------------------------------------------- -------------------------------------------------------------------
type NLP_API = Lang -> Text -> IO PosSentences type NLP_API = Lang -> Text -> IO PosSentences
data MultitermsExtractionException
= MEE_nlp_server_http_exception !NLPServerConfig !HttpException
deriving Show
instance Exception MultitermsExtractionException
------------------------------------------------------------------- -------------------------------------------------------------------
-- | Extracts the terms from the input 'txt'. Throws a
-- 'MultitermExtractionException' in case we fail.
multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount] multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
multiterms nsc l txt = do multiterms nsc l txt = handle (\ex -> Safe.throwIO $ MEE_nlp_server_http_exception nsc ex) $ do
let txt' = cleanTextForNLP txt let txt' = cleanTextForNLP txt
if txt' == "" if txt' == ""
then do then do
......
...@@ -56,7 +56,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -56,7 +56,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
import Conduit import Conduit
import Control.Lens ( to, view ) import Control.Lens ( to, view )
import Control.Monad.Catch import Control.Exception.Safe (catch, MonadCatch)
import Data.Conduit qualified as C import Data.Conduit qualified as C
import Data.Conduit.Internal (zipSources) import Data.Conduit.Internal (zipSources)
import Data.Conduit.List qualified as CL import Data.Conduit.List qualified as CL
...@@ -116,6 +116,7 @@ import Gargantext.Utils.Jobs.Monad ( JobHandle, MonadJobStatus(..) ) ...@@ -116,6 +116,7 @@ import Gargantext.Utils.Jobs.Monad ( JobHandle, MonadJobStatus(..) )
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Imports for upgrade function -- Imports for upgrade function
import Gargantext.Database.Query.Tree.Error ( HasTreeError ) import Gargantext.Database.Query.Tree.Error ( HasTreeError )
import Gargantext.Core.Text.Terms.Multi (MultitermsExtractionException)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -256,9 +257,11 @@ flowCorpus :: ( IsDBCmd env err m ...@@ -256,9 +257,11 @@ flowCorpus :: ( IsDBCmd env err m
, HasTreeError err , HasTreeError err
, HasValidationError err , HasValidationError err
, FlowCorpus a , FlowCorpus a
, ExtractNgrams m a
, MonadJobStatus m , MonadJobStatus m
, MonadCatch m , MonadCatch m
, HasCentralExchangeNotification env, Show a ) , HasCentralExchangeNotification env
)
=> MkCorpusUser => MkCorpusUser
-> TermType Lang -> TermType Lang
-> Maybe FlowSocialListWith -> Maybe FlowSocialListWith
...@@ -276,10 +279,11 @@ flow :: forall env err m a c. ...@@ -276,10 +279,11 @@ flow :: forall env err m a c.
, HasTreeError err , HasTreeError err
, HasValidationError err , HasValidationError err
, FlowCorpus a , FlowCorpus a
, ExtractNgrams m a
, MkCorpus c , MkCorpus c
, MonadJobStatus m , MonadJobStatus m
, HasCentralExchangeNotification env , HasCentralExchangeNotification env
, MonadCatch m, Show a , MonadCatch m
) )
=> Maybe c => Maybe c
-> MkCorpusUser -> MkCorpusUser
...@@ -317,9 +321,10 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m ...@@ -317,9 +321,10 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m
, HasNodeError err , HasNodeError err
, HasNLPServer env , HasNLPServer env
, FlowCorpus document , FlowCorpus document
, ExtractNgrams m document
, MkCorpus corpus , MkCorpus corpus
, MonadLogger m , MonadLogger m
, MonadCatch m, Show document , MonadCatch m
) )
=> Maybe corpus => Maybe corpus
-> TermType Lang -> TermType Lang
...@@ -471,7 +476,7 @@ data InsertDocError ...@@ -471,7 +476,7 @@ data InsertDocError
extractNgramsFromDocument :: ( UniqParameters doc extractNgramsFromDocument :: ( UniqParameters doc
, HasText doc , HasText doc
, ExtractNgrams doc , ExtractNgrams m doc
, IsDBCmd err env m , IsDBCmd err env m
, MonadLogger m , MonadLogger m
, MonadCatch m , MonadCatch m
...@@ -487,7 +492,7 @@ extractNgramsFromDocument nlpServer lang doc = ...@@ -487,7 +492,7 @@ extractNgramsFromDocument nlpServer lang doc =
-- will still be added to the corpus and we can try to regen the ngrams at a later stage. -- will still be added to the corpus and we can try to regen the ngrams at a later stage.
UncommittedNgrams . Map.singleton docId <$> UncommittedNgrams . Map.singleton docId <$>
(documentIdWithNgrams (extractNgrams nlpServer $ withLang lang [doc]) (Indexed docId doc) (documentIdWithNgrams (extractNgrams nlpServer $ withLang lang [doc]) (Indexed docId doc)
`catch` \(e :: SomeException) -> do `catch` \(e :: MultitermsExtractionException) -> do
$(logLocM) ERROR $ T.pack $ "Document with hash " <> show docId <> " failed ngrams extraction due to an exception: " <> displayException e $(logLocM) ERROR $ T.pack $ "Document with hash " <> show docId <> " failed ngrams extraction due to an exception: " <> displayException e
pure $ DocumentIdWithNgrams (Indexed docId doc) mempty pure $ DocumentIdWithNgrams (Indexed docId doc) mempty
) )
...@@ -517,7 +522,7 @@ commitNgramsForDocument (UncommittedNgrams ng) (Indexed oldIx node) = do ...@@ -517,7 +522,7 @@ commitNgramsForDocument (UncommittedNgrams ng) (Indexed oldIx node) = do
extractNgramsFromDocuments :: forall doc env err m. extractNgramsFromDocuments :: forall doc env err m.
( HasText doc ( HasText doc
, UniqParameters doc , UniqParameters doc
, ExtractNgrams doc , ExtractNgrams m doc
, IsDBCmd env err m , IsDBCmd env err m
, MonadLogger m , MonadLogger m
, MonadCatch m , MonadCatch m
...@@ -545,7 +550,7 @@ commitNgramsForDocuments ng nodes = ...@@ -545,7 +550,7 @@ commitNgramsForDocuments ng nodes =
insertMasterDocs :: ( HasNodeError err insertMasterDocs :: ( HasNodeError err
, UniqParameters doc , UniqParameters doc
, FlowCorpus doc , FlowCorpus doc
, MkCorpus c, Show doc , MkCorpus c
) )
=> GargConfig => GargConfig
-> UncommittedNgrams doc -> UncommittedNgrams doc
......
...@@ -13,6 +13,7 @@ Portability : POSIX ...@@ -13,6 +13,7 @@ Portability : POSIX
{-# LANGUAGE InstanceSigs #-} {-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE UndecidableInstances #-}
module Gargantext.Database.Action.Flow.Extract module Gargantext.Database.Action.Flow.Extract
...@@ -30,7 +31,6 @@ import Gargantext.Core.Types (POS(NP), TermsCount, TermsWeight) ...@@ -30,7 +31,6 @@ import Gargantext.Core.Types (POS(NP), TermsCount, TermsWeight)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_lastName, hc_who ) import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_lastName, hc_who )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source )
import Gargantext.Database.Admin.Types.Node ( Node ) import Gargantext.Database.Admin.Types.Node ( Node )
import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag) import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag)
import Gargantext.Database.Schema.Ngrams ( text2ngrams ) import Gargantext.Database.Schema.Ngrams ( text2ngrams )
import Gargantext.Database.Schema.Node (NodePoly(..)) import Gargantext.Database.Schema.Node (NodePoly(..))
...@@ -39,7 +39,7 @@ import Gargantext.Prelude ...@@ -39,7 +39,7 @@ import Gargantext.Prelude
------------------------------------------------------------------------ ------------------------------------------------------------------------
instance ExtractNgrams HyperdataContact where instance Monad m => ExtractNgrams m HyperdataContact where
extractNgrams _ncs _l = pure . HashMap.mapKeys (cleanExtractedNgrams 255) . extract extractNgrams _ncs _l = pure . HashMap.mapKeys (cleanExtractedNgrams 255) . extract
where where
extract :: HyperdataContact extract :: HyperdataContact
...@@ -55,15 +55,15 @@ instance ExtractNgrams HyperdataContact where ...@@ -55,15 +55,15 @@ instance ExtractNgrams HyperdataContact where
-- | Main ngrams extraction functionality. -- | Main ngrams extraction functionality.
-- For NgramsTerms, this calls NLP server under the hood. -- For NgramsTerms, this calls NLP server under the hood.
-- For Sources, Institutes, Authors, this uses simple split on " ". -- For Sources, Institutes, Authors, this uses simple split on " ".
instance ExtractNgrams HyperdataDocument where instance (Monad m, MonadBase IO m) => ExtractNgrams m HyperdataDocument where
extractNgrams :: NLPServerConfig extractNgrams :: NLPServerConfig
-> TermType Lang -> TermType Lang
-> HyperdataDocument -> HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount)) -> m (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgrams ncs lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' hd extractNgrams ncs lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' hd
where where
extractNgramsT' :: HyperdataDocument extractNgramsT' :: HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount)) -> m (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgramsT' doc = do extractNgramsT' doc = do
let source = text2ngrams let source = text2ngrams
$ maybe "Nothing" identity $ maybe "Nothing" identity
...@@ -87,7 +87,7 @@ instance ExtractNgrams HyperdataDocument where ...@@ -87,7 +87,7 @@ instance ExtractNgrams HyperdataDocument where
<> [(SimpleNgrams a', (DM.singleton Authors 1, 1)) | a' <- authors ] <> [(SimpleNgrams a', (DM.singleton Authors 1, 1)) | a' <- authors ]
<> [(EnrichedNgrams t', (DM.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ] <> [(EnrichedNgrams t', (DM.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
instance (ExtractNgrams a, HasText a) => ExtractNgrams (Node a) instance (ExtractNgrams m a, HasText a) => ExtractNgrams m (Node a)
where where
extractNgrams ncs l (Node { _node_hyperdata = h }) = extractNgrams ncs l h extractNgrams ncs l (Node { _node_hyperdata = h }) = extractNgrams ncs l h
......
...@@ -25,7 +25,6 @@ import Gargantext.Core.NodeStory.Types ( HasNodeStory ) ...@@ -25,7 +25,6 @@ import Gargantext.Core.NodeStory.Types ( HasNodeStory )
import Gargantext.Core.Text ( HasText ) import Gargantext.Core.Text ( HasText )
import Gargantext.API.Admin.Orchestrator.Types qualified as API import Gargantext.API.Admin.Orchestrator.Types qualified as API
import Gargantext.Core.Text.Ngrams (NgramsType(..)) import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms ( ExtractNgrams )
import Gargantext.Core.Types (HasValidationError, TermsCount, TermsWeight) import Gargantext.Core.Types (HasValidationError, TermsCount, TermsWeight)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument )
...@@ -51,11 +50,10 @@ type FlowCmdM env err m = ...@@ -51,11 +50,10 @@ type FlowCmdM env err m =
) )
type FlowCorpus a = ( UniqParameters a type FlowCorpus a = ( UniqParameters a
, InsertDb a , InsertDb a
, ExtractNgrams a , HasText a
, HasText a , ToNode a
, ToNode a , ToJSON a
, ToJSON a
) )
type FlowInsertDB a = ( AddUniqId a type FlowInsertDB a = ( AddUniqId a
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment