Verified Commit e7735135 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into 477-dev-flow-zip-file-upload-2

parents 25123b89 d362b468
......@@ -644,7 +644,6 @@ library
, transformers-base ^>= 0.4.6
, tree-diff
, tuple ^>= 0.3.0.2
, unbounded-delays >= 0.1.1 && < 0.2
, unicode-collation >= 0.1.3.5
, unordered-containers ^>= 0.2.16.0
-- needed for Worker / System.Posix.Signals
......
......@@ -15,7 +15,6 @@ Portability : POSIX
{-# LANGUAGE KindSignatures #-} -- for use of Endpoint (name :: Symbol)
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE PartialTypeSignatures #-} -- to automatically use suggested type hole signatures during compilation
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.API.GraphQL where
......
......@@ -22,8 +22,7 @@ import Data.Morpheus.Types
, ResolverM
, QUERY
)
import Data.Text (pack, unpack)
import Data.Text qualified as Text
import Data.Text (pack)
import Data.Time.Format.ISO8601 (iso8601Show)
import Gargantext.API.Admin.Auth.Types ( AuthenticatedUser )
import Gargantext.API.Auth.PolicyCheck ( nodeWriteChecks, AccessPolicyManager )
......@@ -97,7 +96,7 @@ data ContextsForNgramsArgs
= ContextsForNgramsArgs
{ corpus_id :: Int
, ngrams_terms :: [Text]
, and_logic :: Text
, and_logic :: Bool
} deriving (Generic, GQLType)
data NodeContextCategoryMArgs = NodeContextCategoryMArgs
......@@ -153,9 +152,10 @@ dbNodeContext context_id node_id = do
-- | Returns list of `ContextGQL` for given ngrams in given corpus id.
dbContextForNgrams
:: (IsDBEnvExtra env)
=> Int -> [Text] -> Text -> GqlM e env [ContextGQL]
=> Int -> [Text] -> Bool -> GqlM e env [ContextGQL]
dbContextForNgrams node_id ngrams_terms and_logic = do
contextsForNgramsTerms <- lift $ runDBQuery $ getContextsForNgramsTerms (UnsafeMkNodeId node_id) ngrams_terms ( readMaybe $ unpack $ Text.toTitle and_logic )
contextsForNgramsTerms <- lift $ runDBQuery $
getContextsForNgramsTerms (UnsafeMkNodeId node_id) ngrams_terms and_logic
--lift $ printDebug "[dbContextForNgrams] contextsForNgramsTerms" contextsForNgramsTerms
pure $ toContextGQL <$> contextsForNgramsTerms
......
......@@ -116,11 +116,11 @@ data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
instance Hashable ExtractedNgrams
-- | A typeclass that represents extracting ngrams from an entity.
class ExtractNgrams h where
class Monad m => ExtractNgrams m h where
extractNgrams :: NLPServerConfig
-> TermType Lang
-> h
-> DBCmd err (HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
-> m (HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
......
......@@ -11,9 +11,19 @@ Multi-terms are ngrams where n > 1.
-}
module Gargantext.Core.Text.Terms.Multi (multiterms, Terms(..), tokenTag2terms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
module Gargantext.Core.Text.Terms.Multi (
multiterms
, Terms(..)
, MultitermsExtractionException(..)
, tokenTag2terms
, multiterms_rake
, tokenTagsWith
, tokenTags
, cleanTextForNLP
)
where
import Control.Exception.Safe qualified as Safe
import Data.Attoparsec.Text as DAT (space, notChar, string )
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
......@@ -25,14 +35,23 @@ import Gargantext.Core.Types ( POS(NP), Terms(Terms), TermsWithCount, TokenTag(T
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude
import Gargantext.Utils.SpacyNLP qualified as SpacyNLP
import Network.HTTP.Client
import Replace.Attoparsec.Text as RAT ( streamEdit )
-------------------------------------------------------------------
type NLP_API = Lang -> Text -> IO PosSentences
data MultitermsExtractionException
= MEE_nlp_server_http_exception !NLPServerConfig !HttpException
deriving Show
instance Exception MultitermsExtractionException
-------------------------------------------------------------------
-- | Extracts the terms from the input 'txt'. Throws a
-- 'MultitermExtractionException' in case we fail.
multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
multiterms nsc l txt = do
multiterms nsc l txt = handle (\ex -> Safe.throwIO $ MEE_nlp_server_http_exception nsc ex) $ do
let txt' = cleanTextForNLP txt
if txt' == ""
then do
......
......@@ -54,11 +54,10 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
)
where
import Async.Worker qualified as W
import Conduit
import Control.Concurrent.Timeout qualified as Timeout
import Control.Exception.Safe qualified as CES
import Control.Lens ( to, view )
import Control.Exception.Safe (catch, MonadCatch)
import Data.Conduit qualified as C
import Data.Conduit.Internal (zipSources)
import Data.Conduit.List qualified as CL
......@@ -118,6 +117,7 @@ import Gargantext.Utils.Jobs.Monad ( JobHandle, MonadJobStatus(..) )
------------------------------------------------------------------------
-- Imports for upgrade function
import Gargantext.Database.Query.Tree.Error ( HasTreeError )
import Gargantext.Core.Text.Terms.Multi (MultitermsExtractionException)
------------------------------------------------------------------------
......@@ -258,9 +258,11 @@ flowCorpus :: ( IsDBCmd env err m
, HasTreeError err
, HasValidationError err
, FlowCorpus a
, ExtractNgrams m a
, MonadJobStatus m
, CES.MonadCatch m
, HasCentralExchangeNotification env, Show a )
, MonadCatch m
, HasCentralExchangeNotification env
)
=> MkCorpusUser
-> TermType Lang
-> Maybe FlowSocialListWith
......@@ -278,11 +280,11 @@ flow :: forall env err m a c.
, HasTreeError err
, HasValidationError err
, FlowCorpus a
, ExtractNgrams m a
, MkCorpus c
, MonadJobStatus m
, HasCentralExchangeNotification env
, CES.MonadCatch m
, Show a
, MonadCatch m
)
=> Maybe c
-> MkCorpusUser
......@@ -320,10 +322,10 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m
, HasNodeError err
, HasNLPServer env
, FlowCorpus document
, ExtractNgrams m document
, MkCorpus corpus
, MonadLogger m
, CES.MonadCatch m
, Show document
, MonadCatch m
)
=> Maybe corpus
-> TermType Lang
......@@ -475,7 +477,7 @@ data InsertDocError
extractNgramsFromDocument :: ( UniqParameters doc
, HasText doc
, ExtractNgrams doc
, ExtractNgrams m doc
, IsDBCmd err env m
, MonadLogger m
, CES.MonadCatch m
......@@ -491,13 +493,9 @@ extractNgramsFromDocument nlpServer lang doc =
-- will still be added to the corpus and we can try to regen the ngrams at a later stage.
UncommittedNgrams . Map.singleton docId <$>
(documentIdWithNgrams (extractNgrams nlpServer $ withLang lang [doc]) (Indexed docId doc)
`CES.catches`
[ CES.Handler $ \(e :: Timeout.Timeout) -> CES.throw e
, CES.Handler $ \(e :: W.KillWorkerSafely) -> CES.throw e
, CES.Handler $ \(e :: CES.SomeException) -> do
$(logLocM) ERROR $ T.pack $ "Document with hash " <> show docId <> " failed ngrams extraction due to an exception: " <> displayException e
pure $ DocumentIdWithNgrams (Indexed docId doc) mempty
]
`catch` \(e :: MultitermsExtractionException) -> do
$(logLocM) ERROR $ T.pack $ "Document with hash " <> show docId <> " failed ngrams extraction due to an exception: " <> displayException e
pure $ DocumentIdWithNgrams (Indexed docId doc) mempty
)
where
docId = DocumentHashId $ newUniqIdHash doc
......@@ -525,7 +523,7 @@ commitNgramsForDocument (UncommittedNgrams ng) (Indexed oldIx node) = do
extractNgramsFromDocuments :: forall doc env err m.
( HasText doc
, UniqParameters doc
, ExtractNgrams doc
, ExtractNgrams m doc
, IsDBCmd env err m
, MonadLogger m
, CES.MonadCatch m
......@@ -553,7 +551,7 @@ commitNgramsForDocuments ng nodes =
insertMasterDocs :: ( HasNodeError err
, UniqParameters doc
, FlowCorpus doc
, MkCorpus c, Show doc
, MkCorpus c
)
=> GargConfig
-> UncommittedNgrams doc
......
......@@ -13,6 +13,7 @@ Portability : POSIX
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE UndecidableInstances #-}
module Gargantext.Database.Action.Flow.Extract
......@@ -30,7 +31,6 @@ import Gargantext.Core.Types (POS(NP), TermsCount, TermsWeight)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact, cw_lastName, hc_who )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_authors, hd_bdd, hd_institutes, hd_source )
import Gargantext.Database.Admin.Types.Node ( Node )
import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag)
import Gargantext.Database.Schema.Ngrams ( text2ngrams )
import Gargantext.Database.Schema.Node (NodePoly(..))
......@@ -39,7 +39,7 @@ import Gargantext.Prelude
------------------------------------------------------------------------
instance ExtractNgrams HyperdataContact where
instance Monad m => ExtractNgrams m HyperdataContact where
extractNgrams _ncs _l = pure . HashMap.mapKeys (cleanExtractedNgrams 255) . extract
where
extract :: HyperdataContact
......@@ -55,15 +55,15 @@ instance ExtractNgrams HyperdataContact where
-- | Main ngrams extraction functionality.
-- For NgramsTerms, this calls NLP server under the hood.
-- For Sources, Institutes, Authors, this uses simple split on " ".
instance ExtractNgrams HyperdataDocument where
instance (Monad m, MonadBase IO m) => ExtractNgrams m HyperdataDocument where
extractNgrams :: NLPServerConfig
-> TermType Lang
-> HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
-> m (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgrams ncs lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' hd
where
extractNgramsT' :: HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
-> m (HashMap.HashMap ExtractedNgrams (Map NgramsType TermsWeight, TermsCount))
extractNgramsT' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
......@@ -87,7 +87,7 @@ instance ExtractNgrams HyperdataDocument where
<> [(SimpleNgrams a', (DM.singleton Authors 1, 1)) | a' <- authors ]
<> [(EnrichedNgrams t', (DM.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
instance (ExtractNgrams a, HasText a) => ExtractNgrams (Node a)
instance (ExtractNgrams m a, HasText a) => ExtractNgrams m (Node a)
where
extractNgrams ncs l (Node { _node_hyperdata = h }) = extractNgrams ncs l h
......
......@@ -25,7 +25,6 @@ import Gargantext.Core.NodeStory.Types ( HasNodeStory )
import Gargantext.Core.Text ( HasText )
import Gargantext.API.Admin.Orchestrator.Types qualified as API
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms ( ExtractNgrams )
import Gargantext.Core.Types (HasValidationError, TermsCount, TermsWeight)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument )
......@@ -51,11 +50,10 @@ type FlowCmdM env err m =
)
type FlowCorpus a = ( UniqParameters a
, InsertDb a
, ExtractNgrams a
, HasText a
, ToNode a
, ToJSON a
, InsertDb a
, HasText a
, ToNode a
, ToJSON a
)
type FlowInsertDB a = ( AddUniqId a
......
......@@ -151,9 +151,9 @@ data ContextForNgramsTerms =
getContextsForNgramsTerms :: HasNodeError err
=> NodeId
-> [Text]
-> Maybe Bool
-> Bool
-> DBQuery err x [ContextForNgramsTerms]
getContextsForNgramsTerms cId ngramsTerms (Just True) = do
getContextsForNgramsTerms cId ngramsTerms True = do
let terms_length = length ngramsTerms
res <- mkPGQuery query (cId, PGS.In ngramsTerms, terms_length)
pure $ (\( _cfnt_nodeId
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment