Commit 03b5f28d authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

WIP - Add Gargantext.Core.Text.Corpus.Query

It's not fully wired in, `convertQuery` for Arxiv is implemented as
`undefined`.
parent b3fb1a16
......@@ -53,6 +53,7 @@ library
Gargantext.Core.Text
Gargantext.Core.Text.Context
Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
......@@ -382,6 +383,7 @@ library
, blaze-html
, blaze-markup
, blaze-svg
, boolexpr
, bytestring
, case-insensitive
, cassava
......
......@@ -81,6 +81,7 @@ library:
- Gargantext.Core.Text
- Gargantext.Core.Text.Context
- Gargantext.Core.Text.Corpus.API
- Gargantext.Core.Text.Corpus.Query
- Gargantext.Core.Text.Corpus.Parsers
- Gargantext.Core.Text.Corpus.Parsers.CSV
- Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
......@@ -166,6 +167,7 @@ library:
- blaze-html
- blaze-markup
- blaze-svg
- boolexpr
- bytestring
- case-insensitive
- cassava
......
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.API.Admin.Orchestrator.Types
where
......@@ -34,8 +34,7 @@ instance Arbitrary a => Arbitrary (JobOutput a) where
-- | Main Types
-- TODO IsidoreAuth
data ExternalAPIs = All
| PubMed
data ExternalAPIs = PubMed
| Arxiv
| HAL
| IsTex
......
......@@ -65,6 +65,7 @@ import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
import qualified Gargantext.Database.GargDB as GargDB
------------------------------------------------------------------------
{-
data Query = Query { query_query :: Text
......@@ -136,7 +137,7 @@ info = ApiInfo API.externalAPIs
------------------------------------------------------------------------
------------------------------------------------------------------------
data WithQuery = WithQuery
{ _wq_query :: !Text
{ _wq_query :: !API.RawQuery
, _wq_databases :: !Database
, _wq_datafield :: !(Maybe Datafield)
, _wq_lang :: !Lang
......@@ -182,7 +183,7 @@ addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
=> User
-> CorpusId
-> WithQuery
-> Maybe Integer
-> Maybe API.Limit
-> JobHandle m
-> m ()
addToCorpusWithQuery user cid (WithQuery { _wq_query = q
......
......@@ -150,7 +150,7 @@ insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) =
triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m, MonadJobStatus m)
=> User
-> CorpusId
-> API.Query
-> API.RawQuery
-> Lang
-> JobHandle m
-> m ()
......@@ -183,7 +183,7 @@ triggerSearxSearch user cId q l jobHandle = do
res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
, _fsp_manager = manager
, _fsp_pageno = page
, _fsp_query = q
, _fsp_query = API.getRawQuery q
, _fsp_url = surl }
insertSearxResponse user cId listId l res
......
......@@ -11,7 +11,7 @@ import GHC.Generics (Generic)
import Gargantext.Prelude
import qualified Gargantext.API.Admin.Orchestrator.Types as T
import qualified Gargantext.API.Admin.Orchestrator.Types as Types
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Action.Flow (DataOrigin(..))
......@@ -28,12 +28,12 @@ instance ToSchema Database where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
database2origin :: Database -> DataOrigin
database2origin Empty = InternalOrigin T.IsTex
database2origin PubMed = ExternalOrigin T.PubMed
database2origin Arxiv = ExternalOrigin T.Arxiv
database2origin HAL = ExternalOrigin T.HAL
database2origin IsTex = ExternalOrigin T.IsTex
database2origin Isidore = ExternalOrigin T.Isidore
database2origin Empty = InternalOrigin Types.IsTex
database2origin PubMed = ExternalOrigin Types.PubMed
database2origin Arxiv = ExternalOrigin Types.Arxiv
database2origin HAL = ExternalOrigin Types.HAL
database2origin IsTex = ExternalOrigin Types.IsTex
database2origin Isidore = ExternalOrigin Types.Isidore
------------------------------------------------------------------------
data Datafield = Gargantext
......
......@@ -290,7 +290,7 @@ addCorpusWithQuery :: User -> ServerT New.AddWithQuery (GargM Env GargError)
addCorpusWithQuery user cid =
serveJobsAPI AddCorpusQueryJob $ \jHandle q -> do
limit <- view $ hasConfig . gc_max_docs_scrapers
New.addToCorpusWithQuery user cid q (Just limit) jHandle
New.addToCorpusWithQuery user cid q (Just $ fromIntegral limit) jHandle
{- let log' x = do
printDebug "addToCorpusWithQuery" x
liftBase $ log x
......
......@@ -11,16 +11,19 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API
( ExternalAPIs(..)
, Query
, Limit
, Corpus.RawQuery(..)
, Corpus.Limit(..)
, GetCorpusError(..)
, get
, externalAPIs
) where
import Conduit
import Control.Lens ((^.))
import Data.Bifunctor
import Data.Either (Either(..))
import Data.Maybe
import qualified Data.Text as T
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
......@@ -31,29 +34,35 @@ import qualified Gargantext.Core.Text.Corpus.API.Hal as HAL
import qualified Gargantext.Core.Text.Corpus.API.Isidore as ISIDORE
import qualified Gargantext.Core.Text.Corpus.API.Istex as ISTEX
import qualified Gargantext.Core.Text.Corpus.API.Pubmed as PUBMED
import qualified Gargantext.Core.Text.Corpus.Query as Corpus
import Servant.Client (ClientError)
data GetCorpusError
= -- | We couldn't parse the user input query into something meaningful.
InvalidInputQuery !Corpus.RawQuery !T.Text
-- | The external service returned an error.
| ExternalAPIError !ClientError
deriving (Show, Eq)
-- | Get External API metadata main function
get :: GargConfig
-> ExternalAPIs
-> Lang
-> Query
-> Maybe Limit
-> Corpus.RawQuery
-> Maybe Corpus.Limit
-- -> IO [HyperdataDocument]
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get cfg PubMed _la q limit = PUBMED.get (cfg ^. gc_pubmed_api_key) q limit
--docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
get _ Arxiv la q limit = Arxiv.get la q (fromIntegral <$> limit)
get _ HAL la q limit = HAL.getC la q limit
get _ IsTex la q limit = do
docs <- ISTEX.get la q limit
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
get _ Isidore la q limit = do
docs <- ISIDORE.get la (fromIntegral <$> limit) (Just q) Nothing
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
get _ externalApi _ _ _ = panic $ "[G.C.T.Corpus.API] This options are note taken into account: " <> (cs $ show externalApi)
-- | Some Sugar for the documentation
type Query = PUBMED.Query
type Limit = PUBMED.Limit
-> IO (Either GetCorpusError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get cfg externalAPI la q limit = do
case Corpus.parseQuery q of
Left err -> pure $ Left $ InvalidInputQuery q (T.pack err)
Right corpusQuery -> case externalAPI of
PubMed -> first ExternalAPIError <$>
PUBMED.get (cfg ^. gc_pubmed_api_key) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
--docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
Arxiv -> Right <$> Arxiv.get la corpusQuery (Corpus.getLimit <$> limit)
HAL -> first ExternalAPIError <$> HAL.getC la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
......@@ -10,34 +10,41 @@ Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans -fno-warn-unused-top-binds #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Corpus.API.Arxiv
where
import Conduit
import Data.Either (Either(..))
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as Text
import Servant.Client (ClientError)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Core.Text.Corpus.Query as Corpus
import qualified Arxiv as Arxiv
import qualified Network.Api.Arxiv as Ax
type Query = Text
type Limit = Arxiv.Limit
-- | Converts a Gargantext's generic boolean query into an Arxiv Query.
convertQuery :: Corpus.Query -> Ax.Query
convertQuery _q = undefined
-- | TODO put default pubmed query in gargantext.ini
-- by default: 10K docs
get :: Lang -> Query -> Maybe Limit -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get la q _l = do
(cnt, resC) <- Arxiv.apiSimpleC Nothing [Text.unpack q]
pure $ Right (Just $ fromIntegral cnt, resC .| mapC (toDoc la))
get :: Lang
-> Corpus.Query
-> Maybe Arxiv.Limit
-> IO (Maybe Integer, ConduitT () HyperdataDocument IO ())
get la (convertQuery -> query) limit = do
(cnt, resC) <- case limit of
Nothing -> Arxiv.searchAxv' query
(Just l) -> do (cnt, res) <- Arxiv.searchAxv' query
pure (cnt, res .| takeC l)
pure $ (Just $ fromIntegral cnt, resC .| mapC (toDoc la))
toDoc :: Lang -> Arxiv.Result -> HyperdataDocument
toDoc l (Arxiv.Result { abstract
......
......@@ -27,14 +27,14 @@ import qualified HAL as HAL
import qualified HAL.Client as HAL
import qualified HAL.Doc.Corpus as HAL
get :: Lang -> Text -> Maybe Integer -> IO [HyperdataDocument]
get :: Lang -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do
eDocs <- HAL.getMetadataWith q (Just 0) ml
eDocs <- HAL.getMetadataWith q (Just 0) (fromIntegral <$> ml)
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
getC :: Lang -> Text -> Maybe Integer -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC :: Lang -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do
eRes <- HAL.getMetadataWithC q (Just 0) ml
eRes <- HAL.getMetadataWithC q (Just 0) (fromIntegral <$> ml)
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of
-- Left err -> panic $ pack $ show err
......
......@@ -29,7 +29,7 @@ import qualified ISTEX as ISTEX
import qualified ISTEX.Client as ISTEX
type Query = Text
type MaxResults = Maybe Integer
type MaxResults = Maybe Int
get :: Lang -> Query -> MaxResults -> IO [HyperdataDocument]
get la query' maxResults = do
......@@ -57,7 +57,7 @@ get la query' maxResults = do
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing
eDocs <- ISTEX.getMetadataWith query (fromIntegral <$> maxResults)
eDocs <- ISTEX.getMetadataWith query maxResults
-- printDebug "[Istex.get] will print length" (0 :: Int)
case eDocs of
Left _ -> pure ()
......
......@@ -11,6 +11,7 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Pubmed
( get )
where
import Conduit
......@@ -31,7 +32,7 @@ import PUBMED.Types (Config(..))
type Query = Text
type Limit = Integer
type Limit = Int
-- | TODO put default pubmed query in gargantext.ini
......
{-# LANGUAGE DerivingStrategies #-}
module Gargantext.Core.Text.Corpus.Query (
Query -- * opaque
, RawQuery(..)
, Limit(..)
, getQuery
, parseQuery
, ExternalAPIs(..)
) where
import Data.Bifunctor
import Data.String
import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.Core.Types
import Prelude
import qualified Data.Aeson as Aeson
import qualified Data.BoolExpr as BoolExpr
import qualified Data.BoolExpr.Parser as BoolExpr
import qualified Data.Swagger as Swagger
import qualified Data.Text as T
import qualified Servant.API as Servant
import qualified Text.Parsec as P
-- | A raw query, as typed by the user from the frontend.
newtype RawQuery = RawQuery { getRawQuery :: T.Text }
deriving newtype ( Show, Eq, IsString
, Servant.FromHttpApiData, Servant.ToHttpApiData
, Aeson.FromJSON, Aeson.ToJSON
, Swagger.ToParamSchema, Swagger.ToSchema)
-- | A limit to the number of results we want to retrieve.
newtype Limit = Limit { getLimit :: Int }
deriving newtype ( Show, Eq, Num
, Servant.FromHttpApiData, Servant.ToHttpApiData
, Aeson.FromJSON, Aeson.ToJSON
, Swagger.ToParamSchema, Swagger.ToSchema)
-- | An opaque wrapper around a 'Query' type which can be parsed from a boolean
-- expression like (a AND b) OR c, and which can be interpreted in many ways
-- according to the particular service we are targeting.
newtype Query = Query { getQuery :: (BoolExpr.CNF Term) }
deriving Show
-- | Parses an input 'Text' into a 'Query', reporting an error if it fails.
parseQuery :: RawQuery -> Either String Query
parseQuery (RawQuery txt) = bimap show (Query . BoolExpr.boolTreeToCNF) $
P.runParser (BoolExpr.parseBoolExpr (Term . T.pack <$> BoolExpr.identifier)) () "Corpus.Query" (T.unpack txt)
......@@ -13,11 +13,12 @@ commentary with @some markup@.
------------------------------------------------------------------------
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE DerivingStrategies #-}
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Database.Admin.Types.Node
, DebugMode(..), withDebugMode
, Term, Terms(..), TermsCount, TermsWithCount
, Term(..), Terms(..), TermsCount, TermsWithCount
, TokenTag(..), POS(..), NER(..)
, Label, Stems
, HasInvalidError(..), assertValid
......@@ -38,6 +39,7 @@ import Data.Maybe
import Data.Monoid
import Data.Semigroup
import Data.Set (Set, empty)
import Data.String
import Data.Swagger (ToParamSchema)
import Data.Swagger (ToSchema(..))
import Data.Text (Text, unpack)
......@@ -63,7 +65,10 @@ data Ordering = Down | Up
------------------------------------------------------------------------
type Name = Text
type Term = Text
newtype Term = Term { getTerm :: Text }
deriving newtype (IsString, Show)
type Stems = Set Text
type Label = [Text]
......
......@@ -64,7 +64,6 @@ import Data.Swagger
import qualified Data.Text as T
import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic)
import Servant.Client (ClientError)
import System.FilePath (FilePath)
import qualified Data.HashMap.Strict as HashMap
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
......@@ -151,9 +150,9 @@ printDataText (DataNew (maybeInt, conduitData)) = do
getDataText :: FlowCmdM env err m
=> DataOrigin
-> TermType Lang
-> API.Query
-> API.RawQuery
-> Maybe API.Limit
-> m (Either ClientError DataText)
-> m (Either API.GetCorpusError DataText)
getDataText (ExternalOrigin api) la q li = do
cfg <- view $ hasConfig
eRes <- liftBase $ API.get cfg api (_tt_lang la) q li
......@@ -164,13 +163,13 @@ getDataText (InternalOrigin _) _la q _li = do
(UserName userMaster)
(Left "")
(Nothing :: Maybe HyperdataCorpus)
ids <- map fst <$> searchDocInDatabase cId (stemIt q)
ids <- map fst <$> searchDocInDatabase cId (stemIt $ API.getRawQuery q)
pure $ Right $ DataOld ids
getDataText_Debug :: FlowCmdM env err m
=> DataOrigin
-> TermType Lang
-> API.Query
-> API.RawQuery
-> Maybe API.Limit
-> m ()
getDataText_Debug a l q li = do
......
......@@ -37,6 +37,7 @@ extra-deps:
- HSvm-0.1.1.3.22
- hsparql-0.3.8
- ghc-clippy-plugin-0.0.0.1
- boolexpr-0.2
#- git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git
# commit: 791c2a7046a3760f8ae5fabdbd708f61caa63741
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment