[API] fixes to pagination (usehistory=y)

parent 4ade4957
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.0.
-- This file has been generated from package.yaml by hpack version 0.35.2.
--
-- see: https://github.com/sol/hpack
......@@ -19,10 +19,6 @@ extra-source-files:
README.md
ChangeLog.md
data-files:
test/test-corpus/pubmed-29768149.xml
test/test-corpus/pubmed_long.xml
source-repository head
type: git
location: https://github.com/gitlab/crawlerPubMed
......@@ -32,6 +28,7 @@ library
PUBMED
PUBMED.Client
PUBMED.Parser
PUBMED.ServantClientLogging
PUBMED.Types
other-modules:
Paths_crawlerPubMed
......@@ -47,6 +44,7 @@ library
build-depends:
attoparsec
, base >=4.7 && <5
, binary
, bytestring
, conduit
, data-time-segment
......@@ -61,6 +59,7 @@ library
, optparse-applicative
, servant
, servant-client
, servant-client-core
, taggy
, taggy-lens
, text
......@@ -81,6 +80,7 @@ executable crawlerPubMed-exe
build-depends:
attoparsec
, base >=4.7 && <5
, binary
, bytestring
, conduit
, crawlerPubMed
......@@ -96,6 +96,7 @@ executable crawlerPubMed-exe
, optparse-applicative
, servant
, servant-client
, servant-client-core
, taggy
, taggy-lens
, text
......@@ -119,6 +120,7 @@ test-suite crawlerPubMed-test
build-depends:
attoparsec
, base >=4.7 && <5
, binary
, bytestring
, conduit
, crawlerPubMed
......@@ -134,19 +136,9 @@ test-suite crawlerPubMed-test
, optparse-applicative
, servant
, servant-client
, tasty
, tasty-hunit
, servant-client-core
, taggy
, taggy-lens
, text
, time
default-language: Haskell2010
benchmark bench-foo
type: exitcode-stdio-1.0
hs-source-dirs: bench
main-is: Main.hs
build-depends:
base
, crawlerPubMed
, tasty-bench
......@@ -29,6 +29,7 @@ description: Please see the README on GitHub at <https://gitlab.iscpif.f
dependencies:
- attoparsec
- base >= 4.7 && < 5
- binary
- bytestring
- conduit
- data-time-segment
......@@ -43,6 +44,7 @@ dependencies:
- optparse-applicative
- servant
- servant-client
- servant-client-core
- taggy
- taggy-lens
- text
......
......@@ -16,7 +16,7 @@ import PUBMED.Client
import PUBMED.Parser
import Panic (panic)
import Prelude hiding (takeWhile)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), ClientEnv, ClientError, Scheme(..))
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), ClientEnv(..), ClientError, Scheme(..))
import qualified Data.ByteString.Lazy as LBS
import qualified Data.List as List
import qualified Data.Text as T
......@@ -24,6 +24,7 @@ import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
import PUBMED.Types
import PUBMED.ServantClientLogging
pmHost :: String
pmHost = "eutils.ncbi.nlm.nih.gov"
......@@ -35,7 +36,8 @@ pmPort = 443
defaultClientEnv :: IO ClientEnv
defaultClientEnv = do
manager' <- newManager tlsManagerSettings
pure $ mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath
let env = mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath
pure $ addLoggingToClientEnv env
-- | API main function
getMetadataWith :: Env (Either Text [PubMed])
......@@ -43,11 +45,11 @@ getMetadataWith = runSimpleFindPubmedAbstractRequest Nothing
getMetadataWithC :: Env (Either ClientError (Maybe Integer, ConduitT () PubMed IO ()))
getMetadataWithC = do
config@(Config { apiKey, query, perPage }) <- ask
config@(Config { apiKey, query, perPage, mWebEnv }) <- ask
liftIO $ do
env <- defaultClientEnv
-- First, estimate the total number of documents
eRes <- runClientM (search apiKey (Just query) Nothing (Just 1)) env
eRes <- runClientM (searchWithHistory apiKey (Just query) Nothing (Just 1) (Just "y") mWebEnv) env
pure $ get' config env (fromMaybe defaultPerPage perPage) <$> eRes
where
get' :: Config
......@@ -59,18 +61,20 @@ getMetadataWithC = do
( Just numResults
, yieldMany [0..]
.| takeC (fromInteger numPages)
.| concatMapMC (\pageNum -> runReaderT (getPage pageNum) config))
.| concatMapMC (\pageNum -> runReaderT (getPage pageNum) $ config { mWebEnv = maybe mWebEnv' Just (mWebEnv config)}))
where
numResults = fromMaybe 0 $ parseDocCount $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res
resEncoded = TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res
mWebEnv' = parseWebEnv resEncoded
numResults = fromMaybe 0 $ parseDocCount resEncoded
numPages = numResults `div` (fromIntegral perPage) + 1
getPage :: Integer
getPage :: Offset
-> Env [PubMed]
getPage pageNum = do
(Config { perPage = mPerPage }) <- ask
(Config { perPage = mPerPage, query }) <- ask
let perPage = fromMaybe defaultPerPage mPerPage
let offset = fromIntegral $ pageNum * perPage
liftIO $ print $ "[getPage] getting page " <> show pageNum <> ", offset: " <> show offset <> ", perPage: " <> show perPage
liftIO $ print $ "[getPage] getting page " <> show pageNum <> ", offset: " <> show offset <> ", perPage: " <> show perPage <> ", query: " <> T.unpack query
eDocs <- runSimpleFindPubmedAbstractRequest (Just offset)
case eDocs of
Left err -> panic $ "[getPage] error: " <> show err
......@@ -118,7 +122,7 @@ runSimpleFetchPubmedAbstractRequest ids = do
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Maybe Integer
runSimpleFindPubmedAbstractRequest :: Maybe Offset
-> Env (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest offset = do
eDocIds <- searchDocIds offset
......@@ -128,17 +132,18 @@ runSimpleFindPubmedAbstractRequest offset = do
liftIO $ print $ "[runSimpleFindPubmedAbstractRequest] docIds" <> show docIds
runMultipleFPAR docIds
searchDocIds :: Maybe Integer -> Env (Either Text [Integer])
searchDocIds :: Maybe Offset -> Env (Either Text [Integer])
searchDocIds offset = do
(Config { apiKey, query, perPage }) <- ask
(Config { apiKey, query, perPage, mWebEnv }) <- ask
liftIO $ do
env <- defaultClientEnv
res <- runClientM
(search apiKey (Just query) offset perPage)
(searchWithHistory apiKey (Just query) offset perPage (Just "y") mWebEnv)
env
case res of
(Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do
liftIO $ print $ "[searchDocIds] docs" <> show docs
pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs
--pure $ Right $ runParser parseDocId docs
......
......@@ -7,7 +7,7 @@ import Servant.Client
import qualified Data.Text as T
import qualified Network.HTTP.Media as M
import PUBMED.Types (APIKey, Query, PerPage)
import PUBMED.Types (APIKey, Offset, Query, PerPage, WebEnv)
data DB = PUBMED
......@@ -21,13 +21,26 @@ instance MimeUnrender BsXml BsXml where
mimeUnrender _ = Right . BsXml
type PUBMEDAPI =
-- https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
"esearch.fcgi"
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:> QueryParam "api_key" APIKey
:> QueryParam "term" T.Text
:> QueryParam "retstart" Integer
:> QueryParam "retmax" Integer
:> QueryParam "term" Query
:> QueryParam "retstart" Offset
:> QueryParam "retmax" PerPage
:> Get '[BsXml] BsXml
:<|>
-- https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
"esearch.fcgi"
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:> QueryParam "api_key" APIKey
:> QueryParam "term" Query
:> QueryParam "retstart" Offset
:> QueryParam "retmax" PerPage
:> QueryParam "usehistory" T.Text
:> QueryParam "WebEnv" WebEnv
:> Get '[BsXml] BsXml
:<|>
"efetch.fcgi"
......@@ -41,14 +54,23 @@ pubmedApi :: Proxy PUBMEDAPI
pubmedApi = Proxy
search :: Maybe APIKey
-> Maybe T.Text
-> Maybe Integer
-> Maybe Integer
-> Maybe Query
-> Maybe Offset
-> Maybe PerPage
-> ClientM BsXml
searchWithHistory :: Maybe APIKey
-> Maybe Query
-> Maybe Offset
-> Maybe PerPage
-> Maybe T.Text
-> Maybe WebEnv
-> ClientM BsXml
fetch :: Maybe APIKey
-> Maybe T.Text
-> Maybe T.Text
-> [Integer]
-> ClientM BsXml
search :<|> fetch = client pubmedApi
search :<|> searchWithHistory :<|> fetch = client pubmedApi
......@@ -35,6 +35,9 @@ parseDocIds txt = map parseId parsed
parseDocCount :: TL.Text -> Maybe Integer
parseDocCount txt = TR.readMaybe $ T.unpack $ txt ^. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "Count" . TTL.contents
parseWebEnv :: TL.Text -> Maybe T.Text
parseWebEnv txt = TR.readMaybe $ T.unpack $ txt ^. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "WebEnv" . TTL.contents
data PubMed =
PubMed { pubmed_id :: Int
, pubmed_article :: PubMedArticle
......
module PUBMED.ServantClientLogging where
import qualified Data.Binary.Builder as DBB
import Debug.Trace (trace)
import Servant.Client (ClientEnv(..))
import Servant.Client.Core.Request (RequestF(..))
addLoggingToClientEnv :: ClientEnv -> ClientEnv
addLoggingToClientEnv env@(ClientEnv { makeClientRequest = mcr }) = env { makeClientRequest }
where
makeClientRequest baseUrl req =
trace ("req: " <> showRequestPath req <> " " <> show (requestQueryString req)) $ mcr baseUrl req
showRequestPath req =
show $ DBB.toLazyByteString $ requestPath req
......@@ -5,17 +5,21 @@ import Data.Text (Text)
type APIKey = Text
type Offset = Integer
type Query = Text
type PerPage = Integer
type WebEnv = Text
data Config = Config {
apiKey :: Maybe APIKey
, query :: Query
, perPage :: Maybe PerPage
, mWebEnv :: Maybe WebEnv
}
type Env = ReaderT Config IO
-- | This is the default `retmax` value in ESearch.
defaultPerPage :: PerPage
defaultPerPage = 200
defaultPerPage = 20
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment