[API] fixes to pagination (usehistory=y)

parent 4ade4957
cabal-version: 1.12 cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.0. -- This file has been generated from package.yaml by hpack version 0.35.2.
-- --
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
...@@ -19,10 +19,6 @@ extra-source-files: ...@@ -19,10 +19,6 @@ extra-source-files:
README.md README.md
ChangeLog.md ChangeLog.md
data-files:
test/test-corpus/pubmed-29768149.xml
test/test-corpus/pubmed_long.xml
source-repository head source-repository head
type: git type: git
location: https://github.com/gitlab/crawlerPubMed location: https://github.com/gitlab/crawlerPubMed
...@@ -32,6 +28,7 @@ library ...@@ -32,6 +28,7 @@ library
PUBMED PUBMED
PUBMED.Client PUBMED.Client
PUBMED.Parser PUBMED.Parser
PUBMED.ServantClientLogging
PUBMED.Types PUBMED.Types
other-modules: other-modules:
Paths_crawlerPubMed Paths_crawlerPubMed
...@@ -47,6 +44,7 @@ library ...@@ -47,6 +44,7 @@ library
build-depends: build-depends:
attoparsec attoparsec
, base >=4.7 && <5 , base >=4.7 && <5
, binary
, bytestring , bytestring
, conduit , conduit
, data-time-segment , data-time-segment
...@@ -61,6 +59,7 @@ library ...@@ -61,6 +59,7 @@ library
, optparse-applicative , optparse-applicative
, servant , servant
, servant-client , servant-client
, servant-client-core
, taggy , taggy
, taggy-lens , taggy-lens
, text , text
...@@ -81,6 +80,7 @@ executable crawlerPubMed-exe ...@@ -81,6 +80,7 @@ executable crawlerPubMed-exe
build-depends: build-depends:
attoparsec attoparsec
, base >=4.7 && <5 , base >=4.7 && <5
, binary
, bytestring , bytestring
, conduit , conduit
, crawlerPubMed , crawlerPubMed
...@@ -96,6 +96,7 @@ executable crawlerPubMed-exe ...@@ -96,6 +96,7 @@ executable crawlerPubMed-exe
, optparse-applicative , optparse-applicative
, servant , servant
, servant-client , servant-client
, servant-client-core
, taggy , taggy
, taggy-lens , taggy-lens
, text , text
...@@ -119,6 +120,7 @@ test-suite crawlerPubMed-test ...@@ -119,6 +120,7 @@ test-suite crawlerPubMed-test
build-depends: build-depends:
attoparsec attoparsec
, base >=4.7 && <5 , base >=4.7 && <5
, binary
, bytestring , bytestring
, conduit , conduit
, crawlerPubMed , crawlerPubMed
...@@ -134,19 +136,9 @@ test-suite crawlerPubMed-test ...@@ -134,19 +136,9 @@ test-suite crawlerPubMed-test
, optparse-applicative , optparse-applicative
, servant , servant
, servant-client , servant-client
, tasty , servant-client-core
, tasty-hunit
, taggy , taggy
, taggy-lens , taggy-lens
, text , text
, time , time
default-language: Haskell2010 default-language: Haskell2010
benchmark bench-foo
type: exitcode-stdio-1.0
hs-source-dirs: bench
main-is: Main.hs
build-depends:
base
, crawlerPubMed
, tasty-bench
...@@ -29,6 +29,7 @@ description: Please see the README on GitHub at <https://gitlab.iscpif.f ...@@ -29,6 +29,7 @@ description: Please see the README on GitHub at <https://gitlab.iscpif.f
dependencies: dependencies:
- attoparsec - attoparsec
- base >= 4.7 && < 5 - base >= 4.7 && < 5
- binary
- bytestring - bytestring
- conduit - conduit
- data-time-segment - data-time-segment
...@@ -43,6 +44,7 @@ dependencies: ...@@ -43,6 +44,7 @@ dependencies:
- optparse-applicative - optparse-applicative
- servant - servant
- servant-client - servant-client
- servant-client-core
- taggy - taggy
- taggy-lens - taggy-lens
- text - text
......
...@@ -16,7 +16,7 @@ import PUBMED.Client ...@@ -16,7 +16,7 @@ import PUBMED.Client
import PUBMED.Parser import PUBMED.Parser
import Panic (panic) import Panic (panic)
import Prelude hiding (takeWhile) import Prelude hiding (takeWhile)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), ClientEnv, ClientError, Scheme(..)) import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), ClientEnv(..), ClientError, Scheme(..))
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Text as T import qualified Data.Text as T
...@@ -24,6 +24,7 @@ import qualified Data.Text.Encoding as TE ...@@ -24,6 +24,7 @@ import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy as TL
import PUBMED.Types import PUBMED.Types
import PUBMED.ServantClientLogging
pmHost :: String pmHost :: String
pmHost = "eutils.ncbi.nlm.nih.gov" pmHost = "eutils.ncbi.nlm.nih.gov"
...@@ -35,7 +36,8 @@ pmPort = 443 ...@@ -35,7 +36,8 @@ pmPort = 443
defaultClientEnv :: IO ClientEnv defaultClientEnv :: IO ClientEnv
defaultClientEnv = do defaultClientEnv = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
pure $ mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath let env = mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath
pure $ addLoggingToClientEnv env
-- | API main function -- | API main function
getMetadataWith :: Env (Either Text [PubMed]) getMetadataWith :: Env (Either Text [PubMed])
...@@ -43,11 +45,11 @@ getMetadataWith = runSimpleFindPubmedAbstractRequest Nothing ...@@ -43,11 +45,11 @@ getMetadataWith = runSimpleFindPubmedAbstractRequest Nothing
getMetadataWithC :: Env (Either ClientError (Maybe Integer, ConduitT () PubMed IO ())) getMetadataWithC :: Env (Either ClientError (Maybe Integer, ConduitT () PubMed IO ()))
getMetadataWithC = do getMetadataWithC = do
config@(Config { apiKey, query, perPage }) <- ask config@(Config { apiKey, query, perPage, mWebEnv }) <- ask
liftIO $ do liftIO $ do
env <- defaultClientEnv env <- defaultClientEnv
-- First, estimate the total number of documents -- First, estimate the total number of documents
eRes <- runClientM (search apiKey (Just query) Nothing (Just 1)) env eRes <- runClientM (searchWithHistory apiKey (Just query) Nothing (Just 1) (Just "y") mWebEnv) env
pure $ get' config env (fromMaybe defaultPerPage perPage) <$> eRes pure $ get' config env (fromMaybe defaultPerPage perPage) <$> eRes
where where
get' :: Config get' :: Config
...@@ -59,18 +61,20 @@ getMetadataWithC = do ...@@ -59,18 +61,20 @@ getMetadataWithC = do
( Just numResults ( Just numResults
, yieldMany [0..] , yieldMany [0..]
.| takeC (fromInteger numPages) .| takeC (fromInteger numPages)
.| concatMapMC (\pageNum -> runReaderT (getPage pageNum) config)) .| concatMapMC (\pageNum -> runReaderT (getPage pageNum) $ config { mWebEnv = maybe mWebEnv' Just (mWebEnv config)}))
where where
numResults = fromMaybe 0 $ parseDocCount $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res resEncoded = TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res
mWebEnv' = parseWebEnv resEncoded
numResults = fromMaybe 0 $ parseDocCount resEncoded
numPages = numResults `div` (fromIntegral perPage) + 1 numPages = numResults `div` (fromIntegral perPage) + 1
getPage :: Integer getPage :: Offset
-> Env [PubMed] -> Env [PubMed]
getPage pageNum = do getPage pageNum = do
(Config { perPage = mPerPage }) <- ask (Config { perPage = mPerPage, query }) <- ask
let perPage = fromMaybe defaultPerPage mPerPage let perPage = fromMaybe defaultPerPage mPerPage
let offset = fromIntegral $ pageNum * perPage let offset = fromIntegral $ pageNum * perPage
liftIO $ print $ "[getPage] getting page " <> show pageNum <> ", offset: " <> show offset <> ", perPage: " <> show perPage liftIO $ print $ "[getPage] getting page " <> show pageNum <> ", offset: " <> show offset <> ", perPage: " <> show perPage <> ", query: " <> T.unpack query
eDocs <- runSimpleFindPubmedAbstractRequest (Just offset) eDocs <- runSimpleFindPubmedAbstractRequest (Just offset)
case eDocs of case eDocs of
Left err -> panic $ "[getPage] error: " <> show err Left err -> panic $ "[getPage] error: " <> show err
...@@ -118,7 +122,7 @@ runSimpleFetchPubmedAbstractRequest ids = do ...@@ -118,7 +122,7 @@ runSimpleFetchPubmedAbstractRequest ids = do
-- pure []) :: XmlException -> IO [PubMed]) -- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed -- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Maybe Integer runSimpleFindPubmedAbstractRequest :: Maybe Offset
-> Env (Either Text [PubMed]) -> Env (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest offset = do runSimpleFindPubmedAbstractRequest offset = do
eDocIds <- searchDocIds offset eDocIds <- searchDocIds offset
...@@ -128,17 +132,18 @@ runSimpleFindPubmedAbstractRequest offset = do ...@@ -128,17 +132,18 @@ runSimpleFindPubmedAbstractRequest offset = do
liftIO $ print $ "[runSimpleFindPubmedAbstractRequest] docIds" <> show docIds liftIO $ print $ "[runSimpleFindPubmedAbstractRequest] docIds" <> show docIds
runMultipleFPAR docIds runMultipleFPAR docIds
searchDocIds :: Maybe Integer -> Env (Either Text [Integer]) searchDocIds :: Maybe Offset -> Env (Either Text [Integer])
searchDocIds offset = do searchDocIds offset = do
(Config { apiKey, query, perPage }) <- ask (Config { apiKey, query, perPage, mWebEnv }) <- ask
liftIO $ do liftIO $ do
env <- defaultClientEnv env <- defaultClientEnv
res <- runClientM res <- runClientM
(search apiKey (Just query) offset perPage) (searchWithHistory apiKey (Just query) offset perPage (Just "y") mWebEnv)
env env
case res of case res of
(Left err) -> pure (Left $ T.pack $ show err) (Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do (Right (BsXml docs)) -> do
liftIO $ print $ "[searchDocIds] docs" <> show docs
pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs
--pure $ Right $ runParser parseDocId docs --pure $ Right $ runParser parseDocId docs
......
...@@ -7,7 +7,7 @@ import Servant.Client ...@@ -7,7 +7,7 @@ import Servant.Client
import qualified Data.Text as T import qualified Data.Text as T
import qualified Network.HTTP.Media as M import qualified Network.HTTP.Media as M
import PUBMED.Types (APIKey, Query, PerPage) import PUBMED.Types (APIKey, Offset, Query, PerPage, WebEnv)
data DB = PUBMED data DB = PUBMED
...@@ -21,13 +21,26 @@ instance MimeUnrender BsXml BsXml where ...@@ -21,13 +21,26 @@ instance MimeUnrender BsXml BsXml where
mimeUnrender _ = Right . BsXml mimeUnrender _ = Right . BsXml
type PUBMEDAPI = type PUBMEDAPI =
-- https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
"esearch.fcgi" "esearch.fcgi"
-- :> QueryParam "db" DB -- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed -- not mandatory since the base db is pubmed
:> QueryParam "api_key" APIKey :> QueryParam "api_key" APIKey
:> QueryParam "term" T.Text :> QueryParam "term" Query
:> QueryParam "retstart" Integer :> QueryParam "retstart" Offset
:> QueryParam "retmax" Integer :> QueryParam "retmax" PerPage
:> Get '[BsXml] BsXml
:<|>
-- https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
"esearch.fcgi"
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:> QueryParam "api_key" APIKey
:> QueryParam "term" Query
:> QueryParam "retstart" Offset
:> QueryParam "retmax" PerPage
:> QueryParam "usehistory" T.Text
:> QueryParam "WebEnv" WebEnv
:> Get '[BsXml] BsXml :> Get '[BsXml] BsXml
:<|> :<|>
"efetch.fcgi" "efetch.fcgi"
...@@ -41,14 +54,23 @@ pubmedApi :: Proxy PUBMEDAPI ...@@ -41,14 +54,23 @@ pubmedApi :: Proxy PUBMEDAPI
pubmedApi = Proxy pubmedApi = Proxy
search :: Maybe APIKey search :: Maybe APIKey
-> Maybe T.Text -> Maybe Query
-> Maybe Integer -> Maybe Offset
-> Maybe Integer -> Maybe PerPage
-> ClientM BsXml -> ClientM BsXml
searchWithHistory :: Maybe APIKey
-> Maybe Query
-> Maybe Offset
-> Maybe PerPage
-> Maybe T.Text
-> Maybe WebEnv
-> ClientM BsXml
fetch :: Maybe APIKey fetch :: Maybe APIKey
-> Maybe T.Text -> Maybe T.Text
-> Maybe T.Text -> Maybe T.Text
-> [Integer] -> [Integer]
-> ClientM BsXml -> ClientM BsXml
search :<|> fetch = client pubmedApi
search :<|> searchWithHistory :<|> fetch = client pubmedApi
...@@ -35,6 +35,9 @@ parseDocIds txt = map parseId parsed ...@@ -35,6 +35,9 @@ parseDocIds txt = map parseId parsed
parseDocCount :: TL.Text -> Maybe Integer parseDocCount :: TL.Text -> Maybe Integer
parseDocCount txt = TR.readMaybe $ T.unpack $ txt ^. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "Count" . TTL.contents parseDocCount txt = TR.readMaybe $ T.unpack $ txt ^. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "Count" . TTL.contents
parseWebEnv :: TL.Text -> Maybe T.Text
parseWebEnv txt = TR.readMaybe $ T.unpack $ txt ^. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "WebEnv" . TTL.contents
data PubMed = data PubMed =
PubMed { pubmed_id :: Int PubMed { pubmed_id :: Int
, pubmed_article :: PubMedArticle , pubmed_article :: PubMedArticle
......
module PUBMED.ServantClientLogging where
import qualified Data.Binary.Builder as DBB
import Debug.Trace (trace)
import Servant.Client (ClientEnv(..))
import Servant.Client.Core.Request (RequestF(..))
addLoggingToClientEnv :: ClientEnv -> ClientEnv
addLoggingToClientEnv env@(ClientEnv { makeClientRequest = mcr }) = env { makeClientRequest }
where
makeClientRequest baseUrl req =
trace ("req: " <> showRequestPath req <> " " <> show (requestQueryString req)) $ mcr baseUrl req
showRequestPath req =
show $ DBB.toLazyByteString $ requestPath req
...@@ -5,17 +5,21 @@ import Data.Text (Text) ...@@ -5,17 +5,21 @@ import Data.Text (Text)
type APIKey = Text type APIKey = Text
type Offset = Integer
type Query = Text type Query = Text
type PerPage = Integer type PerPage = Integer
type WebEnv = Text
data Config = Config { data Config = Config {
apiKey :: Maybe APIKey apiKey :: Maybe APIKey
, query :: Query , query :: Query
, perPage :: Maybe PerPage , perPage :: Maybe PerPage
, mWebEnv :: Maybe WebEnv
} }
type Env = ReaderT Config IO type Env = ReaderT Config IO
-- | This is the default `retmax` value in ESearch.
defaultPerPage :: PerPage defaultPerPage :: PerPage
defaultPerPage = 200 defaultPerPage = 20
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment