Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pubmed
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
2
Issues
2
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
crawlers
pubmed
Commits
234ad423
Verified
Commit
234ad423
authored
Apr 04, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[API] fixes to pagination (usehistory=y)
parent
4ade4957
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
81 additions
and
38 deletions
+81
-38
crawlerPubMed.cabal
crawlerPubMed.cabal
+8
-16
package.yaml
package.yaml
+2
-0
PUBMED.hs
src/PUBMED.hs
+18
-13
Client.hs
src/PUBMED/Client.hs
+30
-8
Parser.hs
src/PUBMED/Parser.hs
+3
-0
ServantClientLogging.hs
src/PUBMED/ServantClientLogging.hs
+15
-0
Types.hs
src/PUBMED/Types.hs
+5
-1
No files found.
crawlerPubMed.cabal
View file @
234ad423
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.
0
.
-- This file has been generated from package.yaml by hpack version 0.35.
2
.
--
-- see: https://github.com/sol/hpack
...
...
@@ -19,10 +19,6 @@ extra-source-files:
README.md
ChangeLog.md
data-files:
test/test-corpus/pubmed-29768149.xml
test/test-corpus/pubmed_long.xml
source-repository head
type: git
location: https://github.com/gitlab/crawlerPubMed
...
...
@@ -32,6 +28,7 @@ library
PUBMED
PUBMED.Client
PUBMED.Parser
PUBMED.ServantClientLogging
PUBMED.Types
other-modules:
Paths_crawlerPubMed
...
...
@@ -47,6 +44,7 @@ library
build-depends:
attoparsec
, base >=4.7 && <5
, binary
, bytestring
, conduit
, data-time-segment
...
...
@@ -61,6 +59,7 @@ library
, optparse-applicative
, servant
, servant-client
, servant-client-core
, taggy
, taggy-lens
, text
...
...
@@ -81,6 +80,7 @@ executable crawlerPubMed-exe
build-depends:
attoparsec
, base >=4.7 && <5
, binary
, bytestring
, conduit
, crawlerPubMed
...
...
@@ -96,6 +96,7 @@ executable crawlerPubMed-exe
, optparse-applicative
, servant
, servant-client
, servant-client-core
, taggy
, taggy-lens
, text
...
...
@@ -119,6 +120,7 @@ test-suite crawlerPubMed-test
build-depends:
attoparsec
, base >=4.7 && <5
, binary
, bytestring
, conduit
, crawlerPubMed
...
...
@@ -134,19 +136,9 @@ test-suite crawlerPubMed-test
, optparse-applicative
, servant
, servant-client
, tasty
, tasty-hunit
, servant-client-core
, taggy
, taggy-lens
, text
, time
default-language: Haskell2010
benchmark bench-foo
type: exitcode-stdio-1.0
hs-source-dirs: bench
main-is: Main.hs
build-depends:
base
, crawlerPubMed
, tasty-bench
package.yaml
View file @
234ad423
...
...
@@ -29,6 +29,7 @@ description: Please see the README on GitHub at <https://gitlab.iscpif.f
dependencies
:
-
attoparsec
-
base >= 4.7 && < 5
-
binary
-
bytestring
-
conduit
-
data-time-segment
...
...
@@ -43,6 +44,7 @@ dependencies:
-
optparse-applicative
-
servant
-
servant-client
-
servant-client-core
-
taggy
-
taggy-lens
-
text
...
...
src/PUBMED.hs
View file @
234ad423
...
...
@@ -16,7 +16,7 @@ import PUBMED.Client
import
PUBMED.Parser
import
Panic
(
panic
)
import
Prelude
hiding
(
takeWhile
)
import
Servant.Client
(
runClientM
,
mkClientEnv
,
BaseUrl
(
..
),
ClientEnv
,
ClientError
,
Scheme
(
..
))
import
Servant.Client
(
runClientM
,
mkClientEnv
,
BaseUrl
(
..
),
ClientEnv
(
..
)
,
ClientError
,
Scheme
(
..
))
import
qualified
Data.ByteString.Lazy
as
LBS
import
qualified
Data.List
as
List
import
qualified
Data.Text
as
T
...
...
@@ -24,6 +24,7 @@ import qualified Data.Text.Encoding as TE
import
qualified
Data.Text.Lazy
as
TL
import
PUBMED.Types
import
PUBMED.ServantClientLogging
pmHost
::
String
pmHost
=
"eutils.ncbi.nlm.nih.gov"
...
...
@@ -35,7 +36,8 @@ pmPort = 443
defaultClientEnv
::
IO
ClientEnv
defaultClientEnv
=
do
manager'
<-
newManager
tlsManagerSettings
pure
$
mkClientEnv
manager'
$
BaseUrl
Https
pmHost
pmPort
pmSearchPath
let
env
=
mkClientEnv
manager'
$
BaseUrl
Https
pmHost
pmPort
pmSearchPath
pure
$
addLoggingToClientEnv
env
-- | API main function
getMetadataWith
::
Env
(
Either
Text
[
PubMed
])
...
...
@@ -43,11 +45,11 @@ getMetadataWith = runSimpleFindPubmedAbstractRequest Nothing
getMetadataWithC
::
Env
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
PubMed
IO
()
))
getMetadataWithC
=
do
config
@
(
Config
{
apiKey
,
query
,
perPage
})
<-
ask
config
@
(
Config
{
apiKey
,
query
,
perPage
,
mWebEnv
})
<-
ask
liftIO
$
do
env
<-
defaultClientEnv
-- First, estimate the total number of documents
eRes
<-
runClientM
(
search
apiKey
(
Just
query
)
Nothing
(
Just
1
)
)
env
eRes
<-
runClientM
(
search
WithHistory
apiKey
(
Just
query
)
Nothing
(
Just
1
)
(
Just
"y"
)
mWebEnv
)
env
pure
$
get'
config
env
(
fromMaybe
defaultPerPage
perPage
)
<$>
eRes
where
get'
::
Config
...
...
@@ -59,18 +61,20 @@ getMetadataWithC = do
(
Just
numResults
,
yieldMany
[
0
..
]
.|
takeC
(
fromInteger
numPages
)
.|
concatMapMC
(
\
pageNum
->
runReaderT
(
getPage
pageNum
)
config
))
.|
concatMapMC
(
\
pageNum
->
runReaderT
(
getPage
pageNum
)
$
config
{
mWebEnv
=
maybe
mWebEnv'
Just
(
mWebEnv
config
)}
))
where
numResults
=
fromMaybe
0
$
parseDocCount
$
TL
.
fromStrict
$
TE
.
decodeUtf8
$
LBS
.
toStrict
res
resEncoded
=
TL
.
fromStrict
$
TE
.
decodeUtf8
$
LBS
.
toStrict
res
mWebEnv'
=
parseWebEnv
resEncoded
numResults
=
fromMaybe
0
$
parseDocCount
resEncoded
numPages
=
numResults
`
div
`
(
fromIntegral
perPage
)
+
1
getPage
::
Integer
getPage
::
Offset
->
Env
[
PubMed
]
getPage
pageNum
=
do
(
Config
{
perPage
=
mPerPage
})
<-
ask
(
Config
{
perPage
=
mPerPage
,
query
})
<-
ask
let
perPage
=
fromMaybe
defaultPerPage
mPerPage
let
offset
=
fromIntegral
$
pageNum
*
perPage
liftIO
$
print
$
"[getPage] getting page "
<>
show
pageNum
<>
", offset: "
<>
show
offset
<>
", perPage: "
<>
show
perPage
liftIO
$
print
$
"[getPage] getting page "
<>
show
pageNum
<>
", offset: "
<>
show
offset
<>
", perPage: "
<>
show
perPage
<>
", query: "
<>
T
.
unpack
query
eDocs
<-
runSimpleFindPubmedAbstractRequest
(
Just
offset
)
case
eDocs
of
Left
err
->
panic
$
"[getPage] error: "
<>
show
err
...
...
@@ -118,7 +122,7 @@ runSimpleFetchPubmedAbstractRequest ids = do
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest
::
Maybe
Integer
runSimpleFindPubmedAbstractRequest
::
Maybe
Offset
->
Env
(
Either
Text
[
PubMed
])
runSimpleFindPubmedAbstractRequest
offset
=
do
eDocIds
<-
searchDocIds
offset
...
...
@@ -128,17 +132,18 @@ runSimpleFindPubmedAbstractRequest offset = do
liftIO
$
print
$
"[runSimpleFindPubmedAbstractRequest] docIds"
<>
show
docIds
runMultipleFPAR
docIds
searchDocIds
::
Maybe
Integer
->
Env
(
Either
Text
[
Integer
])
searchDocIds
::
Maybe
Offset
->
Env
(
Either
Text
[
Integer
])
searchDocIds
offset
=
do
(
Config
{
apiKey
,
query
,
perPage
})
<-
ask
(
Config
{
apiKey
,
query
,
perPage
,
mWebEnv
})
<-
ask
liftIO
$
do
env
<-
defaultClientEnv
res
<-
runClientM
(
search
apiKey
(
Just
query
)
offset
perPage
)
(
search
WithHistory
apiKey
(
Just
query
)
offset
perPage
(
Just
"y"
)
mWebEnv
)
env
case
res
of
(
Left
err
)
->
pure
(
Left
$
T
.
pack
$
show
err
)
(
Right
(
BsXml
docs
))
->
do
liftIO
$
print
$
"[searchDocIds] docs"
<>
show
docs
pure
$
Right
$
parseDocIds
$
TL
.
fromStrict
$
TE
.
decodeUtf8
$
LBS
.
toStrict
docs
--pure $ Right $ runParser parseDocId docs
...
...
src/PUBMED/Client.hs
View file @
234ad423
...
...
@@ -7,7 +7,7 @@ import Servant.Client
import
qualified
Data.Text
as
T
import
qualified
Network.HTTP.Media
as
M
import
PUBMED.Types
(
APIKey
,
Query
,
PerPage
)
import
PUBMED.Types
(
APIKey
,
Offset
,
Query
,
PerPage
,
WebEnv
)
data
DB
=
PUBMED
...
...
@@ -21,13 +21,26 @@ instance MimeUnrender BsXml BsXml where
mimeUnrender
_
=
Right
.
BsXml
type
PUBMEDAPI
=
-- https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
"esearch.fcgi"
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:>
QueryParam
"api_key"
APIKey
:>
QueryParam
"term"
T
.
Text
:>
QueryParam
"retstart"
Integer
:>
QueryParam
"retmax"
Integer
:>
QueryParam
"term"
Query
:>
QueryParam
"retstart"
Offset
:>
QueryParam
"retmax"
PerPage
:>
Get
'[
B
sXml
]
BsXml
:<|>
-- https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
"esearch.fcgi"
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:>
QueryParam
"api_key"
APIKey
:>
QueryParam
"term"
Query
:>
QueryParam
"retstart"
Offset
:>
QueryParam
"retmax"
PerPage
:>
QueryParam
"usehistory"
T
.
Text
:>
QueryParam
"WebEnv"
WebEnv
:>
Get
'[
B
sXml
]
BsXml
:<|>
"efetch.fcgi"
...
...
@@ -41,14 +54,23 @@ pubmedApi :: Proxy PUBMEDAPI
pubmedApi
=
Proxy
search
::
Maybe
APIKey
->
Maybe
T
.
Text
->
Maybe
Integer
->
Maybe
Integer
->
Maybe
Query
->
Maybe
Offset
->
Maybe
PerPage
->
ClientM
BsXml
searchWithHistory
::
Maybe
APIKey
->
Maybe
Query
->
Maybe
Offset
->
Maybe
PerPage
->
Maybe
T
.
Text
->
Maybe
WebEnv
->
ClientM
BsXml
fetch
::
Maybe
APIKey
->
Maybe
T
.
Text
->
Maybe
T
.
Text
->
[
Integer
]
->
ClientM
BsXml
search
:<|>
fetch
=
client
pubmedApi
search
:<|>
searchWithHistory
:<|>
fetch
=
client
pubmedApi
src/PUBMED/Parser.hs
View file @
234ad423
...
...
@@ -35,6 +35,9 @@ parseDocIds txt = map parseId parsed
parseDocCount
::
TL
.
Text
->
Maybe
Integer
parseDocCount
txt
=
TR
.
readMaybe
$
T
.
unpack
$
txt
^.
TTL
.
html
.
TTL
.
allNamed
(
only
"eSearchResult"
)
.
namedEl
"Count"
.
TTL
.
contents
parseWebEnv
::
TL
.
Text
->
Maybe
T
.
Text
parseWebEnv
txt
=
TR
.
readMaybe
$
T
.
unpack
$
txt
^.
TTL
.
html
.
TTL
.
allNamed
(
only
"eSearchResult"
)
.
namedEl
"WebEnv"
.
TTL
.
contents
data
PubMed
=
PubMed
{
pubmed_id
::
Int
,
pubmed_article
::
PubMedArticle
...
...
src/PUBMED/ServantClientLogging.hs
0 → 100644
View file @
234ad423
module
PUBMED.ServantClientLogging
where
import
qualified
Data.Binary.Builder
as
DBB
import
Debug.Trace
(
trace
)
import
Servant.Client
(
ClientEnv
(
..
))
import
Servant.Client.Core.Request
(
RequestF
(
..
))
addLoggingToClientEnv
::
ClientEnv
->
ClientEnv
addLoggingToClientEnv
env
@
(
ClientEnv
{
makeClientRequest
=
mcr
})
=
env
{
makeClientRequest
}
where
makeClientRequest
baseUrl
req
=
trace
(
"req: "
<>
showRequestPath
req
<>
" "
<>
show
(
requestQueryString
req
))
$
mcr
baseUrl
req
showRequestPath
req
=
show
$
DBB
.
toLazyByteString
$
requestPath
req
src/PUBMED/Types.hs
View file @
234ad423
...
...
@@ -5,17 +5,21 @@ import Data.Text (Text)
type
APIKey
=
Text
type
Offset
=
Integer
type
Query
=
Text
type
PerPage
=
Integer
type
WebEnv
=
Text
data
Config
=
Config
{
apiKey
::
Maybe
APIKey
,
query
::
Query
,
perPage
::
Maybe
PerPage
,
mWebEnv
::
Maybe
WebEnv
}
type
Env
=
ReaderT
Config
IO
-- | This is the default `retmax` value in ESearch.
defaultPerPage
::
PerPage
defaultPerPage
=
20
0
defaultPerPage
=
20
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment