Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pubmed
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
2
Issues
2
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
crawlers
pubmed
Commits
6495eb7a
Commit
6495eb7a
authored
Feb 24, 2022
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[conduit] implement conduit fetch for pubmed
parent
9cdba642
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
61 additions
and
15 deletions
+61
-15
Main.hs
app/Main.hs
+1
-1
crawlerPubMed.cabal
crawlerPubMed.cabal
+4
-0
package.yaml
package.yaml
+1
-0
PUBMED.hs
src/PUBMED.hs
+40
-8
Client.hs
src/PUBMED/Client.hs
+2
-1
Parser.hs
src/PUBMED/Parser.hs
+13
-5
No files found.
app/Main.hs
View file @
6495eb7a
...
...
@@ -4,4 +4,4 @@ import PUBMED (getMetadataWith)
main
::
IO
()
main
=
getMetadataWith
"bisphenol"
(
Just
100
)
>>=
print
main
=
getMetadataWith
"bisphenol"
Nothing
(
Just
100
)
>>=
print
crawlerPubMed.cabal
View file @
6495eb7a
...
...
@@ -28,6 +28,7 @@ library
PUBMED
PUBMED.Client
PUBMED.Parser
PUBMED.Test
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
...
...
@@ -46,6 +47,7 @@ library
, data-time-segment
, either
, exceptions
, ghc
, http-client
, http-client-tls
, http-media
...
...
@@ -78,6 +80,7 @@ executable crawlerPubMed-exe
, data-time-segment
, either
, exceptions
, ghc
, http-client
, http-client-tls
, http-media
...
...
@@ -113,6 +116,7 @@ test-suite crawlerPubMed-test
, data-time-segment
, either
, exceptions
, ghc
, http-client
, http-client-tls
, http-media
...
...
package.yaml
View file @
6495eb7a
...
...
@@ -33,6 +33,7 @@ dependencies:
-
data-time-segment
-
either
-
exceptions
-
ghc
-
http-client
-
http-client-tls
-
http-media
...
...
src/PUBMED.hs
View file @
6495eb7a
module
PUBMED
where
import
Conduit
import
Control.Applicative
import
Data.Attoparsec.ByteString
import
Data.Attoparsec.ByteString.Char8
(
anyChar
)
import
Data.ByteString.Char8
(
pack
)
import
Data.ByteString.Lazy
(
ByteString
)
import
Data.Maybe
(
fromMaybe
)
import
Data.Text
(
Text
)
import
Network.HTTP.Client
(
newManager
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
PUBMED.Client
import
PUBMED.Parser
import
Prelude
hiding
(
takeWhile
)
import
Servant.Client
(
runClientM
,
mkClientEnv
,
BaseUrl
(
..
),
Scheme
(
..
))
import
Servant.Client
(
runClientM
,
mkClientEnv
,
BaseUrl
(
..
),
ClientEnv
,
ClientError
,
Scheme
(
..
))
import
qualified
Data.ByteString.Lazy
as
LBS
import
qualified
Data.List
as
List
import
qualified
Data.Text
as
T
...
...
@@ -25,14 +27,43 @@ pmSearchPath = "entrez/eutils"
pmPort
::
Int
pmPort
=
443
batchSize
::
Int
batchSize
=
2000
defaultEnv
::
IO
ClientEnv
defaultEnv
=
do
manager'
<-
newManager
tlsManagerSettings
pure
$
mkClientEnv
manager'
$
BaseUrl
Https
pmHost
pmPort
pmSearchPath
-- | API main function
getMetadataWith
::
Text
->
Maybe
Limit
->
IO
(
Either
Text
[
PubMed
])
getMetadataWith
::
Text
->
Maybe
Integer
->
Maybe
Limit
->
IO
(
Either
Text
[
PubMed
])
getMetadataWith
=
runSimpleFindPubmedAbstractRequest
getMetadataWithC
::
Text
->
Maybe
Limit
->
IO
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
PubMed
IO
()
))
getMetadataWithC
query
limit
=
do
env
<-
defaultEnv
-- First, estimate the total number of documents
eRes
<-
runClientM
(
search
(
Just
query
)
Nothing
(
Just
1
))
env
pure
$
get'
env
query
limit
batchSize
<$>
eRes
where
get'
::
ClientEnv
->
Text
->
Maybe
Limit
->
Int
->
BsXml
->
(
Maybe
Integer
,
ConduitT
()
PubMed
IO
()
)
get'
env
q
l
perPage
(
BsXml
res
)
=
(
Just
numResults
,
yieldMany
[
0
..
]
.|
takeC
(
fromInteger
numPages
)
.|
concatMapMC
(
getPage
env
q
perPage
))
where
numResults
=
fromMaybe
0
$
parseDocCount
$
TL
.
fromStrict
$
TE
.
decodeUtf8
$
LBS
.
toStrict
res
numPages
=
numResults
`
div
`
(
fromIntegral
perPage
)
+
1
getPage
::
ClientEnv
->
Text
->
Int
->
Int
->
IO
[
PubMed
]
getPage
env
q
perPage
pageNum
=
do
let
offset
=
fromIntegral
$
pageNum
*
perPage
eDocs
<-
runSimpleFindPubmedAbstractRequest
q
(
Just
offset
)
(
Just
$
fromIntegral
pageNum
)
pure
$
case
eDocs
of
Left
err
->
[]
Right
docs
->
docs
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub
::
Parser
ByteString
...
...
@@ -57,6 +88,7 @@ type Limit = Integer
runMultipleFPAR
::
[
Integer
]
->
IO
(
Either
Text
[
PubMed
])
runMultipleFPAR
[]
=
pure
$
Right
[]
runMultipleFPAR
ids
=
List
.
foldl1'
concat'
<$>
mapM
runSimpleFetchPubmedAbstractRequest
(
by
300
ids
)
where
...
...
@@ -90,18 +122,18 @@ runSimpleFetchPubmedAbstractRequest ids = do
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest
::
Text
->
Maybe
Limit
->
IO
(
Either
Text
[
PubMed
])
runSimpleFindPubmedAbstractRequest
query
limit
=
do
eDocIds
<-
searchDocIds
query
limit
runSimpleFindPubmedAbstractRequest
::
Text
->
Maybe
Integer
->
Maybe
Limit
->
IO
(
Either
Text
[
PubMed
])
runSimpleFindPubmedAbstractRequest
query
offset
limit
=
do
eDocIds
<-
searchDocIds
query
offset
limit
case
eDocIds
of
Left
err
->
pure
$
Left
err
Right
docIds
->
runMultipleFPAR
docIds
searchDocIds
::
Text
->
Maybe
Limit
->
IO
(
Either
Text
[
Integer
])
searchDocIds
query
limit
=
do
searchDocIds
::
Text
->
Maybe
Integer
->
Maybe
Limit
->
IO
(
Either
Text
[
Integer
])
searchDocIds
query
offset
limit
=
do
env
<-
defaultEnv
res
<-
runClientM
(
search
(
Just
query
)
limit
)
(
search
(
Just
query
)
offset
limit
)
env
case
res
of
(
Left
err
)
->
pure
(
Left
$
T
.
pack
$
show
err
)
...
...
src/PUBMED/Client.hs
View file @
6495eb7a
...
...
@@ -23,6 +23,7 @@ type PUBMEDAPI =
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:>
QueryParam
"term"
T
.
Text
:>
QueryParam
"retstart"
Integer
:>
QueryParam
"retmax"
Integer
:>
Get
'[
B
sXml
]
BsXml
:<|>
...
...
@@ -35,7 +36,7 @@ type PUBMEDAPI =
pubmedApi
::
Proxy
PUBMEDAPI
pubmedApi
=
Proxy
search
::
Maybe
T
.
Text
->
Maybe
Integer
->
ClientM
BsXml
search
::
Maybe
T
.
Text
->
Maybe
Integer
->
Maybe
Integer
->
ClientM
BsXml
fetch
::
Maybe
T
.
Text
->
Maybe
T
.
Text
...
...
src/PUBMED/Parser.hs
View file @
6495eb7a
...
...
@@ -3,7 +3,7 @@
module
PUBMED.Parser
where
import
Control.Lens
((
^?
),
(
^.
),
(
^..
),
only
,
to
,
ix
,
prism'
,
Prism
'
)
import
Data.Maybe
(
catMaybes
)
import
Data.Maybe
(
catMaybes
,
fromMaybe
)
import
Data.Time
(
UTCTime
(
..
))
import
Data.Time.Segment
(
jour
)
import
Prelude
hiding
(
head
)
...
...
@@ -12,6 +12,8 @@ import qualified Data.ByteString.Lazy as DBL
import
qualified
Data.Text
as
T
import
qualified
Data.Text.Lazy
as
TL
import
qualified
Data.Text.Lazy.IO
as
TLIO
import
Panic
(
panic
)
import
qualified
Text.Read
as
TR
import
qualified
Text.Taggy.Lens
as
TTL
namedEl
name
=
TTL
.
elements
.
TTL
.
named
(
only
name
)
...
...
@@ -23,9 +25,15 @@ contentWithChildren = prism' NodeContent $ \case { NodeContent c -> Just c
deepContent
=
TTL
.
children
.
traverse
.
contentWithChildren
parseDocIds
::
TL
.
Text
->
[
Integer
]
parseDocIds
txt
=
map
(
\
s
->
read
(
T
.
unpack
s
)
::
Integer
)
parsed
parseDocIds
txt
=
map
parseId
parsed
where
parsed
=
txt
^..
TTL
.
html
.
TTL
.
allNamed
(
only
"eSearchResult"
)
.
namedEl
"IdList"
.
namedEl
"Id"
.
TTL
.
contents
parseId
s
=
case
(
TR
.
readMaybe
(
T
.
unpack
s
)
::
Maybe
Integer
)
of
Nothing
->
panic
$
"Can't read doc id from: "
<>
(
T
.
unpack
s
)
Just
cnt
->
cnt
parseDocCount
::
TL
.
Text
->
Maybe
Integer
parseDocCount
txt
=
TR
.
readMaybe
$
T
.
unpack
$
txt
^.
TTL
.
html
.
TTL
.
allNamed
(
only
"eSearchResult"
)
.
namedEl
"Count"
.
TTL
.
contents
data
PubMed
=
PubMed
{
pubmed_article
::
PubMedArticle
...
...
@@ -73,9 +81,9 @@ parsePubMed txt = catMaybes $ txt ^.. pubmedArticle . to pubMed
,
pubmedDate_month
=
m
,
pubmedDate_day
=
d
}
where
y
=
read
$
T
.
unpack
$
el
^.
namedEl
"Year"
.
TTL
.
contents
m
=
read
$
T
.
unpack
$
el
^.
namedEl
"Month"
.
TTL
.
contents
d
=
read
$
T
.
unpack
$
el
^.
namedEl
"Day"
.
TTL
.
contents
y
=
fromMaybe
1
$
TR
.
readMaybe
$
T
.
unpack
$
el
^.
namedEl
"Year"
.
TTL
.
contents
m
=
fromMaybe
1
$
TR
.
readMaybe
$
T
.
unpack
$
el
^.
namedEl
"Month"
.
TTL
.
contents
d
=
fromMaybe
1
$
TR
.
readMaybe
$
T
.
unpack
$
el
^.
namedEl
"Day"
.
TTL
.
contents
pubMedArticle
el
=
PubMedArticle
{
pubmed_title
=
Just
$
el
^.
title
,
pubmed_journal
=
el
^?
journalTitle
,
pubmed_abstract
=
el
^..
abstract
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment