Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
pubmed
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
2
Issues
2
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
crawlers
pubmed
Commits
06476735
Commit
06476735
authored
Jul 05, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[API] adding limit and better error report (need more error info).
parent
c2e3e77c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
52 additions
and
75 deletions
+52
-75
Main.hs
app/Main.hs
+2
-57
crawlerPubMed.cabal
crawlerPubMed.cabal
+10
-4
PUBMED.hs
src/PUBMED.hs
+38
-12
Client.hs
src/PUBMED/Client.hs
+2
-2
No files found.
app/Main.hs
View file @
06476735
...
@@ -2,63 +2,8 @@
...
@@ -2,63 +2,8 @@
module
Main
where
module
Main
where
import
PUBMED.Client
import
PUBMED
(
crawler
)
import
PUBMED.Parser
import
Network.HTTP.Client
(
newManager
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Servant.Client
(
runClientM
,
mkClientEnv
,
BaseUrl
(
..
),
Scheme
(
..
))
import
Text.XML
(
parseLBS_
,
def
)
import
Text.XML.Cursor
(
fromDocument
,
Cursor
)
import
Data.Conduit
(
ConduitT
)
import
Data.ByteString.Lazy
(
ByteString
)
import
Data.ByteString.Char8
(
pack
)
import
Control.Monad.Catch
(
MonadThrow
)
import
Control.Applicative
import
Data.Attoparsec.ByteString
import
Data.Attoparsec.ByteString.Char8
(
anyChar
)
import
qualified
Data.ByteString.Lazy
as
LBS
import
qualified
Data.ByteString
as
DB
import
qualified
Data.Text
as
T
runParser
::
Show
res
=>
(
Cursor
->
res
)
->
LBS
.
ByteString
->
res
runParser
parser
=
parser
.
fromDocument
.
parseLBS_
def
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub
::
Parser
ByteString
removeSub
=
do
dt
<-
many
textWithBalise
pure
$
LBS
.
fromStrict
$
pack
$
concat
dt
where
textWithBalise
=
manyTill
anyChar
(
try
subs
)
subs
=
sub
<|>
asub
-- <|> isEndOfInput
sub
=
string
"<sub>"
asub
=
string
"</sub>"
runSimpleFindPubmedAbstractRequest
::
T
.
Text
->
IO
[
PubMed
]
runSimpleFindPubmedAbstractRequest
rq
=
do
manager'
<-
newManager
tlsManagerSettings
res
<-
runClientM
(
search
(
Just
rq
))
(
mkClientEnv
manager'
$
BaseUrl
Https
"eutils.ncbi.nlm.nih.gov"
443
"entrez/eutils"
)
case
res
of
(
Left
err
)
->
return
[]
(
Right
(
BsXml
docs
))
->
do
let
docIds
=
runParser
parseDocId
docs
res'
<-
runClientM
(
fetch
(
Just
"pubmed"
)
(
Just
"abstract"
)
docIds
)
(
mkClientEnv
manager'
$
BaseUrl
Https
"eutils.ncbi.nlm.nih.gov"
443
"entrez/eutils"
)
case
res'
of
(
Left
err
)
->
return
[]
(
Right
(
BsXml
abstracts
))
->
do
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case
(
parseOnly
removeSub
$
LBS
.
toStrict
abstracts
<>
"</sub>"
)
of
(
Left
_
)
->
return
[]
(
Right
v
)
->
pubMedParser
v
main
::
IO
()
main
::
IO
()
main
=
do
main
=
crawler
"bisphenol"
(
Just
3
)
>>=
print
pubmeds
<-
runSimpleFindPubmedAbstractRequest
"bisphenol"
print
pubmeds
crawlerPubMed.cabal
View file @
06476735
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
--
--
-- see: https://github.com/sol/hpack
-- see: https://github.com/sol/hpack
--
--
-- hash:
2d262a275f0e9e59092e1b7e005b90c2018a7b9484371aad24175b7a30116e60
-- hash:
580738bee1b100e9a792dbb063ed94632428cf9c69f8ea4889aba5e65415e2fe
name: crawlerPubMed
name: crawlerPubMed
version: 0.1.0.0
version: 0.1.0.0
...
@@ -34,10 +34,12 @@ library
...
@@ -34,10 +34,12 @@ library
hs-source-dirs:
hs-source-dirs:
src
src
build-depends:
build-depends:
base >=4.7 && <5
attoparsec
, base >=4.7 && <5
, bytestring
, bytestring
, conduit
, conduit
, data-time-segment
, data-time-segment
, either
, exceptions
, exceptions
, http-client
, http-client
, http-client-tls
, http-client-tls
...
@@ -59,11 +61,13 @@ executable crawlerPubMed-exe
...
@@ -59,11 +61,13 @@ executable crawlerPubMed-exe
app
app
ghc-options: -threaded -rtsopts -with-rtsopts=-N
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
build-depends:
base >=4.7 && <5
attoparsec
, base >=4.7 && <5
, bytestring
, bytestring
, conduit
, conduit
, crawlerPubMed
, crawlerPubMed
, data-time-segment
, data-time-segment
, either
, exceptions
, exceptions
, http-client
, http-client
, http-client-tls
, http-client-tls
...
@@ -86,11 +90,13 @@ test-suite crawlerPubMed-test
...
@@ -86,11 +90,13 @@ test-suite crawlerPubMed-test
test
test
ghc-options: -threaded -rtsopts -with-rtsopts=-N
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
build-depends:
base >=4.7 && <5
attoparsec
, base >=4.7 && <5
, bytestring
, bytestring
, conduit
, conduit
, crawlerPubMed
, crawlerPubMed
, data-time-segment
, data-time-segment
, either
, exceptions
, exceptions
, http-client
, http-client
, http-client-tls
, http-client-tls
...
...
src/PUBMED.hs
View file @
06476735
...
@@ -2,41 +2,67 @@
...
@@ -2,41 +2,67 @@
module
PUBMED
where
module
PUBMED
where
import
Data.Text
(
Text
)
import
PUBMED.Client
import
PUBMED.Client
import
PUBMED.Parser
import
PUBMED.Parser
import
Network.HTTP.Client
(
newManager
)
import
Network.HTTP.Client
(
newManager
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Servant.Client
(
runClientM
,
mkClientEnv
,
BaseUrl
(
..
),
Scheme
(
..
))
import
Servant.Client
(
runClientM
,
mkClientEnv
,
BaseUrl
(
..
),
Scheme
(
..
))
import
Text.XML
(
parseLBS_
,
def
)
import
Text.XML
(
parseLBS_
,
def
)
import
Text.XML.Cursor
(
fromDocument
,
Cursor
)
import
Text.XML.Cursor
(
fromDocument
,
Cursor
)
import
Data.Conduit
(
ConduitT
)
import
Data.ByteString.Lazy
(
ByteString
)
import
Data.ByteString.Char8
(
pack
)
import
Control.Monad.Catch
(
MonadThrow
)
import
Control.Applicative
import
Data.Attoparsec.ByteString
import
Data.Attoparsec.ByteString.Char8
(
anyChar
)
import
qualified
Data.ByteString.Lazy
as
LBS
import
qualified
Data.ByteString.Lazy
as
LBS
import
qualified
Data.ByteString
as
DB
import
qualified
Data.Text
as
T
import
qualified
Data.Text
as
T
runParser
::
Show
res
=>
(
Cursor
->
res
)
->
LBS
.
ByteString
->
res
runParser
::
Show
res
=>
(
Cursor
->
res
)
->
LBS
.
ByteString
->
res
runParser
parser
=
parser
.
fromDocument
.
parseLBS_
def
runParser
parser
=
parser
.
fromDocument
.
parseLBS_
def
crawler
::
T
.
Text
->
IO
[
PubMed
]
-- | TODO this parser need at least one subs at the end
crawler
rq
=
do
-- (use endOfInput)
removeSub
::
Parser
ByteString
removeSub
=
do
dt
<-
many
textWithBalise
pure
$
LBS
.
fromStrict
$
pack
$
concat
dt
where
textWithBalise
=
manyTill
anyChar
(
try
subs
)
subs
=
sub
<|>
asub
-- <|> isEndOfInput
sub
=
string
"<sub>"
asub
=
string
"</sub>"
type
Query
=
Text
type
Limit
=
Integer
crawler
::
Text
->
Maybe
Limit
->
IO
(
Either
Text
[
PubMed
])
crawler
=
runSimpleFindPubmedAbstractRequest
runSimpleFindPubmedAbstractRequest
::
Text
->
Maybe
Limit
->
IO
(
Either
Text
[
PubMed
])
runSimpleFindPubmedAbstractRequest
query
limit
=
do
manager'
<-
newManager
tlsManagerSettings
manager'
<-
newManager
tlsManagerSettings
res
<-
runClientM
res
<-
runClientM
(
search
(
Just
rq
)
)
(
search
(
Just
query
)
limit
)
(
mkClientEnv
manager'
$
BaseUrl
Https
"eutils.ncbi.nlm.nih.gov"
443
"entrez/eutils"
)
(
mkClientEnv
manager'
$
BaseUrl
Https
"eutils.ncbi.nlm.nih.gov"
443
"entrez/eutils"
)
case
res
of
case
res
of
(
Left
err
)
->
return
[]
(
Left
_
)
->
pure
(
Left
"Error: PubMed Internet connection"
)
(
Right
(
BsXml
docs
))
->
do
(
Right
(
BsXml
docs
))
->
do
let
docIds
=
runParser
parseDocId
docs
let
docIds
=
runParser
parseDocId
docs
res'
<-
runClientM
res'
<-
runClientM
(
fetch
(
Just
"pubmed"
)
(
Just
"abstract"
)
docIds
)
(
fetch
(
Just
"pubmed"
)
(
Just
"abstract"
)
docIds
)
(
mkClientEnv
manager'
$
BaseUrl
Https
"eutils.ncbi.nlm.nih.gov"
443
"entrez/eutils"
)
(
mkClientEnv
manager'
$
BaseUrl
Https
"eutils.ncbi.nlm.nih.gov"
443
"entrez/eutils"
)
case
res'
of
case
res'
of
(
Left
err
)
->
return
[]
(
Left
_
)
->
pure
(
Left
"Error: PubMed API connection"
)
(
Right
(
BsXml
abstracts
))
->
(
Right
(
BsXml
abstracts
))
->
do
pubMedParser
abstracts
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case
(
parseOnly
removeSub
$
LBS
.
toStrict
abstracts
<>
"</sub>"
)
of
(
Left
err
)
->
pure
(
Left
"Error: PubMed parser"
)
(
Right
v
)
->
Right
<$>
pubMedParser
v
src/PUBMED/Client.hs
View file @
06476735
...
@@ -30,6 +30,7 @@ type PUBMEDAPI =
...
@@ -30,6 +30,7 @@ type PUBMEDAPI =
-- :> QueryParam "db" DB
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
-- not mandatory since the base db is pubmed
:>
QueryParam
"term"
T
.
Text
:>
QueryParam
"term"
T
.
Text
:>
QueryParam
"retmax"
Integer
:>
Get
'[
B
sXml
]
BsXml
:>
Get
'[
B
sXml
]
BsXml
:<|>
:<|>
"efetch.fcgi"
"efetch.fcgi"
...
@@ -41,8 +42,7 @@ type PUBMEDAPI =
...
@@ -41,8 +42,7 @@ type PUBMEDAPI =
pubmedApi
::
Proxy
PUBMEDAPI
pubmedApi
::
Proxy
PUBMEDAPI
pubmedApi
=
Proxy
pubmedApi
=
Proxy
search
::
Maybe
T
.
Text
search
::
Maybe
T
.
Text
->
Maybe
Integer
->
ClientM
BsXml
->
ClientM
BsXml
fetch
::
Maybe
T
.
Text
fetch
::
Maybe
T
.
Text
->
Maybe
T
.
Text
->
Maybe
T
.
Text
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment