Commit 06476735 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[API] adding limit and better error report (need more error info).

parent c2e3e77c
......@@ -2,63 +2,8 @@
module Main where
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (MonadThrow)
import PUBMED (crawler)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
removeSub = do
dt <- many textWithBalise
pure $ LBS.fromStrict $ pack $ concat dt
where
textWithBalise = manyTill anyChar (try subs)
subs = sub <|> asub -- <|> isEndOfInput
sub = string "<sub>"
asub = string "</sub>"
runSimpleFindPubmedAbstractRequest :: T.Text -> IO [PubMed]
runSimpleFindPubmedAbstractRequest rq = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just rq))
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> return []
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> return []
(Right (BsXml abstracts)) -> do
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
(Left _) -> return []
(Right v) -> pubMedParser v
main :: IO ()
main = do
pubmeds <- runSimpleFindPubmedAbstractRequest "bisphenol"
print pubmeds
main = crawler "bisphenol" (Just 3) >>= print
......@@ -2,7 +2,7 @@
--
-- see: https://github.com/sol/hpack
--
-- hash: 2d262a275f0e9e59092e1b7e005b90c2018a7b9484371aad24175b7a30116e60
-- hash: 580738bee1b100e9a792dbb063ed94632428cf9c69f8ea4889aba5e65415e2fe
name: crawlerPubMed
version: 0.1.0.0
......@@ -34,10 +34,12 @@ library
hs-source-dirs:
src
build-depends:
base >=4.7 && <5
attoparsec
, base >=4.7 && <5
, bytestring
, conduit
, data-time-segment
, either
, exceptions
, http-client
, http-client-tls
......@@ -59,11 +61,13 @@ executable crawlerPubMed-exe
app
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
base >=4.7 && <5
attoparsec
, base >=4.7 && <5
, bytestring
, conduit
, crawlerPubMed
, data-time-segment
, either
, exceptions
, http-client
, http-client-tls
......@@ -86,11 +90,13 @@ test-suite crawlerPubMed-test
test
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
base >=4.7 && <5
attoparsec
, base >=4.7 && <5
, bytestring
, conduit
, crawlerPubMed
, data-time-segment
, either
, exceptions
, http-client
, http-client-tls
......
......@@ -2,41 +2,67 @@
module PUBMED where
import Data.Text (Text)
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (MonadThrow)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
crawler :: T.Text -> IO [PubMed]
crawler rq = do
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
removeSub = do
dt <- many textWithBalise
pure $ LBS.fromStrict $ pack $ concat dt
where
textWithBalise = manyTill anyChar (try subs)
subs = sub <|> asub -- <|> isEndOfInput
sub = string "<sub>"
asub = string "</sub>"
type Query = Text
type Limit = Integer
crawler :: Text -> Maybe Limit -> IO (Either Text [PubMed])
crawler = runSimpleFindPubmedAbstractRequest
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query limit = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just rq))
(search (Just query) limit)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> return []
(Left _) -> pure (Left "Error: PubMed Internet connection")
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> return []
(Right (BsXml abstracts)) ->
pubMedParser abstracts
(Left _) -> pure (Left "Error: PubMed API connection")
(Right (BsXml abstracts)) -> do
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
(Left err) -> pure (Left "Error: PubMed parser")
(Right v) -> Right <$> pubMedParser v
......@@ -30,6 +30,7 @@ type PUBMEDAPI =
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:> QueryParam "term" T.Text
:> QueryParam "retmax" Integer
:> Get '[BsXml] BsXml
:<|>
"efetch.fcgi"
......@@ -41,8 +42,7 @@ type PUBMEDAPI =
pubmedApi :: Proxy PUBMEDAPI
pubmedApi = Proxy
search :: Maybe T.Text
-> ClientM BsXml
search :: Maybe T.Text -> Maybe Integer -> ClientM BsXml
fetch :: Maybe T.Text
-> Maybe T.Text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment