Commit 06476735 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[API] adding limit and better error report (need more error info).

parent c2e3e77c
...@@ -2,63 +2,8 @@ ...@@ -2,63 +2,8 @@
module Main where module Main where
import PUBMED.Client import PUBMED (crawler)
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (MonadThrow)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
removeSub = do
dt <- many textWithBalise
pure $ LBS.fromStrict $ pack $ concat dt
where
textWithBalise = manyTill anyChar (try subs)
subs = sub <|> asub -- <|> isEndOfInput
sub = string "<sub>"
asub = string "</sub>"
runSimpleFindPubmedAbstractRequest :: T.Text -> IO [PubMed]
runSimpleFindPubmedAbstractRequest rq = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just rq))
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> return []
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> return []
(Right (BsXml abstracts)) -> do
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
(Left _) -> return []
(Right v) -> pubMedParser v
main :: IO () main :: IO ()
main = do main = crawler "bisphenol" (Just 3) >>= print
pubmeds <- runSimpleFindPubmedAbstractRequest "bisphenol"
print pubmeds
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
-- --
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
-- --
-- hash: 2d262a275f0e9e59092e1b7e005b90c2018a7b9484371aad24175b7a30116e60 -- hash: 580738bee1b100e9a792dbb063ed94632428cf9c69f8ea4889aba5e65415e2fe
name: crawlerPubMed name: crawlerPubMed
version: 0.1.0.0 version: 0.1.0.0
...@@ -34,10 +34,12 @@ library ...@@ -34,10 +34,12 @@ library
hs-source-dirs: hs-source-dirs:
src src
build-depends: build-depends:
base >=4.7 && <5 attoparsec
, base >=4.7 && <5
, bytestring , bytestring
, conduit , conduit
, data-time-segment , data-time-segment
, either
, exceptions , exceptions
, http-client , http-client
, http-client-tls , http-client-tls
...@@ -59,11 +61,13 @@ executable crawlerPubMed-exe ...@@ -59,11 +61,13 @@ executable crawlerPubMed-exe
app app
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
base >=4.7 && <5 attoparsec
, base >=4.7 && <5
, bytestring , bytestring
, conduit , conduit
, crawlerPubMed , crawlerPubMed
, data-time-segment , data-time-segment
, either
, exceptions , exceptions
, http-client , http-client
, http-client-tls , http-client-tls
...@@ -86,11 +90,13 @@ test-suite crawlerPubMed-test ...@@ -86,11 +90,13 @@ test-suite crawlerPubMed-test
test test
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
base >=4.7 && <5 attoparsec
, base >=4.7 && <5
, bytestring , bytestring
, conduit , conduit
, crawlerPubMed , crawlerPubMed
, data-time-segment , data-time-segment
, either
, exceptions , exceptions
, http-client , http-client
, http-client-tls , http-client-tls
......
...@@ -2,41 +2,67 @@ ...@@ -2,41 +2,67 @@
module PUBMED where module PUBMED where
import Data.Text (Text)
import PUBMED.Client import PUBMED.Client
import PUBMED.Parser import PUBMED.Parser
import Network.HTTP.Client (newManager) import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..)) import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def) import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor) import Text.XML.Cursor (fromDocument, Cursor)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (MonadThrow)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def runParser parser = parser . fromDocument . parseLBS_ def
crawler :: T.Text -> IO [PubMed] -- | TODO this parser need at least one subs at the end
crawler rq = do -- (use endOfInput)
removeSub :: Parser ByteString
removeSub = do
dt <- many textWithBalise
pure $ LBS.fromStrict $ pack $ concat dt
where
textWithBalise = manyTill anyChar (try subs)
subs = sub <|> asub -- <|> isEndOfInput
sub = string "<sub>"
asub = string "</sub>"
type Query = Text
type Limit = Integer
crawler :: Text -> Maybe Limit -> IO (Either Text [PubMed])
crawler = runSimpleFindPubmedAbstractRequest
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query limit = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
res <- runClientM res <- runClientM
(search (Just rq)) (search (Just query) limit)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils") (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of case res of
(Left err) -> return [] (Left _) -> pure (Left "Error: PubMed Internet connection")
(Right (BsXml docs)) -> do (Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs let docIds = runParser parseDocId docs
res' <- runClientM res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds) (fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils") (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of case res' of
(Left err) -> return [] (Left _) -> pure (Left "Error: PubMed API connection")
(Right (BsXml abstracts)) -> (Right (BsXml abstracts)) -> do
pubMedParser abstracts -- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
(Left err) -> pure (Left "Error: PubMed parser")
(Right v) -> Right <$> pubMedParser v
...@@ -30,6 +30,7 @@ type PUBMEDAPI = ...@@ -30,6 +30,7 @@ type PUBMEDAPI =
-- :> QueryParam "db" DB -- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed -- not mandatory since the base db is pubmed
:> QueryParam "term" T.Text :> QueryParam "term" T.Text
:> QueryParam "retmax" Integer
:> Get '[BsXml] BsXml :> Get '[BsXml] BsXml
:<|> :<|>
"efetch.fcgi" "efetch.fcgi"
...@@ -41,8 +42,7 @@ type PUBMEDAPI = ...@@ -41,8 +42,7 @@ type PUBMEDAPI =
pubmedApi :: Proxy PUBMEDAPI pubmedApi :: Proxy PUBMEDAPI
pubmedApi = Proxy pubmedApi = Proxy
search :: Maybe T.Text search :: Maybe T.Text -> Maybe Integer -> ClientM BsXml
-> ClientM BsXml
fetch :: Maybe T.Text fetch :: Maybe T.Text
-> Maybe T.Text -> Maybe T.Text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment