Commit 6495eb7a authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[conduit] implement conduit fetch for pubmed

parent 9cdba642
......@@ -4,4 +4,4 @@ import PUBMED (getMetadataWith)
main :: IO ()
main = getMetadataWith "bisphenol" (Just 100) >>= print
main = getMetadataWith "bisphenol" Nothing (Just 100) >>= print
......@@ -28,6 +28,7 @@ library
PUBMED
PUBMED.Client
PUBMED.Parser
PUBMED.Test
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
......@@ -46,6 +47,7 @@ library
, data-time-segment
, either
, exceptions
, ghc
, http-client
, http-client-tls
, http-media
......@@ -78,6 +80,7 @@ executable crawlerPubMed-exe
, data-time-segment
, either
, exceptions
, ghc
, http-client
, http-client-tls
, http-media
......@@ -113,6 +116,7 @@ test-suite crawlerPubMed-test
, data-time-segment
, either
, exceptions
, ghc
, http-client
, http-client-tls
, http-media
......
......@@ -33,6 +33,7 @@ dependencies:
- data-time-segment
- either
- exceptions
- ghc
- http-client
- http-client-tls
- http-media
......
module PUBMED where
import Conduit
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import Data.ByteString.Char8 (pack)
import Data.ByteString.Lazy (ByteString)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import PUBMED.Client
import PUBMED.Parser
import Prelude hiding (takeWhile)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), ClientEnv, ClientError, Scheme(..))
import qualified Data.ByteString.Lazy as LBS
import qualified Data.List as List
import qualified Data.Text as T
......@@ -25,14 +27,43 @@ pmSearchPath = "entrez/eutils"
pmPort :: Int
pmPort = 443
batchSize :: Int
batchSize = 2000
defaultEnv :: IO ClientEnv
defaultEnv = do
manager' <- newManager tlsManagerSettings
pure $ mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath
-- | API main function
getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed])
getMetadataWith :: Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest
getMetadataWithC :: Text -> Maybe Limit -> IO (Either ClientError (Maybe Integer, ConduitT () PubMed IO ()))
getMetadataWithC query limit = do
env <- defaultEnv
-- First, estimate the total number of documents
eRes <- runClientM (search (Just query) Nothing (Just 1)) env
pure $ get' env query limit batchSize <$> eRes
where
get' :: ClientEnv -> Text -> Maybe Limit -> Int -> BsXml -> (Maybe Integer, ConduitT () PubMed IO ())
get' env q l perPage (BsXml res) =
( Just numResults
, yieldMany [0..]
.| takeC (fromInteger numPages)
.| concatMapMC (getPage env q perPage))
where
numResults = fromMaybe 0 $ parseDocCount $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res
numPages = numResults `div` (fromIntegral perPage) + 1
getPage :: ClientEnv -> Text -> Int -> Int -> IO [PubMed]
getPage env q perPage pageNum = do
let offset = fromIntegral $ pageNum * perPage
eDocs <- runSimpleFindPubmedAbstractRequest q (Just offset) (Just $ fromIntegral pageNum)
pure $ case eDocs of
Left err -> []
Right docs -> docs
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
......@@ -57,6 +88,7 @@ type Limit = Integer
runMultipleFPAR :: [Integer]
-> IO (Either Text [PubMed])
runMultipleFPAR [] = pure $ Right []
runMultipleFPAR ids = List.foldl1' concat'
<$> mapM runSimpleFetchPubmedAbstractRequest (by 300 ids)
where
......@@ -90,18 +122,18 @@ runSimpleFetchPubmedAbstractRequest ids = do
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query limit = do
eDocIds <- searchDocIds query limit
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query offset limit = do
eDocIds <- searchDocIds query offset limit
case eDocIds of
Left err -> pure $ Left err
Right docIds -> runMultipleFPAR docIds
searchDocIds :: Text -> Maybe Limit -> IO (Either Text [Integer])
searchDocIds query limit = do
searchDocIds :: Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [Integer])
searchDocIds query offset limit = do
env <- defaultEnv
res <- runClientM
(search (Just query) limit)
(search (Just query) offset limit)
env
case res of
(Left err) -> pure (Left $ T.pack $ show err)
......
......@@ -23,6 +23,7 @@ type PUBMEDAPI =
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:> QueryParam "term" T.Text
:> QueryParam "retstart" Integer
:> QueryParam "retmax" Integer
:> Get '[BsXml] BsXml
:<|>
......@@ -35,7 +36,7 @@ type PUBMEDAPI =
pubmedApi :: Proxy PUBMEDAPI
pubmedApi = Proxy
search :: Maybe T.Text -> Maybe Integer -> ClientM BsXml
search :: Maybe T.Text -> Maybe Integer -> Maybe Integer -> ClientM BsXml
fetch :: Maybe T.Text
-> Maybe T.Text
......
......@@ -3,7 +3,7 @@
module PUBMED.Parser where
import Control.Lens ((^?), (^.), (^..), only, to, ix, prism', Prism')
import Data.Maybe (catMaybes)
import Data.Maybe (catMaybes, fromMaybe)
import Data.Time (UTCTime(..))
import Data.Time.Segment (jour)
import Prelude hiding (head)
......@@ -12,6 +12,8 @@ import qualified Data.ByteString.Lazy as DBL
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TLIO
import Panic (panic)
import qualified Text.Read as TR
import qualified Text.Taggy.Lens as TTL
namedEl name = TTL.elements . TTL.named (only name)
......@@ -23,9 +25,15 @@ contentWithChildren = prism' NodeContent $ \case { NodeContent c -> Just c
deepContent = TTL.children . traverse . contentWithChildren
parseDocIds :: TL.Text -> [Integer]
parseDocIds txt = map (\s -> read (T.unpack s) :: Integer) parsed
parseDocIds txt = map parseId parsed
where
parsed = txt ^.. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "IdList" . namedEl "Id" . TTL.contents
parseId s = case (TR.readMaybe (T.unpack s) :: Maybe Integer) of
Nothing -> panic $ "Can't read doc id from: " <> (T.unpack s)
Just cnt -> cnt
parseDocCount :: TL.Text -> Maybe Integer
parseDocCount txt = TR.readMaybe $ T.unpack $ txt ^. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "Count" . TTL.contents
data PubMed =
PubMed { pubmed_article :: PubMedArticle
......@@ -73,9 +81,9 @@ parsePubMed txt = catMaybes $ txt ^.. pubmedArticle . to pubMed
, pubmedDate_month = m
, pubmedDate_day = d }
where
y = read $ T.unpack $ el ^. namedEl "Year" . TTL.contents
m = read $ T.unpack $ el ^. namedEl "Month" . TTL.contents
d = read $ T.unpack $ el ^. namedEl "Day" . TTL.contents
y = fromMaybe 1 $ TR.readMaybe $ T.unpack $ el ^. namedEl "Year" . TTL.contents
m = fromMaybe 1 $ TR.readMaybe $ T.unpack $ el ^. namedEl "Month" . TTL.contents
d = fromMaybe 1 $ TR.readMaybe $ T.unpack $ el ^. namedEl "Day" . TTL.contents
pubMedArticle el = PubMedArticle { pubmed_title = Just $ el ^. title
, pubmed_journal = el ^? journalTitle
, pubmed_abstract = el ^.. abstract
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment