Commit 47ed870c authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Refactoring

Added Env type to make a ReaderT wrapper around Config.

Also, Main.hs refactored to accept command line parameters, this makes
testing things out easier.
parent 63eb8187
module Main where module Main where
import Data.Semigroup ((<>)) import Conduit
import Control.Monad.Reader (runReaderT)
import qualified Data.Text as Text import qualified Data.Text as Text
import Options.Applicative import Options.Applicative
import System.Environment (getArgs)
import PUBMED (getMetadataWith, getMetadataWithC) import PUBMED (getMetadataWith, getMetadataWithC)
import PUBMED.Types import PUBMED.Types
...@@ -23,32 +23,33 @@ data Command ...@@ -23,32 +23,33 @@ data Command
| Fetch FetchParams | Fetch FetchParams
metaParams :: Parser Command metaParams :: Parser Command
metaParams = do metaParams = Meta <$>
api_key <- optional (strOption (MetaParams
<$> optional (strOption
( long "api-key" ( long "api-key"
<> metavar "API_KEY" <> metavar "API_KEY"
<> help "Pubmed API key")) <> help "Pubmed API key"))
term <- strArgument (metavar "term") <*> strArgument (metavar "term"))
pure $ Meta (MetaParams api_key term)
fetchParams :: Parser Command fetchParams :: Parser Command
fetchParams = do fetchParams = Fetch <$>
api_key <- optional (strOption (FetchParams
<$> optional (strOption
( long "api-key" ( long "api-key"
<> metavar "API_KEY" <> metavar "API_KEY"
<> help "Pubmed API key")) <> help "Pubmed API key"))
limit <- option auto <*> option auto
( long "limit" ( long "limit"
<> help "Maximum number of documents" <> help "Maximum number of documents"
<> showDefault <> showDefault
<> value 100 <> value 100
<> metavar "INT" ) <> metavar "INT" )
term <- strArgument (metavar "term") <*> strArgument (metavar "term"))
pure $ Fetch (FetchParams api_key limit term)
params :: Parser Command params :: Parser Command
params = metaParams <|> fetchParams params = subparser
(command "meta" (info metaParams (progDesc "Download only metadata"))
<> command "fetch" (info fetchParams (progDesc "Fetch docs")))
opts :: ParserInfo Command opts :: ParserInfo Command
opts = info (params <**> helper) opts = info (params <**> helper)
...@@ -64,14 +65,16 @@ run :: Command -> IO () ...@@ -64,14 +65,16 @@ run :: Command -> IO ()
run (Meta (MetaParams { mp_mAPIKey, mp_term })) = do run (Meta (MetaParams { mp_mAPIKey, mp_term })) = do
let config = Config { mAPIKey = Text.pack <$> mp_mAPIKey } let config = Config { mAPIKey = Text.pack <$> mp_mAPIKey }
eDocs <- getMetadataWithC config (Text.pack mp_term) Nothing eDocs <- runReaderT (getMetadataWithC (Text.pack mp_term) Nothing) config
case eDocs of case eDocs of
Left err -> print err Left err -> print err
Right (count, _docs) -> print $ show count Right (count, _docsC) -> print $ show count
run (Fetch (FetchParams { fp_mAPIKey, fp_term, fp_limit })) = do run (Fetch (FetchParams { fp_mAPIKey, fp_term, fp_limit })) = do
let config = Config { mAPIKey = Text.pack <$> fp_mAPIKey } let config = Config { mAPIKey = Text.pack <$> fp_mAPIKey }
eDocs <- getMetadataWithC config (Text.pack fp_term) (Just $ fromIntegral fp_limit) eDocs <- runReaderT (getMetadataWithC (Text.pack fp_term) (Just $ fromIntegral fp_limit)) config
case eDocs of case eDocs of
Left err -> print err Left err -> print err
Right (count, docs) -> print $ show count Right (count, docsC) -> do
print $ show count
runConduit $ docsC .| mapM_C print
...@@ -53,6 +53,7 @@ library ...@@ -53,6 +53,7 @@ library
, http-client-tls , http-client-tls
, http-media , http-media
, lens , lens
, mtl
, optparse-applicative , optparse-applicative
, protolude , protolude
, servant , servant
...@@ -88,6 +89,7 @@ executable crawlerPubMed-exe ...@@ -88,6 +89,7 @@ executable crawlerPubMed-exe
, http-client-tls , http-client-tls
, http-media , http-media
, lens , lens
, mtl
, optparse-applicative , optparse-applicative
, protolude , protolude
, servant , servant
...@@ -126,6 +128,7 @@ test-suite crawlerPubMed-test ...@@ -126,6 +128,7 @@ test-suite crawlerPubMed-test
, http-client-tls , http-client-tls
, http-media , http-media
, lens , lens
, mtl
, optparse-applicative , optparse-applicative
, protolude , protolude
, servant , servant
......
...@@ -39,6 +39,7 @@ dependencies: ...@@ -39,6 +39,7 @@ dependencies:
- http-client-tls - http-client-tls
- http-media - http-media
- lens - lens
- mtl
- optparse-applicative - optparse-applicative
- protolude - protolude
- servant - servant
......
...@@ -3,6 +3,7 @@ module PUBMED where ...@@ -3,6 +3,7 @@ module PUBMED where
import Conduit import Conduit
import Control.Applicative import Control.Applicative
import Control.Concurrent (threadDelay) import Control.Concurrent (threadDelay)
import Control.Monad.Reader (ask, runReaderT)
import Data.Attoparsec.ByteString import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar) import Data.Attoparsec.ByteString.Char8 (anyChar)
import Data.ByteString.Char8 (pack) import Data.ByteString.Char8 (pack)
...@@ -34,48 +35,55 @@ pmPort = 443 ...@@ -34,48 +35,55 @@ pmPort = 443
batchSize :: Int batchSize :: Int
batchSize = 200 batchSize = 200
defaultEnv :: IO ClientEnv defaultClientEnv :: IO ClientEnv
defaultEnv = do defaultClientEnv = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
pure $ mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath pure $ mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath
-- | API main function -- | API main function
getMetadataWith :: Config -> Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [PubMed]) getMetadataWith :: Text -> Maybe Integer -> Maybe Limit -> Env (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest getMetadataWith = runSimpleFindPubmedAbstractRequest
getMetadataWithC :: Config getMetadataWithC :: Text
-> Text
-> Maybe Limit -> Maybe Limit
-> IO (Either ClientError (Maybe Integer, ConduitT () PubMed IO ())) -> Env (Either ClientError (Maybe Integer, ConduitT () PubMed IO ()))
getMetadataWithC config@(Config { mAPIKey = mAPIKey }) query limit = do getMetadataWithC query limit = do
env <- defaultEnv config@(Config { mAPIKey = mAPIKey }) <- ask
liftIO $ do
env <- defaultClientEnv
-- First, estimate the total number of documents -- First, estimate the total number of documents
eRes <- runClientM (search mAPIKey (Just query) Nothing (Just 1)) env eRes <- runClientM (search mAPIKey (Just query) Nothing (Just 1)) env
pure $ get' env query limit batchSize <$> eRes pure $ get' config env query limit batchSize <$> eRes
where where
get' :: ClientEnv -> Text -> Maybe Limit -> Int -> BsXml -> (Maybe Integer, ConduitT () PubMed IO ()) get' :: Config
get' env q l perPage (BsXml res) = -> ClientEnv
-> Text
-> Maybe Limit
-> Int
-> BsXml
-> (Maybe Integer, ConduitT () PubMed IO ())
get' config env q l perPage (BsXml res) =
( Just numResults ( Just numResults
, yieldMany [0..] , yieldMany [0..]
.| takeC (fromInteger numPages) .| takeC (fromInteger numPages)
.| concatMapMC (getPage env config q perPage)) .| concatMapMC (\pageNum -> runReaderT (getPage q perPage pageNum) config))
where where
numResults = fromMaybe 0 $ parseDocCount $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res numResults = fromMaybe 0 $ parseDocCount $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res
numPages = numResults `div` (fromIntegral perPage) + 1 numPages = numResults `div` (fromIntegral perPage) + 1
getPage :: ClientEnv getPage :: Text
-> Config
-> Text
-> Int -> Int
-> Int -> Int
-> IO [PubMed] -> Env [PubMed]
getPage env config q perPage pageNum = do getPage q perPage pageNum = do
let offset = fromIntegral $ pageNum * perPage let offset = fromIntegral $ pageNum * perPage
eDocs <- runSimpleFindPubmedAbstractRequest config q (Just offset) (Just $ fromIntegral perPage) liftIO $ print $ "[getPage] getting page " <> show pageNum <> ", offset: " <> show offset <> ", perPage: " <> show perPage
case eDocs of eDocs <- runSimpleFindPubmedAbstractRequest q (Just offset) (Just $ fromIntegral perPage)
liftIO $ case eDocs of
Left err -> panic $ "[getPage] error: " <> show err Left err -> panic $ "[getPage] error: " <> show err
Right docs -> do Right docs -> liftIO $ do
_ <- threadDelay 2000000 -- One seconds _ <- threadDelay 2000000 -- One seconds
print $ "[getPage] docs length: " <> show (length docs)
pure docs pure docs
-- | TODO this parser need at least one subs at the end -- | TODO this parser need at least one subs at the end
...@@ -101,26 +109,27 @@ type Query = Text ...@@ -101,26 +109,27 @@ type Query = Text
type Limit = Integer type Limit = Integer
runMultipleFPAR :: [Integer] runMultipleFPAR :: [Integer]
-> IO (Either Text [PubMed]) -> Env (Either Text [PubMed])
runMultipleFPAR [] = pure $ Right [] runMultipleFPAR [] = pure $ Right []
runMultipleFPAR ids = List.foldl1' concat' runMultipleFPAR ids = do
<$> mapM runSimpleFetchPubmedAbstractRequest (by 300 ids) List.foldl1' concat' <$> mapM runSimpleFetchPubmedAbstractRequest (by batchSize ids)
where where
by _ [] = [] by _ [] = []
by n ns = head' : ( by n tail') by n ns = head' : ( by n tail')
where where
(head',tail') = List.splitAt n ns (head', tail') = List.splitAt n ns
concat' (Right n) (Right m) = Right $ n <> m concat' (Right n) (Right m) = Right $ n <> m
concat' n m = n <> m concat' n m = n <> m
runSimpleFetchPubmedAbstractRequest :: runSimpleFetchPubmedAbstractRequest :: [Integer]
[Integer] -> Env (Either Text [PubMed])
-> IO (Either Text [PubMed])
runSimpleFetchPubmedAbstractRequest ids = do runSimpleFetchPubmedAbstractRequest ids = do
env <- defaultEnv (Config { mAPIKey = mAPIKey }) <- ask
liftIO $ do
env <- defaultClientEnv
res <- runClientM res <- runClientM
(fetch Nothing (Just "pubmed") (Just "abstract") ids) (fetch mAPIKey (Just "pubmed") (Just "abstract") ids)
env env
case res of case res of
(Left err) -> pure (Left . T.pack $ show err) (Left err) -> pure (Left . T.pack $ show err)
...@@ -136,20 +145,23 @@ runSimpleFetchPubmedAbstractRequest ids = do ...@@ -136,20 +145,23 @@ runSimpleFetchPubmedAbstractRequest ids = do
-- pure []) :: XmlException -> IO [PubMed]) -- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed -- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Config runSimpleFindPubmedAbstractRequest :: Text
-> Text
-> Maybe Integer -> Maybe Integer
-> Maybe Limit -> Maybe Limit
-> IO (Either Text [PubMed]) -> Env (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest config query offset limit = do runSimpleFindPubmedAbstractRequest query offset limit = do
eDocIds <- searchDocIds config query offset limit eDocIds <- searchDocIds query offset limit
case eDocIds of case eDocIds of
Left err -> pure $ Left err Left err -> pure $ Left err
Right docIds -> runMultipleFPAR docIds Right docIds -> do
liftIO $ print $ "[runSimpleFindPubmedAbstractRequest] docIds" <> show docIds
searchDocIds :: Config -> Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [Integer]) runMultipleFPAR docIds
searchDocIds (Config { mAPIKey = mAPIKey }) query offset limit = do
env <- defaultEnv searchDocIds :: Text -> Maybe Integer -> Maybe Limit -> Env (Either Text [Integer])
searchDocIds query offset limit = do
(Config { mAPIKey = mAPIKey }) <- ask
liftIO $ do
env <- defaultClientEnv
res <- runClientM res <- runClientM
(search mAPIKey (Just query) offset limit) (search mAPIKey (Just query) offset limit)
env env
......
module PUBMED.Types where module PUBMED.Types where
import Control.Monad.Reader (ReaderT)
import Data.Text (Text) import Data.Text (Text)
data Config = Config { data Config = Config {
mAPIKey :: Maybe Text mAPIKey :: Maybe Text
} }
type Env = ReaderT Config IO
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment