Commit 47ed870c authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Refactoring

Added Env type to make a ReaderT wrapper around Config.

Also, Main.hs refactored to accept command line parameters, this makes
testing things out easier.
parent 63eb8187
module Main where
import Data.Semigroup ((<>))
import Conduit
import Control.Monad.Reader (runReaderT)
import qualified Data.Text as Text
import Options.Applicative
import System.Environment (getArgs)
import PUBMED (getMetadataWith, getMetadataWithC)
import PUBMED.Types
......@@ -23,32 +23,33 @@ data Command
| Fetch FetchParams
metaParams :: Parser Command
metaParams = do
api_key <- optional (strOption
( long "api-key"
<> metavar "API_KEY"
<> help "Pubmed API key"))
term <- strArgument (metavar "term")
pure $ Meta (MetaParams api_key term)
metaParams = Meta <$>
(MetaParams
<$> optional (strOption
( long "api-key"
<> metavar "API_KEY"
<> help "Pubmed API key"))
<*> strArgument (metavar "term"))
fetchParams :: Parser Command
fetchParams = do
api_key <- optional (strOption
( long "api-key"
<> metavar "API_KEY"
<> help "Pubmed API key"))
limit <- option auto
fetchParams = Fetch <$>
(FetchParams
<$> optional (strOption
( long "api-key"
<> metavar "API_KEY"
<> help "Pubmed API key"))
<*> option auto
( long "limit"
<> help "Maximum number of documents"
<> showDefault
<> value 100
<> metavar "INT" )
term <- strArgument (metavar "term")
pure $ Fetch (FetchParams api_key limit term)
<> help "Maximum number of documents"
<> showDefault
<> value 100
<> metavar "INT" )
<*> strArgument (metavar "term"))
params :: Parser Command
params = metaParams <|> fetchParams
params = subparser
(command "meta" (info metaParams (progDesc "Download only metadata"))
<> command "fetch" (info fetchParams (progDesc "Fetch docs")))
opts :: ParserInfo Command
opts = info (params <**> helper)
......@@ -64,14 +65,16 @@ run :: Command -> IO ()
run (Meta (MetaParams { mp_mAPIKey, mp_term })) = do
let config = Config { mAPIKey = Text.pack <$> mp_mAPIKey }
eDocs <- getMetadataWithC config (Text.pack mp_term) Nothing
eDocs <- runReaderT (getMetadataWithC (Text.pack mp_term) Nothing) config
case eDocs of
Left err -> print err
Right (count, _docs) -> print $ show count
Right (count, _docsC) -> print $ show count
run (Fetch (FetchParams { fp_mAPIKey, fp_term, fp_limit })) = do
let config = Config { mAPIKey = Text.pack <$> fp_mAPIKey }
eDocs <- getMetadataWithC config (Text.pack fp_term) (Just $ fromIntegral fp_limit)
eDocs <- runReaderT (getMetadataWithC (Text.pack fp_term) (Just $ fromIntegral fp_limit)) config
case eDocs of
Left err -> print err
Right (count, docs) -> print $ show count
Right (count, docsC) -> do
print $ show count
runConduit $ docsC .| mapM_C print
......@@ -53,6 +53,7 @@ library
, http-client-tls
, http-media
, lens
, mtl
, optparse-applicative
, protolude
, servant
......@@ -88,6 +89,7 @@ executable crawlerPubMed-exe
, http-client-tls
, http-media
, lens
, mtl
, optparse-applicative
, protolude
, servant
......@@ -126,6 +128,7 @@ test-suite crawlerPubMed-test
, http-client-tls
, http-media
, lens
, mtl
, optparse-applicative
, protolude
, servant
......
......@@ -39,6 +39,7 @@ dependencies:
- http-client-tls
- http-media
- lens
- mtl
- optparse-applicative
- protolude
- servant
......
......@@ -3,6 +3,7 @@ module PUBMED where
import Conduit
import Control.Applicative
import Control.Concurrent (threadDelay)
import Control.Monad.Reader (ask, runReaderT)
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import Data.ByteString.Char8 (pack)
......@@ -34,49 +35,56 @@ pmPort = 443
batchSize :: Int
batchSize = 200
defaultEnv :: IO ClientEnv
defaultEnv = do
defaultClientEnv :: IO ClientEnv
defaultClientEnv = do
manager' <- newManager tlsManagerSettings
pure $ mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath
-- | API main function
getMetadataWith :: Config -> Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [PubMed])
getMetadataWith :: Text -> Maybe Integer -> Maybe Limit -> Env (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest
getMetadataWithC :: Config
-> Text
getMetadataWithC :: Text
-> Maybe Limit
-> IO (Either ClientError (Maybe Integer, ConduitT () PubMed IO ()))
getMetadataWithC config@(Config { mAPIKey = mAPIKey }) query limit = do
env <- defaultEnv
-- First, estimate the total number of documents
eRes <- runClientM (search mAPIKey (Just query) Nothing (Just 1)) env
pure $ get' env query limit batchSize <$> eRes
-> Env (Either ClientError (Maybe Integer, ConduitT () PubMed IO ()))
getMetadataWithC query limit = do
config@(Config { mAPIKey = mAPIKey }) <- ask
liftIO $ do
env <- defaultClientEnv
-- First, estimate the total number of documents
eRes <- runClientM (search mAPIKey (Just query) Nothing (Just 1)) env
pure $ get' config env query limit batchSize <$> eRes
where
get' :: ClientEnv -> Text -> Maybe Limit -> Int -> BsXml -> (Maybe Integer, ConduitT () PubMed IO ())
get' env q l perPage (BsXml res) =
get' :: Config
-> ClientEnv
-> Text
-> Maybe Limit
-> Int
-> BsXml
-> (Maybe Integer, ConduitT () PubMed IO ())
get' config env q l perPage (BsXml res) =
( Just numResults
, yieldMany [0..]
.| takeC (fromInteger numPages)
.| concatMapMC (getPage env config q perPage))
.| concatMapMC (\pageNum -> runReaderT (getPage q perPage pageNum) config))
where
numResults = fromMaybe 0 $ parseDocCount $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res
numPages = numResults `div` (fromIntegral perPage) + 1
getPage :: ClientEnv
-> Config
-> Text
getPage :: Text
-> Int
-> Int
-> IO [PubMed]
getPage env config q perPage pageNum = do
-> Env [PubMed]
getPage q perPage pageNum = do
let offset = fromIntegral $ pageNum * perPage
eDocs <- runSimpleFindPubmedAbstractRequest config q (Just offset) (Just $ fromIntegral perPage)
case eDocs of
Left err -> panic $ "[getPage] error: " <> show err
Right docs -> do
_ <- threadDelay 2000000 -- One seconds
pure docs
liftIO $ print $ "[getPage] getting page " <> show pageNum <> ", offset: " <> show offset <> ", perPage: " <> show perPage
eDocs <- runSimpleFindPubmedAbstractRequest q (Just offset) (Just $ fromIntegral perPage)
liftIO $ case eDocs of
Left err -> panic $ "[getPage] error: " <> show err
Right docs -> liftIO $ do
_ <- threadDelay 2000000 -- One seconds
print $ "[getPage] docs length: " <> show (length docs)
pure docs
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
......@@ -101,63 +109,67 @@ type Query = Text
type Limit = Integer
runMultipleFPAR :: [Integer]
-> IO (Either Text [PubMed])
-> Env (Either Text [PubMed])
runMultipleFPAR [] = pure $ Right []
runMultipleFPAR ids = List.foldl1' concat'
<$> mapM runSimpleFetchPubmedAbstractRequest (by 300 ids)
runMultipleFPAR ids = do
List.foldl1' concat' <$> mapM runSimpleFetchPubmedAbstractRequest (by batchSize ids)
where
by _ [] = []
by n ns = head' : ( by n tail')
where
(head',tail') = List.splitAt n ns
(head', tail') = List.splitAt n ns
concat' (Right n) (Right m) = Right $ n <> m
concat' n m = n <> m
runSimpleFetchPubmedAbstractRequest ::
[Integer]
-> IO (Either Text [PubMed])
runSimpleFetchPubmedAbstractRequest :: [Integer]
-> Env (Either Text [PubMed])
runSimpleFetchPubmedAbstractRequest ids = do
env <- defaultEnv
res <- runClientM
(fetch Nothing (Just "pubmed") (Just "abstract") ids)
env
case res of
(Left err) -> pure (Left . T.pack $ show err)
(Right (BsXml abs)) -> do
--putStrLn $ show abs
case parseOnly removeSub $ LBS.toStrict abs of
(Left err'') -> pure (Left $ T.pack err'')
(Right v) -> do
let parsed = parsePubMed $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict v
pure $ Right parsed
(Config { mAPIKey = mAPIKey }) <- ask
liftIO $ do
env <- defaultClientEnv
res <- runClientM
(fetch mAPIKey (Just "pubmed") (Just "abstract") ids)
env
case res of
(Left err) -> pure (Left . T.pack $ show err)
(Right (BsXml abs)) -> do
--putStrLn $ show abs
case parseOnly removeSub $ LBS.toStrict abs of
(Left err'') -> pure (Left $ T.pack err'')
(Right v) -> do
let parsed = parsePubMed $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict v
pure $ Right parsed
-- parsed <- catch (pubMedParser v) ((\e -> do
-- _ <- print e
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Config
-> Text
runSimpleFindPubmedAbstractRequest :: Text
-> Maybe Integer
-> Maybe Limit
-> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest config query offset limit = do
eDocIds <- searchDocIds config query offset limit
-> Env (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query offset limit = do
eDocIds <- searchDocIds query offset limit
case eDocIds of
Left err -> pure $ Left err
Right docIds -> runMultipleFPAR docIds
searchDocIds :: Config -> Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [Integer])
searchDocIds (Config { mAPIKey = mAPIKey }) query offset limit = do
env <- defaultEnv
res <- runClientM
(search mAPIKey (Just query) offset limit)
env
case res of
(Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do
pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs
--pure $ Right $ runParser parseDocId docs
Right docIds -> do
liftIO $ print $ "[runSimpleFindPubmedAbstractRequest] docIds" <> show docIds
runMultipleFPAR docIds
searchDocIds :: Text -> Maybe Integer -> Maybe Limit -> Env (Either Text [Integer])
searchDocIds query offset limit = do
(Config { mAPIKey = mAPIKey }) <- ask
liftIO $ do
env <- defaultClientEnv
res <- runClientM
(search mAPIKey (Just query) offset limit)
env
case res of
(Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do
pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs
--pure $ Right $ runParser parseDocId docs
--runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
--runParser parser = parser . fromDocument . parseLBS_ def
module PUBMED.Types where
import Control.Monad.Reader (ReaderT)
import Data.Text (Text)
data Config = Config {
mAPIKey :: Maybe Text
}
type Env = ReaderT Config IO
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment