Commit 31cb4d28 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Main.hs works now, returning list of docs

parent 47ed870c
......@@ -63,18 +63,22 @@ main = run =<< execParser opts
run :: Command -> IO ()
run (Meta (MetaParams { mp_mAPIKey, mp_term })) = do
let config = Config { mAPIKey = Text.pack <$> mp_mAPIKey }
let config = Config { apiKey = Text.pack <$> mp_mAPIKey
, query = Text.pack mp_term
, perPage = Nothing }
eDocs <- runReaderT (getMetadataWithC (Text.pack mp_term) Nothing) config
eDocs <- runReaderT getMetadataWithC config
case eDocs of
Left err -> print err
Right (count, _docsC) -> print $ show count
run (Fetch (FetchParams { fp_mAPIKey, fp_term, fp_limit })) = do
let config = Config { mAPIKey = Text.pack <$> fp_mAPIKey }
let config = Config { apiKey = Text.pack <$> fp_mAPIKey
, query = Text.pack fp_term
, perPage = Nothing }
eDocs <- runReaderT (getMetadataWithC (Text.pack fp_term) (Just $ fromIntegral fp_limit)) config
eDocs <- runReaderT getMetadataWithC config
case eDocs of
Left err -> print err
Right (count, docsC) -> do
print $ show count
runConduit $ docsC .| mapM_C print
runConduit $ docsC .| takeC fp_limit .| mapM_C print
......@@ -32,87 +32,60 @@ pmSearchPath = "entrez/eutils"
pmPort :: Int
pmPort = 443
batchSize :: Int
batchSize = 200
defaultClientEnv :: IO ClientEnv
defaultClientEnv = do
manager' <- newManager tlsManagerSettings
pure $ mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath
-- | API main function
getMetadataWith :: Text -> Maybe Integer -> Maybe Limit -> Env (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest
getMetadataWith :: Env (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest Nothing
getMetadataWithC :: Text
-> Maybe Limit
-> Env (Either ClientError (Maybe Integer, ConduitT () PubMed IO ()))
getMetadataWithC query limit = do
config@(Config { mAPIKey = mAPIKey }) <- ask
getMetadataWithC :: Env (Either ClientError (Maybe Integer, ConduitT () PubMed IO ()))
getMetadataWithC = do
config@(Config { apiKey, query, perPage }) <- ask
liftIO $ do
env <- defaultClientEnv
-- First, estimate the total number of documents
eRes <- runClientM (search mAPIKey (Just query) Nothing (Just 1)) env
pure $ get' config env query limit batchSize <$> eRes
eRes <- runClientM (search apiKey (Just query) Nothing (Just 1)) env
pure $ get' config env (fromMaybe defaultPerPage perPage) <$> eRes
where
get' :: Config
-> ClientEnv
-> Text
-> Maybe Limit
-> Int
-> PerPage
-> BsXml
-> (Maybe Integer, ConduitT () PubMed IO ())
get' config env q l perPage (BsXml res) =
get' config env perPage (BsXml res) =
( Just numResults
, yieldMany [0..]
.| takeC (fromInteger numPages)
.| concatMapMC (\pageNum -> runReaderT (getPage q perPage pageNum) config))
.| concatMapMC (\pageNum -> runReaderT (getPage pageNum) config))
where
numResults = fromMaybe 0 $ parseDocCount $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res
numPages = numResults `div` (fromIntegral perPage) + 1
getPage :: Text
-> Int
-> Int
getPage :: Integer
-> Env [PubMed]
getPage q perPage pageNum = do
getPage pageNum = do
(Config { perPage = mPerPage }) <- ask
let perPage = fromMaybe defaultPerPage mPerPage
let offset = fromIntegral $ pageNum * perPage
liftIO $ print $ "[getPage] getting page " <> show pageNum <> ", offset: " <> show offset <> ", perPage: " <> show perPage
eDocs <- runSimpleFindPubmedAbstractRequest q (Just offset) (Just $ fromIntegral perPage)
liftIO $ case eDocs of
Left err -> panic $ "[getPage] error: " <> show err
Right docs -> liftIO $ do
_ <- threadDelay 2000000 -- One seconds
print $ "[getPage] docs length: " <> show (length docs)
pure docs
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
removeSub = do
dt <- many textWithBalise
eo <- manyTill anyChar endOfInput
pure $ LBS.fromStrict $ pack $ concat dt <> eo
where
textWithBalise =
manyTill anyChar (sub <|> asub)
sub = string "<sub>"
<|> string "<sup>"
<|> string "<i>"
<|> string "<b>"
asub = string "</sub>"
<|> string "</sup>"
<|> string "</i>"
<|> string "</b>"
type Query = Text
type Limit = Integer
eDocs <- runSimpleFindPubmedAbstractRequest (Just offset)
case eDocs of
Left err -> panic $ "[getPage] error: " <> show err
Right docs -> liftIO $ do
_ <- threadDelay 2000000 -- One seconds
print $ "[getPage] docs length: " <> show (length docs)
pure docs
runMultipleFPAR :: [Integer]
-> Env (Either Text [PubMed])
runMultipleFPAR [] = pure $ Right []
runMultipleFPAR ids = do
List.foldl1' concat' <$> mapM runSimpleFetchPubmedAbstractRequest (by batchSize ids)
(Config { perPage = mPerPage }) <- ask
let perPage = fromInteger $ fromMaybe defaultPerPage mPerPage
List.foldl1' concat' <$> mapM runSimpleFetchPubmedAbstractRequest (by perPage ids)
where
by _ [] = []
by n ns = head' : ( by n tail')
......@@ -125,11 +98,11 @@ runMultipleFPAR ids = do
runSimpleFetchPubmedAbstractRequest :: [Integer]
-> Env (Either Text [PubMed])
runSimpleFetchPubmedAbstractRequest ids = do
(Config { mAPIKey = mAPIKey }) <- ask
(Config { apiKey }) <- ask
liftIO $ do
env <- defaultClientEnv
res <- runClientM
(fetch mAPIKey (Just "pubmed") (Just "abstract") ids)
(fetch apiKey (Just "pubmed") (Just "abstract") ids)
env
case res of
(Left err) -> pure (Left . T.pack $ show err)
......@@ -145,25 +118,23 @@ runSimpleFetchPubmedAbstractRequest ids = do
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Text
-> Maybe Integer
-> Maybe Limit
runSimpleFindPubmedAbstractRequest :: Maybe Integer
-> Env (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query offset limit = do
eDocIds <- searchDocIds query offset limit
runSimpleFindPubmedAbstractRequest offset = do
eDocIds <- searchDocIds offset
case eDocIds of
Left err -> pure $ Left err
Right docIds -> do
liftIO $ print $ "[runSimpleFindPubmedAbstractRequest] docIds" <> show docIds
runMultipleFPAR docIds
searchDocIds :: Text -> Maybe Integer -> Maybe Limit -> Env (Either Text [Integer])
searchDocIds query offset limit = do
(Config { mAPIKey = mAPIKey }) <- ask
searchDocIds :: Maybe Integer -> Env (Either Text [Integer])
searchDocIds offset = do
(Config { apiKey, query, perPage }) <- ask
liftIO $ do
env <- defaultClientEnv
res <- runClientM
(search mAPIKey (Just query) offset limit)
(search apiKey (Just query) offset perPage)
env
case res of
(Left err) -> pure (Left $ T.pack $ show err)
......@@ -173,3 +144,23 @@ searchDocIds query offset limit = do
--runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
--runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
removeSub = do
dt <- many textWithBalise
eo <- manyTill anyChar endOfInput
pure $ LBS.fromStrict $ pack $ concat dt <> eo
where
textWithBalise =
manyTill anyChar (sub <|> asub)
sub = string "<sub>"
<|> string "<sup>"
<|> string "<i>"
<|> string "<b>"
asub = string "</sub>"
<|> string "</sup>"
<|> string "</i>"
<|> string "</b>"
......@@ -7,6 +7,8 @@ import Servant.Client
import qualified Data.Text as T
import qualified Network.HTTP.Media as M
import PUBMED.Types (APIKey, Query, PerPage)
data DB = PUBMED
newtype BsXml = BsXml ByteString
......@@ -22,14 +24,14 @@ type PUBMEDAPI =
"esearch.fcgi"
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:> QueryParam "api_key" T.Text
:> QueryParam "api_key" APIKey
:> QueryParam "term" T.Text
:> QueryParam "retstart" Integer
:> QueryParam "retmax" Integer
:> Get '[BsXml] BsXml
:<|>
"efetch.fcgi"
:> QueryParam "api_key" T.Text
:> QueryParam "api_key" APIKey
:> QueryParam "db" T.Text
:> QueryParam "rettype" T.Text
:> QueryParams "id" Integer
......@@ -38,13 +40,13 @@ type PUBMEDAPI =
pubmedApi :: Proxy PUBMEDAPI
pubmedApi = Proxy
search :: Maybe T.Text
search :: Maybe APIKey
-> Maybe T.Text
-> Maybe Integer
-> Maybe Integer
-> ClientM BsXml
fetch :: Maybe T.Text
fetch :: Maybe APIKey
-> Maybe T.Text
-> Maybe T.Text
-> [Integer]
......
......@@ -16,7 +16,7 @@ import Panic (panic)
import qualified Text.Read as TR
import qualified Text.Taggy.Lens as TTL
namedEl name = TTL.elements . TTL.named (only name)
namedEl name = TTL.elements . TTL.named (only name)
contentWithChildren :: Prism' Node T.Text
contentWithChildren = prism' NodeContent $ \case { NodeContent c -> Just c
......
......@@ -3,8 +3,19 @@ module PUBMED.Types where
import Control.Monad.Reader (ReaderT)
import Data.Text (Text)
type APIKey = Text
type Query = Text
type PerPage = Integer
data Config = Config {
mAPIKey :: Maybe Text
apiKey :: Maybe APIKey
, query :: Query
, perPage :: Maybe PerPage
}
type Env = ReaderT Config IO
defaultPerPage :: PerPage
defaultPerPage = 200
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment