Commit c2e3e77c authored by Alexandre Delanoë's avatar Alexandre Delanoë

[MERGE]

parents dcaa0f5d 8504d4c5
......@@ -2,7 +2,63 @@
module Main where
import qualified PUBMED as PubMed
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (MonadThrow)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
removeSub = do
dt <- many textWithBalise
pure $ LBS.fromStrict $ pack $ concat dt
where
textWithBalise = manyTill anyChar (try subs)
subs = sub <|> asub -- <|> isEndOfInput
sub = string "<sub>"
asub = string "</sub>"
runSimpleFindPubmedAbstractRequest :: T.Text -> IO [PubMed]
runSimpleFindPubmedAbstractRequest rq = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just rq))
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> return []
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> return []
(Right (BsXml abstracts)) -> do
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
(Left _) -> return []
(Right v) -> pubMedParser v
main :: IO ()
main = PubMed.crawler "organ" >>= print
main = do
pubmeds <- runSimpleFindPubmedAbstractRequest "bisphenol"
print pubmeds
......@@ -35,6 +35,8 @@ dependencies:
- time
- data-time-segment
- protolude
- attoparsec
- either
library:
source-dirs: src
......
......@@ -43,6 +43,7 @@ pubmedApi = Proxy
search :: Maybe T.Text
-> ClientM BsXml
fetch :: Maybe T.Text
-> Maybe T.Text
-> [Integer]
......
......@@ -38,7 +38,8 @@ parseDocId cursor = fst <$>
identity :: a -> a
identity x = x
manyTagsUntil :: MonadThrow m => Name
manyTagsUntil :: MonadThrow m =>
Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do
......@@ -97,11 +98,11 @@ pubMedParser bstring = runConduit $ parseLBS def bstring
parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
parseArticleSet =
force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
force "PubmedArticleSet required" $ manyTagsUntil "PubmedArticleSet" $ manyYield parsePubMedArticle
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
parsePubMedArticle =
tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
manyTagsUntil "PubmedArticle" parsePubMedArticle'
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
parsePubMedArticle' = do
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment