You need to sign in or sign up before continuing.
Commit c2e3e77c authored by Alexandre Delanoë's avatar Alexandre Delanoë

[MERGE]

parents dcaa0f5d 8504d4c5
...@@ -2,7 +2,63 @@ ...@@ -2,7 +2,63 @@
module Main where module Main where
import qualified PUBMED as PubMed import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (MonadThrow)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
removeSub = do
dt <- many textWithBalise
pure $ LBS.fromStrict $ pack $ concat dt
where
textWithBalise = manyTill anyChar (try subs)
subs = sub <|> asub -- <|> isEndOfInput
sub = string "<sub>"
asub = string "</sub>"
runSimpleFindPubmedAbstractRequest :: T.Text -> IO [PubMed]
runSimpleFindPubmedAbstractRequest rq = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just rq))
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> return []
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> return []
(Right (BsXml abstracts)) -> do
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
(Left _) -> return []
(Right v) -> pubMedParser v
main :: IO () main :: IO ()
main = PubMed.crawler "organ" >>= print main = do
pubmeds <- runSimpleFindPubmedAbstractRequest "bisphenol"
print pubmeds
...@@ -35,6 +35,8 @@ dependencies: ...@@ -35,6 +35,8 @@ dependencies:
- time - time
- data-time-segment - data-time-segment
- protolude - protolude
- attoparsec
- either
library: library:
source-dirs: src source-dirs: src
......
...@@ -43,6 +43,7 @@ pubmedApi = Proxy ...@@ -43,6 +43,7 @@ pubmedApi = Proxy
search :: Maybe T.Text search :: Maybe T.Text
-> ClientM BsXml -> ClientM BsXml
fetch :: Maybe T.Text fetch :: Maybe T.Text
-> Maybe T.Text -> Maybe T.Text
-> [Integer] -> [Integer]
......
...@@ -38,7 +38,8 @@ parseDocId cursor = fst <$> ...@@ -38,7 +38,8 @@ parseDocId cursor = fst <$>
identity :: a -> a identity :: a -> a
identity x = x identity x = x
manyTagsUntil :: MonadThrow m => Name manyTagsUntil :: MonadThrow m =>
Name
-> ConduitT Event o m b -> ConduitT Event o m b
-> ConduitT Event o m (Maybe b) -> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do manyTagsUntil name f = do
...@@ -97,11 +98,11 @@ pubMedParser bstring = runConduit $ parseLBS def bstring ...@@ -97,11 +98,11 @@ pubMedParser bstring = runConduit $ parseLBS def bstring
parseArticleSet :: MonadThrow m => ConduitT Event PubMed m () parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
parseArticleSet = parseArticleSet =
force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle force "PubmedArticleSet required" $ manyTagsUntil "PubmedArticleSet" $ manyYield parsePubMedArticle
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed) parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
parsePubMedArticle = parsePubMedArticle =
tagIgnoreAttrs "PubmedArticle" parsePubMedArticle' manyTagsUntil "PubmedArticle" parsePubMedArticle'
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
parsePubMedArticle' = do parsePubMedArticle' = do
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment