Commit 9a375182 authored by Mael NICOLAS's avatar Mael NICOLAS

add parser for search, took gg's parser for fetch

parent eb248441
......@@ -3,17 +3,33 @@
module Main where
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import qualified Data.ByteString.Lazy as LBS
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
main :: IO ()
main = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(fetch (Just "pubmed") ["31059770", "31059556"])
(search (Just "organ"))
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> print err
(Right ok) -> print ok
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> print err
(Right (BsXml abstracts)) -> do
pmeds <- pubMedParser abstracts
print pmeds
......@@ -29,6 +29,12 @@ dependencies:
- http-client
- http-client-tls
- http-media
- exceptions
- conduit
- xml-types
- time
- data-time-segment
- protolude
library:
source-dirs: src
......
module PUBMED where
import PUBMED.Client
import PUBMED.Parser
......@@ -34,7 +34,8 @@ type PUBMEDAPI =
:<|>
"efetch.fcgi"
:> QueryParam "db" T.Text
:> QueryParams "id" T.Text
:> QueryParam "rettype" T.Text
:> QueryParams "id" Integer
:> Get '[BsXml] BsXml
pubmedApi :: Proxy PUBMEDAPI
......@@ -43,6 +44,7 @@ pubmedApi = Proxy
search :: Maybe T.Text
-> ClientM BsXml
fetch :: Maybe T.Text
-> [T.Text]
-> Maybe T.Text
-> [Integer]
-> ClientM BsXml
search :<|> fetch = client pubmedApi
{-# LANGUAGE OverloadedStrings #-}
module PUBMED.Parser where
import Text.XML.Stream.Parse
import qualified Text.XML.Cursor as C -- ((&/), (&//), Cursor, content, element)
import Text.XML (Name)
import Data.Either (rights)
import Data.Maybe (Maybe)
import Data.Monoid (mconcat)
import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack)
import Data.XML.Types (Event)
import Data.Time.Segment (jour)
import Data.Time (UTCTime(..))
import GHC.IO (FilePath)
import Protolude (head)
import Prelude hiding (head)
import Control.Monad.Catch (MonadThrow)
import Control.Monad (join)
import qualified Data.Text as T
import qualified Data.Text.Read as T
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Conduit.List as CL
parseDocId :: C.Cursor -> [Integer]
parseDocId cursor = fst <$>
rights (T.decimal
<$> filter notNullOrEOL (rawElement cursor)
)
where rawElement = C.element "eSearchResult" C.&/ C.element "IdList" C.&// C.content
notNullOrEOL t = not (T.null t) && t /= "\n"
identity :: a -> a
identity x = x
manyTagsUntil :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
-- | Utility function that matches everything but the tag given
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
data PubMed =
PubMed { pubmed_article :: PubMedArticle
, pubmed_date Protolude
:: PubMedData
} deriving Show
data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe T.Text
, pubmed_journal :: Maybe T.Text
, pubmed_abstract :: Maybe [T.Text]
}
deriving (Show)
data PubMedData =
PubMedData { pubmedData_date :: UTCTime
, pubmedData_year :: Integer
, pubmedData_month :: Int
, pubmedData_day :: Int
} deriving (Show)
readPubMedFile :: FilePath -> IO [PubMed]
readPubMedFile fp = do
input <- DBL.readFile fp
pubMedParser input
pubMedParser :: DBL.ByteString -> IO [PubMed]
pubMedParser bstring = runConduit $ parseLBS def bstring
.| parseArticleSet
.| CL.consume
parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
parseArticleSet = do
as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
return as
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
parsePubMedArticle = do
articles <- tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
return articles
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (PubMed)
parsePubMedArticle' = do
article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
dates <- tagIgnoreAttrs "PubmedData" $ do
dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
y' <- force "Year" $ tagIgnoreAttrs "Year" content
m' <- force "Month" $ tagIgnoreAttrs "Month" content
d' <- force "Day" $ tagIgnoreAttrs "Day" content
_ <- many $ ignoreAnyTreeContent
return (read $ unpack y', read $ unpack m', read $ unpack d')
_ <- many $ ignoreAnyTreeContent
return dates'
_ <- many $ ignoreAnyTreeContent
let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates
return $ PubMed (article) (PubMedData (jour y m d) y m d)
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = do
a <- force "article" $ manyTagsUntil "Article" parseArticle
_ <- many $ ignoreAnyTreeContent
return a
parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
parseArticle = do
journal <- force "journal" $ manyTagsUntil "Journal" $ do
j <- manyTagsUntil "Title" content
_ <- many $ ignoreAnyTreeContent
return j
title <- do
t <- manyTagsUntil "ArticleTitle" content
return t
abstracts <- do
as <- manyTagsUntil "Abstract" $ many $ do
txt <- tagIgnoreAttrs "AbstractT.Text" $ do
c <- content
_ <- many $ ignoreAnyTreeContent
return c
_ <- many $ ignoreAnyTreeContent
return txt
return as
-- TODO add authos
_ <- many $ ignoreAnyTreeContent
return $ PubMedArticle title journal abstracts
pubMedData :: DBL.ByteString
pubMedData = mconcat
[ "<?xml version=\"1.0\"?>\n"
, "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
, "<PubmedArticleSet>\n"
, "<PubmedArticle>\n"
, " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
, " <PMID Version=\"1\">30357468</PMID>\n"
, " <DateRevised>\n"
, " <Year>2018</Year>\n"
, " </DateRevised>\n"
, " <Article PubModel=\"Print-Electronic\">\n"
, " <Journal>\n"
, " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
, " <Title>European journal of pediatrics</Title>\n"
, " </Journal>\n"
, " <ArticleTitle>Title of the Article</ArticleTitle>\n"
, " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
, " <Abstract>\n"
, " <AbstractText>Abstract Text.</AbstractText>\n"
, " </Abstract>\n"
, " <AuthorList>\n"
, " </AuthorList>\n"
, " </Article>\n"
, " </MedlineCitation>\n"
, " <PubmedData>\n"
, " <History>\n"
, " </History>\n"
, " </PubmedData>\n"
, "</PubmedArticle>\n"
, "</PubmedArticleSet>\n"
]
......@@ -37,7 +37,10 @@ packages:
# Dependency packages to be pulled from upstream that are not in the resolver
# using the same syntax as the packages field.
# (e.g., acme-missiles-0.3)
# extra-deps: []
extra-deps:
- git: https://github.com/delanoe/data-time-segment.git
commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723
# Override default flag values for local packages and extra-deps
# flags: {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment