Commit c0a74bb9 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'abstract-parser-fix' of ssh://gitlab.iscpif.fr:20022/cgenie/pubmed

parents c6b5ab1f bfbdc993
...@@ -4,4 +4,4 @@ import PUBMED (getMetadataWith) ...@@ -4,4 +4,4 @@ import PUBMED (getMetadataWith)
main :: IO () main :: IO ()
main = getMetadataWith "bisphenol" (Just 100) >>= print main = getMetadataWith "bisphenol" Nothing (Just 100) >>= print
...@@ -28,6 +28,7 @@ library ...@@ -28,6 +28,7 @@ library
PUBMED PUBMED
PUBMED.Client PUBMED.Client
PUBMED.Parser PUBMED.Parser
PUBMED.Test
other-modules: other-modules:
Paths_crawlerPubMed Paths_crawlerPubMed
hs-source-dirs: hs-source-dirs:
...@@ -46,6 +47,7 @@ library ...@@ -46,6 +47,7 @@ library
, data-time-segment , data-time-segment
, either , either
, exceptions , exceptions
, ghc
, http-client , http-client
, http-client-tls , http-client-tls
, http-media , http-media
...@@ -78,6 +80,7 @@ executable crawlerPubMed-exe ...@@ -78,6 +80,7 @@ executable crawlerPubMed-exe
, data-time-segment , data-time-segment
, either , either
, exceptions , exceptions
, ghc
, http-client , http-client
, http-client-tls , http-client-tls
, http-media , http-media
...@@ -113,6 +116,7 @@ test-suite crawlerPubMed-test ...@@ -113,6 +116,7 @@ test-suite crawlerPubMed-test
, data-time-segment , data-time-segment
, either , either
, exceptions , exceptions
, ghc
, http-client , http-client
, http-client-tls , http-client-tls
, http-media , http-media
......
...@@ -33,6 +33,7 @@ dependencies: ...@@ -33,6 +33,7 @@ dependencies:
- data-time-segment - data-time-segment
- either - either
- exceptions - exceptions
- ghc
- http-client - http-client
- http-client-tls - http-client-tls
- http-media - http-media
......
module PUBMED where module PUBMED where
import Conduit
import Control.Applicative import Control.Applicative
import Data.Attoparsec.ByteString import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar) import Data.Attoparsec.ByteString.Char8 (anyChar)
import Data.ByteString.Char8 (pack) import Data.ByteString.Char8 (pack)
import Data.ByteString.Lazy (ByteString) import Data.ByteString.Lazy (ByteString)
import Data.Maybe (fromMaybe)
import Data.Text (Text) import Data.Text (Text)
import Network.HTTP.Client (newManager) import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Client.TLS (tlsManagerSettings)
import PUBMED.Client import PUBMED.Client
import PUBMED.Parser import PUBMED.Parser
import Prelude hiding (takeWhile) import Prelude hiding (takeWhile)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..)) import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), ClientEnv, ClientError, Scheme(..))
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Text as T import qualified Data.Text as T
...@@ -25,14 +27,43 @@ pmSearchPath = "entrez/eutils" ...@@ -25,14 +27,43 @@ pmSearchPath = "entrez/eutils"
pmPort :: Int pmPort :: Int
pmPort = 443 pmPort = 443
batchSize :: Int
batchSize = 2000
defaultEnv :: IO ClientEnv
defaultEnv = do defaultEnv = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
pure $ mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath pure $ mkClientEnv manager' $ BaseUrl Https pmHost pmPort pmSearchPath
-- | API main function -- | API main function
getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed]) getMetadataWith :: Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest getMetadataWith = runSimpleFindPubmedAbstractRequest
getMetadataWithC :: Text -> Maybe Limit -> IO (Either ClientError (Maybe Integer, ConduitT () PubMed IO ()))
getMetadataWithC query limit = do
env <- defaultEnv
-- First, estimate the total number of documents
eRes <- runClientM (search (Just query) Nothing (Just 1)) env
pure $ get' env query limit batchSize <$> eRes
where
get' :: ClientEnv -> Text -> Maybe Limit -> Int -> BsXml -> (Maybe Integer, ConduitT () PubMed IO ())
get' env q l perPage (BsXml res) =
( Just numResults
, yieldMany [0..]
.| takeC (fromInteger numPages)
.| concatMapMC (getPage env q perPage))
where
numResults = fromMaybe 0 $ parseDocCount $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict res
numPages = numResults `div` (fromIntegral perPage) + 1
getPage :: ClientEnv -> Text -> Int -> Int -> IO [PubMed]
getPage env q perPage pageNum = do
let offset = fromIntegral $ pageNum * perPage
eDocs <- runSimpleFindPubmedAbstractRequest q (Just offset) (Just $ fromIntegral pageNum)
pure $ case eDocs of
Left err -> []
Right docs -> docs
-- | TODO this parser need at least one subs at the end -- | TODO this parser need at least one subs at the end
-- (use endOfInput) -- (use endOfInput)
removeSub :: Parser ByteString removeSub :: Parser ByteString
...@@ -57,6 +88,7 @@ type Limit = Integer ...@@ -57,6 +88,7 @@ type Limit = Integer
runMultipleFPAR :: [Integer] runMultipleFPAR :: [Integer]
-> IO (Either Text [PubMed]) -> IO (Either Text [PubMed])
runMultipleFPAR [] = pure $ Right []
runMultipleFPAR ids = List.foldl1' concat' runMultipleFPAR ids = List.foldl1' concat'
<$> mapM runSimpleFetchPubmedAbstractRequest (by 300 ids) <$> mapM runSimpleFetchPubmedAbstractRequest (by 300 ids)
where where
...@@ -90,18 +122,18 @@ runSimpleFetchPubmedAbstractRequest ids = do ...@@ -90,18 +122,18 @@ runSimpleFetchPubmedAbstractRequest ids = do
-- pure []) :: XmlException -> IO [PubMed]) -- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed -- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed]) runSimpleFindPubmedAbstractRequest :: Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query limit = do runSimpleFindPubmedAbstractRequest query offset limit = do
eDocIds <- searchDocIds query limit eDocIds <- searchDocIds query offset limit
case eDocIds of case eDocIds of
Left err -> pure $ Left err Left err -> pure $ Left err
Right docIds -> runMultipleFPAR docIds Right docIds -> runMultipleFPAR docIds
searchDocIds :: Text -> Maybe Limit -> IO (Either Text [Integer]) searchDocIds :: Text -> Maybe Integer -> Maybe Limit -> IO (Either Text [Integer])
searchDocIds query limit = do searchDocIds query offset limit = do
env <- defaultEnv env <- defaultEnv
res <- runClientM res <- runClientM
(search (Just query) limit) (search (Just query) offset limit)
env env
case res of case res of
(Left err) -> pure (Left $ T.pack $ show err) (Left err) -> pure (Left $ T.pack $ show err)
......
...@@ -23,6 +23,7 @@ type PUBMEDAPI = ...@@ -23,6 +23,7 @@ type PUBMEDAPI =
-- :> QueryParam "db" DB -- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed -- not mandatory since the base db is pubmed
:> QueryParam "term" T.Text :> QueryParam "term" T.Text
:> QueryParam "retstart" Integer
:> QueryParam "retmax" Integer :> QueryParam "retmax" Integer
:> Get '[BsXml] BsXml :> Get '[BsXml] BsXml
:<|> :<|>
...@@ -35,7 +36,7 @@ type PUBMEDAPI = ...@@ -35,7 +36,7 @@ type PUBMEDAPI =
pubmedApi :: Proxy PUBMEDAPI pubmedApi :: Proxy PUBMEDAPI
pubmedApi = Proxy pubmedApi = Proxy
search :: Maybe T.Text -> Maybe Integer -> ClientM BsXml search :: Maybe T.Text -> Maybe Integer -> Maybe Integer -> ClientM BsXml
fetch :: Maybe T.Text fetch :: Maybe T.Text
-> Maybe T.Text -> Maybe T.Text
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
module PUBMED.Parser where module PUBMED.Parser where
import Control.Lens ((^?), (^.), (^..), only, to, ix, prism', Prism') import Control.Lens ((^?), (^.), (^..), only, to, ix, prism', Prism', Traversal')
import Data.Maybe (catMaybes) import Data.Maybe (catMaybes, fromMaybe)
import Data.Time (UTCTime(..)) import Data.Time (UTCTime(..))
import Data.Time.Segment (jour) import Data.Time.Segment (jour)
import Prelude hiding (head) import Prelude hiding (head)
...@@ -12,6 +12,8 @@ import qualified Data.ByteString.Lazy as DBL ...@@ -12,6 +12,8 @@ import qualified Data.ByteString.Lazy as DBL
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TLIO import qualified Data.Text.Lazy.IO as TLIO
import Panic (panic)
import qualified Text.Read as TR
import qualified Text.Taggy.Lens as TTL import qualified Text.Taggy.Lens as TTL
namedEl name = TTL.elements . TTL.named (only name) namedEl name = TTL.elements . TTL.named (only name)
...@@ -23,12 +25,19 @@ contentWithChildren = prism' NodeContent $ \case { NodeContent c -> Just c ...@@ -23,12 +25,19 @@ contentWithChildren = prism' NodeContent $ \case { NodeContent c -> Just c
deepContent = TTL.children . traverse . contentWithChildren deepContent = TTL.children . traverse . contentWithChildren
parseDocIds :: TL.Text -> [Integer] parseDocIds :: TL.Text -> [Integer]
parseDocIds txt = map (\s -> read (T.unpack s) :: Integer) parsed parseDocIds txt = map parseId parsed
where where
parsed = txt ^.. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "IdList" . namedEl "Id" . TTL.contents parsed = txt ^.. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "IdList" . namedEl "Id" . TTL.contents
parseId s = case (TR.readMaybe (T.unpack s) :: Maybe Integer) of
Nothing -> panic $ "Can't read doc id from: " <> (T.unpack s)
Just cnt -> cnt
parseDocCount :: TL.Text -> Maybe Integer
parseDocCount txt = TR.readMaybe $ T.unpack $ txt ^. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "Count" . TTL.contents
data PubMed = data PubMed =
PubMed { pubmed_article :: PubMedArticle PubMed { pubmed_id :: Int
, pubmed_article :: PubMedArticle
, pubmed_date :: PubMedDate , pubmed_date :: PubMedDate
} deriving Show } deriving Show
...@@ -63,19 +72,26 @@ parsePubMed :: TL.Text -> [PubMed] ...@@ -63,19 +72,26 @@ parsePubMed :: TL.Text -> [PubMed]
parsePubMed txt = catMaybes $ txt ^.. pubmedArticle . to pubMed parsePubMed txt = catMaybes $ txt ^.. pubmedArticle . to pubMed
where where
pubmedArticle = TTL.html . TTL.allNamed (only "PubmedArticleSet") . TTL.allNamed (only "PubmedArticle") pubmedArticle = TTL.html . TTL.allNamed (only "PubmedArticleSet") . TTL.allNamed (only "PubmedArticle")
pubMed el = PubMed <$> el ^? article . to pubMedArticle pubMed el = PubMed <$> el ^? articleId . to pubMedId
<*> el ^? article . to pubMedArticle
<*> el ^? pubDate . to pubMedDate <*> el ^? pubDate . to pubMedDate
where where
article = namedEl "MedlineCitation" . namedEl "Article" medline :: Traversal' TTL.Element TTL.Element
medline = namedEl "MedlineCitation"
article = medline . namedEl "Article"
articleId = medline . namedEl "PMID"
pubDate = namedEl "PubmedData" . namedEl "History" . namedEl "PubMedPubDate" . TTL.attributed (ix "PubStatus" . only "pubmed") pubDate = namedEl "PubmedData" . namedEl "History" . namedEl "PubMedPubDate" . TTL.attributed (ix "PubStatus" . only "pubmed")
pubMedId el = case (TR.readMaybe $ T.unpack $ el ^. TTL.contents) of
Nothing -> panic $ "Cannot parse id: " <> (T.unpack $ el ^. TTL.contents)
Just id -> id
pubMedDate el = PubMedDate { pubmedDate_date = jour y m d pubMedDate el = PubMedDate { pubmedDate_date = jour y m d
, pubmedDate_year = y , pubmedDate_year = y
, pubmedDate_month = m , pubmedDate_month = m
, pubmedDate_day = d } , pubmedDate_day = d }
where where
y = read $ T.unpack $ el ^. namedEl "Year" . TTL.contents y = fromMaybe 1 $ TR.readMaybe $ T.unpack $ el ^. namedEl "Year" . TTL.contents
m = read $ T.unpack $ el ^. namedEl "Month" . TTL.contents m = fromMaybe 1 $ TR.readMaybe $ T.unpack $ el ^. namedEl "Month" . TTL.contents
d = read $ T.unpack $ el ^. namedEl "Day" . TTL.contents d = fromMaybe 1 $ TR.readMaybe $ T.unpack $ el ^. namedEl "Day" . TTL.contents
pubMedArticle el = PubMedArticle { pubmed_title = Just $ el ^. title pubMedArticle el = PubMedArticle { pubmed_title = Just $ el ^. title
, pubmed_journal = el ^? journalTitle , pubmed_journal = el ^? journalTitle
, pubmed_abstract = el ^.. abstract , pubmed_abstract = el ^.. abstract
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment