Commit 23a3e2cd authored by Mael NICOLAS's avatar Mael NICOLAS

correct parsing bug, and permit huge query (tested with 100 000)

parent 62ecec55
......@@ -6,4 +6,4 @@ import PUBMED (crawler)
main :: IO ()
main = crawler "bisphenol" (Just 50) >>= print
main = crawler "bisphenol" (Just 1000000) >>= print
......@@ -11,10 +11,11 @@ import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Text.XML.Stream.Parse (XmlException)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (MonadThrow)
import Control.Monad.Catch (catch, MonadThrow, Exception)
import Control.Applicative
import Data.Attoparsec.ByteString
......@@ -47,6 +48,32 @@ removeSub = do
type Query = Text
type Limit = Integer
runMultipleFPAR ::
[Integer]
-> IO (Either Text [PubMed])
runMultipleFPAR ids
| length ids < 300 = runSimpleFetchPubmedAbstractRequest ids
| otherwise = do
runSimpleFetchPubmedAbstractRequest (Prelude.take 300 ids)
<> runMultipleFPAR (drop 300 ids)
runSimpleFetchPubmedAbstractRequest ::
[Integer]
-> IO (Either Text [PubMed])
runSimpleFetchPubmedAbstractRequest ids = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(fetch (Just "pubmed") (Just "abstract") ids)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> pure (Left . T.pack $ show err)
(Right (BsXml abs)) ->
case parseOnly removeSub $ LBS.toStrict abs of
(Left err'') -> pure (Left $ T.pack err'')
(Right v) -> do
parsed <- catch (pubMedParser v) ((\e -> pure []) :: XmlException -> IO [PubMed])
Right <$> pure parsed
crawler :: Text -> Maybe Limit -> IO (Either Text [PubMed])
crawler = runSimpleFindPubmedAbstractRequest
......@@ -60,15 +87,5 @@ runSimpleFindPubmedAbstractRequest query limit = do
(Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err') -> pure (Left $ T.pack $ show err')
(Right (BsXml abstracts)) -> do
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
(Left err'') -> pure (Left $ T.pack err'')
(Right v) -> Right <$> pubMedParser v
runMultipleFPAR docIds
......@@ -137,19 +137,18 @@ parseArticle = do
authors <- manyTagsUntil "AuthorList" . many $
tagIgnoreAttrs "Author" $ do
ln <- tagIgnoreAttrs "LastName" content
fn <- tagIgnoreAttrs "ForeName" content
affi <- manyTagsUntil "AffiliationInfo" $
tagIgnoreAttrs "Affiliation" content
ln <- manyTagsUntil "LastName" content
fn <- manyTagsUntil "ForeName" content
affi <- manyTagsUntil "AffiliationInfo" $ do
aff <- manyTagsUntil "Affiliation" content
_ <- many ignoreAnyTreeContent
return aff
_ <- many ignoreAnyTreeContent
return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi}
abstracts <-
manyTagsUntil "Abstract" . many $ do
txt <- tagIgnoreAttrs "AbstractT.Text" $ do
c <- content
_ <- many ignoreAnyTreeContent
return c
txt <- tagIgnoreAttrs "AbstractText" content
_ <- many ignoreAnyTreeContent
return txt
-- TODO add authos
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment