Commit 23a3e2cd authored by Mael NICOLAS's avatar Mael NICOLAS

correct parsing bug, and permit huge query (tested with 100 000)

parent 62ecec55
...@@ -6,4 +6,4 @@ import PUBMED (crawler) ...@@ -6,4 +6,4 @@ import PUBMED (crawler)
main :: IO () main :: IO ()
main = crawler "bisphenol" (Just 50) >>= print main = crawler "bisphenol" (Just 1000000) >>= print
...@@ -11,10 +11,11 @@ import Network.HTTP.Client.TLS (tlsManagerSettings) ...@@ -11,10 +11,11 @@ import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..)) import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def) import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor) import Text.XML.Cursor (fromDocument, Cursor)
import Text.XML.Stream.Parse (XmlException)
import Data.Conduit (ConduitT) import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString) import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack) import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (MonadThrow) import Control.Monad.Catch (catch, MonadThrow, Exception)
import Control.Applicative import Control.Applicative
import Data.Attoparsec.ByteString import Data.Attoparsec.ByteString
...@@ -47,6 +48,32 @@ removeSub = do ...@@ -47,6 +48,32 @@ removeSub = do
type Query = Text type Query = Text
type Limit = Integer type Limit = Integer
runMultipleFPAR ::
[Integer]
-> IO (Either Text [PubMed])
runMultipleFPAR ids
| length ids < 300 = runSimpleFetchPubmedAbstractRequest ids
| otherwise = do
runSimpleFetchPubmedAbstractRequest (Prelude.take 300 ids)
<> runMultipleFPAR (drop 300 ids)
runSimpleFetchPubmedAbstractRequest ::
[Integer]
-> IO (Either Text [PubMed])
runSimpleFetchPubmedAbstractRequest ids = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(fetch (Just "pubmed") (Just "abstract") ids)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> pure (Left . T.pack $ show err)
(Right (BsXml abs)) ->
case parseOnly removeSub $ LBS.toStrict abs of
(Left err'') -> pure (Left $ T.pack err'')
(Right v) -> do
parsed <- catch (pubMedParser v) ((\e -> pure []) :: XmlException -> IO [PubMed])
Right <$> pure parsed
crawler :: Text -> Maybe Limit -> IO (Either Text [PubMed]) crawler :: Text -> Maybe Limit -> IO (Either Text [PubMed])
crawler = runSimpleFindPubmedAbstractRequest crawler = runSimpleFindPubmedAbstractRequest
...@@ -60,15 +87,5 @@ runSimpleFindPubmedAbstractRequest query limit = do ...@@ -60,15 +87,5 @@ runSimpleFindPubmedAbstractRequest query limit = do
(Left err) -> pure (Left $ T.pack $ show err) (Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do (Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs let docIds = runParser parseDocId docs
res' <- runClientM runMultipleFPAR docIds
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err') -> pure (Left $ T.pack $ show err')
(Right (BsXml abstracts)) -> do
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
(Left err'') -> pure (Left $ T.pack err'')
(Right v) -> Right <$> pubMedParser v
...@@ -137,19 +137,18 @@ parseArticle = do ...@@ -137,19 +137,18 @@ parseArticle = do
authors <- manyTagsUntil "AuthorList" . many $ authors <- manyTagsUntil "AuthorList" . many $
tagIgnoreAttrs "Author" $ do tagIgnoreAttrs "Author" $ do
ln <- tagIgnoreAttrs "LastName" content ln <- manyTagsUntil "LastName" content
fn <- tagIgnoreAttrs "ForeName" content fn <- manyTagsUntil "ForeName" content
affi <- manyTagsUntil "AffiliationInfo" $ affi <- manyTagsUntil "AffiliationInfo" $ do
tagIgnoreAttrs "Affiliation" content aff <- manyTagsUntil "Affiliation" content
_ <- many ignoreAnyTreeContent
return aff
_ <- many ignoreAnyTreeContent _ <- many ignoreAnyTreeContent
return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi} return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi}
abstracts <- abstracts <-
manyTagsUntil "Abstract" . many $ do manyTagsUntil "Abstract" . many $ do
txt <- tagIgnoreAttrs "AbstractT.Text" $ do txt <- tagIgnoreAttrs "AbstractText" content
c <- content
_ <- many ignoreAnyTreeContent
return c
_ <- many ignoreAnyTreeContent _ <- many ignoreAnyTreeContent
return txt return txt
-- TODO add authos -- TODO add authos
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment