correct parsing bug, and permit huge query (tested with 100 000)

23a3e2cd · Mael NICOLAS · 62ecec55 · 23a3e2cd · 23a3e2cd · 23a3e2cd
Commit 23a3e2cd authored Jul 08, 2019 by Mael NICOLAS
Show whitespace changes
Inline Side-by-side

Showing with 37 additions and 21 deletions

Main.hs app/Main.hs +1 -1

PUBMED.hs src/PUBMED.hs +29 -12

Parser.hs src/PUBMED/Parser.hs +7 -8

No files found.
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -6,4 +6,4 @@ import PUBMED (crawler)
 main :: IO ()
-main = crawler "bisphenol" (Just 50) >>= print
+main = crawler "bisphenol" (Just 1000000) >>= print
--- a/src/PUBMED.hs
+++ b/src/PUBMED.hs
@@ -11,10 +11,11 @@ import Network.HTTP.Client.TLS (tlsManagerSettings)
 import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
 import Text.XML (parseLBS_, def)
 import Text.XML.Cursor (fromDocument, Cursor)
+import Text.XML.Stream.Parse (XmlException)
 import Data.Conduit (ConduitT)
 import Data.ByteString.Lazy (ByteString)
 import Data.ByteString.Char8 (pack)
-import Control.Monad.Catch (MonadThrow)
+import Control.Monad.Catch (catch, MonadThrow, Exception)
 import Control.Applicative
 import Data.Attoparsec.ByteString
@@ -47,6 +48,32 @@ removeSub = do
 type Query = Text
 type Limit = Integer
+runMultipleFPAR ::
+  [Integer]
+  -> IO (Either Text [PubMed])
+runMultipleFPAR ids
+  | length ids < 300 = runSimpleFetchPubmedAbstractRequest ids
+  | otherwise = do
+      runSimpleFetchPubmedAbstractRequest (Prelude.take 300 ids)
+        <> runMultipleFPAR (drop 300 ids)
+runSimpleFetchPubmedAbstractRequest ::
+  [Integer]
+  -> IO (Either Text [PubMed])
+runSimpleFetchPubmedAbstractRequest ids = do
+  manager' <- newManager tlsManagerSettings
+  res <- runClientM
+    (fetch (Just "pubmed") (Just "abstract") ids)
+    (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
+  case res of
+    (Left err) -> pure (Left . T.pack $ show err)
+    (Right (BsXml abs)) ->
+      case parseOnly removeSub $ LBS.toStrict abs of
+        (Left err'') -> pure (Left $ T.pack err'')
+        (Right v) -> do
+          parsed <- catch (pubMedParser v) ((\e -> pure []) :: XmlException -> IO [PubMed])
+          Right <$> pure parsed
 crawler :: Text -> Maybe Limit -> IO (Either Text [PubMed])
 crawler = runSimpleFindPubmedAbstractRequest
@@ -60,15 +87,5 @@ runSimpleFindPubmedAbstractRequest query limit =  do
    (Left err) -> pure (Left $ T.pack $ show err)
    (Right (BsXml docs)) -> do
      let docIds = runParser parseDocId docs
-      res' <- runClientM
+      runMultipleFPAR docIds
-        (fetch (Just "pubmed") (Just "abstract") docIds)
-        (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
-      case res' of
-        (Left err') -> pure (Left $ T.pack $ show err')
-        (Right (BsXml abstracts)) -> do
-          -- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
-          case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
-            (Left err'') -> pure (Left $ T.pack err'')
-            (Right v) -> Right <$> pubMedParser v
--- a/src/PUBMED/Parser.hs
+++ b/src/PUBMED/Parser.hs
@@ -137,19 +137,18 @@ parseArticle = do
  authors <-  manyTagsUntil "AuthorList" . many $
     tagIgnoreAttrs "Author" $ do
-        ln <- tagIgnoreAttrs "LastName" content
+        ln <- manyTagsUntil "LastName" content
-        fn <- tagIgnoreAttrs "ForeName" content
+        fn <- manyTagsUntil "ForeName" content
-        affi <- manyTagsUntil "AffiliationInfo" $
+        affi <- manyTagsUntil "AffiliationInfo" $ do
-          tagIgnoreAttrs "Affiliation" content
+          aff <- manyTagsUntil "Affiliation" content
+          _ <- many ignoreAnyTreeContent
+          return aff
        _ <- many ignoreAnyTreeContent
        return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi}
  abstracts <-
    manyTagsUntil "Abstract" . many $ do
-      txt <- tagIgnoreAttrs "AbstractT.Text" $ do
+      txt <- tagIgnoreAttrs "AbstractText" content
-        c <- content
-        _ <- many ignoreAnyTreeContent
-        return c
      _ <- many ignoreAnyTreeContent
      return txt
  -- TODO add authos