Commit 62ecec55 authored by Mael NICOLAS's avatar Mael NICOLAS

correct removeSub

parent 06476735
...@@ -6,4 +6,4 @@ import PUBMED (crawler) ...@@ -6,4 +6,4 @@ import PUBMED (crawler)
main :: IO () main :: IO ()
main = crawler "bisphenol" (Just 3) >>= print main = crawler "bisphenol" (Just 50) >>= print
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
module PUBMED where module PUBMED where
import Prelude hiding (takeWhile)
import Data.Text (Text) import Data.Text (Text)
import PUBMED.Client import PUBMED.Client
import PUBMED.Parser import PUBMED.Parser
...@@ -31,12 +32,17 @@ runParser parser = parser . fromDocument . parseLBS_ def ...@@ -31,12 +32,17 @@ runParser parser = parser . fromDocument . parseLBS_ def
removeSub :: Parser ByteString removeSub :: Parser ByteString
removeSub = do removeSub = do
dt <- many textWithBalise dt <- many textWithBalise
pure $ LBS.fromStrict $ pack $ concat dt eo <- manyTill anyChar endOfInput
pure $ LBS.fromStrict $ pack $ concat dt <> eo
where where
textWithBalise = manyTill anyChar (try subs) textWithBalise =
subs = sub <|> asub -- <|> isEndOfInput manyTill anyChar (sub <|> asub)
sub = string "<sub>" sub = string "<sub>"
<|> string "<sup>"
<|> string "<i>"
asub = string "</sub>" asub = string "</sub>"
<|> string "</sup>"
<|> string "</i>"
type Query = Text type Query = Text
type Limit = Integer type Limit = Integer
...@@ -51,18 +57,18 @@ runSimpleFindPubmedAbstractRequest query limit = do ...@@ -51,18 +57,18 @@ runSimpleFindPubmedAbstractRequest query limit = do
(search (Just query) limit) (search (Just query) limit)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils") (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of case res of
(Left _) -> pure (Left "Error: PubMed Internet connection") (Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do (Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs let docIds = runParser parseDocId docs
res' <- runClientM res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds) (fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils") (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of case res' of
(Left _) -> pure (Left "Error: PubMed API connection") (Left err') -> pure (Left $ T.pack $ show err')
(Right (BsXml abstracts)) -> do (Right (BsXml abstracts)) -> do
-- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput -- TODO remove "</sub>" maybe there is a cleaner way with isEndOfInput
case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of case (parseOnly removeSub $ LBS.toStrict abstracts <> "</sub>") of
(Left err) -> pure (Left "Error: PubMed parser") (Left err'') -> pure (Left $ T.pack err'')
(Right v) -> Right <$> pubMedParser v (Right v) -> Right <$> pubMedParser v
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment