Commit 1b5abcee authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[taggy] use taggy to parse XML

This fixes a bug where article title had bold text inside (which threw
an error).
parent 233fb199
......@@ -38,6 +38,7 @@ library
MultiParamTypeClasses
OverloadedStrings
TypeOperators
ghc-options: -Wunused-imports
build-depends:
attoparsec
, base >=4.7 && <5
......@@ -49,13 +50,14 @@ library
, http-client
, http-client-tls
, http-media
, lens
, protolude
, servant
, servant-client
, taggy
, taggy-lens
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
executable crawlerPubMed-exe
......@@ -67,7 +69,7 @@ executable crawlerPubMed-exe
MultiParamTypeClasses
OverloadedStrings
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
ghc-options: -threaded -rtsopts -Wunused-imports -with-rtsopts=-N
build-depends:
attoparsec
, base >=4.7 && <5
......@@ -80,13 +82,14 @@ executable crawlerPubMed-exe
, http-client
, http-client-tls
, http-media
, lens
, protolude
, servant
, servant-client
, taggy
, taggy-lens
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
test-suite crawlerPubMed-test
......@@ -114,11 +117,12 @@ test-suite crawlerPubMed-test
, http-client
, http-client-tls
, http-media
, lens
, protolude
, servant
, servant-client
, taggy
, taggy-lens
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
......@@ -26,26 +26,29 @@ default-extensions:
description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
dependencies:
- attoparsec
- base >= 4.7 && < 5
- servant
- servant-client
- text
- bytestring
- xml-conduit
- conduit
- data-time-segment
- either
- exceptions
- http-client
- http-client-tls
- http-media
- exceptions
- conduit
- xml-types
- time
- data-time-segment
- lens
- protolude
- attoparsec
- either
- servant
- servant-client
- taggy
- taggy-lens
- text
- time
library:
source-dirs: src
ghc-options:
- -Wunused-imports
executables:
crawlerPubMed-exe:
......@@ -54,6 +57,7 @@ executables:
ghc-options:
- -threaded
- -rtsopts
- -Wunused-imports
- -with-rtsopts=-N
dependencies:
- crawlerPubMed
......
module PUBMED where
import Prelude hiding (takeWhile)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import Data.ByteString.Char8 (pack)
import Data.ByteString.Lazy (ByteString)
import Data.Text (Text)
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import PUBMED.Client
import PUBMED.Parser
import Prelude hiding (takeWhile)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Text.XML.Stream.Parse (XmlException)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (catch, MonadThrow, Exception)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
import qualified Data.List as List
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
pmHost :: String
pmHost = "eutils.ncbi.nlm.nih.gov"
......@@ -39,9 +33,6 @@ defaultEnv = do
getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
......@@ -87,14 +78,17 @@ runSimpleFetchPubmedAbstractRequest ids = do
env
case res of
(Left err) -> pure (Left . T.pack $ show err)
(Right (BsXml abs)) ->
(Right (BsXml abs)) -> do
--putStrLn $ show abs
case parseOnly removeSub $ LBS.toStrict abs of
(Left err'') -> pure (Left $ T.pack err'')
(Right v) -> do
parsed <- catch (pubMedParser v) ((\e -> do
_ <- print e
pure []) :: XmlException -> IO [PubMed])
Right <$> pure parsed
let parsed = parsePubMed $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict v
pure $ Right parsed
-- parsed <- catch (pubMedParser v) ((\e -> do
-- _ <- print e
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query limit = do
......@@ -112,4 +106,8 @@ searchDocIds query limit = do
case res of
(Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do
pure $ Right $ runParser parseDocId docs
pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs
--pure $ Right $ runParser parseDocId docs
--runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
--runParser parser = parser . fromDocument . parseLBS_ def
module PUBMED.Client where
import Data.ByteString.Lazy (ByteString)
import Data.Proxy (Proxy(..))
import Servant.API
import Servant.Client
import Data.Proxy (Proxy(..))
import Data.ByteString.Lazy (ByteString)
import qualified Network.HTTP.Media as M
import qualified Data.Text as T
import qualified Network.HTTP.Media as M
data DB = PUBMED
......
module PUBMED.Parser where
{-# LANGUAGE LambdaCase #-}
import Text.XML.Stream.Parse
import qualified Text.XML.Cursor as C -- ((&/), (&//), Cursor, content, element)
import Text.XML (Name)
module PUBMED.Parser where
import Data.Either (rights)
import Data.Maybe (Maybe, fromJust, fromMaybe)
import Data.Monoid (mconcat)
import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack)
import Data.XML.Types (Event)
import Data.Time.Segment (jour)
import Control.Lens ((^?), (^.), (^..), only, to, ix, prism', Prism')
import Data.Maybe (catMaybes)
import Data.Time (UTCTime(..))
import Data.Time.Segment (jour)
import Prelude hiding (head)
import Text.Taggy (Node(..))
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TLIO
import qualified Text.Taggy.Lens as TTL
import GHC.IO (FilePath)
namedEl name = TTL.elements . TTL.named (only name)
import Protolude (head)
import Prelude hiding (head)
import Control.Monad.Catch (MonadThrow)
import Control.Monad (join)
contentWithChildren :: Prism' Node T.Text
contentWithChildren = prism' NodeContent $ \case { NodeContent c -> Just c
; NodeElement e -> Just $ e ^. TTL.children . traverse . contentWithChildren }
import qualified Data.Text as T
import qualified Data.Text.Read as T
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Conduit.List as CL
parseDocId :: C.Cursor -> [Integer]
parseDocId cursor = fst <$>
rights (T.decimal
<$> filter notNullOrEOL (rawElement cursor)
)
where rawElement = C.element "eSearchResult" C.&/ C.element "IdList" C.&// C.content
notNullOrEOL t = not (T.null t) && t /= "\n"
identity :: a -> a
identity x = x
manyTagsUntil :: MonadThrow m =>
Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
-- | Utility function that matches everything but the tag given
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_ n = many_ ( ignoreTree (tagUntil n) ignoreAttrs)
manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
parseDocIds :: TL.Text -> [Integer]
parseDocIds txt = map (\s -> read (T.unpack s) :: Integer) parsed
where
parsed = txt ^.. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "IdList" . namedEl "Id" . TTL.contents
data PubMed =
PubMed { pubmed_article :: PubMedArticle
......@@ -64,17 +33,16 @@ data PubMed =
data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe T.Text
, pubmed_journal :: Maybe T.Text
, pubmed_abstract :: Maybe [T.Text]
, pubmed_authors :: Maybe [Author]
, pubmed_abstract :: Maybe [T.Text] -- TODO No Maybe?
, pubmed_authors :: Maybe [Author] -- TODO No Maybe?
}
deriving (Show)
data Author =
Author {
lastName :: Maybe T.Text,
foreName :: Maybe T.Text,
affiliation :: Maybe T.Text
}
Author { lastName :: Maybe T.Text
, foreName :: Maybe T.Text
, affiliation :: Maybe T.Text
}
deriving (Show)
data PubMedDate =
......@@ -84,76 +52,146 @@ data PubMedDate =
, pubmedDate_day :: Int
} deriving (Show)
readPubMedFile :: FilePath -> IO [PubMed]
readPubMedFile fp = do
input <- DBL.readFile fp
pubMedParser input
pubMedParser :: DBL.ByteString -> IO [PubMed]
pubMedParser bstring = runConduit $ parseLBS def bstring
.| parseArticleSet
.| CL.consume
parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
parseArticleSet =
force "PubmedArticleSet required" $ manyTagsUntil "PubmedArticleSet" $ manyYield parsePubMedArticle
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
parsePubMedArticle =
manyTagsUntil "PubmedArticle" parsePubMedArticle'
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
parsePubMedArticle' = do
article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
dates <- tagIgnoreAttrs "PubmedData" $ do
dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
y' <- force "Year" $ tagIgnoreAttrs "Year" content
m' <- force "Month" $ tagIgnoreAttrs "Month" content
d' <- force "Day" $ tagIgnoreAttrs "Day" content
_ <- many ignoreAnyTreeContent
return (read $ unpack y', read $ unpack m', read $ unpack d')
_ <- many ignoreAnyTreeContent
return dates'
_ <- many ignoreAnyTreeContent
let (y,m,d) = maybe (1,1,1) (fromJust . head) (reverse <$> join dates)
return $ PubMed article (PubMedDate (jour y m d) y m d)
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = do
a <- force "article" $ manyTagsUntil "Article" parseArticle
_ <- many ignoreAnyTreeContent
return a
parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
parseArticle = do
journal <- force "journal" $
manyTagsUntil "Journal" $ do
j <- manyTagsUntil "Title" content
_ <- many ignoreAnyTreeContent
return j
title <- manyTagsUntil "ArticleTitle" content
abstracts <- manyTagsUntil "Abstract" . many $ do
txt <- tagIgnoreAttrs "AbstractText" content
_ <- many ignoreAnyTreeContent
return txt
-- TODO add authos
authors <- manyTagsUntil "AuthorList" . many $
tagIgnoreAttrs "Author" $ do
ln <- manyTagsUntil "LastName" content
fn <- manyTagsUntil "ForeName" content
affi <- manyTagsUntil "AffiliationInfo" $ do
aff <- manyTagsUntil "Affiliation" content
_ <- many ignoreAnyTreeContent
return aff
_ <- many ignoreAnyTreeContent
return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi}
_ <- many ignoreAnyTreeContent
return $ PubMedArticle title journal abstracts authors
parsePubMedFile fp = do
input <- TLIO.readFile fp
pure $ parsePubMed input
parsePubMed :: TL.Text -> [PubMed]
parsePubMed txt = catMaybes $ txt ^.. pubmedArticle . to pubMed
where
pubmedArticle = TTL.html . TTL.allNamed (only "PubmedArticleSet") . TTL.allNamed (only "PubmedArticle")
pubMed el = PubMed <$> el ^? article . to pubMedArticle
<*> el ^? pubDate . to pubMedDate
where
article = namedEl "MedlineCitation" . namedEl "Article"
pubDate = namedEl "PubmedData" . namedEl "History" . namedEl "PubMedPubDate" . TTL.attributed (ix "PubStatus" . only "pubmed")
pubMedDate el = PubMedDate { pubmedDate_date = jour y m d
, pubmedDate_year = y
, pubmedDate_month = m
, pubmedDate_day = d }
where
y = read $ T.unpack $ el ^. namedEl "Year" . TTL.contents
m = read $ T.unpack $ el ^. namedEl "Month" . TTL.contents
d = read $ T.unpack $ el ^. namedEl "Day" . TTL.contents
pubMedArticle el = PubMedArticle { pubmed_title = Just $ el ^. title
, pubmed_journal = el ^? journalTitle
, pubmed_abstract = Just $ el ^.. abstract
, pubmed_authors = Just $ el ^.. authors . to author }
where
journalTitle = namedEl "Journal" . namedEl "Title" . TTL.contents
title = namedEl "ArticleTitle" . TTL.children . traverse . contentWithChildren
abstract = namedEl "Abstract" . namedEl "AbstractText" . TTL.contents
authors = namedEl "AuthorList" . TTL.allNamed (only "Author")
author el = Author { lastName = el ^? lastName
, foreName = el ^? firstName
, affiliation = el ^? affiliation }
where
firstName = namedEl "ForeName" . TTL.contents
lastName = namedEl "LastName" . TTL.contents
affiliation = namedEl "AffiliationInfo" . namedEl "Affiliation" . TTL.contents
--identity :: a -> a
--identity x = x
--
--manyTagsUntil :: MonadThrow m =>
-- Name
-- -> ConduitT Event o m b
-- -> ConduitT Event o m (Maybe b)
--manyTagsUntil name f = do
-- _ <- manyTagsUntil_ name
-- tagIgnoreAttrs (matching (== name)) f
--
---- | Utility function that matches everything but the tag given
--tagUntil :: Name -> NameMatcher Name
--tagUntil name = matching (/= name)
--
---- | Utility function that consumes everything but the tag given
---- usefull because we have to consume every data.
--manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
--manyTagsUntil_ n = many_ ( ignoreTree (tagUntil n) ignoreAttrs)
--
--manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
--manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
--parseDocId :: C.Cursor -> [Integer]
--parseDocId cursor = fst <$>
-- rights (T.decimal
-- <$> filter notNullOrEOL (rawElement cursor)
-- )
-- where rawElement = C.element "eSearchResult" C.&/ C.element "IdList" C.&// C.content
-- notNullOrEOL t = not (T.null t) && t /= "\n"
--readPubMedFile :: FilePath -> IO [PubMed]
--readPubMedFile fp = do
-- input <- DBL.readFile fp
-- pubMedParser input
--pubMedParser :: DBL.ByteString -> IO [PubMed]
--pubMedParser bstring = runConduit $ parseLBS def bstring
-- .| parseArticleSet
-- .| CL.consume
--
--parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
--parseArticleSet =
-- force "PubmedArticleSet required" $ manyTagsUntil "PubmedArticleSet" $ manyYield parsePubMedArticle
--
--parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
--parsePubMedArticle =
-- manyTagsUntil "PubmedArticle" parsePubMedArticle'
--
--parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
--parsePubMedArticle' = do
-- article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
-- dates <- tagIgnoreAttrs "PubmedData" $ do
-- dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
-- y' <- force "Year" $ tagIgnoreAttrs "Year" content
-- m' <- force "Month" $ tagIgnoreAttrs "Month" content
-- d' <- force "Day" $ tagIgnoreAttrs "Day" content
-- _ <- many ignoreAnyTreeContent
-- return (read $ unpack y', read $ unpack m', read $ unpack d')
-- _ <- many ignoreAnyTreeContent
-- return dates'
-- _ <- many ignoreAnyTreeContent
-- let (y,m,d) = maybe (1,1,1) (fromJust . head) (reverse <$> join dates)
-- return $ PubMed article (PubMedDate (jour y m d) y m d)
--
--parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
--parseMedlineCitation = do
-- a <- force "article" $ manyTagsUntil "Article" parseArticle
-- _ <- many ignoreAnyTreeContent
-- return a
--
--parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
--parseArticle = do
-- journal <- force "journal" $
-- manyTagsUntil "Journal" $ do
-- j <- manyTagsUntil "Title" content
-- _ <- many ignoreAnyTreeContent
-- return j
--
-- title <- manyTagsUntil "ArticleTitle" content
--
-- abstracts <- manyTagsUntil "Abstract" . many $ do
-- txt <- tagIgnoreAttrs "AbstractText" content
-- _ <- many ignoreAnyTreeContent
-- return txt
-- -- TODO add authos
--
-- authors <- manyTagsUntil "AuthorList" . many $
-- tagIgnoreAttrs "Author" $ do
-- ln <- manyTagsUntil "LastName" content
-- fn <- manyTagsUntil "ForeName" content
-- affi <- manyTagsUntil "AffiliationInfo" $ do
-- aff <- manyTagsUntil "Affiliation" content
-- _ <- many ignoreAnyTreeContent
-- return aff
-- _ <- many ignoreAnyTreeContent
-- return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi}
--
--
-- _ <- many ignoreAnyTreeContent
-- return $ PubMedArticle title journal abstracts authors
pubMedData :: DBL.ByteString
......
......@@ -40,7 +40,8 @@ packages:
extra-deps:
- git: https://github.com/delanoe/data-time-segment.git
commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723
- taggy-0.2.1@sha256:7bc55ddba178971dc6052163597f0445a0a2b5b0ca0e84ce651d53d722e3c265,4662
- taggy-lens-0.1.2@sha256:091ca81d02bd3d7fb493dce0148e1a38f25eb178a1ebd751043a23239e5e3265,3009
# Override default flag values for local packages and extra-deps
# flags: {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment