Commit 1b5abcee authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[taggy] use taggy to parse XML

This fixes a bug where article title had bold text inside (which threw
an error).
parent 233fb199
...@@ -38,6 +38,7 @@ library ...@@ -38,6 +38,7 @@ library
MultiParamTypeClasses MultiParamTypeClasses
OverloadedStrings OverloadedStrings
TypeOperators TypeOperators
ghc-options: -Wunused-imports
build-depends: build-depends:
attoparsec attoparsec
, base >=4.7 && <5 , base >=4.7 && <5
...@@ -49,13 +50,14 @@ library ...@@ -49,13 +50,14 @@ library
, http-client , http-client
, http-client-tls , http-client-tls
, http-media , http-media
, lens
, protolude , protolude
, servant , servant
, servant-client , servant-client
, taggy
, taggy-lens
, text , text
, time , time
, xml-conduit
, xml-types
default-language: Haskell2010 default-language: Haskell2010
executable crawlerPubMed-exe executable crawlerPubMed-exe
...@@ -67,7 +69,7 @@ executable crawlerPubMed-exe ...@@ -67,7 +69,7 @@ executable crawlerPubMed-exe
MultiParamTypeClasses MultiParamTypeClasses
OverloadedStrings OverloadedStrings
TypeOperators TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -Wunused-imports -with-rtsopts=-N
build-depends: build-depends:
attoparsec attoparsec
, base >=4.7 && <5 , base >=4.7 && <5
...@@ -80,13 +82,14 @@ executable crawlerPubMed-exe ...@@ -80,13 +82,14 @@ executable crawlerPubMed-exe
, http-client , http-client
, http-client-tls , http-client-tls
, http-media , http-media
, lens
, protolude , protolude
, servant , servant
, servant-client , servant-client
, taggy
, taggy-lens
, text , text
, time , time
, xml-conduit
, xml-types
default-language: Haskell2010 default-language: Haskell2010
test-suite crawlerPubMed-test test-suite crawlerPubMed-test
...@@ -114,11 +117,12 @@ test-suite crawlerPubMed-test ...@@ -114,11 +117,12 @@ test-suite crawlerPubMed-test
, http-client , http-client
, http-client-tls , http-client-tls
, http-media , http-media
, lens
, protolude , protolude
, servant , servant
, servant-client , servant-client
, taggy
, taggy-lens
, text , text
, time , time
, xml-conduit
, xml-types
default-language: Haskell2010 default-language: Haskell2010
...@@ -26,26 +26,29 @@ default-extensions: ...@@ -26,26 +26,29 @@ default-extensions:
description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md> description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
dependencies: dependencies:
- attoparsec
- base >= 4.7 && < 5 - base >= 4.7 && < 5
- servant
- servant-client
- text
- bytestring - bytestring
- xml-conduit - conduit
- data-time-segment
- either
- exceptions
- http-client - http-client
- http-client-tls - http-client-tls
- http-media - http-media
- exceptions - lens
- conduit
- xml-types
- time
- data-time-segment
- protolude - protolude
- attoparsec - servant
- either - servant-client
- taggy
- taggy-lens
- text
- time
library: library:
source-dirs: src source-dirs: src
ghc-options:
- -Wunused-imports
executables: executables:
crawlerPubMed-exe: crawlerPubMed-exe:
...@@ -54,6 +57,7 @@ executables: ...@@ -54,6 +57,7 @@ executables:
ghc-options: ghc-options:
- -threaded - -threaded
- -rtsopts - -rtsopts
- -Wunused-imports
- -with-rtsopts=-N - -with-rtsopts=-N
dependencies: dependencies:
- crawlerPubMed - crawlerPubMed
......
module PUBMED where module PUBMED where
import Prelude hiding (takeWhile) import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import Data.ByteString.Char8 (pack)
import Data.ByteString.Lazy (ByteString)
import Data.Text (Text) import Data.Text (Text)
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager) import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Client.TLS (tlsManagerSettings)
import PUBMED.Client
import PUBMED.Parser
import Prelude hiding (takeWhile)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..)) import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Text.XML.Stream.Parse (XmlException)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (catch, MonadThrow, Exception)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
pmHost :: String pmHost :: String
pmHost = "eutils.ncbi.nlm.nih.gov" pmHost = "eutils.ncbi.nlm.nih.gov"
...@@ -39,9 +33,6 @@ defaultEnv = do ...@@ -39,9 +33,6 @@ defaultEnv = do
getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed]) getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest getMetadataWith = runSimpleFindPubmedAbstractRequest
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end -- | TODO this parser need at least one subs at the end
-- (use endOfInput) -- (use endOfInput)
removeSub :: Parser ByteString removeSub :: Parser ByteString
...@@ -87,14 +78,17 @@ runSimpleFetchPubmedAbstractRequest ids = do ...@@ -87,14 +78,17 @@ runSimpleFetchPubmedAbstractRequest ids = do
env env
case res of case res of
(Left err) -> pure (Left . T.pack $ show err) (Left err) -> pure (Left . T.pack $ show err)
(Right (BsXml abs)) -> (Right (BsXml abs)) -> do
--putStrLn $ show abs
case parseOnly removeSub $ LBS.toStrict abs of case parseOnly removeSub $ LBS.toStrict abs of
(Left err'') -> pure (Left $ T.pack err'') (Left err'') -> pure (Left $ T.pack err'')
(Right v) -> do (Right v) -> do
parsed <- catch (pubMedParser v) ((\e -> do let parsed = parsePubMed $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict v
_ <- print e pure $ Right parsed
pure []) :: XmlException -> IO [PubMed]) -- parsed <- catch (pubMedParser v) ((\e -> do
Right <$> pure parsed -- _ <- print e
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed]) runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query limit = do runSimpleFindPubmedAbstractRequest query limit = do
...@@ -112,4 +106,8 @@ searchDocIds query limit = do ...@@ -112,4 +106,8 @@ searchDocIds query limit = do
case res of case res of
(Left err) -> pure (Left $ T.pack $ show err) (Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do (Right (BsXml docs)) -> do
pure $ Right $ runParser parseDocId docs pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs
--pure $ Right $ runParser parseDocId docs
--runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
--runParser parser = parser . fromDocument . parseLBS_ def
module PUBMED.Client where module PUBMED.Client where
import Data.ByteString.Lazy (ByteString)
import Data.Proxy (Proxy(..))
import Servant.API import Servant.API
import Servant.Client import Servant.Client
import Data.Proxy (Proxy(..))
import Data.ByteString.Lazy (ByteString)
import qualified Network.HTTP.Media as M
import qualified Data.Text as T import qualified Data.Text as T
import qualified Network.HTTP.Media as M
data DB = PUBMED data DB = PUBMED
......
module PUBMED.Parser where {-# LANGUAGE LambdaCase #-}
import Text.XML.Stream.Parse module PUBMED.Parser where
import qualified Text.XML.Cursor as C -- ((&/), (&//), Cursor, content, element)
import Text.XML (Name)
import Data.Either (rights) import Control.Lens ((^?), (^.), (^..), only, to, ix, prism', Prism')
import Data.Maybe (Maybe, fromJust, fromMaybe) import Data.Maybe (catMaybes)
import Data.Monoid (mconcat)
import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack)
import Data.XML.Types (Event)
import Data.Time.Segment (jour)
import Data.Time (UTCTime(..)) import Data.Time (UTCTime(..))
import Data.Time.Segment (jour)
import Prelude hiding (head)
import Text.Taggy (Node(..))
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TLIO
import qualified Text.Taggy.Lens as TTL
import GHC.IO (FilePath) namedEl name = TTL.elements . TTL.named (only name)
import Protolude (head) contentWithChildren :: Prism' Node T.Text
import Prelude hiding (head) contentWithChildren = prism' NodeContent $ \case { NodeContent c -> Just c
import Control.Monad.Catch (MonadThrow) ; NodeElement e -> Just $ e ^. TTL.children . traverse . contentWithChildren }
import Control.Monad (join)
import qualified Data.Text as T parseDocIds :: TL.Text -> [Integer]
import qualified Data.Text.Read as T parseDocIds txt = map (\s -> read (T.unpack s) :: Integer) parsed
import qualified Data.ByteString.Lazy as DBL where
import qualified Data.Conduit.List as CL parsed = txt ^.. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "IdList" . namedEl "Id" . TTL.contents
parseDocId :: C.Cursor -> [Integer]
parseDocId cursor = fst <$>
rights (T.decimal
<$> filter notNullOrEOL (rawElement cursor)
)
where rawElement = C.element "eSearchResult" C.&/ C.element "IdList" C.&// C.content
notNullOrEOL t = not (T.null t) && t /= "\n"
identity :: a -> a
identity x = x
manyTagsUntil :: MonadThrow m =>
Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
-- | Utility function that matches everything but the tag given
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_ n = many_ ( ignoreTree (tagUntil n) ignoreAttrs)
manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
data PubMed = data PubMed =
PubMed { pubmed_article :: PubMedArticle PubMed { pubmed_article :: PubMedArticle
...@@ -64,17 +33,16 @@ data PubMed = ...@@ -64,17 +33,16 @@ data PubMed =
data PubMedArticle = data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe T.Text PubMedArticle { pubmed_title :: Maybe T.Text
, pubmed_journal :: Maybe T.Text , pubmed_journal :: Maybe T.Text
, pubmed_abstract :: Maybe [T.Text] , pubmed_abstract :: Maybe [T.Text] -- TODO No Maybe?
, pubmed_authors :: Maybe [Author] , pubmed_authors :: Maybe [Author] -- TODO No Maybe?
} }
deriving (Show) deriving (Show)
data Author = data Author =
Author { Author { lastName :: Maybe T.Text
lastName :: Maybe T.Text, , foreName :: Maybe T.Text
foreName :: Maybe T.Text, , affiliation :: Maybe T.Text
affiliation :: Maybe T.Text }
}
deriving (Show) deriving (Show)
data PubMedDate = data PubMedDate =
...@@ -84,76 +52,146 @@ data PubMedDate = ...@@ -84,76 +52,146 @@ data PubMedDate =
, pubmedDate_day :: Int , pubmedDate_day :: Int
} deriving (Show) } deriving (Show)
readPubMedFile :: FilePath -> IO [PubMed]
readPubMedFile fp = do parsePubMedFile fp = do
input <- DBL.readFile fp input <- TLIO.readFile fp
pubMedParser input pure $ parsePubMed input
pubMedParser :: DBL.ByteString -> IO [PubMed] parsePubMed :: TL.Text -> [PubMed]
pubMedParser bstring = runConduit $ parseLBS def bstring parsePubMed txt = catMaybes $ txt ^.. pubmedArticle . to pubMed
.| parseArticleSet where
.| CL.consume pubmedArticle = TTL.html . TTL.allNamed (only "PubmedArticleSet") . TTL.allNamed (only "PubmedArticle")
pubMed el = PubMed <$> el ^? article . to pubMedArticle
parseArticleSet :: MonadThrow m => ConduitT Event PubMed m () <*> el ^? pubDate . to pubMedDate
parseArticleSet = where
force "PubmedArticleSet required" $ manyTagsUntil "PubmedArticleSet" $ manyYield parsePubMedArticle article = namedEl "MedlineCitation" . namedEl "Article"
pubDate = namedEl "PubmedData" . namedEl "History" . namedEl "PubMedPubDate" . TTL.attributed (ix "PubStatus" . only "pubmed")
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed) pubMedDate el = PubMedDate { pubmedDate_date = jour y m d
parsePubMedArticle = , pubmedDate_year = y
manyTagsUntil "PubmedArticle" parsePubMedArticle' , pubmedDate_month = m
, pubmedDate_day = d }
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed where
parsePubMedArticle' = do y = read $ T.unpack $ el ^. namedEl "Year" . TTL.contents
article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation m = read $ T.unpack $ el ^. namedEl "Month" . TTL.contents
dates <- tagIgnoreAttrs "PubmedData" $ do d = read $ T.unpack $ el ^. namedEl "Day" . TTL.contents
dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do pubMedArticle el = PubMedArticle { pubmed_title = Just $ el ^. title
y' <- force "Year" $ tagIgnoreAttrs "Year" content , pubmed_journal = el ^? journalTitle
m' <- force "Month" $ tagIgnoreAttrs "Month" content , pubmed_abstract = Just $ el ^.. abstract
d' <- force "Day" $ tagIgnoreAttrs "Day" content , pubmed_authors = Just $ el ^.. authors . to author }
_ <- many ignoreAnyTreeContent where
return (read $ unpack y', read $ unpack m', read $ unpack d') journalTitle = namedEl "Journal" . namedEl "Title" . TTL.contents
_ <- many ignoreAnyTreeContent title = namedEl "ArticleTitle" . TTL.children . traverse . contentWithChildren
return dates' abstract = namedEl "Abstract" . namedEl "AbstractText" . TTL.contents
_ <- many ignoreAnyTreeContent authors = namedEl "AuthorList" . TTL.allNamed (only "Author")
let (y,m,d) = maybe (1,1,1) (fromJust . head) (reverse <$> join dates) author el = Author { lastName = el ^? lastName
return $ PubMed article (PubMedDate (jour y m d) y m d) , foreName = el ^? firstName
, affiliation = el ^? affiliation }
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle where
parseMedlineCitation = do firstName = namedEl "ForeName" . TTL.contents
a <- force "article" $ manyTagsUntil "Article" parseArticle lastName = namedEl "LastName" . TTL.contents
_ <- many ignoreAnyTreeContent affiliation = namedEl "AffiliationInfo" . namedEl "Affiliation" . TTL.contents
return a
--identity :: a -> a
parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle --identity x = x
parseArticle = do --
journal <- force "journal" $ --manyTagsUntil :: MonadThrow m =>
manyTagsUntil "Journal" $ do -- Name
j <- manyTagsUntil "Title" content -- -> ConduitT Event o m b
_ <- many ignoreAnyTreeContent -- -> ConduitT Event o m (Maybe b)
return j --manyTagsUntil name f = do
-- _ <- manyTagsUntil_ name
title <- manyTagsUntil "ArticleTitle" content -- tagIgnoreAttrs (matching (== name)) f
--
abstracts <- manyTagsUntil "Abstract" . many $ do ---- | Utility function that matches everything but the tag given
txt <- tagIgnoreAttrs "AbstractText" content --tagUntil :: Name -> NameMatcher Name
_ <- many ignoreAnyTreeContent --tagUntil name = matching (/= name)
return txt --
-- TODO add authos ---- | Utility function that consumes everything but the tag given
---- usefull because we have to consume every data.
authors <- manyTagsUntil "AuthorList" . many $ --manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
tagIgnoreAttrs "Author" $ do --manyTagsUntil_ n = many_ ( ignoreTree (tagUntil n) ignoreAttrs)
ln <- manyTagsUntil "LastName" content --
fn <- manyTagsUntil "ForeName" content --manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
affi <- manyTagsUntil "AffiliationInfo" $ do --manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
aff <- manyTagsUntil "Affiliation" content
_ <- many ignoreAnyTreeContent --parseDocId :: C.Cursor -> [Integer]
return aff --parseDocId cursor = fst <$>
_ <- many ignoreAnyTreeContent -- rights (T.decimal
return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi} -- <$> filter notNullOrEOL (rawElement cursor)
-- )
-- where rawElement = C.element "eSearchResult" C.&/ C.element "IdList" C.&// C.content
_ <- many ignoreAnyTreeContent -- notNullOrEOL t = not (T.null t) && t /= "\n"
return $ PubMedArticle title journal abstracts authors
--readPubMedFile :: FilePath -> IO [PubMed]
--readPubMedFile fp = do
-- input <- DBL.readFile fp
-- pubMedParser input
--pubMedParser :: DBL.ByteString -> IO [PubMed]
--pubMedParser bstring = runConduit $ parseLBS def bstring
-- .| parseArticleSet
-- .| CL.consume
--
--parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
--parseArticleSet =
-- force "PubmedArticleSet required" $ manyTagsUntil "PubmedArticleSet" $ manyYield parsePubMedArticle
--
--parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
--parsePubMedArticle =
-- manyTagsUntil "PubmedArticle" parsePubMedArticle'
--
--parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
--parsePubMedArticle' = do
-- article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
-- dates <- tagIgnoreAttrs "PubmedData" $ do
-- dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
-- y' <- force "Year" $ tagIgnoreAttrs "Year" content
-- m' <- force "Month" $ tagIgnoreAttrs "Month" content
-- d' <- force "Day" $ tagIgnoreAttrs "Day" content
-- _ <- many ignoreAnyTreeContent
-- return (read $ unpack y', read $ unpack m', read $ unpack d')
-- _ <- many ignoreAnyTreeContent
-- return dates'
-- _ <- many ignoreAnyTreeContent
-- let (y,m,d) = maybe (1,1,1) (fromJust . head) (reverse <$> join dates)
-- return $ PubMed article (PubMedDate (jour y m d) y m d)
--
--parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
--parseMedlineCitation = do
-- a <- force "article" $ manyTagsUntil "Article" parseArticle
-- _ <- many ignoreAnyTreeContent
-- return a
--
--parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
--parseArticle = do
-- journal <- force "journal" $
-- manyTagsUntil "Journal" $ do
-- j <- manyTagsUntil "Title" content
-- _ <- many ignoreAnyTreeContent
-- return j
--
-- title <- manyTagsUntil "ArticleTitle" content
--
-- abstracts <- manyTagsUntil "Abstract" . many $ do
-- txt <- tagIgnoreAttrs "AbstractText" content
-- _ <- many ignoreAnyTreeContent
-- return txt
-- -- TODO add authos
--
-- authors <- manyTagsUntil "AuthorList" . many $
-- tagIgnoreAttrs "Author" $ do
-- ln <- manyTagsUntil "LastName" content
-- fn <- manyTagsUntil "ForeName" content
-- affi <- manyTagsUntil "AffiliationInfo" $ do
-- aff <- manyTagsUntil "Affiliation" content
-- _ <- many ignoreAnyTreeContent
-- return aff
-- _ <- many ignoreAnyTreeContent
-- return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi}
--
--
-- _ <- many ignoreAnyTreeContent
-- return $ PubMedArticle title journal abstracts authors
pubMedData :: DBL.ByteString pubMedData :: DBL.ByteString
......
...@@ -40,7 +40,8 @@ packages: ...@@ -40,7 +40,8 @@ packages:
extra-deps: extra-deps:
- git: https://github.com/delanoe/data-time-segment.git - git: https://github.com/delanoe/data-time-segment.git
commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723 commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723
- taggy-0.2.1@sha256:7bc55ddba178971dc6052163597f0445a0a2b5b0ca0e84ce651d53d722e3c265,4662
- taggy-lens-0.1.2@sha256:091ca81d02bd3d7fb493dce0148e1a38f25eb178a1ebd751043a23239e5e3265,3009
# Override default flag values for local packages and extra-deps # Override default flag values for local packages and extra-deps
# flags: {} # flags: {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment