[taggy] use taggy to parse XML

This fixes a bug where article title had bold text inside (which threw an error).

[taggy] use taggy to parse XML
This fixes a bug where article title had bold text inside (which threw an error).
1b5abcee · Przemyslaw Kaminski · 233fb199 · 1b5abcee · 1b5abcee · 1b5abcee
Commit 1b5abcee authored Jan 19, 2022 by Przemyslaw Kaminski
6 changed files
--- a/crawlerPubMed.cabal
+++ b/crawlerPubMed.cabal
@@ -38,6 +38,7 @@ library
      MultiParamTypeClasses
      OverloadedStrings
      TypeOperators
+  ghc-options: -Wunused-imports
  build-depends:
      attoparsec
    , base >=4.7 && <5
@@ -49,13 +50,14 @@ library
    , http-client
    , http-client-tls
    , http-media
+    , lens
    , protolude
    , servant
    , servant-client
+    , taggy
+    , taggy-lens
    , text
    , time
-    , xml-conduit
-    , xml-types
  default-language: Haskell2010

 executable crawlerPubMed-exe
@@ -67,7 +69,7 @@ executable crawlerPubMed-exe
      MultiParamTypeClasses
      OverloadedStrings
      TypeOperators
-  ghc-options: -threaded -rtsopts -with-rtsopts=-N
+  ghc-options: -threaded -rtsopts -Wunused-imports -with-rtsopts=-N
  build-depends:
      attoparsec
    , base >=4.7 && <5
@@ -80,13 +82,14 @@ executable crawlerPubMed-exe
    , http-client
    , http-client-tls
    , http-media
+    , lens
    , protolude
    , servant
    , servant-client
+    , taggy
+    , taggy-lens
    , text
    , time
-    , xml-conduit
-    , xml-types
  default-language: Haskell2010

 test-suite crawlerPubMed-test
@@ -114,11 +117,12 @@ test-suite crawlerPubMed-test
    , http-client
    , http-client-tls
    , http-media
+    , lens
    , protolude
    , servant
    , servant-client
+    , taggy
+    , taggy-lens
    , text
    , time
-    , xml-conduit
-    , xml-types
  default-language: Haskell2010
--- a/package.yaml
+++ b/package.yaml
@@ -26,26 +26,29 @@ default-extensions:
 description:         Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>

 dependencies:
+- attoparsec
 - base >= 4.7 && < 5
- servant
- servant-client
- text
 - bytestring
- xml-conduit
+- conduit
+- data-time-segment
+- either
+- exceptions
 - http-client
 - http-client-tls
 - http-media
- exceptions
- conduit
- xml-types
- time
- data-time-segment
+- lens
 - protolude
- attoparsec
- either
+- servant
+- servant-client
+- taggy
+- taggy-lens
+- text
+- time

 library:
  source-dirs: src
+  ghc-options:
+  - -Wunused-imports

 executables:
  crawlerPubMed-exe:
@@ -54,6 +57,7 @@ executables:
    ghc-options:
    - -threaded
    - -rtsopts
+    - -Wunused-imports
    - -with-rtsopts=-N
    dependencies:
    - crawlerPubMed

--- a/src/PUBMED.hs
+++ b/src/PUBMED.hs
 module PUBMED where

-import Prelude hiding (takeWhile)
+import Control.Applicative
+import Data.Attoparsec.ByteString
+import Data.Attoparsec.ByteString.Char8 (anyChar)
+import Data.ByteString.Char8 (pack)
+import Data.ByteString.Lazy (ByteString)
 import Data.Text (Text)
-import PUBMED.Client
-import PUBMED.Parser
 import Network.HTTP.Client (newManager)
 import Network.HTTP.Client.TLS (tlsManagerSettings)
+import PUBMED.Client
+import PUBMED.Parser
+import Prelude hiding (takeWhile)
 import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
-import Text.XML (parseLBS_, def)
-import Text.XML.Cursor (fromDocument, Cursor)
-import Text.XML.Stream.Parse (XmlException)
-import Data.Conduit (ConduitT)
-import Data.ByteString.Lazy (ByteString)
-import Data.ByteString.Char8 (pack)
-import Control.Monad.Catch (catch, MonadThrow, Exception)
-
-import Control.Applicative
-import Data.Attoparsec.ByteString
-import Data.Attoparsec.ByteString.Char8 (anyChar)
-
 import qualified Data.ByteString.Lazy as LBS
-import qualified Data.ByteString      as DB
-import qualified Data.Text as T
 import qualified Data.List as List
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as TE
+import qualified Data.Text.Lazy as TL

 pmHost :: String
 pmHost = "eutils.ncbi.nlm.nih.gov"
@@ -39,9 +33,6 @@ defaultEnv = do
 getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed])
 getMetadataWith = runSimpleFindPubmedAbstractRequest

-runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
-runParser parser = parser . fromDocument . parseLBS_ def
-
 -- | TODO this parser need at least one subs at the end
 -- (use endOfInput)
 removeSub :: Parser ByteString
@@ -87,14 +78,17 @@ runSimpleFetchPubmedAbstractRequest ids = do
    env
  case res of
    (Left err) -> pure (Left . T.pack $ show err)
-    (Right (BsXml abs)) ->
+    (Right (BsXml abs)) -> do
+      --putStrLn $ show abs
      case parseOnly removeSub $ LBS.toStrict abs of
        (Left err'') -> pure (Left $ T.pack err'')
        (Right v) -> do
-          parsed <- catch (pubMedParser v) ((\e -> do
-                                                _ <- print e
-                                                pure []) :: XmlException -> IO [PubMed])
-          Right <$> pure parsed
+          let parsed = parsePubMed $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict v
+          pure $ Right parsed
+--          parsed <- catch (pubMedParser v) ((\e -> do
+--                                                _ <- print e
+--                                                pure []) :: XmlException -> IO [PubMed])
+--          Right <$> pure parsed

 runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed])
 runSimpleFindPubmedAbstractRequest query limit =  do
@@ -112,4 +106,8 @@ searchDocIds query limit = do
  case res of
    (Left err) -> pure (Left $ T.pack $ show err)
    (Right (BsXml docs)) -> do
-      pure $ Right $ runParser parseDocId docs
+      pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs
+      --pure $ Right $ runParser parseDocId docs
+
+--runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
+--runParser parser = parser . fromDocument . parseLBS_ def
--- a/src/PUBMED/Client.hs
+++ b/src/PUBMED/Client.hs
 module PUBMED.Client where

+import Data.ByteString.Lazy (ByteString)
+import Data.Proxy (Proxy(..))
 import Servant.API
 import Servant.Client
-
-import Data.Proxy (Proxy(..))
-import Data.ByteString.Lazy (ByteString)
-
-import qualified Network.HTTP.Media as M
 import qualified Data.Text as T
+import qualified Network.HTTP.Media as M

 data DB = PUBMED


--- a/src/PUBMED/Parser.hs
+++ b/src/PUBMED/Parser.hs
-module PUBMED.Parser where
+{-# LANGUAGE LambdaCase #-}

-import Text.XML.Stream.Parse
-import qualified Text.XML.Cursor as C -- ((&/), (&//), Cursor, content, element)
-import Text.XML (Name)
+module PUBMED.Parser where

-import Data.Either (rights)
-import Data.Maybe (Maybe, fromJust, fromMaybe)
-import Data.Monoid (mconcat)
-import Data.Conduit (runConduit, (.|), ConduitT)
-import Data.Text (Text, unpack)
-import Data.XML.Types (Event)
-import Data.Time.Segment (jour)
+import Control.Lens ((^?), (^.), (^..), only, to, ix, prism', Prism')
+import Data.Maybe (catMaybes)
 import Data.Time (UTCTime(..))
+import Data.Time.Segment (jour)
+import Prelude hiding (head)
+import Text.Taggy (Node(..))
+import qualified Data.ByteString.Lazy as DBL
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.IO as TLIO
+import qualified Text.Taggy.Lens as TTL

-import GHC.IO (FilePath)
+namedEl name = TTL.elements . TTL.named (only name) 

-import Protolude (head)
-import Prelude hiding (head)
-import Control.Monad.Catch (MonadThrow)
-import Control.Monad (join)
+contentWithChildren :: Prism' Node T.Text
+contentWithChildren = prism' NodeContent $ \case { NodeContent c -> Just c
+                                                 ; NodeElement e -> Just $ e ^. TTL.children . traverse . contentWithChildren }

-import qualified Data.Text as T
-import qualified Data.Text.Read as T
-import qualified Data.ByteString.Lazy as DBL
-import qualified Data.Conduit.List as CL
-
-parseDocId :: C.Cursor -> [Integer]
-parseDocId cursor = fst <$>
-  rights (T.decimal
-          <$> filter notNullOrEOL (rawElement cursor)
-         )
-  where rawElement = C.element "eSearchResult" C.&/ C.element "IdList"  C.&// C.content
-        notNullOrEOL t = not (T.null t) && t /= "\n"
-
-identity :: a -> a
-identity x = x
-
-manyTagsUntil :: MonadThrow m =>
-     Name
-  -> ConduitT Event o m b
-  -> ConduitT Event o m (Maybe b)
-manyTagsUntil name f = do
-  _ <- manyTagsUntil_ name
-  tagIgnoreAttrs (matching (== name)) f
-
-- | Utility function that matches everything but the tag given
-tagUntil :: Name -> NameMatcher Name
-tagUntil name = matching (/= name)
-
-- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
-manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
-manyTagsUntil_ n = many_ ( ignoreTree (tagUntil n) ignoreAttrs)
-
-manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
-manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
+parseDocIds :: TL.Text -> [Integer]
+parseDocIds txt = map (\s -> read (T.unpack s) :: Integer) parsed
+  where
+    parsed = txt ^.. TTL.html . TTL.allNamed (only "eSearchResult") . namedEl "IdList" . namedEl "Id" . TTL.contents

 data PubMed =
     PubMed { pubmed_article :: PubMedArticle
@@ -64,17 +33,16 @@ data PubMed =
 data PubMedArticle =
     PubMedArticle { pubmed_title   :: Maybe T.Text
                   , pubmed_journal :: Maybe T.Text
-                   , pubmed_abstract :: Maybe [T.Text]
-                   , pubmed_authors :: Maybe [Author]
+                   , pubmed_abstract :: Maybe [T.Text]  -- TODO No Maybe?
+                   , pubmed_authors :: Maybe [Author]  -- TODO No Maybe?
                   }
                   deriving (Show)

 data Author =
-  Author {
-  lastName :: Maybe T.Text,
-  foreName :: Maybe T.Text,
-  affiliation :: Maybe T.Text
-  }
+  Author { lastName :: Maybe T.Text
+         , foreName :: Maybe T.Text
+         , affiliation :: Maybe T.Text
+         }
  deriving (Show)

 data PubMedDate =
@@ -84,76 +52,146 @@ data PubMedDate =
                , pubmedDate_day      :: Int
                } deriving (Show)

-readPubMedFile :: FilePath -> IO [PubMed]
-readPubMedFile fp = do
-  input <- DBL.readFile fp
-  pubMedParser input
-
-pubMedParser :: DBL.ByteString -> IO [PubMed]
-pubMedParser bstring = runConduit $ parseLBS def bstring
-                                 .| parseArticleSet
-                                 .| CL.consume
-
-parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
-parseArticleSet =
-  force "PubmedArticleSet required" $ manyTagsUntil "PubmedArticleSet" $ manyYield parsePubMedArticle
-
-parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
-parsePubMedArticle =
-  manyTagsUntil "PubmedArticle" parsePubMedArticle'
-
-parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
-parsePubMedArticle' = do
-  article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
-  dates <- tagIgnoreAttrs "PubmedData" $ do
-    dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
-      y' <- force "Year"  $ tagIgnoreAttrs "Year"  content
-      m' <- force "Month" $ tagIgnoreAttrs "Month" content
-      d' <- force "Day"   $ tagIgnoreAttrs "Day"   content
-      _ <- many ignoreAnyTreeContent
-      return (read $ unpack y', read $ unpack m', read $ unpack d')
-    _ <- many ignoreAnyTreeContent
-    return dates'
-  _ <- many ignoreAnyTreeContent
-  let (y,m,d) = maybe (1,1,1) (fromJust . head) (reverse <$> join dates)
-  return $ PubMed article (PubMedDate (jour y m d) y m d)
-
-parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
-parseMedlineCitation = do
-  a <- force "article" $ manyTagsUntil "Article" parseArticle
-  _ <- many ignoreAnyTreeContent
-  return a
-
-parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
-parseArticle = do
-  journal  <- force "journal" $
-    manyTagsUntil "Journal" $ do
-    j <- manyTagsUntil "Title" content
-    _ <- many ignoreAnyTreeContent
-    return j
-
-  title <- manyTagsUntil "ArticleTitle" content
-
-  abstracts <- manyTagsUntil "Abstract" . many $ do
-      txt <- tagIgnoreAttrs "AbstractText" content
-      _ <- many ignoreAnyTreeContent
-      return txt
-      -- TODO add authos
-
-  authors <-  manyTagsUntil "AuthorList" . many $
-     tagIgnoreAttrs "Author" $ do
-        ln <- manyTagsUntil "LastName" content
-        fn <- manyTagsUntil "ForeName" content
-        affi <- manyTagsUntil "AffiliationInfo" $ do
-          aff <- manyTagsUntil "Affiliation" content
-          _ <- many ignoreAnyTreeContent
-          return aff
-        _ <- many ignoreAnyTreeContent
-        return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi}
-
-
-  _ <- many ignoreAnyTreeContent
-  return $ PubMedArticle title journal abstracts authors
+
+parsePubMedFile fp = do
+  input <- TLIO.readFile fp
+  pure $ parsePubMed input
+
+parsePubMed :: TL.Text -> [PubMed]
+parsePubMed txt = catMaybes $ txt ^.. pubmedArticle . to pubMed
+  where
+    pubmedArticle = TTL.html . TTL.allNamed (only "PubmedArticleSet") . TTL.allNamed (only "PubmedArticle")
+    pubMed el = PubMed <$> el ^? article . to pubMedArticle
+                       <*> el ^? pubDate . to pubMedDate
+      where
+        article = namedEl "MedlineCitation" . namedEl "Article"
+        pubDate = namedEl "PubmedData" . namedEl "History" . namedEl "PubMedPubDate" . TTL.attributed (ix "PubStatus" . only "pubmed")
+    pubMedDate el = PubMedDate { pubmedDate_date = jour y m d
+                               , pubmedDate_year = y
+                               , pubmedDate_month = m
+                               , pubmedDate_day = d }
+      where
+        y = read $ T.unpack $ el ^. namedEl "Year" . TTL.contents
+        m = read $ T.unpack $ el ^. namedEl "Month" . TTL.contents
+        d = read $ T.unpack $ el ^. namedEl "Day" . TTL.contents
+    pubMedArticle el = PubMedArticle { pubmed_title = Just $ el ^. title
+                                     , pubmed_journal = el ^? journalTitle
+                                     , pubmed_abstract = Just $ el ^.. abstract
+                                     , pubmed_authors = Just $ el ^.. authors . to author }
+      where
+        journalTitle = namedEl "Journal" . namedEl "Title" . TTL.contents
+        title = namedEl "ArticleTitle" . TTL.children . traverse . contentWithChildren
+        abstract = namedEl "Abstract" . namedEl "AbstractText" . TTL.contents
+        authors = namedEl "AuthorList" . TTL.allNamed (only "Author")
+    author el = Author { lastName = el ^? lastName
+                       , foreName = el ^? firstName
+                       , affiliation = el ^? affiliation }
+      where
+        firstName = namedEl "ForeName" . TTL.contents
+        lastName = namedEl "LastName" . TTL.contents
+        affiliation = namedEl "AffiliationInfo" . namedEl "Affiliation" . TTL.contents
+
+--identity :: a -> a
+--identity x = x
+--
+--manyTagsUntil :: MonadThrow m =>
+--     Name
+--  -> ConduitT Event o m b
+--  -> ConduitT Event o m (Maybe b)
+--manyTagsUntil name f = do
+--  _ <- manyTagsUntil_ name
+--  tagIgnoreAttrs (matching (== name)) f
+--
+---- | Utility function that matches everything but the tag given
+--tagUntil :: Name -> NameMatcher Name
+--tagUntil name = matching (/= name)
+--
+---- | Utility function that consumes everything but the tag given
+---- usefull because we have to consume every data.
+--manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
+--manyTagsUntil_ n = many_ ( ignoreTree (tagUntil n) ignoreAttrs)
+--
+--manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
+--manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
+
+--parseDocId :: C.Cursor -> [Integer]
+--parseDocId cursor = fst <$>
+--  rights (T.decimal
+--          <$> filter notNullOrEOL (rawElement cursor)
+--         )
+--  where rawElement = C.element "eSearchResult" C.&/ C.element "IdList"  C.&// C.content
+--        notNullOrEOL t = not (T.null t) && t /= "\n"
+
+--readPubMedFile :: FilePath -> IO [PubMed]
+--readPubMedFile fp = do
+--  input <- DBL.readFile fp
+--  pubMedParser input
+
+--pubMedParser :: DBL.ByteString -> IO [PubMed]
+--pubMedParser bstring = runConduit $ parseLBS def bstring
+--                                 .| parseArticleSet
+--                                 .| CL.consume
+--
+--parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
+--parseArticleSet =
+--  force "PubmedArticleSet required" $ manyTagsUntil "PubmedArticleSet" $ manyYield parsePubMedArticle
+--
+--parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
+--parsePubMedArticle =
+--  manyTagsUntil "PubmedArticle" parsePubMedArticle'
+--
+--parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
+--parsePubMedArticle' = do
+--  article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
+--  dates <- tagIgnoreAttrs "PubmedData" $ do
+--    dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
+--      y' <- force "Year"  $ tagIgnoreAttrs "Year"  content
+--      m' <- force "Month" $ tagIgnoreAttrs "Month" content
+--      d' <- force "Day"   $ tagIgnoreAttrs "Day"   content
+--      _ <- many ignoreAnyTreeContent
+--      return (read $ unpack y', read $ unpack m', read $ unpack d')
+--    _ <- many ignoreAnyTreeContent
+--    return dates'
+--  _ <- many ignoreAnyTreeContent
+--  let (y,m,d) = maybe (1,1,1) (fromJust . head) (reverse <$> join dates)
+--  return $ PubMed article (PubMedDate (jour y m d) y m d)
+--
+--parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
+--parseMedlineCitation = do
+--  a <- force "article" $ manyTagsUntil "Article" parseArticle
+--  _ <- many ignoreAnyTreeContent
+--  return a
+--
+--parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
+--parseArticle = do
+--  journal  <- force "journal" $
+--    manyTagsUntil "Journal" $ do
+--    j <- manyTagsUntil "Title" content
+--    _ <- many ignoreAnyTreeContent
+--    return j
+--
+--  title <- manyTagsUntil "ArticleTitle" content
+--
+--  abstracts <- manyTagsUntil "Abstract" . many $ do
+--      txt <- tagIgnoreAttrs "AbstractText" content
+--      _ <- many ignoreAnyTreeContent
+--      return txt
+--      -- TODO add authos
+--
+--  authors <-  manyTagsUntil "AuthorList" . many $
+--     tagIgnoreAttrs "Author" $ do
+--        ln <- manyTagsUntil "LastName" content
+--        fn <- manyTagsUntil "ForeName" content
+--        affi <- manyTagsUntil "AffiliationInfo" $ do
+--          aff <- manyTagsUntil "Affiliation" content
+--          _ <- many ignoreAnyTreeContent
+--          return aff
+--        _ <- many ignoreAnyTreeContent
+--        return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi}
+--
+--
+--  _ <- many ignoreAnyTreeContent
+--  return $ PubMedArticle title journal abstracts authors


 pubMedData :: DBL.ByteString

--- a/stack.yaml
+++ b/stack.yaml
@@ -40,7 +40,8 @@ packages:
 extra-deps:
 - git: https://github.com/delanoe/data-time-segment.git
  commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723
-
+- taggy-0.2.1@sha256:7bc55ddba178971dc6052163597f0445a0a2b5b0ca0e84ce651d53d722e3c265,4662
+- taggy-lens-0.1.2@sha256:091ca81d02bd3d7fb493dce0148e1a38f25eb178a1ebd751043a23239e5e3265,3009

 # Override default flag values for local packages and extra-deps
 # flags: {}