Commit 1b5abcee authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[taggy] use taggy to parse XML

This fixes a bug where article title had bold text inside (which threw
an error).
parent 233fb199
...@@ -38,6 +38,7 @@ library ...@@ -38,6 +38,7 @@ library
MultiParamTypeClasses MultiParamTypeClasses
OverloadedStrings OverloadedStrings
TypeOperators TypeOperators
ghc-options: -Wunused-imports
build-depends: build-depends:
attoparsec attoparsec
, base >=4.7 && <5 , base >=4.7 && <5
...@@ -49,13 +50,14 @@ library ...@@ -49,13 +50,14 @@ library
, http-client , http-client
, http-client-tls , http-client-tls
, http-media , http-media
, lens
, protolude , protolude
, servant , servant
, servant-client , servant-client
, taggy
, taggy-lens
, text , text
, time , time
, xml-conduit
, xml-types
default-language: Haskell2010 default-language: Haskell2010
executable crawlerPubMed-exe executable crawlerPubMed-exe
...@@ -67,7 +69,7 @@ executable crawlerPubMed-exe ...@@ -67,7 +69,7 @@ executable crawlerPubMed-exe
MultiParamTypeClasses MultiParamTypeClasses
OverloadedStrings OverloadedStrings
TypeOperators TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -Wunused-imports -with-rtsopts=-N
build-depends: build-depends:
attoparsec attoparsec
, base >=4.7 && <5 , base >=4.7 && <5
...@@ -80,13 +82,14 @@ executable crawlerPubMed-exe ...@@ -80,13 +82,14 @@ executable crawlerPubMed-exe
, http-client , http-client
, http-client-tls , http-client-tls
, http-media , http-media
, lens
, protolude , protolude
, servant , servant
, servant-client , servant-client
, taggy
, taggy-lens
, text , text
, time , time
, xml-conduit
, xml-types
default-language: Haskell2010 default-language: Haskell2010
test-suite crawlerPubMed-test test-suite crawlerPubMed-test
...@@ -114,11 +117,12 @@ test-suite crawlerPubMed-test ...@@ -114,11 +117,12 @@ test-suite crawlerPubMed-test
, http-client , http-client
, http-client-tls , http-client-tls
, http-media , http-media
, lens
, protolude , protolude
, servant , servant
, servant-client , servant-client
, taggy
, taggy-lens
, text , text
, time , time
, xml-conduit
, xml-types
default-language: Haskell2010 default-language: Haskell2010
...@@ -26,26 +26,29 @@ default-extensions: ...@@ -26,26 +26,29 @@ default-extensions:
description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md> description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
dependencies: dependencies:
- attoparsec
- base >= 4.7 && < 5 - base >= 4.7 && < 5
- servant
- servant-client
- text
- bytestring - bytestring
- xml-conduit - conduit
- data-time-segment
- either
- exceptions
- http-client - http-client
- http-client-tls - http-client-tls
- http-media - http-media
- exceptions - lens
- conduit
- xml-types
- time
- data-time-segment
- protolude - protolude
- attoparsec - servant
- either - servant-client
- taggy
- taggy-lens
- text
- time
library: library:
source-dirs: src source-dirs: src
ghc-options:
- -Wunused-imports
executables: executables:
crawlerPubMed-exe: crawlerPubMed-exe:
...@@ -54,6 +57,7 @@ executables: ...@@ -54,6 +57,7 @@ executables:
ghc-options: ghc-options:
- -threaded - -threaded
- -rtsopts - -rtsopts
- -Wunused-imports
- -with-rtsopts=-N - -with-rtsopts=-N
dependencies: dependencies:
- crawlerPubMed - crawlerPubMed
......
module PUBMED where module PUBMED where
import Prelude hiding (takeWhile) import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import Data.ByteString.Char8 (pack)
import Data.ByteString.Lazy (ByteString)
import Data.Text (Text) import Data.Text (Text)
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager) import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Client.TLS (tlsManagerSettings)
import PUBMED.Client
import PUBMED.Parser
import Prelude hiding (takeWhile)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..)) import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Text.XML.Stream.Parse (XmlException)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (catch, MonadThrow, Exception)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
pmHost :: String pmHost :: String
pmHost = "eutils.ncbi.nlm.nih.gov" pmHost = "eutils.ncbi.nlm.nih.gov"
...@@ -39,9 +33,6 @@ defaultEnv = do ...@@ -39,9 +33,6 @@ defaultEnv = do
getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed]) getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest getMetadataWith = runSimpleFindPubmedAbstractRequest
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end -- | TODO this parser need at least one subs at the end
-- (use endOfInput) -- (use endOfInput)
removeSub :: Parser ByteString removeSub :: Parser ByteString
...@@ -87,14 +78,17 @@ runSimpleFetchPubmedAbstractRequest ids = do ...@@ -87,14 +78,17 @@ runSimpleFetchPubmedAbstractRequest ids = do
env env
case res of case res of
(Left err) -> pure (Left . T.pack $ show err) (Left err) -> pure (Left . T.pack $ show err)
(Right (BsXml abs)) -> (Right (BsXml abs)) -> do
--putStrLn $ show abs
case parseOnly removeSub $ LBS.toStrict abs of case parseOnly removeSub $ LBS.toStrict abs of
(Left err'') -> pure (Left $ T.pack err'') (Left err'') -> pure (Left $ T.pack err'')
(Right v) -> do (Right v) -> do
parsed <- catch (pubMedParser v) ((\e -> do let parsed = parsePubMed $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict v
_ <- print e pure $ Right parsed
pure []) :: XmlException -> IO [PubMed]) -- parsed <- catch (pubMedParser v) ((\e -> do
Right <$> pure parsed -- _ <- print e
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed]) runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query limit = do runSimpleFindPubmedAbstractRequest query limit = do
...@@ -112,4 +106,8 @@ searchDocIds query limit = do ...@@ -112,4 +106,8 @@ searchDocIds query limit = do
case res of case res of
(Left err) -> pure (Left $ T.pack $ show err) (Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do (Right (BsXml docs)) -> do
pure $ Right $ runParser parseDocId docs pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs
--pure $ Right $ runParser parseDocId docs
--runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
--runParser parser = parser . fromDocument . parseLBS_ def
module PUBMED.Client where module PUBMED.Client where
import Data.ByteString.Lazy (ByteString)
import Data.Proxy (Proxy(..))
import Servant.API import Servant.API
import Servant.Client import Servant.Client
import Data.Proxy (Proxy(..))
import Data.ByteString.Lazy (ByteString)
import qualified Network.HTTP.Media as M
import qualified Data.Text as T import qualified Data.Text as T
import qualified Network.HTTP.Media as M
data DB = PUBMED data DB = PUBMED
......
This diff is collapsed.
...@@ -40,7 +40,8 @@ packages: ...@@ -40,7 +40,8 @@ packages:
extra-deps: extra-deps:
- git: https://github.com/delanoe/data-time-segment.git - git: https://github.com/delanoe/data-time-segment.git
commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723 commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723
- taggy-0.2.1@sha256:7bc55ddba178971dc6052163597f0445a0a2b5b0ca0e84ce651d53d722e3c265,4662
- taggy-lens-0.1.2@sha256:091ca81d02bd3d7fb493dce0148e1a38f25eb178a1ebd751043a23239e5e3265,3009
# Override default flag values for local packages and extra-deps # Override default flag values for local packages and extra-deps
# flags: {} # flags: {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment