Commit 1b5abcee authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[taggy] use taggy to parse XML

This fixes a bug where article title had bold text inside (which threw
an error).
parent 233fb199
......@@ -38,6 +38,7 @@ library
MultiParamTypeClasses
OverloadedStrings
TypeOperators
ghc-options: -Wunused-imports
build-depends:
attoparsec
, base >=4.7 && <5
......@@ -49,13 +50,14 @@ library
, http-client
, http-client-tls
, http-media
, lens
, protolude
, servant
, servant-client
, taggy
, taggy-lens
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
executable crawlerPubMed-exe
......@@ -67,7 +69,7 @@ executable crawlerPubMed-exe
MultiParamTypeClasses
OverloadedStrings
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
ghc-options: -threaded -rtsopts -Wunused-imports -with-rtsopts=-N
build-depends:
attoparsec
, base >=4.7 && <5
......@@ -80,13 +82,14 @@ executable crawlerPubMed-exe
, http-client
, http-client-tls
, http-media
, lens
, protolude
, servant
, servant-client
, taggy
, taggy-lens
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
test-suite crawlerPubMed-test
......@@ -114,11 +117,12 @@ test-suite crawlerPubMed-test
, http-client
, http-client-tls
, http-media
, lens
, protolude
, servant
, servant-client
, taggy
, taggy-lens
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
......@@ -26,26 +26,29 @@ default-extensions:
description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
dependencies:
- attoparsec
- base >= 4.7 && < 5
- servant
- servant-client
- text
- bytestring
- xml-conduit
- conduit
- data-time-segment
- either
- exceptions
- http-client
- http-client-tls
- http-media
- exceptions
- conduit
- xml-types
- time
- data-time-segment
- lens
- protolude
- attoparsec
- either
- servant
- servant-client
- taggy
- taggy-lens
- text
- time
library:
source-dirs: src
ghc-options:
- -Wunused-imports
executables:
crawlerPubMed-exe:
......@@ -54,6 +57,7 @@ executables:
ghc-options:
- -threaded
- -rtsopts
- -Wunused-imports
- -with-rtsopts=-N
dependencies:
- crawlerPubMed
......
module PUBMED where
import Prelude hiding (takeWhile)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import Data.ByteString.Char8 (pack)
import Data.ByteString.Lazy (ByteString)
import Data.Text (Text)
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import PUBMED.Client
import PUBMED.Parser
import Prelude hiding (takeWhile)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Text.XML.Stream.Parse (XmlException)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (catch, MonadThrow, Exception)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
import qualified Data.List as List
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
pmHost :: String
pmHost = "eutils.ncbi.nlm.nih.gov"
......@@ -39,9 +33,6 @@ defaultEnv = do
getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
......@@ -87,14 +78,17 @@ runSimpleFetchPubmedAbstractRequest ids = do
env
case res of
(Left err) -> pure (Left . T.pack $ show err)
(Right (BsXml abs)) ->
(Right (BsXml abs)) -> do
--putStrLn $ show abs
case parseOnly removeSub $ LBS.toStrict abs of
(Left err'') -> pure (Left $ T.pack err'')
(Right v) -> do
parsed <- catch (pubMedParser v) ((\e -> do
_ <- print e
pure []) :: XmlException -> IO [PubMed])
Right <$> pure parsed
let parsed = parsePubMed $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict v
pure $ Right parsed
-- parsed <- catch (pubMedParser v) ((\e -> do
-- _ <- print e
-- pure []) :: XmlException -> IO [PubMed])
-- Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query limit = do
......@@ -112,4 +106,8 @@ searchDocIds query limit = do
case res of
(Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do
pure $ Right $ runParser parseDocId docs
pure $ Right $ parseDocIds $ TL.fromStrict $ TE.decodeUtf8 $ LBS.toStrict docs
--pure $ Right $ runParser parseDocId docs
--runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
--runParser parser = parser . fromDocument . parseLBS_ def
module PUBMED.Client where
import Data.ByteString.Lazy (ByteString)
import Data.Proxy (Proxy(..))
import Servant.API
import Servant.Client
import Data.Proxy (Proxy(..))
import Data.ByteString.Lazy (ByteString)
import qualified Network.HTTP.Media as M
import qualified Data.Text as T
import qualified Network.HTTP.Media as M
data DB = PUBMED
......
This diff is collapsed.
......@@ -40,7 +40,8 @@ packages:
extra-deps:
- git: https://github.com/delanoe/data-time-segment.git
commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723
- taggy-0.2.1@sha256:7bc55ddba178971dc6052163597f0445a0a2b5b0ca0e84ce651d53d722e3c265,4662
- taggy-lens-0.1.2@sha256:091ca81d02bd3d7fb493dce0148e1a38f25eb178a1ebd751043a23239e5e3265,3009
# Override default flag values for local packages and extra-deps
# flags: {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment