Commit a7a556b6 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

conduit get works now, reporting total docs coutn

parent 6f05441f
module Main where
import Arxiv.Wrapper
import Conduit
main :: IO ()
main = undefined
main = do
(cnt, resC) <- searchAxv' $ simpleQuery ["banach space"]
putStrLn $ "Total count: " <> show cnt
runConduitRes $ resC .| outSnk
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.34.4.
--
-- see: https://github.com/sol/hpack
name: crawlerArxiv
version: 0.1.0.0
description: Please see the README on GitHub at <https://github.com/githubuser/arxiv#readme>
homepage: https://github.com/delanoe/arxiv-api#readme
bug-reports: https://github.com/delanoe/arxiv-api/issues
author: Alexandre Delanoë
maintainer: alexandre+dev@delanoe.org
copyright: 2021 CNRS/A. Delanoë
license: GPLv3
license-file: LICENSE
build-type: Simple
extra-source-files:
README.md
ChangeLog.md
source-repository head
type: git
location: https://github.com/delanoe/arxiv-api
library
exposed-modules:
Arxiv.Wrapper
other-modules:
Paths_crawlerArxiv
hs-source-dirs:
src
build-depends:
arxiv
, base >=4.7 && <5
, bytestring
, conduit
, http-conduit
, http-types
, mtl
, network
, resourcet
, tagsoup
, text
default-language: Haskell2010
executable arxiv-exe
main-is: Main.hs
other-modules:
Paths_crawlerArxiv
hs-source-dirs:
app
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
arxiv
, base >=4.7 && <5
, bytestring
, conduit
, crawlerArxiv
, http-conduit
, http-types
, mtl
, network
, resourcet
, tagsoup
, text
default-language: Haskell2010
test-suite arxiv-test
type: exitcode-stdio-1.0
main-is: Spec.hs
other-modules:
Paths_crawlerArxiv
hs-source-dirs:
test
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
arxiv
, base >=4.7 && <5
, bytestring
, conduit
, http-conduit
, http-types
, mtl
, network
, resourcet
, tagsoup
, text
default-language: Haskell2010
......@@ -14,6 +14,7 @@ import Network.HTTP.Conduit (parseRequest)
import Network.HTTP.Simple as HT
import Network.HTTP.Types.Status
import Text.HTML.TagSoup
import qualified Conduit as C
import qualified Data.ByteString as B hiding (unpack)
import qualified Data.ByteString.Char8 as B (unpack)
import qualified Data.Conduit as C
......@@ -71,37 +72,59 @@ execQuery (Just l) q = C.runConduitRes (searchAxv q .| CL.take l)
----------------------------------------------------------------------
-- Execute query and start a source
----------------------------------------------------------------------
searchAxv :: MonadResource m => Ax.Query -> (Int, C.ConduitT () Result m ())
searchAxvBody :: Ax.Query -> IO B.ByteString
searchAxvBody q =
let s = Ax.mkQuery q
in do rsp <- HT.httpBS =<< liftIO (parseRequest s)
case getResponseStatus rsp of
(Status 200 _) -> pure $ getResponseBody rsp
st -> error $ "Error:" ++ show st
searchAxv' :: MonadResource m => Ax.Query -> IO (Int, C.ConduitT () Result m ())
searchAxv' q =
let s = Ax.mkQuery q
in do rsp <- HT.httpBS =<< liftIO (parseRequest s)
case getResponseStatus rsp of
(Status 200 _) -> case getSoup (getResponseBody rsp) of
[] -> pure (0, C.yieldMany [])
soups@(s:_) -> pure (Ax.totalResults soups, results q soups)
st -> error $ "Error:" ++ show st
searchAxv :: MonadResource m => Ax.Query -> C.ConduitT () Result m ()
searchAxv q =
let s = Ax.mkQuery q
in do rsp <- HT.httpBS =<< liftIO (parseRequest s)
case getResponseStatus rsp of
(Status 200 _) -> getSoup (getResponseBody rsp) >>= results q
(Status 200 _) -> getSoupC (getResponseBody rsp) >>= results q
st -> error $ "Error:" ++ show st
----------------------------------------------------------------------
-- Consume page by page
----------------------------------------------------------------------
getSoup :: MonadResource m =>
B.ByteString -> C.ConduitT () Result m [Soup]
getSoup b = concat <$> (C.yield b .| toSoup .| CL.consume)
getSoup :: B.ByteString -> [Soup]
getSoup b = parseTags $ B.unpack b
--getSoupC :: MonadResource m =>
-- B.ByteString -> C.ConduitT () a m [Soup]
getSoupC :: Monad m => B.ByteString -> C.ConduitT () a m [Soup]
getSoupC b = concat <$> (C.yield b .| toSoup .| CL.consume)
----------------------------------------------------------------------
-- Receive a ByteString and yield Soup
----------------------------------------------------------------------
toSoup :: MonadResource m => C.ConduitT B.ByteString [Soup] m ()
--toSoup :: MonadResource m => C.ConduitT B.ByteString [Soup] m ()
toSoup :: Monad m => C.ConduitT B.ByteString [Soup] m ()
toSoup = C.awaitForever (C.yield . parseTags . B.unpack)
----------------------------------------------------------------------
-- Yield all entries and fetch next page
----------------------------------------------------------------------
results :: MonadResource m =>
Ax.Query -> [Soup] -> (Int, C.ConduitT () Result m ())
results q sp =
( Ax.totalResults sp
, Ax.forEachEntryM sp (C.yield . mkResult)
>> (snd $ searchAxv (Ax.nextPage q)))
Ax.Query -> [Soup] -> C.ConduitT () Result m ()
results q sp = Ax.forEachEntryM sp (C.yield . mkResult)
>> searchAxv (Ax.nextPage q)
----------------------------------------------------------------------
-- Get data and format
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment