Commit a7a556b6 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

conduit get works now, reporting total docs coutn

parent 6f05441f
module Main where module Main where
import Arxiv.Wrapper import Arxiv.Wrapper
import Conduit
main :: IO () main :: IO ()
main = undefined main = do
(cnt, resC) <- searchAxv' $ simpleQuery ["banach space"]
putStrLn $ "Total count: " <> show cnt
runConduitRes $ resC .| outSnk
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.34.4.
--
-- see: https://github.com/sol/hpack
name: crawlerArxiv
version: 0.1.0.0
description: Please see the README on GitHub at <https://github.com/githubuser/arxiv#readme>
homepage: https://github.com/delanoe/arxiv-api#readme
bug-reports: https://github.com/delanoe/arxiv-api/issues
author: Alexandre Delanoë
maintainer: alexandre+dev@delanoe.org
copyright: 2021 CNRS/A. Delanoë
license: GPLv3
license-file: LICENSE
build-type: Simple
extra-source-files:
README.md
ChangeLog.md
source-repository head
type: git
location: https://github.com/delanoe/arxiv-api
library
exposed-modules:
Arxiv.Wrapper
other-modules:
Paths_crawlerArxiv
hs-source-dirs:
src
build-depends:
arxiv
, base >=4.7 && <5
, bytestring
, conduit
, http-conduit
, http-types
, mtl
, network
, resourcet
, tagsoup
, text
default-language: Haskell2010
executable arxiv-exe
main-is: Main.hs
other-modules:
Paths_crawlerArxiv
hs-source-dirs:
app
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
arxiv
, base >=4.7 && <5
, bytestring
, conduit
, crawlerArxiv
, http-conduit
, http-types
, mtl
, network
, resourcet
, tagsoup
, text
default-language: Haskell2010
test-suite arxiv-test
type: exitcode-stdio-1.0
main-is: Spec.hs
other-modules:
Paths_crawlerArxiv
hs-source-dirs:
test
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
arxiv
, base >=4.7 && <5
, bytestring
, conduit
, http-conduit
, http-types
, mtl
, network
, resourcet
, tagsoup
, text
default-language: Haskell2010
...@@ -14,6 +14,7 @@ import Network.HTTP.Conduit (parseRequest) ...@@ -14,6 +14,7 @@ import Network.HTTP.Conduit (parseRequest)
import Network.HTTP.Simple as HT import Network.HTTP.Simple as HT
import Network.HTTP.Types.Status import Network.HTTP.Types.Status
import Text.HTML.TagSoup import Text.HTML.TagSoup
import qualified Conduit as C
import qualified Data.ByteString as B hiding (unpack) import qualified Data.ByteString as B hiding (unpack)
import qualified Data.ByteString.Char8 as B (unpack) import qualified Data.ByteString.Char8 as B (unpack)
import qualified Data.Conduit as C import qualified Data.Conduit as C
...@@ -71,37 +72,59 @@ execQuery (Just l) q = C.runConduitRes (searchAxv q .| CL.take l) ...@@ -71,37 +72,59 @@ execQuery (Just l) q = C.runConduitRes (searchAxv q .| CL.take l)
---------------------------------------------------------------------- ----------------------------------------------------------------------
-- Execute query and start a source -- Execute query and start a source
---------------------------------------------------------------------- ----------------------------------------------------------------------
searchAxv :: MonadResource m => Ax.Query -> (Int, C.ConduitT () Result m ()) searchAxvBody :: Ax.Query -> IO B.ByteString
searchAxvBody q =
let s = Ax.mkQuery q
in do rsp <- HT.httpBS =<< liftIO (parseRequest s)
case getResponseStatus rsp of
(Status 200 _) -> pure $ getResponseBody rsp
st -> error $ "Error:" ++ show st
searchAxv' :: MonadResource m => Ax.Query -> IO (Int, C.ConduitT () Result m ())
searchAxv' q =
let s = Ax.mkQuery q
in do rsp <- HT.httpBS =<< liftIO (parseRequest s)
case getResponseStatus rsp of
(Status 200 _) -> case getSoup (getResponseBody rsp) of
[] -> pure (0, C.yieldMany [])
soups@(s:_) -> pure (Ax.totalResults soups, results q soups)
st -> error $ "Error:" ++ show st
searchAxv :: MonadResource m => Ax.Query -> C.ConduitT () Result m ()
searchAxv q = searchAxv q =
let s = Ax.mkQuery q let s = Ax.mkQuery q
in do rsp <- HT.httpBS =<< liftIO (parseRequest s) in do rsp <- HT.httpBS =<< liftIO (parseRequest s)
case getResponseStatus rsp of case getResponseStatus rsp of
(Status 200 _) -> getSoup (getResponseBody rsp) >>= results q (Status 200 _) -> getSoupC (getResponseBody rsp) >>= results q
st -> error $ "Error:" ++ show st st -> error $ "Error:" ++ show st
---------------------------------------------------------------------- ----------------------------------------------------------------------
-- Consume page by page -- Consume page by page
---------------------------------------------------------------------- ----------------------------------------------------------------------
getSoup :: MonadResource m => getSoup :: B.ByteString -> [Soup]
B.ByteString -> C.ConduitT () Result m [Soup] getSoup b = parseTags $ B.unpack b
getSoup b = concat <$> (C.yield b .| toSoup .| CL.consume)
--getSoupC :: MonadResource m =>
-- B.ByteString -> C.ConduitT () a m [Soup]
getSoupC :: Monad m => B.ByteString -> C.ConduitT () a m [Soup]
getSoupC b = concat <$> (C.yield b .| toSoup .| CL.consume)
---------------------------------------------------------------------- ----------------------------------------------------------------------
-- Receive a ByteString and yield Soup -- Receive a ByteString and yield Soup
---------------------------------------------------------------------- ----------------------------------------------------------------------
toSoup :: MonadResource m => C.ConduitT B.ByteString [Soup] m () --toSoup :: MonadResource m => C.ConduitT B.ByteString [Soup] m ()
toSoup :: Monad m => C.ConduitT B.ByteString [Soup] m ()
toSoup = C.awaitForever (C.yield . parseTags . B.unpack) toSoup = C.awaitForever (C.yield . parseTags . B.unpack)
---------------------------------------------------------------------- ----------------------------------------------------------------------
-- Yield all entries and fetch next page -- Yield all entries and fetch next page
---------------------------------------------------------------------- ----------------------------------------------------------------------
results :: MonadResource m => results :: MonadResource m =>
Ax.Query -> [Soup] -> (Int, C.ConduitT () Result m ()) Ax.Query -> [Soup] -> C.ConduitT () Result m ()
results q sp = results q sp = Ax.forEachEntryM sp (C.yield . mkResult)
( Ax.totalResults sp >> searchAxv (Ax.nextPage q)
, Ax.forEachEntryM sp (C.yield . mkResult)
>> (snd $ searchAxv (Ax.nextPage q)))
---------------------------------------------------------------------- ----------------------------------------------------------------------
-- Get data and format -- Get data and format
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment