Commit 06eedee4 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Experiment with request logging

parent fe4e032c
cabal-version: 1.12 cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.0. -- This file has been generated from package.yaml by hpack version 0.35.2.
-- --
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
-- --
-- hash: d5f6b9788433a3873efd55684dfd55f4d85380f24209a6a8eb8ba80c82a330f8 -- hash: 8e1f14ffe78d88b3a3011e489d4b883c7c81039249d2334d78d7dbcc155816f6
name: crawlerHAL name: crawlerHAL
version: 0.1.0.0 version: 0.1.0.0
...@@ -53,6 +53,7 @@ library ...@@ -53,6 +53,7 @@ library
, http-client , http-client
, http-client-tls , http-client-tls
, lens , lens
, mtl
, neat-interpolation , neat-interpolation
, optparse-applicative , optparse-applicative
, scientific , scientific
...@@ -89,6 +90,7 @@ executable crawlerHAL-exe ...@@ -89,6 +90,7 @@ executable crawlerHAL-exe
, http-client , http-client
, http-client-tls , http-client-tls
, lens , lens
, mtl
, neat-interpolation , neat-interpolation
, optparse-applicative , optparse-applicative
, scientific , scientific
...@@ -126,6 +128,7 @@ test-suite halCrawler-test ...@@ -126,6 +128,7 @@ test-suite halCrawler-test
, http-client , http-client
, http-client-tls , http-client-tls
, lens , lens
, mtl
, neat-interpolation , neat-interpolation
, optparse-applicative , optparse-applicative
, scientific , scientific
......
...@@ -30,6 +30,7 @@ dependencies: ...@@ -30,6 +30,7 @@ dependencies:
- http-client-tls - http-client-tls
- lens - lens
- neat-interpolation - neat-interpolation
- mtl
- optparse-applicative - optparse-applicative
- scientific - scientific
- servant - servant
......
...@@ -4,24 +4,21 @@ module HAL where ...@@ -4,24 +4,21 @@ module HAL where
import Conduit import Conduit
import Control.Monad import Control.Monad
import Control.Monad.Reader
import Data.Aeson
import Data.Default (def) import Data.Default (def)
import Data.Maybe (fromMaybe)
import Data.Either (fromRight) import Data.Either (fromRight)
import Data.Maybe (fromMaybe)
import Data.Text import Data.Text
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (BaseUrl(..), Scheme(..), ClientM, ClientError, runClientM, mkClientEnv)
import HAL.Client import HAL.Client
import HAL.Doc.Corpus import HAL.Doc.Corpus
import HAL.Doc.Struct import HAL.Doc.Struct
import Network.HTTP.Client (newManager, Request)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Prelude import Prelude
import Servant.API import Servant.API
import Data.Aeson import Servant.Client (BaseUrl(..), Scheme(..), ClientM, ClientError, runClientM, mkClientEnv, ClientEnv (makeClientRequest))
import System.IO.Unsafe (unsafePerformIO)
data HalCrawlerOptions data HalCrawlerOptions
= HalCrawlerOptions = HalCrawlerOptions
...@@ -64,23 +61,22 @@ getMetadataWithOptionsC opts@HalCrawlerOptions{..} q mb_offset mb_limit = do ...@@ -64,23 +61,22 @@ getMetadataWithOptionsC opts@HalCrawlerOptions{..} q mb_offset mb_limit = do
numResults = limit - offset numResults = limit - offset
numPages = numResults `div` _hco_batchSize + 1 numPages = numResults `div` _hco_batchSize + 1
debugLog :: String -> IO ()
debugLog msg = when _hco_debugLogs $ putStrLn msg
getPage :: Text -> Int -> Int -> IO [Corpus] getPage :: Text -> Int -> Int -> IO [Corpus]
getPage q start pageNum = do getPage q start pageNum = do
let offset = start + pageNum * _hco_batchSize let offset = start + pageNum * _hco_batchSize
debugLog $ "[getMetadataWithC] getPage: " <> show offset debugLog opts $ "[getMetadataWithC] getPage: " <> show offset
eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just offset) (Just _hco_batchSize) eRes <- runHalAPIClient opts $ search (Just requestedFields) [q] Nothing (Just offset) (Just _hco_batchSize)
pure $ case eRes of pure $ case eRes of
Left _ -> [] Left _ -> []
Right (Response { _docs }) -> _docs Right (Response { _docs }) -> _docs
printDoc :: Corpus -> IO Corpus printDoc :: Corpus -> IO Corpus
printDoc c@(Corpus { _corpus_docid, _corpus_title }) = do printDoc c@(Corpus { _corpus_docid, _corpus_title }) = do
debugLog $ show _corpus_title debugLog opts $ show _corpus_title
pure c pure c
debugLog :: HalCrawlerOptions -> String -> IO ()
debugLog HalCrawlerOptions{..} msg = when _hco_debugLogs $ putStrLn msg
getMetadataWith :: Text getMetadataWith :: Text
-- ^ The query, as a text. -- ^ The query, as a text.
...@@ -91,7 +87,7 @@ getMetadataWith :: Text ...@@ -91,7 +87,7 @@ getMetadataWith :: Text
-> IO (Either ClientError (Response Corpus)) -> IO (Either ClientError (Response Corpus))
getMetadataWith q start rows = do getMetadataWith q start rows = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
runHalAPIClient $ search (Just requestedFields) [q] Nothing start rows runHalAPIClient defaultHalOptions $ search (Just requestedFields) [q] Nothing start rows
getMetadataWithC :: Text getMetadataWithC :: Text
-- ^ The textual query -- ^ The textual query
...@@ -107,7 +103,7 @@ countResults :: Text -> IO (Either ClientError Int) ...@@ -107,7 +103,7 @@ countResults :: Text -> IO (Either ClientError Int)
countResults q = do countResults q = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
-- First, estimate the total number of documents -- First, estimate the total number of documents
eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just 0) (Just 1) :: IO (Either ClientError (Response Corpus)) eRes <- runHalAPIClient defaultHalOptions $ search (Just requestedFields) [q] Nothing (Just 0) (Just 1) :: IO (Either ClientError (Response Corpus))
pure $ _numFound <$> eRes pure $ _numFound <$> eRes
requestedFields :: Text requestedFields :: Text
...@@ -116,19 +112,31 @@ requestedFields = "docid,title_s,abstract_s,submittedDate_s,source_s,authFullNam ...@@ -116,19 +112,31 @@ requestedFields = "docid,title_s,abstract_s,submittedDate_s,source_s,authFullNam
structFields :: Text structFields :: Text
structFields = "docid,label_s,parentDocid_i" structFields = "docid,label_s,parentDocid_i"
runHalAPIClient :: (FromJSON doc, ToHttpApiData doc) => requestLog :: HalCrawlerOptions -> Request -> Request
ClientM (Response doc) -> IO (Either ClientError (Response doc)) requestLog opts rq = unsafePerformIO $ do
runHalAPIClient cmd = do debugLog opts $ "[HAL.makeClientRequestLog] " <> show rq
pure rq
{-# NOINLINE requestLog #-}
runHalAPIClient :: (FromJSON doc, ToHttpApiData doc)
=> HalCrawlerOptions
-> ClientM (Response doc)
-> IO (Either ClientError (Response doc))
runHalAPIClient opts cmd = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
runClientM cmd (mkClientEnv manager' $ BaseUrl Https "api.archives-ouvertes.fr" 443 "") runClientM cmd' (mkClientEnv manager' $ BaseUrl Https "api.archives-ouvertes.fr" 443 "")
where
cmd' = local (\r -> r {
makeClientRequest = \bUrl servantRq -> requestLog opts ((makeClientRequest r) bUrl servantRq)
}) cmd
runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Struct)) runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Struct))
runStructureRequest rq = runStructureRequest rq =
runHalAPIClient $ structure (Just structFields) rq (Just 10000) runHalAPIClient defaultHalOptions $ structure (Just structFields) rq (Just 10000)
runSearchRequest :: [Text] -> IO (Either ClientError (Response Corpus)) runSearchRequest :: [Text] -> IO (Either ClientError (Response Corpus))
runSearchRequest rq = runSearchRequest rq =
runHalAPIClient $ search (Just requestedFields) rq Nothing Nothing Nothing runHalAPIClient defaultHalOptions $ search (Just requestedFields) rq Nothing Nothing Nothing
generateRequestByStructID :: Text -> [Text] -> Text generateRequestByStructID :: Text -> [Text] -> Text
generateRequestByStructID rq struct_ids = generateRequestByStructID rq struct_ids =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment