Commit fe4e032c authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Initial cleanup

parent 7f209ce7
module HAL where module HAL where
{-# LANGUAGE BangPatterns #-}
import Conduit import Conduit
import Control.Monad
import Data.Default (def) import Data.Default (def)
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.Either (fromRight) import Data.Either (fromRight)
...@@ -15,50 +18,92 @@ import HAL.Client ...@@ -15,50 +18,92 @@ import HAL.Client
import HAL.Doc.Corpus import HAL.Doc.Corpus
import HAL.Doc.Struct import HAL.Doc.Struct
import Prelude
import Servant.API import Servant.API
import Data.Aeson import Data.Aeson
batchSize :: Int data HalCrawlerOptions
batchSize = 1000 = HalCrawlerOptions
{ -- | If 'True', enable the debug logs to stdout.
getMetadataWith :: Text -> Maybe Int -> Maybe Integer -> IO (Either ClientError (Response Corpus)) _hco_debugLogs :: !Bool
getMetadataWith q start rows = do , _hco_batchSize :: !Int
manager' <- newManager tlsManagerSettings }
runHalAPIClient $ search (Just requestedFields) [q] Nothing start rows
defaultHalOptions :: HalCrawlerOptions
getMetadataWithC :: Text -> Maybe Int -> Maybe Integer -> IO (Either ClientError (Maybe Integer, ConduitT () Corpus IO ())) defaultHalOptions = HalCrawlerOptions
getMetadataWithC q start rows = do { _hco_debugLogs = False
, _hco_batchSize = 1000
}
getMetadataWithOptionsC :: HalCrawlerOptions
-- ^ The options for the crawler
-> Text
-- ^ The textual query
-> Maybe Int
-- ^ An optional offset
-> Maybe Int
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> IO (Either ClientError (Maybe Int, ConduitT () Corpus IO ()))
getMetadataWithOptionsC opts@HalCrawlerOptions{..} q mb_offset mb_limit = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
-- First, estimate the total number of documents -- First, estimate the total number of documents
eCount <- countResults q eCount <- countResults q
pure $ get' q start rows <$> eCount pure $ get' <$> eCount
where where
get' :: Text -> Maybe Int -> Maybe Integer -> Integer -> (Maybe Integer, ConduitT () Corpus IO ()) get' :: Int -> (Maybe Int, ConduitT () Corpus IO ())
get' q start rows numFound = get' numFound =
( Just numResults ( Just numResults
, yieldMany [0..] , yieldMany [0..]
.| takeC (fromInteger numPages) .| takeC numPages
.| concatMapMC (getPage q start')) .| concatMapMC (getPage q offset))
where where
start' = fromMaybe 0 start offset = fromMaybe 0 mb_offset
rows' = min numFound $ fromMaybe numFound rows limit = min numFound $ fromMaybe numFound mb_limit
numResults = rows' - (fromIntegral start') numResults = limit - offset
numPages = numResults `div` (fromIntegral batchSize) + 1 numPages = numResults `div` _hco_batchSize + 1
debugLog :: String -> IO ()
debugLog msg = when _hco_debugLogs $ putStrLn msg
getPage :: Text -> Int -> Int -> IO [Corpus] getPage :: Text -> Int -> Int -> IO [Corpus]
getPage q start pageNum = do getPage q start pageNum = do
let offset = start + pageNum * batchSize let offset = start + pageNum * _hco_batchSize
putStrLn $ "[getMetadataWithC] getPage: " <> show offset debugLog $ "[getMetadataWithC] getPage: " <> show offset
eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just offset) (Just $ fromIntegral batchSize) eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just offset) (Just _hco_batchSize)
pure $ case eRes of pure $ case eRes of
Left _ -> [] Left _ -> []
Right (Response { _docs }) -> _docs Right (Response { _docs }) -> _docs
printDoc :: Corpus -> IO Corpus printDoc :: Corpus -> IO Corpus
printDoc c@(Corpus { _corpus_docid, _corpus_title }) = do printDoc c@(Corpus { _corpus_docid, _corpus_title }) = do
print $ show _corpus_title debugLog $ show _corpus_title
pure c pure c
countResults :: Text -> IO (Either ClientError Integer)
getMetadataWith :: Text
-- ^ The query, as a text.
-> Maybe Int
-- ^ The offset, it influences where the search starts.
-> Maybe Int
-- ^ The offset, it influences how many rows are returned.
-> IO (Either ClientError (Response Corpus))
getMetadataWith q start rows = do
manager' <- newManager tlsManagerSettings
runHalAPIClient $ search (Just requestedFields) [q] Nothing start rows
getMetadataWithC :: Text
-- ^ The textual query
-> Maybe Int
-- ^ An optional offset for the search, it influences where to start.
-> Maybe Int
-- ^ An optional limit for the search, it influences how many rows are
-- returned.
-> IO (Either ClientError (Maybe Int, ConduitT () Corpus IO ()))
getMetadataWithC q start = getMetadataWithOptionsC defaultHalOptions q start
countResults :: Text -> IO (Either ClientError Int)
countResults q = do countResults q = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
-- First, estimate the total number of documents -- First, estimate the total number of documents
......
...@@ -11,6 +11,7 @@ import Data.Text ...@@ -11,6 +11,7 @@ import Data.Text
import Data.Map import Data.Map
import Data.Aeson import Data.Aeson
import Data.Aeson.Types (typeMismatch)
import qualified Data.ByteString.Lazy as BSL import qualified Data.ByteString.Lazy as BSL
import qualified Codec.Binary.UTF8.String as UTF import qualified Codec.Binary.UTF8.String as UTF
...@@ -30,7 +31,7 @@ type Search doc = "search" ...@@ -30,7 +31,7 @@ type Search doc = "search"
-- permit to start at the x result -- permit to start at the x result
:> QueryParam "start" Int :> QueryParam "start" Int
-- use rows to make the request only return the x number of result -- use rows to make the request only return the x number of result
:> QueryParam "rows" Integer :> QueryParam "rows" Int
:> Get '[JSON] (Response doc) :> Get '[JSON] (Response doc)
type Structure doc = "ref" :> "structure" type Structure doc = "ref" :> "structure"
...@@ -55,7 +56,7 @@ desc = Just . Desc ...@@ -55,7 +56,7 @@ desc = Just . Desc
-- Response type -- Response type
data Response doc = Response data Response doc = Response
{ _numFound :: Integer { _numFound :: Int
, _start :: Int , _start :: Int
, _docs :: [doc] , _docs :: [doc]
} deriving (Show, Generic) } deriving (Show, Generic)
...@@ -66,6 +67,7 @@ instance FromJSON doc => FromJSON (Response doc) where ...@@ -66,6 +67,7 @@ instance FromJSON doc => FromJSON (Response doc) where
((o .: "response") >>= (.: "numFound")) ((o .: "response") >>= (.: "numFound"))
<*> ((o .: "response") >>= (.: "start")) <*> ((o .: "response") >>= (.: "start"))
<*> ((o .: "response") >>= (.: "docs")) <*> ((o .: "response") >>= (.: "docs"))
parseJSON ty = typeMismatch "Hal Response" ty
halAPI :: Proxy (HALAPI doc) halAPI :: Proxy (HALAPI doc)
halAPI = Proxy halAPI = Proxy
...@@ -76,7 +78,7 @@ search :: (FromJSON doc, ToHttpApiData doc) => ...@@ -76,7 +78,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
-> [Text] -- fq -> [Text] -- fq
-> Maybe SortField -- sort -> Maybe SortField -- sort
-> Maybe Int -- start -> Maybe Int -- start
-> Maybe Integer -- rows -> Maybe Int -- rows
-> ClientM (Response doc) -> ClientM (Response doc)
structure :: (FromJSON doc, ToHttpApiData doc) => structure :: (FromJSON doc, ToHttpApiData doc) =>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment