Commit fe4e032c authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Initial cleanup

parent 7f209ce7
module HAL where
{-# LANGUAGE BangPatterns #-}
import Conduit
import Control.Monad
import Data.Default (def)
import Data.Maybe (fromMaybe)
import Data.Either (fromRight)
......@@ -15,50 +18,92 @@ import HAL.Client
import HAL.Doc.Corpus
import HAL.Doc.Struct
import Prelude
import Servant.API
import Data.Aeson
batchSize :: Int
batchSize = 1000
getMetadataWith :: Text -> Maybe Int -> Maybe Integer -> IO (Either ClientError (Response Corpus))
getMetadataWith q start rows = do
manager' <- newManager tlsManagerSettings
runHalAPIClient $ search (Just requestedFields) [q] Nothing start rows
getMetadataWithC :: Text -> Maybe Int -> Maybe Integer -> IO (Either ClientError (Maybe Integer, ConduitT () Corpus IO ()))
getMetadataWithC q start rows = do
data HalCrawlerOptions
= HalCrawlerOptions
{ -- | If 'True', enable the debug logs to stdout.
_hco_debugLogs :: !Bool
, _hco_batchSize :: !Int
}
defaultHalOptions :: HalCrawlerOptions
defaultHalOptions = HalCrawlerOptions
{ _hco_debugLogs = False
, _hco_batchSize = 1000
}
getMetadataWithOptionsC :: HalCrawlerOptions
-- ^ The options for the crawler
-> Text
-- ^ The textual query
-> Maybe Int
-- ^ An optional offset
-> Maybe Int
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> IO (Either ClientError (Maybe Int, ConduitT () Corpus IO ()))
getMetadataWithOptionsC opts@HalCrawlerOptions{..} q mb_offset mb_limit = do
manager' <- newManager tlsManagerSettings
-- First, estimate the total number of documents
eCount <- countResults q
pure $ get' q start rows <$> eCount
pure $ get' <$> eCount
where
get' :: Text -> Maybe Int -> Maybe Integer -> Integer -> (Maybe Integer, ConduitT () Corpus IO ())
get' q start rows numFound =
get' :: Int -> (Maybe Int, ConduitT () Corpus IO ())
get' numFound =
( Just numResults
, yieldMany [0..]
.| takeC (fromInteger numPages)
.| concatMapMC (getPage q start'))
.| takeC numPages
.| concatMapMC (getPage q offset))
where
start' = fromMaybe 0 start
rows' = min numFound $ fromMaybe numFound rows
numResults = rows' - (fromIntegral start')
numPages = numResults `div` (fromIntegral batchSize) + 1
offset = fromMaybe 0 mb_offset
limit = min numFound $ fromMaybe numFound mb_limit
numResults = limit - offset
numPages = numResults `div` _hco_batchSize + 1
debugLog :: String -> IO ()
debugLog msg = when _hco_debugLogs $ putStrLn msg
getPage :: Text -> Int -> Int -> IO [Corpus]
getPage q start pageNum = do
let offset = start + pageNum * batchSize
putStrLn $ "[getMetadataWithC] getPage: " <> show offset
eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just offset) (Just $ fromIntegral batchSize)
let offset = start + pageNum * _hco_batchSize
debugLog $ "[getMetadataWithC] getPage: " <> show offset
eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just offset) (Just _hco_batchSize)
pure $ case eRes of
Left _ -> []
Right (Response { _docs }) -> _docs
printDoc :: Corpus -> IO Corpus
printDoc c@(Corpus { _corpus_docid, _corpus_title }) = do
print $ show _corpus_title
debugLog $ show _corpus_title
pure c
countResults :: Text -> IO (Either ClientError Integer)
getMetadataWith :: Text
-- ^ The query, as a text.
-> Maybe Int
-- ^ The offset, it influences where the search starts.
-> Maybe Int
-- ^ The offset, it influences how many rows are returned.
-> IO (Either ClientError (Response Corpus))
getMetadataWith q start rows = do
manager' <- newManager tlsManagerSettings
runHalAPIClient $ search (Just requestedFields) [q] Nothing start rows
getMetadataWithC :: Text
-- ^ The textual query
-> Maybe Int
-- ^ An optional offset for the search, it influences where to start.
-> Maybe Int
-- ^ An optional limit for the search, it influences how many rows are
-- returned.
-> IO (Either ClientError (Maybe Int, ConduitT () Corpus IO ()))
getMetadataWithC q start = getMetadataWithOptionsC defaultHalOptions q start
countResults :: Text -> IO (Either ClientError Int)
countResults q = do
manager' <- newManager tlsManagerSettings
-- First, estimate the total number of documents
......
......@@ -11,6 +11,7 @@ import Data.Text
import Data.Map
import Data.Aeson
import Data.Aeson.Types (typeMismatch)
import qualified Data.ByteString.Lazy as BSL
import qualified Codec.Binary.UTF8.String as UTF
......@@ -30,7 +31,7 @@ type Search doc = "search"
-- permit to start at the x result
:> QueryParam "start" Int
-- use rows to make the request only return the x number of result
:> QueryParam "rows" Integer
:> QueryParam "rows" Int
:> Get '[JSON] (Response doc)
type Structure doc = "ref" :> "structure"
......@@ -55,7 +56,7 @@ desc = Just . Desc
-- Response type
data Response doc = Response
{ _numFound :: Integer
{ _numFound :: Int
, _start :: Int
, _docs :: [doc]
} deriving (Show, Generic)
......@@ -66,6 +67,7 @@ instance FromJSON doc => FromJSON (Response doc) where
((o .: "response") >>= (.: "numFound"))
<*> ((o .: "response") >>= (.: "start"))
<*> ((o .: "response") >>= (.: "docs"))
parseJSON ty = typeMismatch "Hal Response" ty
halAPI :: Proxy (HALAPI doc)
halAPI = Proxy
......@@ -76,7 +78,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
-> [Text] -- fq
-> Maybe SortField -- sort
-> Maybe Int -- start
-> Maybe Integer -- rows
-> Maybe Int -- rows
-> ClientM (Response doc)
structure :: (FromJSON doc, ToHttpApiData doc) =>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment