Fetch using cursors

The HAL API strongly recommends to fetch paginated results using
cursors.

https://api.archives-ouvertes.fr/docs/search#cursors

A new function 'getMetadataWithCursorC' is thus added to facilitate
this.
parent bfa9069b
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Main where
import Conduit
import Conduit ( sinkList, mapM_C, (.|), runConduit )
import Data.LanguageCodes (ISO639_1(..))
import Data.Text qualified as T
import HAL
import HAL (getMetadataWith)
import HAL.Client
import HAL (getMetadataWith, getMetadataWithC, getMetadataWithCursorC)
import HAL.Doc
import HAL.Doc.Corpus (Corpus(..))
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Options.Applicative
import Protolude
import Servant.Client
import Tree
data CountParams = CountParams
{ cp_query :: [ T.Text ] }
data FetchParams = FetchParams
{ fp_query :: [ T.Text ] }
{ fp_query :: [ T.Text ]
, fp_limit :: Integer }
data Command =
Count CountParams
......@@ -37,7 +33,8 @@ countParams = Count <$>
fetchParams :: Parser Command
fetchParams = Fetch <$>
(FetchParams
<$> many (strArgument (metavar "query")))
<$> many (strArgument (metavar "query"))
<*> option auto (long "limit"))
params :: Parser Command
params = subparser
......@@ -63,8 +60,8 @@ run (Count (CountParams { cp_query })) = do
case res of
Left err -> putText $ show err
Right (cnt, _docsC) -> putText $ show cnt
run (Fetch (FetchParams { fp_query })) = do
res <- getMetadataWithC fp_query (Just 0) Nothing Nothing
run (Fetch (FetchParams { fp_query, fp_limit })) = do
res <- getMetadataWithCursorC fp_query (Just fp_limit) Nothing
case res of
Left err -> putText $ show err
Right (_cnt, docsC) -> do
......
-- Generated by stack2cabal
with-compiler: ghc-8.10.7
packages:
./
tests: True
-- allow-older: *
allow-newer: base:*
with-compiler: ghc-9.4.7
packages: .
tests: True
-- Generated by stack2cabal
with-compiler: ghc-8.10.7
packages:
./
with-compiler: ghc-9.4.7
packages: .
tests: True
-- allow-older: *
allow-newer: base:*
......@@ -32,6 +32,7 @@ library
HAL.Doc.Corpus
HAL.Doc.EntityTree
HAL.Doc.Struct
HAL.Types
HAL.Utils
Tree
other-modules:
......
{-# LANGUAGE BangPatterns #-}
module HAL where
import Conduit
import Control.Monad
import Control.Monad.Reader
import Data.Aeson
import Conduit ( yieldMany, concatMapMC, takeC, (.|), ConduitT )
import Control.Monad.Fail (fail)
import Data.Aeson ( FromJSON )
import Data.LanguageCodes (ISO639_1(..))
import Data.Text
import HAL.Client
import HAL.Doc.Corpus
import HAL.Doc.Struct
import Data.Text qualified as T
import HAL.Client ( SortField(Asc), search, structure, searchCursor )
import HAL.Doc.Corpus ( Corpus )
import HAL.Doc.Struct ( Struct )
import HAL.Types (Response(..))
import HAL.Utils (langAbstractS, toText)
import Network.HTTP.Client (newManager, Request)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude
import Servant.API
import Servant.API ( ToHttpApiData )
import Servant.Client (BaseUrl(..), Scheme(..), ClientM, ClientError, runClientM, mkClientEnv, ClientEnv (makeClientRequest))
import System.IO.Unsafe (unsafePerformIO)
data HalCrawlerOptions
= HalCrawlerOptions
{ -- | If 'True', enable the debug logs to stdout.
......@@ -29,7 +28,7 @@ data HalCrawlerOptions
defaultHalOptions :: HalCrawlerOptions
defaultHalOptions = HalCrawlerOptions
{ _hco_debugLogs = False
, _hco_batchSize = 1000
, _hco_batchSize = 300
}
type Query = Text
......@@ -52,8 +51,12 @@ getMetadataWith :: [Query]
-- ^ An optional language for the search.
-> IO (Either ClientError (Response Corpus))
getMetadataWith qs start_ limit lang = do
runHalAPIClient defaultHalOptions $ search (Just $ requestedFields lang) (queryWithLang lang qs) Nothing start_ (fromIntegral <$> limit)
runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields lang) [] Nothing start_ (fromIntegral <$> limit)
where
q = joinQueries $ queryWithLang lang qs
-- | Fetch results, returning a Conduit stream.
-- NOTE: Prefer fetching with `getMetadataWithCursorC` instead of this function.
getMetadataWithC :: [Query]
-- ^ The textual query
-> Maybe Start
......@@ -89,7 +92,10 @@ getMetadataWithLangC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang =
( Just numResults
, yieldMany [0..]
.| takeC (fromIntegral numPages)
.| concatMapMC (getPage offset))
.| concatMapMC (getPage offset)
-- | we need takeC again, because getPage could give too many results
.| takeC (fromIntegral numResults)
)
where
offset = fromMaybe 0 mb_offset
limit = min numFound' $ fromMaybe numFound' mb_limit
......@@ -99,26 +105,108 @@ getMetadataWithLangC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang =
getPage :: Start -> Int -> IO [Corpus]
getPage start' pageNum = do
let offset = start' + pageNum * _hco_batchSize
debugLog opts $ "[getMetadataWithC] getPage: " <> show offset
eRes <- runHalAPIClient opts $ search (Just $ requestedFields lang) qs Nothing (Just offset) (Just $ fromIntegral _hco_batchSize)
debugLog opts $ "[getMetadataWithLangC] getPage: " <> show offset
eRes <- runHalAPIClient opts $ search (Just q) (Just $ requestedFields lang) [] Nothing (Just offset) (Just $ fromIntegral _hco_batchSize)
pure $ case eRes of
Left _ -> []
Right Response { _docs } -> _docs
where
q = joinQueries qs
getMetadataWithCursorC :: [Query]
-- ^ The textual query
-> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithCursorC qs limit lang = getMetadataWithCursorLangC defaultHalOptions (queryWithLang lang qs) limit lang
-- | Fetch metadata using cursors
-- https://api.archives-ouvertes.fr/docs/search#cursors
getMetadataWithCursorLangC :: HalCrawlerOptions
-- ^ The options for the crawler
-> [Query]
-- ^ The textual query
-> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithCursorLangC opts@HalCrawlerOptions { .. } qs mb_limit lang = do
-- Basically this works as follows:
-- - fetch first page with cursor = "*"
-- - get next cursor from the results
-- - feed the cursor to get next page
-- - when previous and current cursors are equal, there are no more results
-- First, estimate the total number of documents
eCount <- countResults qs
pure $ get' <$> eCount
where
q = joinQueries qs
sort_ = Just $ Asc "docid"
get' :: Count
-> (Maybe Count, ConduitT () Corpus IO ())
get' numFound' =
( Just numResults
, producer "*"
-- | we need takeC again, because getPage could give too many results
.| takeC (fromIntegral numResults)
)
where
limit = min numFound' $ fromMaybe numFound' mb_limit
numResults = limit
producer :: Text -> ConduitT () Corpus IO ()
producer cursor = do
let endpoint = searchCursor (Just q) (Just $ requestedFields lang) [] sort_ (Just $ fromIntegral _hco_batchSize) (Just cursor)
liftIO $ debugLog opts $ "[getMetadataWithCursorLangC] producer: " <> show cursor
eRes <- liftIO $ runHalAPIClient opts endpoint
case eRes of
Left err -> fail $ "error: " <> show err
Right (Response { _docs, _nextCursorMark }) -> do
yieldMany _docs
case _nextCursorMark of
Nothing -> fail "Expected next cursor mark, but got nothing"
Just nextCursor -> do
if cursor == nextCursor then
pure ()
else do
producer nextCursor
debugLog :: HalCrawlerOptions -> Text -> IO ()
debugLog HalCrawlerOptions{..} msg = when _hco_debugLogs $ putStrLn msg
countResults :: [Query] -> IO (Either ClientError Count)
countResults qs = do
-- Set rows=0 to query number of results
-- https://api.archives-ouvertes.fr/docs/search#rows
-- First, estimate the total number of documents
eRes <- runHalAPIClient defaultHalOptions $ search (Just $ requestedFields Nothing) qs Nothing (Just 0) (Just 1) :: IO (Either ClientError (Response Corpus))
eRes <- runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Corpus))
pure $ fromIntegral <$> _numFound <$> eRes
where
q = joinQueries qs
requestedFields :: Maybe ISO639_1 -> Text
requestedFields (Just EN) = "docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
requestedFields (Just lang) = "docid,title_s,en_abstract_s," <> langAbstractS lang <> ",submittedDate_s,source_s,authFullName_s,authOrganism_s"
requestedFields (Just EN) = T.intercalate "," baseFields
requestedFields (Just lang) = T.intercalate "," $ baseFields <> [langAbstractS lang]
-- "docid,title_s,en_abstract_s," <> langAbstractS lang <> ",submittedDate_s,source_s,authFullName_s,authOrganism_s"
requestedFields _ = requestedFields (Just EN)
baseFields :: [Text]
baseFields = [ "docid"
, "title_s"
, "en_abstract_s"
, "submittedDate_s"
, "source_s"
, "authFullName_s"
, "authOrganism_s" ]
structFields :: Text
structFields = "docid,label_s,parentDocid_i"
......@@ -146,7 +234,9 @@ runStructureRequest rq =
runSearchRequest :: [Text] -> IO (Either ClientError (Response Corpus))
runSearchRequest rq =
runHalAPIClient defaultHalOptions $ search (Just $ requestedFields Nothing) rq Nothing Nothing Nothing
runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing Nothing Nothing
where
q = joinQueries rq
generateRequestByStructID :: Text -> [Text] -> Text
generateRequestByStructID rq struct_ids =
......@@ -161,3 +251,7 @@ flattenPipe :: [Text] -> Text
flattenPipe [] = ""
flattenPipe (x:[]) = x
flattenPipe (x:xs) = x <> " || " <> flattenPipe xs
joinQueries :: [Text] -> Text
joinQueries = T.intercalate " AND "
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE NoMonomorphismRestriction#-}
module HAL.Client where
import Control.Lens as L (makeLenses)
import Data.Aeson
import Data.Proxy
import GHC.Generics
import Data.Aeson (FromJSON)
import HAL.Types (Response)
import Protolude
import Servant.API
import Servant.Client hiding (Response)
import Servant.Client (ClientM, client)
type HALAPI doc = Search doc
:<|> SearchCursor doc
:<|> Structure doc
type Search doc = "search"
-- q is the main query
:> QueryParam "q" Text
-- fl determine which fields will be returned it can be a list of fields or *
:> QueryParam "fl" Text -- doc
-- TODO: type this monster
......@@ -29,6 +29,23 @@ type Search doc = "search"
:> QueryParam "rows" Int
:> Get '[JSON] (Response doc)
type SearchCursor doc = "search"
-- q is the main query
:> QueryParam "q" Text
-- fl determine which fields will be returned it can be a list of fields or *
:> QueryParam "fl" Text -- doc
-- TODO: type this monster
-- fq is to filter request
:> QueryParams "fq" Text
-- pretty much clear, (Asc || Desc) + field you want to sort by
-- is required when using cursor
:> QueryParam "sort" SortField
-- permit to start at the x result
:> QueryParam "rows" Int
-- cursor
:> QueryParam "cursorMark" Text
:> Get '[JSON] (Response doc)
type Structure doc = "ref" :> "structure"
:> QueryParam "fl" Text
:> QueryParam "fq" Text
......@@ -49,37 +66,33 @@ asc = Just . Asc
desc :: Text -> Maybe SortField
desc = Just . Desc
-- Response type
data Response doc = Response
{ _numFound :: Int
, _start :: Int
, _docs :: [doc]
} deriving (Show, Generic)
L.makeLenses ''Response
instance FromJSON doc => FromJSON (Response doc) where
parseJSON = withObject "Response" $
\o -> Response
<$> ((o .: "response") >>= (.: "numFound"))
<*> ((o .: "response") >>= (.: "start"))
<*> ((o .: "response") >>= (.: "docs"))
halAPI :: Proxy (HALAPI doc)
halAPI = Proxy
-- search should always have at least `docid` and `label_s` in his fl params
search :: (FromJSON doc, ToHttpApiData doc) =>
Maybe Text -- fl
Maybe Text -- q
-> Maybe Text -- fl
-> [Text] -- fq
-> Maybe SortField -- sort
-> Maybe Int -- start
-> Maybe Int -- rows
-> ClientM (Response doc)
-- search should always have at least `docid` and `label_s` in his fl params
searchCursor :: (FromJSON doc, ToHttpApiData doc) =>
Maybe Text -- q
-> Maybe Text -- fl
-> [Text] -- fq
-> Maybe SortField -- sort
-> Maybe Int -- rows
-> Maybe Text -- cursor
-> ClientM (Response doc)
structure :: (FromJSON doc, ToHttpApiData doc) =>
Maybe Text
-> Maybe Text -- fq
-> Maybe Int -- rows
-> ClientM (Response doc)
(search :<|> structure) = client halAPI
(search :<|> searchCursor :<|> structure) = client halAPI
......@@ -3,11 +3,10 @@
module HAL.Doc.Corpus where
import Control.Lens qualified as L
import Data.Aeson
import Data.Default
import Data.Aeson ( (.:), (.:?), withObject, FromJSON(parseJSON), Object )
import Data.Default (Default(..))
import Data.Map.Strict qualified as Map
import Data.String
import GHC.Generics
import Data.String (IsString(fromString))
import HAL.Utils (allLangs, langAbstractS)
import Data.LanguageCodes (ISO639_1(..))
import Protolude
......@@ -44,7 +43,7 @@ instance FromJSON Corpus where
abstracts <-
mapM (\lang -> do
ma <- o .:? (fromString $ T.unpack $ langAbstractS lang)
ma <- o .:? fromString (T.unpack $ langAbstractS lang)
pure $ (\a -> (lang, a)) <$> ma) allLangs
let _corpus_abstract_lang_map = Map.fromList $ catMaybes abstracts
......
module HAL.Doc.EntityTree where
import Data.Aeson ((.:), (.:?), (.!=), FromJSON(..), withObject)
import Data.Default
import GHC.Generics
import Data.Default (Default(..))
import Protolude hiding (show)
import Protolude.Base (Show(..))
import Servant.API (ToHttpApiData(..))
......
......@@ -2,9 +2,8 @@
module HAL.Doc.Struct where
import Data.Aeson
import Data.Default
import GHC.Generics
import Data.Aeson (FromJSON(..), (.:), withObject)
import Data.Default (Default(..))
import Protolude
import Servant.API (ToHttpApiData(..))
import qualified Control.Lens as L
......
{-# LANGUAGE TemplateHaskell #-}
module HAL.Types where
import Control.Lens as L (makeLenses)
import Data.Aeson ((.:), (.:?), FromJSON(..), withObject)
import Protolude
-- Response type
data Response doc = Response
{ _numFound :: Int
, _start :: Int
, _docs :: [doc]
, _nextCursorMark :: Maybe Text
} deriving (Show, Generic)
L.makeLenses ''Response
instance FromJSON doc => FromJSON (Response doc) where
parseJSON = withObject "Response" $
\o -> Response
<$> ((o .: "response") >>= (.: "numFound"))
<*> ((o .: "response") >>= (.: "start"))
<*> ((o .: "response") >>= (.: "docs"))
<*> (o .:? "nextCursorMark")
......@@ -13,4 +13,4 @@ toText lang = T.pack [l1, l2]
(l1, l2) = toChars lang
langAbstractS :: ISO639_1 -> Text
langAbstractS lang = (toText lang) <> "_abstract_s"
langAbstractS lang = toText lang <> "_abstract_s"
......@@ -8,26 +8,26 @@ import Data.Map.Strict.Internal (merge, preserveMissing, zipWithMatched)
import Data.Text qualified as T
import Data.Text.Format (format)
import Data.Text.Lazy qualified as TL
import GHC.Generics
import HAL
import HAL.Client
import HAL.Doc.Struct
import HAL ( runStructureRequest )
import HAL.Types (docs)
import HAL.Doc.Struct ( Struct(Struct, _struct_docid) )
import Protolude
--import Text.Printf
formatParentIdRequest :: [Struct] -> Maybe Text
formatParentIdRequest [] = Nothing
formatParentIdRequest (x:[]) = Just . T.pack . show $ _struct_docid x
formatParentIdRequest (x:xs) =
(Just . T.pack . show $ _struct_docid x)
<> (Just " || ")
<> Just " || "
<> formatParentIdRequest xs
ds2Child :: [Struct] -> IO [Struct]
ds2Child ds = do
rs <- sequence $ runStructureRequest <$> formatedRequests ds
return . concat $ (^. docs) <$> rights rs
rs <- mapM runStructureRequest (formatedRequests ds)
return (concatMap (^. docs) (rights rs))
where formatedRequest docs' = Just "parentDocid_i:(" <> formatParentIdRequest docs' <> Just ")"
formatedRequests docs'' = formatedRequest <$> chunksOf 100 docs''
......@@ -37,7 +37,7 @@ fetchChildren ds = (ds <>) <$> (fetchChildren =<< ds2Child ds)
isChildOf :: Struct -> Struct -> Bool
isChildOf (Struct i _ _) (Struct _ _ p') =
not . null $ filter (\id -> id == (T.pack $ show i)) p'
any (\id -> id == T.pack (show i)) p'
data DocTree = DocTree Struct Int [DocTree]
deriving (Show, Generic)
......@@ -55,19 +55,24 @@ formatTree (DocTree doc depth children) =
where
addSpace :: Text
addSpace
| depth > 0 = "├" <> (T.replicate (depth * depth + depth) "─")
| depth > 0 = "├" <> T.replicate (depth * depth + depth) "─"
| otherwise = "🌲"
findDeepest :: Map Int Int -> DocTree -> Map Int Int
findDeepest m (DocTree (Struct docid _ _) depth children) =
mergeMap map1 maps
where map1 = insert docid depth m
maps = foldl mergeMap Map.empty $ findDeepest m <$> children
mergeMap m1 m2 = merge preserveMissing preserveMissing (zipWithMatched whenMatch) m1 m2
whenMatch _k v v' = if v > v' then v else v'
where
map1 :: Map Int Int
map1 = insert docid depth m
maps :: Map Int Int
maps = foldl' mergeMap Map.empty $ findDeepest m <$> children
whenMatch :: w -> Int -> Int -> Int
whenMatch _k = max
mergeMap :: Map Int Int -> Map Int Int -> Map Int Int
mergeMap = merge preserveMissing preserveMissing (zipWithMatched whenMatch)
isDeep :: Map Int Int -> DocTree -> Bool
isDeep m (DocTree (Struct id _ _) depth _) = depth >= (fromMaybe 0 $ Map.lookup id m)
isDeep m (DocTree (Struct id _ _) depth _) = depth >= fromMaybe 0 (Map.lookup id m)
removeDuplicate :: Map Int Int -> DocTree -> DocTree
removeDuplicate deepMap (DocTree doc depth children) =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment