Use fq for lang filter

Also, some refactoring for app (add debug etc).
parent 5edaca5c
......@@ -5,21 +5,24 @@ module Main where
import Conduit ( sinkList, mapM_C, (.|), runConduit )
import Data.LanguageCodes (ISO639_1(..))
import Data.Text qualified as T
import HAL (getMetadataWith, getMetadataWithC, getMetadataWithCursorC)
import HAL (getMetadataWithCursorOptsC, countResultsOpts', HalCrawlerOptions(..), defaultHalOptions)
import HAL.Doc
import HAL.Doc.Corpus (Corpus(..))
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Options.Applicative
import Prelude qualified
import Protolude
data CountParams = CountParams
{ cp_query :: [ T.Text ] }
{ cp_query :: T.Text
, cp_lang :: Maybe ISO639_1 }
data FetchParams = FetchParams
{ fp_query :: [ T.Text ]
, fp_limit :: Integer }
{ fp_query :: T.Text
, fp_limit :: Integer
, fp_lang :: Maybe ISO639_1 }
data Command =
Count CountParams
......@@ -28,14 +31,19 @@ data Command =
countParams :: Parser Command
countParams = Count <$>
(CountParams
<$> many (strArgument (metavar "query")))
<$> strArgument (metavar "query")
<*> optional (option (maybeReader readLang) (long "lang")))
fetchParams :: Parser Command
fetchParams = Fetch <$>
(FetchParams
<$> many (strArgument (metavar "query"))
<*> option auto (long "limit"))
<$> strArgument (metavar "query")
<*> option auto (long "limit")
<*> optional(option (maybeReader readLang) (long "lang")))
readLang :: Prelude.String -> Maybe ISO639_1
readLang = readMaybe
params :: Parser Command
params = subparser
(command "count" (info countParams (progDesc "Count number of docs for a given query"))
......@@ -55,13 +63,15 @@ main = run =<< execParser opts
-- (Right val) -> print $ _docs val
run :: Command -> IO ()
run (Count (CountParams { cp_query })) = do
res <- getMetadataWithC cp_query (Just 0) Nothing Nothing
run (Count (CountParams { cp_query, cp_lang })) = do
res <- countResultsOpts' opts cp_query cp_lang
case res of
Left err -> putText $ show err
Right (cnt, _docsC) -> putText $ show cnt
run (Fetch (FetchParams { fp_query, fp_limit })) = do
res <- getMetadataWithCursorC fp_query (Just fp_limit) Nothing
Right cnt -> putText $ show cnt
where
opts = defaultHalOptions { _hco_debugLogs = True }
run (Fetch (FetchParams { fp_query, fp_limit, fp_lang })) = do
res <- getMetadataWithCursorOptsC opts fp_query (Just fp_limit) fp_lang
case res of
Left err -> putText $ show err
Right (_cnt, docsC) -> do
......@@ -70,13 +80,14 @@ run (Fetch (FetchParams { fp_query, fp_limit })) = do
.| mapM_C printCorpus
.| sinkList
pure ()
where
printCorpus Corpus { .. } = do
putText $ "docid: " <> _corpus_docid <> " [" <> (T.intercalate " " _corpus_title) <> "]"
putText $ " " <> (T.intercalate " " _corpus_abstract)
putText $ " " <> show _corpus_abstract_lang_map
putText $ " " <> show _corpus_original
putText "------------"
where
opts = defaultHalOptions { _hco_debugLogs = True }
printCorpus Corpus { .. } = do
putText $ "docid: " <> _corpus_docid <> " [" <> (T.intercalate " " _corpus_title) <> "]"
putText $ " " <> (T.intercalate " " _corpus_abstract)
putText $ " " <> show _corpus_abstract_lang_map
putText $ " " <> show _corpus_original
putText "------------"
-- data
......
......@@ -36,9 +36,95 @@ type Start = Int
type Limit = Integer
type Count = Integer
queryWithLang :: Maybe ISO639_1 -> [Query] -> [Query]
queryWithLang Nothing qs = qs
queryWithLang (Just lang) qs = qs <> ["language_s:" <> toText lang]
queryWithLang :: Maybe ISO639_1 -> [Query]
queryWithLang Nothing = []
queryWithLang (Just lang) = ["language_s:" <> toText lang]
getMetadataWithCursorC :: Query
-- ^ The textual query
-> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithCursorC = getMetadataWithCursorOptsC defaultHalOptions
-- | Fetch metadata using cursors
-- https://api.archives-ouvertes.fr/docs/search#cursors
getMetadataWithCursorOptsC :: HalCrawlerOptions
-- ^ The options for the crawler
-> Query
-- ^ The textual query
-> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do
-- Basically this works as follows:
-- - fetch first page with cursor = "*"
-- - get next cursor from the results
-- - feed the cursor to get next page
-- - when previous and current cursors are equal, there are no more results
-- First, estimate the total number of documents
eCount <- countResultsOpts' opts q lang
pure $ get' <$> eCount
where
sort_ = Just $ Asc "docid"
fq = queryWithLang lang
get' :: Count
-> (Maybe Count, ConduitT () Corpus IO ())
get' numFound' =
( Just numResults
, producer "*"
-- | we need takeC again, because getPage could give too many results
.| takeC (fromIntegral numResults)
)
where
limit = min numFound' $ fromMaybe numFound' mb_limit
numResults = limit
producer :: Text -> ConduitT () Corpus IO ()
producer cursor = do
let endpoint = searchCursor (Just q) (Just $ requestedFields lang) fq sort_ (Just $ fromIntegral _hco_batchSize) (Just cursor)
liftIO $ debugLog opts $ "[getMetadataWithCursorLangC] producer: " <> show cursor
eRes <- liftIO $ runHalAPIClient opts endpoint
case eRes of
Left err -> fail $ "error: " <> show err
Right (Response { _docs, _nextCursorMark }) -> do
yieldMany _docs
case _nextCursorMark of
Nothing -> fail "Expected next cursor mark, but got nothing"
Just nextCursor -> do
if cursor == nextCursor then
pure ()
else do
producer nextCursor
countResults' :: Query -> Maybe ISO639_1 -> IO (Either ClientError Count)
countResults' q lang = do
countResultsOpts' defaultHalOptions q lang
countResultsOpts' :: HalCrawlerOptions -> Query -> Maybe ISO639_1 -> IO (Either ClientError Count)
countResultsOpts' opts q lang = do
-- Set rows=0 to query number of results
-- https://api.archives-ouvertes.fr/docs/search#rows
-- First, estimate the total number of documents
eRes <- runHalAPIClient opts $ search (Just q) (Just $ requestedFields Nothing) fq Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Corpus))
pure (fromIntegral . _numFound <$> eRes)
where
fq = queryWithLang lang
--- SOME OTHER FETCH FUNCTIONS
getMetadataWith :: [Query]
-- ^ The textual query
......@@ -53,7 +139,7 @@ getMetadataWith :: [Query]
getMetadataWith qs start_ limit lang = do
runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields lang) [] Nothing start_ (fromIntegral <$> limit)
where
q = joinQueries $ queryWithLang lang qs
q = joinQueries $ qs <> queryWithLang lang
-- | Fetch results, returning a Conduit stream.
-- NOTE: Prefer fetching with `getMetadataWithCursorC` instead of this function.
......@@ -67,9 +153,9 @@ getMetadataWithC :: [Query]
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithC qs start_ limit lang = getMetadataWithLangC defaultHalOptions (queryWithLang lang qs) start_ limit lang
getMetadataWithC qs start_ limit lang = getMetadataWithOptsC defaultHalOptions (qs <> queryWithLang lang) start_ limit lang
getMetadataWithLangC :: HalCrawlerOptions
getMetadataWithOptsC :: HalCrawlerOptions
-- ^ The options for the crawler
-> [Query]
-- ^ The textual query
......@@ -81,7 +167,7 @@ getMetadataWithLangC :: HalCrawlerOptions
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithLangC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang = do
getMetadataWithOptsC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang = do
-- First, estimate the total number of documents
eCount <- countResults qs
pure $ get' <$> eCount
......@@ -113,72 +199,7 @@ getMetadataWithLangC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang =
where
q = joinQueries qs
getMetadataWithCursorC :: [Query]
-- ^ The textual query
-> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithCursorC qs limit lang = getMetadataWithCursorLangC defaultHalOptions (queryWithLang lang qs) limit lang
-- | Fetch metadata using cursors
-- https://api.archives-ouvertes.fr/docs/search#cursors
getMetadataWithCursorLangC :: HalCrawlerOptions
-- ^ The options for the crawler
-> [Query]
-- ^ The textual query
-> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithCursorLangC opts@HalCrawlerOptions { .. } qs mb_limit lang = do
-- Basically this works as follows:
-- - fetch first page with cursor = "*"
-- - get next cursor from the results
-- - feed the cursor to get next page
-- - when previous and current cursors are equal, there are no more results
-- First, estimate the total number of documents
eCount <- countResults qs
pure $ get' <$> eCount
where
q = joinQueries qs
sort_ = Just $ Asc "docid"
get' :: Count
-> (Maybe Count, ConduitT () Corpus IO ())
get' numFound' =
( Just numResults
, producer "*"
-- | we need takeC again, because getPage could give too many results
.| takeC (fromIntegral numResults)
)
where
limit = min numFound' $ fromMaybe numFound' mb_limit
numResults = limit
producer :: Text -> ConduitT () Corpus IO ()
producer cursor = do
let endpoint = searchCursor (Just q) (Just $ requestedFields lang) [] sort_ (Just $ fromIntegral _hco_batchSize) (Just cursor)
liftIO $ debugLog opts $ "[getMetadataWithCursorLangC] producer: " <> show cursor
eRes <- liftIO $ runHalAPIClient opts endpoint
case eRes of
Left err -> fail $ "error: " <> show err
Right (Response { _docs, _nextCursorMark }) -> do
yieldMany _docs
case _nextCursorMark of
Nothing -> fail "Expected next cursor mark, but got nothing"
Just nextCursor -> do
if cursor == nextCursor then
pure ()
else do
producer nextCursor
debugLog :: HalCrawlerOptions -> Text -> IO ()
debugLog HalCrawlerOptions{..} msg = when _hco_debugLogs $ putStrLn msg
......@@ -188,10 +209,14 @@ countResults qs = do
-- https://api.archives-ouvertes.fr/docs/search#rows
-- First, estimate the total number of documents
eRes <- runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Corpus))
pure $ fromIntegral <$> _numFound <$> eRes
pure (fromIntegral . _numFound <$> eRes)
where
q = joinQueries qs
requestedFields :: Maybe ISO639_1 -> Text
requestedFields (Just EN) = T.intercalate "," baseFields
requestedFields (Just lang) = T.intercalate "," $ baseFields <> [langAbstractS lang]
......@@ -225,7 +250,7 @@ runHalAPIClient opts cmd = do
runClientM cmd' (mkClientEnv manager' $ BaseUrl Https "api.archives-ouvertes.fr" 443 "")
where
cmd' = local (\r -> r {
makeClientRequest = \bUrl servantRq -> requestLog opts ((makeClientRequest r) bUrl servantRq)
makeClientRequest = \bUrl servantRq -> requestLog opts (makeClientRequest r bUrl servantRq)
}) cmd
runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Struct))
......@@ -244,12 +269,12 @@ generateRequestByStructID rq struct_ids =
rq
<> " AND "
<> "structId_i:("
<> (flattenPipe struct_ids)
<> flattenPipe struct_ids
<> ")"
flattenPipe :: [Text] -> Text
flattenPipe [] = ""
flattenPipe (x:[]) = x
flattenPipe [x] = x
flattenPipe (x:xs) = x <> " || " <> flattenPipe xs
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment