Verified Commit 25a1e955 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'master' into refactoring

Tests added as well
parents f7b928da 06eedee4
...@@ -5,6 +5,7 @@ with-compiler: ghc-8.10.7 ...@@ -5,6 +5,7 @@ with-compiler: ghc-8.10.7
packages: packages:
./ ./
tests: True
-- allow-older: * -- allow-older: *
allow-newer: base:* allow-newer: base:*
cabal-version: 1.12 cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.0. -- This file has been generated from package.yaml by hpack version 0.35.2.
-- --
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
-- --
-- hash: d5f6b9788433a3873efd55684dfd55f4d85380f24209a6a8eb8ba80c82a330f8 -- hash: 8e1f14ffe78d88b3a3011e489d4b883c7c81039249d2334d78d7dbcc155816f6
name: crawlerHAL name: crawlerHAL
version: 0.1.0.0 version: 0.1.0.0
...@@ -58,6 +58,7 @@ library ...@@ -58,6 +58,7 @@ library
, http-client-tls >= 0.3.6.1 && < 0.4 , http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2 , iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3 , lens >= 5.1.1 && < 5.3
, mtl >= 2.2.2 && < 2.3
, optparse-applicative >= 0.17 && < 0.19 , optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4 , protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4 , scientific >= 0.3.7.0 && < 0.4
...@@ -87,8 +88,7 @@ executable crawlerHAL-exe ...@@ -87,8 +88,7 @@ executable crawlerHAL-exe
TypeOperators TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
aeson >= 1.5.6 && < 1.6 base >=4.7 && <5
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13 , bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4 , conduit >= 1.3.5 && < 1.4
, containers >= 0.6.7 && < 0.7 , containers >= 0.6.7 && < 0.7
...@@ -97,21 +97,16 @@ executable crawlerHAL-exe ...@@ -97,21 +97,16 @@ executable crawlerHAL-exe
, http-client >= 0.7.13.1 && < 0.8 , http-client >= 0.7.13.1 && < 0.8
, http-client-tls >= 0.3.6.1 && < 0.4 , http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2 , iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, optparse-applicative >= 0.17 && < 0.19 , optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4 , protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21 , servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3 , split >= 0.2.3.5 && < 0.3
, text >= 1.2.3.0 && < 2.1 , text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.12.0 && < 0.14
default-language: Haskell2010 default-language: Haskell2010
test-suite halCrawler-test test-suite halCrawler-test
type: exitcode-stdio-1.0 type: exitcode-stdio-1.0
main-is: Spec.hs main-is: Main.hs
other-modules: other-modules:
Paths_crawlerHAL Paths_crawlerHAL
hs-source-dirs: hs-source-dirs:
...@@ -127,24 +122,11 @@ test-suite halCrawler-test ...@@ -127,24 +122,11 @@ test-suite halCrawler-test
TypeOperators TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
aeson >= 1.5.6 && < 1.6 base >=4.7 && <5
, base >=4.7 && <5 , crawlerHAL
, bytestring >= 0.11.0 && < 0.13 , HUnit >= 1.6.2.0 && < 1.7
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.7 && < 0.7
, data-default >= 0.7.1.1 && < 0.8
, halCrawler
, http-client >= 0.7.13.1 && < 0.8
, http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2 , iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4 , protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4 , tasty >= 1.4.3 && < 1.5
, servant >= 0.19 && < 0.21 , tasty-hunit >= 0.10.0.3 && < 0.11
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.12.0 && < 0.14
default-language: Haskell2010 default-language: Haskell2010
module HAL where module HAL where
{-# LANGUAGE BangPatterns #-}
import Conduit import Conduit
import Control.Monad
import Control.Monad.Reader
import Data.Aeson import Data.Aeson
import Data.LanguageCodes (ISO639_1(..)) import Data.LanguageCodes (ISO639_1(..))
import Data.Text import Data.Text
...@@ -8,14 +12,25 @@ import HAL.Client ...@@ -8,14 +12,25 @@ import HAL.Client
import HAL.Doc.Corpus import HAL.Doc.Corpus
import HAL.Doc.Struct import HAL.Doc.Struct
import HAL.Utils (langAbstractS, toText) import HAL.Utils (langAbstractS, toText)
import Network.HTTP.Client (newManager) import Network.HTTP.Client (newManager, Request)
import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude import Protolude
import Servant.API import Servant.API
import Servant.Client (BaseUrl(..), Scheme(..), ClientM, ClientError, runClientM, mkClientEnv) import Servant.Client (BaseUrl(..), Scheme(..), ClientM, ClientError, runClientM, mkClientEnv, ClientEnv (makeClientRequest))
import System.IO.Unsafe (unsafePerformIO)
batchSize :: Int
batchSize = 1000 data HalCrawlerOptions
= HalCrawlerOptions
{ -- | If 'True', enable the debug logs to stdout.
_hco_debugLogs :: !Bool
, _hco_batchSize :: !Int
}
defaultHalOptions :: HalCrawlerOptions
defaultHalOptions = HalCrawlerOptions
{ _hco_debugLogs = False
, _hco_batchSize = 1000
}
type Query = Text type Query = Text
type Start = Int type Start = Int
...@@ -27,61 +42,77 @@ queryWithLang Nothing qs = qs ...@@ -27,61 +42,77 @@ queryWithLang Nothing qs = qs
queryWithLang (Just lang) qs = qs <> ["language_s:" <> toText lang] queryWithLang (Just lang) qs = qs <> ["language_s:" <> toText lang]
getMetadataWith :: [Query] getMetadataWith :: [Query]
-- ^ The textual query
-> Maybe Start -> Maybe Start
-- ^ An optional offset
-> Maybe Limit -> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1 -> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Response Corpus)) -> IO (Either ClientError (Response Corpus))
getMetadataWith qs start_ limit lang = do getMetadataWith qs start_ limit lang = do
runHalAPIClient $ search (Just $ requestedFields lang) (queryWithLang lang qs) Nothing start_ limit runHalAPIClient defaultHalOptions $ search (Just $ requestedFields lang) (queryWithLang lang qs) Nothing start_ (fromIntegral <$> limit)
getMetadataWithC :: [Query] getMetadataWithC :: [Query]
-- ^ The textual query
-> Maybe Start -> Maybe Start
-- ^ An optional offset
-> Maybe Limit -> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1 -> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ())) -> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithC qs start_ limit lang = getMetadataWithLangC (queryWithLang lang qs) start_ limit lang getMetadataWithC qs start_ limit lang = getMetadataWithLangC defaultHalOptions (queryWithLang lang qs) start_ limit lang
getMetadataWithLangC :: [Query] getMetadataWithLangC :: HalCrawlerOptions
-- ^ The options for the crawler
-> [Query]
-- ^ The textual query
-> Maybe Start -> Maybe Start
-- ^ An optional offset
-> Maybe Limit -> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1 -> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ())) -> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithLangC qs start_ limit lang = do getMetadataWithLangC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang = do
-- First, estimate the total number of documents -- First, estimate the total number of documents
eCount <- countResults qs eCount <- countResults qs
pure $ get' <$> eCount pure $ get' <$> eCount
where where
get' :: Count get' :: Count
-> (Maybe Count, ConduitT () Corpus IO ()) -> (Maybe Count, ConduitT () Corpus IO ())
get' numFound_ = get' numFound' =
( Just numResults ( Just numResults
, yieldMany [0..] , yieldMany [0..]
.| takeC (fromInteger numPages) .| takeC (fromIntegral numPages)
.| concatMapMC (getPage start')) .| concatMapMC (getPage offset))
where where
start' = fromMaybe 0 start_ offset = fromMaybe 0 mb_offset
rows' = min numFound_ $ fromMaybe numFound_ limit limit = min numFound' $ fromMaybe numFound' mb_limit
numResults = rows' - (fromIntegral start') numResults = limit - fromIntegral offset
numPages = numResults `div` (fromIntegral batchSize) + 1 numPages = numResults `div` fromIntegral _hco_batchSize + 1
getPage :: Start -> Int -> IO [Corpus] getPage :: Start -> Int -> IO [Corpus]
getPage start' pageNum = do getPage start' pageNum = do
-- putText $ "requestedFields: " <> (show $ requestedFields lang) let offset = start' + pageNum * _hco_batchSize
let offset = start' + pageNum * batchSize debugLog opts $ "[getMetadataWithC] getPage: " <> show offset
eRes <- runHalAPIClient $ search (Just $ requestedFields lang) qs Nothing (Just offset) (Just $ fromIntegral batchSize) eRes <- runHalAPIClient opts $ search (Just $ requestedFields lang) qs Nothing (Just offset) (Just $ fromIntegral _hco_batchSize)
pure $ case eRes of pure $ case eRes of
Left _ -> [] Left _ -> []
Right (Response { _docs }) -> _docs Right Response { _docs } -> _docs
-- printDoc :: Corpus -> IO Corpus
-- printDoc c@(Corpus { .. }) = do debugLog :: HalCrawlerOptions -> Text -> IO ()
-- putText $ show _corpus_title debugLog HalCrawlerOptions{..} msg = when _hco_debugLogs $ putStrLn msg
-- pure c
countResults :: [Query] -> IO (Either ClientError Count) countResults :: [Query] -> IO (Either ClientError Count)
countResults qs = do countResults qs = do
-- First, estimate the total number of documents -- First, estimate the total number of documents
eRes <- runHalAPIClient $ search (Just $ requestedFields Nothing) qs Nothing (Just 0) (Just 1) :: IO (Either ClientError (Response Corpus)) eRes <- runHalAPIClient defaultHalOptions $ search (Just $ requestedFields Nothing) qs Nothing (Just 0) (Just 1) :: IO (Either ClientError (Response Corpus))
pure $ _numFound <$> eRes pure $ fromIntegral <$> _numFound <$> eRes
requestedFields :: Maybe ISO639_1 -> Text requestedFields :: Maybe ISO639_1 -> Text
requestedFields (Just EN) = "docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s" requestedFields (Just EN) = "docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
...@@ -91,19 +122,31 @@ requestedFields _ = requestedFields (Just EN) ...@@ -91,19 +122,31 @@ requestedFields _ = requestedFields (Just EN)
structFields :: Text structFields :: Text
structFields = "docid,label_s,parentDocid_i" structFields = "docid,label_s,parentDocid_i"
runHalAPIClient :: (FromJSON doc, ToHttpApiData doc) => requestLog :: HalCrawlerOptions -> Request -> Request
ClientM (Response doc) -> IO (Either ClientError (Response doc)) requestLog opts rq = unsafePerformIO $ do
runHalAPIClient cmd = do debugLog opts $ "[HAL.makeClientRequestLog] " <> show rq
pure rq
{-# NOINLINE requestLog #-}
runHalAPIClient :: (FromJSON doc, ToHttpApiData doc)
=> HalCrawlerOptions
-> ClientM (Response doc)
-> IO (Either ClientError (Response doc))
runHalAPIClient opts cmd = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
runClientM cmd (mkClientEnv manager' $ BaseUrl Https "api.archives-ouvertes.fr" 443 "") runClientM cmd' (mkClientEnv manager' $ BaseUrl Https "api.archives-ouvertes.fr" 443 "")
where
cmd' = local (\r -> r {
makeClientRequest = \bUrl servantRq -> requestLog opts ((makeClientRequest r) bUrl servantRq)
}) cmd
runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Struct)) runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Struct))
runStructureRequest rq = runStructureRequest rq =
runHalAPIClient $ structure (Just structFields) rq (Just 10000) runHalAPIClient defaultHalOptions $ structure (Just structFields) rq (Just 10000)
runSearchRequest :: [Text] -> Maybe ISO639_1 -> IO (Either ClientError (Response Corpus)) runSearchRequest :: [Text] -> IO (Either ClientError (Response Corpus))
runSearchRequest rq lang = runSearchRequest rq =
runHalAPIClient $ search (Just $ requestedFields lang) rq Nothing Nothing Nothing runHalAPIClient defaultHalOptions $ search (Just $ requestedFields Nothing) rq Nothing Nothing Nothing
generateRequestByStructID :: Text -> [Text] -> Text generateRequestByStructID :: Text -> [Text] -> Text
generateRequestByStructID rq struct_ids = generateRequestByStructID rq struct_ids =
......
...@@ -11,6 +11,7 @@ import Protolude ...@@ -11,6 +11,7 @@ import Protolude
import Servant.API import Servant.API
import Servant.Client hiding (Response) import Servant.Client hiding (Response)
type HALAPI doc = Search doc type HALAPI doc = Search doc
:<|> Structure doc :<|> Structure doc
...@@ -25,7 +26,7 @@ type Search doc = "search" ...@@ -25,7 +26,7 @@ type Search doc = "search"
-- permit to start at the x result -- permit to start at the x result
:> QueryParam "start" Int :> QueryParam "start" Int
-- use rows to make the request only return the x number of result -- use rows to make the request only return the x number of result
:> QueryParam "rows" Integer :> QueryParam "rows" Int
:> Get '[JSON] (Response doc) :> Get '[JSON] (Response doc)
type Structure doc = "ref" :> "structure" type Structure doc = "ref" :> "structure"
...@@ -50,7 +51,7 @@ desc = Just . Desc ...@@ -50,7 +51,7 @@ desc = Just . Desc
-- Response type -- Response type
data Response doc = Response data Response doc = Response
{ _numFound :: Integer { _numFound :: Int
, _start :: Int , _start :: Int
, _docs :: [doc] , _docs :: [doc]
} deriving (Show, Generic) } deriving (Show, Generic)
...@@ -72,7 +73,7 @@ search :: (FromJSON doc, ToHttpApiData doc) => ...@@ -72,7 +73,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
-> [Text] -- fq -> [Text] -- fq
-> Maybe SortField -- sort -> Maybe SortField -- sort
-> Maybe Int -- start -> Maybe Int -- start
-> Maybe Integer -- rows -> Maybe Int -- rows
-> ClientM (Response doc) -> ClientM (Response doc)
structure :: (FromJSON doc, ToHttpApiData doc) => structure :: (FromJSON doc, ToHttpApiData doc) =>
......
module Main where
import Data.LanguageCodes (ISO639_1(..))
import HAL qualified
import HAL.Utils qualified as HAL
import Protolude
import Test.Tasty
import Test.Tasty.HUnit
main :: IO ()
main = defaultMain unitTests
unitTests = testGroup "unit tests"
[ testCase "Check langAbstractS" $ do
"pl_abstract_s" @?= (HAL.langAbstractS PL)
"en_abstract_s" @?= (HAL.langAbstractS EN)
"fr_abstract_s" @?= (HAL.langAbstractS FR) ]
main :: IO ()
main = putStrLn "Test suite not yet implemented"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment