Verified Commit 25a1e955 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'master' into refactoring

Tests added as well
parents f7b928da 06eedee4
......@@ -5,6 +5,7 @@ with-compiler: ghc-8.10.7
packages:
./
tests: True
-- allow-older: *
allow-newer: base:*
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.0.
-- This file has been generated from package.yaml by hpack version 0.35.2.
--
-- see: https://github.com/sol/hpack
--
-- hash: d5f6b9788433a3873efd55684dfd55f4d85380f24209a6a8eb8ba80c82a330f8
-- hash: 8e1f14ffe78d88b3a3011e489d4b883c7c81039249d2334d78d7dbcc155816f6
name: crawlerHAL
version: 0.1.0.0
......@@ -58,6 +58,7 @@ library
, http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, mtl >= 2.2.2 && < 2.3
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
......@@ -87,8 +88,7 @@ executable crawlerHAL-exe
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson >= 1.5.6 && < 1.6
, base >=4.7 && <5
base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.7 && < 0.7
......@@ -97,21 +97,16 @@ executable crawlerHAL-exe
, http-client >= 0.7.13.1 && < 0.8
, http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.12.0 && < 0.14
default-language: Haskell2010
test-suite halCrawler-test
type: exitcode-stdio-1.0
main-is: Spec.hs
main-is: Main.hs
other-modules:
Paths_crawlerHAL
hs-source-dirs:
......@@ -127,24 +122,11 @@ test-suite halCrawler-test
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson >= 1.5.6 && < 1.6
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.7 && < 0.7
, data-default >= 0.7.1.1 && < 0.8
, halCrawler
, http-client >= 0.7.13.1 && < 0.8
, http-client-tls >= 0.3.6.1 && < 0.4
base >=4.7 && <5
, crawlerHAL
, HUnit >= 1.6.2.0 && < 1.7
, iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.12.0 && < 0.14
, tasty >= 1.4.3 && < 1.5
, tasty-hunit >= 0.10.0.3 && < 0.11
default-language: Haskell2010
module HAL where
{-# LANGUAGE BangPatterns #-}
import Conduit
import Control.Monad
import Control.Monad.Reader
import Data.Aeson
import Data.LanguageCodes (ISO639_1(..))
import Data.Text
......@@ -8,14 +12,25 @@ import HAL.Client
import HAL.Doc.Corpus
import HAL.Doc.Struct
import HAL.Utils (langAbstractS, toText)
import Network.HTTP.Client (newManager)
import Network.HTTP.Client (newManager, Request)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude
import Servant.API
import Servant.Client (BaseUrl(..), Scheme(..), ClientM, ClientError, runClientM, mkClientEnv)
batchSize :: Int
batchSize = 1000
import Servant.Client (BaseUrl(..), Scheme(..), ClientM, ClientError, runClientM, mkClientEnv, ClientEnv (makeClientRequest))
import System.IO.Unsafe (unsafePerformIO)
data HalCrawlerOptions
= HalCrawlerOptions
{ -- | If 'True', enable the debug logs to stdout.
_hco_debugLogs :: !Bool
, _hco_batchSize :: !Int
}
defaultHalOptions :: HalCrawlerOptions
defaultHalOptions = HalCrawlerOptions
{ _hco_debugLogs = False
, _hco_batchSize = 1000
}
type Query = Text
type Start = Int
......@@ -27,61 +42,77 @@ queryWithLang Nothing qs = qs
queryWithLang (Just lang) qs = qs <> ["language_s:" <> toText lang]
getMetadataWith :: [Query]
-- ^ The textual query
-> Maybe Start
-- ^ An optional offset
-> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Response Corpus))
getMetadataWith qs start_ limit lang = do
runHalAPIClient $ search (Just $ requestedFields lang) (queryWithLang lang qs) Nothing start_ limit
runHalAPIClient defaultHalOptions $ search (Just $ requestedFields lang) (queryWithLang lang qs) Nothing start_ (fromIntegral <$> limit)
getMetadataWithC :: [Query]
-- ^ The textual query
-> Maybe Start
-- ^ An optional offset
-> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithC qs start_ limit lang = getMetadataWithLangC (queryWithLang lang qs) start_ limit lang
getMetadataWithC qs start_ limit lang = getMetadataWithLangC defaultHalOptions (queryWithLang lang qs) start_ limit lang
getMetadataWithLangC :: [Query]
getMetadataWithLangC :: HalCrawlerOptions
-- ^ The options for the crawler
-> [Query]
-- ^ The textual query
-> Maybe Start
-- ^ An optional offset
-> Maybe Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithLangC qs start_ limit lang = do
getMetadataWithLangC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang = do
-- First, estimate the total number of documents
eCount <- countResults qs
pure $ get' <$> eCount
where
get' :: Count
-> (Maybe Count, ConduitT () Corpus IO ())
get' numFound_ =
get' numFound' =
( Just numResults
, yieldMany [0..]
.| takeC (fromInteger numPages)
.| concatMapMC (getPage start'))
.| takeC (fromIntegral numPages)
.| concatMapMC (getPage offset))
where
start' = fromMaybe 0 start_
rows' = min numFound_ $ fromMaybe numFound_ limit
numResults = rows' - (fromIntegral start')
numPages = numResults `div` (fromIntegral batchSize) + 1
offset = fromMaybe 0 mb_offset
limit = min numFound' $ fromMaybe numFound' mb_limit
numResults = limit - fromIntegral offset
numPages = numResults `div` fromIntegral _hco_batchSize + 1
getPage :: Start -> Int -> IO [Corpus]
getPage start' pageNum = do
-- putText $ "requestedFields: " <> (show $ requestedFields lang)
let offset = start' + pageNum * batchSize
eRes <- runHalAPIClient $ search (Just $ requestedFields lang) qs Nothing (Just offset) (Just $ fromIntegral batchSize)
let offset = start' + pageNum * _hco_batchSize
debugLog opts $ "[getMetadataWithC] getPage: " <> show offset
eRes <- runHalAPIClient opts $ search (Just $ requestedFields lang) qs Nothing (Just offset) (Just $ fromIntegral _hco_batchSize)
pure $ case eRes of
Left _ -> []
Right (Response { _docs }) -> _docs
-- printDoc :: Corpus -> IO Corpus
-- printDoc c@(Corpus { .. }) = do
-- putText $ show _corpus_title
-- pure c
Right Response { _docs } -> _docs
debugLog :: HalCrawlerOptions -> Text -> IO ()
debugLog HalCrawlerOptions{..} msg = when _hco_debugLogs $ putStrLn msg
countResults :: [Query] -> IO (Either ClientError Count)
countResults qs = do
-- First, estimate the total number of documents
eRes <- runHalAPIClient $ search (Just $ requestedFields Nothing) qs Nothing (Just 0) (Just 1) :: IO (Either ClientError (Response Corpus))
pure $ _numFound <$> eRes
eRes <- runHalAPIClient defaultHalOptions $ search (Just $ requestedFields Nothing) qs Nothing (Just 0) (Just 1) :: IO (Either ClientError (Response Corpus))
pure $ fromIntegral <$> _numFound <$> eRes
requestedFields :: Maybe ISO639_1 -> Text
requestedFields (Just EN) = "docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
......@@ -91,19 +122,31 @@ requestedFields _ = requestedFields (Just EN)
structFields :: Text
structFields = "docid,label_s,parentDocid_i"
runHalAPIClient :: (FromJSON doc, ToHttpApiData doc) =>
ClientM (Response doc) -> IO (Either ClientError (Response doc))
runHalAPIClient cmd = do
requestLog :: HalCrawlerOptions -> Request -> Request
requestLog opts rq = unsafePerformIO $ do
debugLog opts $ "[HAL.makeClientRequestLog] " <> show rq
pure rq
{-# NOINLINE requestLog #-}
runHalAPIClient :: (FromJSON doc, ToHttpApiData doc)
=> HalCrawlerOptions
-> ClientM (Response doc)
-> IO (Either ClientError (Response doc))
runHalAPIClient opts cmd = do
manager' <- newManager tlsManagerSettings
runClientM cmd (mkClientEnv manager' $ BaseUrl Https "api.archives-ouvertes.fr" 443 "")
runClientM cmd' (mkClientEnv manager' $ BaseUrl Https "api.archives-ouvertes.fr" 443 "")
where
cmd' = local (\r -> r {
makeClientRequest = \bUrl servantRq -> requestLog opts ((makeClientRequest r) bUrl servantRq)
}) cmd
runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Struct))
runStructureRequest rq =
runHalAPIClient $ structure (Just structFields) rq (Just 10000)
runHalAPIClient defaultHalOptions $ structure (Just structFields) rq (Just 10000)
runSearchRequest :: [Text] -> Maybe ISO639_1 -> IO (Either ClientError (Response Corpus))
runSearchRequest rq lang =
runHalAPIClient $ search (Just $ requestedFields lang) rq Nothing Nothing Nothing
runSearchRequest :: [Text] -> IO (Either ClientError (Response Corpus))
runSearchRequest rq =
runHalAPIClient defaultHalOptions $ search (Just $ requestedFields Nothing) rq Nothing Nothing Nothing
generateRequestByStructID :: Text -> [Text] -> Text
generateRequestByStructID rq struct_ids =
......
......@@ -11,6 +11,7 @@ import Protolude
import Servant.API
import Servant.Client hiding (Response)
type HALAPI doc = Search doc
:<|> Structure doc
......@@ -25,7 +26,7 @@ type Search doc = "search"
-- permit to start at the x result
:> QueryParam "start" Int
-- use rows to make the request only return the x number of result
:> QueryParam "rows" Integer
:> QueryParam "rows" Int
:> Get '[JSON] (Response doc)
type Structure doc = "ref" :> "structure"
......@@ -50,7 +51,7 @@ desc = Just . Desc
-- Response type
data Response doc = Response
{ _numFound :: Integer
{ _numFound :: Int
, _start :: Int
, _docs :: [doc]
} deriving (Show, Generic)
......@@ -72,7 +73,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
-> [Text] -- fq
-> Maybe SortField -- sort
-> Maybe Int -- start
-> Maybe Integer -- rows
-> Maybe Int -- rows
-> ClientM (Response doc)
structure :: (FromJSON doc, ToHttpApiData doc) =>
......
module Main where
import Data.LanguageCodes (ISO639_1(..))
import HAL qualified
import HAL.Utils qualified as HAL
import Protolude
import Test.Tasty
import Test.Tasty.HUnit
main :: IO ()
main = defaultMain unitTests
unitTests = testGroup "unit tests"
[ testCase "Check langAbstractS" $ do
"pl_abstract_s" @?= (HAL.langAbstractS PL)
"en_abstract_s" @?= (HAL.langAbstractS EN)
"fr_abstract_s" @?= (HAL.langAbstractS FR) ]
main :: IO ()
main = putStrLn "Test suite not yet implemented"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment