Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
hal
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
hal
Commits
06eedee4
Commit
06eedee4
authored
Jun 13, 2023
by
Alfredo Di Napoli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Experiment with request logging
parent
fe4e032c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
38 additions
and
26 deletions
+38
-26
crawlerHAL.cabal
crawlerHAL.cabal
+5
-2
package.yaml
package.yaml
+1
-0
HAL.hs
src/HAL.hs
+32
-24
No files found.
crawlerHAL.cabal
View file @
06eedee4
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.
0
.
-- This file has been generated from package.yaml by hpack version 0.35.
2
.
--
-- see: https://github.com/sol/hpack
--
-- hash:
d5f6b9788433a3873efd55684dfd55f4d85380f24209a6a8eb8ba80c82a330f8
-- hash:
8e1f14ffe78d88b3a3011e489d4b883c7c81039249d2334d78d7dbcc155816f6
name: crawlerHAL
version: 0.1.0.0
...
...
@@ -53,6 +53,7 @@ library
, http-client
, http-client-tls
, lens
, mtl
, neat-interpolation
, optparse-applicative
, scientific
...
...
@@ -89,6 +90,7 @@ executable crawlerHAL-exe
, http-client
, http-client-tls
, lens
, mtl
, neat-interpolation
, optparse-applicative
, scientific
...
...
@@ -126,6 +128,7 @@ test-suite halCrawler-test
, http-client
, http-client-tls
, lens
, mtl
, neat-interpolation
, optparse-applicative
, scientific
...
...
package.yaml
View file @
06eedee4
...
...
@@ -30,6 +30,7 @@ dependencies:
-
http-client-tls
-
lens
-
neat-interpolation
-
mtl
-
optparse-applicative
-
scientific
-
servant
...
...
src/HAL.hs
View file @
06eedee4
...
...
@@ -4,24 +4,21 @@ module HAL where
import
Conduit
import
Control.Monad
import
Control.Monad.Reader
import
Data.Aeson
import
Data.Default
(
def
)
import
Data.Maybe
(
fromMaybe
)
import
Data.Either
(
fromRight
)
import
Data.Maybe
(
fromMaybe
)
import
Data.Text
import
Network.HTTP.Client
(
newManager
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Servant.Client
(
BaseUrl
(
..
),
Scheme
(
..
),
ClientM
,
ClientError
,
runClientM
,
mkClientEnv
)
import
HAL.Client
import
HAL.Doc.Corpus
import
HAL.Doc.Struct
import
Network.HTTP.Client
(
newManager
,
Request
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Prelude
import
Servant.API
import
Data.Aeson
import
Servant.Client
(
BaseUrl
(
..
),
Scheme
(
..
),
ClientM
,
ClientError
,
runClientM
,
mkClientEnv
,
ClientEnv
(
makeClientRequest
))
import
System.IO.Unsafe
(
unsafePerformIO
)
data
HalCrawlerOptions
=
HalCrawlerOptions
...
...
@@ -64,23 +61,22 @@ getMetadataWithOptionsC opts@HalCrawlerOptions{..} q mb_offset mb_limit = do
numResults
=
limit
-
offset
numPages
=
numResults
`
div
`
_hco_batchSize
+
1
debugLog
::
String
->
IO
()
debugLog
msg
=
when
_hco_debugLogs
$
putStrLn
msg
getPage
::
Text
->
Int
->
Int
->
IO
[
Corpus
]
getPage
q
start
pageNum
=
do
let
offset
=
start
+
pageNum
*
_hco_batchSize
debugLog
$
"[getMetadataWithC] getPage: "
<>
show
offset
eRes
<-
runHalAPIClient
$
search
(
Just
requestedFields
)
[
q
]
Nothing
(
Just
offset
)
(
Just
_hco_batchSize
)
debugLog
opts
$
"[getMetadataWithC] getPage: "
<>
show
offset
eRes
<-
runHalAPIClient
opts
$
search
(
Just
requestedFields
)
[
q
]
Nothing
(
Just
offset
)
(
Just
_hco_batchSize
)
pure
$
case
eRes
of
Left
_
->
[]
Right
(
Response
{
_docs
})
->
_docs
printDoc
::
Corpus
->
IO
Corpus
printDoc
c
@
(
Corpus
{
_corpus_docid
,
_corpus_title
})
=
do
debugLog
$
show
_corpus_title
debugLog
opts
$
show
_corpus_title
pure
c
debugLog
::
HalCrawlerOptions
->
String
->
IO
()
debugLog
HalCrawlerOptions
{
..
}
msg
=
when
_hco_debugLogs
$
putStrLn
msg
getMetadataWith
::
Text
-- ^ The query, as a text.
...
...
@@ -91,7 +87,7 @@ getMetadataWith :: Text
->
IO
(
Either
ClientError
(
Response
Corpus
))
getMetadataWith
q
start
rows
=
do
manager'
<-
newManager
tlsManagerSettings
runHalAPIClient
$
search
(
Just
requestedFields
)
[
q
]
Nothing
start
rows
runHalAPIClient
defaultHalOptions
$
search
(
Just
requestedFields
)
[
q
]
Nothing
start
rows
getMetadataWithC
::
Text
-- ^ The textual query
...
...
@@ -107,7 +103,7 @@ countResults :: Text -> IO (Either ClientError Int)
countResults
q
=
do
manager'
<-
newManager
tlsManagerSettings
-- First, estimate the total number of documents
eRes
<-
runHalAPIClient
$
search
(
Just
requestedFields
)
[
q
]
Nothing
(
Just
0
)
(
Just
1
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
eRes
<-
runHalAPIClient
defaultHalOptions
$
search
(
Just
requestedFields
)
[
q
]
Nothing
(
Just
0
)
(
Just
1
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
pure
$
_numFound
<$>
eRes
requestedFields
::
Text
...
...
@@ -116,19 +112,31 @@ requestedFields = "docid,title_s,abstract_s,submittedDate_s,source_s,authFullNam
structFields
::
Text
structFields
=
"docid,label_s,parentDocid_i"
runHalAPIClient
::
(
FromJSON
doc
,
ToHttpApiData
doc
)
=>
ClientM
(
Response
doc
)
->
IO
(
Either
ClientError
(
Response
doc
))
runHalAPIClient
cmd
=
do
requestLog
::
HalCrawlerOptions
->
Request
->
Request
requestLog
opts
rq
=
unsafePerformIO
$
do
debugLog
opts
$
"[HAL.makeClientRequestLog] "
<>
show
rq
pure
rq
{-# NOINLINE requestLog #-}
runHalAPIClient
::
(
FromJSON
doc
,
ToHttpApiData
doc
)
=>
HalCrawlerOptions
->
ClientM
(
Response
doc
)
->
IO
(
Either
ClientError
(
Response
doc
))
runHalAPIClient
opts
cmd
=
do
manager'
<-
newManager
tlsManagerSettings
runClientM
cmd
(
mkClientEnv
manager'
$
BaseUrl
Https
"api.archives-ouvertes.fr"
443
""
)
runClientM
cmd'
(
mkClientEnv
manager'
$
BaseUrl
Https
"api.archives-ouvertes.fr"
443
""
)
where
cmd'
=
local
(
\
r
->
r
{
makeClientRequest
=
\
bUrl
servantRq
->
requestLog
opts
((
makeClientRequest
r
)
bUrl
servantRq
)
})
cmd
runStructureRequest
::
Maybe
Text
->
IO
(
Either
ClientError
(
Response
Struct
))
runStructureRequest
rq
=
runHalAPIClient
$
structure
(
Just
structFields
)
rq
(
Just
10000
)
runHalAPIClient
defaultHalOptions
$
structure
(
Just
structFields
)
rq
(
Just
10000
)
runSearchRequest
::
[
Text
]
->
IO
(
Either
ClientError
(
Response
Corpus
))
runSearchRequest
rq
=
runHalAPIClient
$
search
(
Just
requestedFields
)
rq
Nothing
Nothing
Nothing
runHalAPIClient
defaultHalOptions
$
search
(
Just
requestedFields
)
rq
Nothing
Nothing
Nothing
generateRequestByStructID
::
Text
->
[
Text
]
->
Text
generateRequestByStructID
rq
struct_ids
=
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment