Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
hal
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
hal
Commits
25a1e955
Verified
Commit
25a1e955
authored
Jul 26, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' into refactoring
Tests added as well
parents
f7b928da
06eedee4
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
109 additions
and
67 deletions
+109
-67
cabal.project
cabal.project
+1
-0
crawlerHAL.cabal
crawlerHAL.cabal
+10
-28
HAL.hs
src/HAL.hs
+77
-34
Client.hs
src/HAL/Client.hs
+4
-3
Main.hs
test/Main.hs
+17
-0
Spec.hs
test/Spec.hs
+0
-2
No files found.
cabal.project
View file @
25a1e955
...
...
@@ -5,6 +5,7 @@ with-compiler: ghc-8.10.7
packages:
./
tests: True
-- allow-older: *
allow-newer: base:*
crawlerHAL.cabal
View file @
25a1e955
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.
0
.
-- This file has been generated from package.yaml by hpack version 0.35.
2
.
--
-- see: https://github.com/sol/hpack
--
-- hash:
d5f6b9788433a3873efd55684dfd55f4d85380f24209a6a8eb8ba80c82a330f8
-- hash:
8e1f14ffe78d88b3a3011e489d4b883c7c81039249d2334d78d7dbcc155816f6
name: crawlerHAL
version: 0.1.0.0
...
...
@@ -58,6 +58,7 @@ library
, http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, mtl >= 2.2.2 && < 2.3
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
...
...
@@ -87,8 +88,7 @@ executable crawlerHAL-exe
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson >= 1.5.6 && < 1.6
, base >=4.7 && <5
base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.7 && < 0.7
...
...
@@ -97,21 +97,16 @@ executable crawlerHAL-exe
, http-client >= 0.7.13.1 && < 0.8
, http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.12.0 && < 0.14
default-language: Haskell2010
test-suite halCrawler-test
type: exitcode-stdio-1.0
main-is:
Spec
.hs
main-is:
Main
.hs
other-modules:
Paths_crawlerHAL
hs-source-dirs:
...
...
@@ -127,24 +122,11 @@ test-suite halCrawler-test
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson >= 1.5.6 && < 1.6
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.7 && < 0.7
, data-default >= 0.7.1.1 && < 0.8
, halCrawler
, http-client >= 0.7.13.1 && < 0.8
, http-client-tls >= 0.3.6.1 && < 0.4
base >=4.7 && <5
, crawlerHAL
, HUnit >= 1.6.2.0 && < 1.7
, iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.12.0 && < 0.14
, tasty >= 1.4.3 && < 1.5
, tasty-hunit >= 0.10.0.3 && < 0.11
default-language: Haskell2010
src/HAL.hs
View file @
25a1e955
module
HAL
where
{-# LANGUAGE BangPatterns #-}
import
Conduit
import
Control.Monad
import
Control.Monad.Reader
import
Data.Aeson
import
Data.LanguageCodes
(
ISO639_1
(
..
))
import
Data.Text
...
...
@@ -8,14 +12,25 @@ import HAL.Client
import
HAL.Doc.Corpus
import
HAL.Doc.Struct
import
HAL.Utils
(
langAbstractS
,
toText
)
import
Network.HTTP.Client
(
newManager
)
import
Network.HTTP.Client
(
newManager
,
Request
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Protolude
import
Servant.API
import
Servant.Client
(
BaseUrl
(
..
),
Scheme
(
..
),
ClientM
,
ClientError
,
runClientM
,
mkClientEnv
)
batchSize
::
Int
batchSize
=
1000
import
Servant.Client
(
BaseUrl
(
..
),
Scheme
(
..
),
ClientM
,
ClientError
,
runClientM
,
mkClientEnv
,
ClientEnv
(
makeClientRequest
))
import
System.IO.Unsafe
(
unsafePerformIO
)
data
HalCrawlerOptions
=
HalCrawlerOptions
{
-- | If 'True', enable the debug logs to stdout.
_hco_debugLogs
::
!
Bool
,
_hco_batchSize
::
!
Int
}
defaultHalOptions
::
HalCrawlerOptions
defaultHalOptions
=
HalCrawlerOptions
{
_hco_debugLogs
=
False
,
_hco_batchSize
=
1000
}
type
Query
=
Text
type
Start
=
Int
...
...
@@ -27,61 +42,77 @@ queryWithLang Nothing qs = qs
queryWithLang
(
Just
lang
)
qs
=
qs
<>
[
"language_s:"
<>
toText
lang
]
getMetadataWith
::
[
Query
]
-- ^ The textual query
->
Maybe
Start
-- ^ An optional offset
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Response
Corpus
))
getMetadataWith
qs
start_
limit
lang
=
do
runHalAPIClient
$
search
(
Just
$
requestedFields
lang
)
(
queryWithLang
lang
qs
)
Nothing
start_
limit
runHalAPIClient
defaultHalOptions
$
search
(
Just
$
requestedFields
lang
)
(
queryWithLang
lang
qs
)
Nothing
start_
(
fromIntegral
<$>
limit
)
getMetadataWithC
::
[
Query
]
-- ^ The textual query
->
Maybe
Start
-- ^ An optional offset
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithC
qs
start_
limit
lang
=
getMetadataWithLangC
(
queryWithLang
lang
qs
)
start_
limit
lang
getMetadataWithC
qs
start_
limit
lang
=
getMetadataWithLangC
defaultHalOptions
(
queryWithLang
lang
qs
)
start_
limit
lang
getMetadataWithLangC
::
[
Query
]
getMetadataWithLangC
::
HalCrawlerOptions
-- ^ The options for the crawler
->
[
Query
]
-- ^ The textual query
->
Maybe
Start
-- ^ An optional offset
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithLangC
qs
start_
limit
lang
=
do
getMetadataWithLangC
opts
@
HalCrawlerOptions
{
..
}
qs
mb_offset
mb_
limit
lang
=
do
-- First, estimate the total number of documents
eCount
<-
countResults
qs
pure
$
get'
<$>
eCount
where
get'
::
Count
->
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
)
get'
numFound
_
=
get'
numFound
'
=
(
Just
numResults
,
yieldMany
[
0
..
]
.|
takeC
(
fromInteg
er
numPages
)
.|
concatMapMC
(
getPage
start'
))
.|
takeC
(
fromInteg
ral
numPages
)
.|
concatMapMC
(
getPage
offset
))
where
start'
=
fromMaybe
0
start_
rows'
=
min
numFound_
$
fromMaybe
numFound_
limit
numResults
=
rows'
-
(
fromIntegral
start'
)
numPages
=
numResults
`
div
`
(
fromIntegral
batchSize
)
+
1
offset
=
fromMaybe
0
mb_offset
limit
=
min
numFound'
$
fromMaybe
numFound'
mb_
limit
numResults
=
limit
-
fromIntegral
offset
numPages
=
numResults
`
div
`
fromIntegral
_hco_batchSize
+
1
getPage
::
Start
->
Int
->
IO
[
Corpus
]
getPage
start'
pageNum
=
do
-- putText $ "requestedFields: " <> (show $ requestedFields lang)
let
offset
=
start'
+
pageNum
*
batchSize
eRes
<-
runHalAPIClient
$
search
(
Just
$
requestedFields
lang
)
qs
Nothing
(
Just
offset
)
(
Just
$
fromIntegral
batchSize
)
let
offset
=
start'
+
pageNum
*
_hco_batchSize
debugLog
opts
$
"[getMetadataWithC] getPage: "
<>
show
offset
eRes
<-
runHalAPIClient
opts
$
search
(
Just
$
requestedFields
lang
)
qs
Nothing
(
Just
offset
)
(
Just
$
fromIntegral
_hco_
batchSize
)
pure
$
case
eRes
of
Left
_
->
[]
Right
(
Response
{
_docs
})
->
_docs
-- printDoc :: Corpus -> IO Corpus
-- printDoc c@(Corpus { .. }) = do
-- putText $ show _corpus_title
-- pure c
Right
Response
{
_docs
}
->
_docs
debugLog
::
HalCrawlerOptions
->
Text
->
IO
()
debugLog
HalCrawlerOptions
{
..
}
msg
=
when
_hco_debugLogs
$
putStrLn
msg
countResults
::
[
Query
]
->
IO
(
Either
ClientError
Count
)
countResults
qs
=
do
-- First, estimate the total number of documents
eRes
<-
runHalAPIClient
$
search
(
Just
$
requestedFields
Nothing
)
qs
Nothing
(
Just
0
)
(
Just
1
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
pure
$
_numFound
<$>
eRes
eRes
<-
runHalAPIClient
defaultHalOptions
$
search
(
Just
$
requestedFields
Nothing
)
qs
Nothing
(
Just
0
)
(
Just
1
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
pure
$
fromIntegral
<$>
_numFound
<$>
eRes
requestedFields
::
Maybe
ISO639_1
->
Text
requestedFields
(
Just
EN
)
=
"docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
...
...
@@ -91,19 +122,31 @@ requestedFields _ = requestedFields (Just EN)
structFields
::
Text
structFields
=
"docid,label_s,parentDocid_i"
runHalAPIClient
::
(
FromJSON
doc
,
ToHttpApiData
doc
)
=>
ClientM
(
Response
doc
)
->
IO
(
Either
ClientError
(
Response
doc
))
runHalAPIClient
cmd
=
do
requestLog
::
HalCrawlerOptions
->
Request
->
Request
requestLog
opts
rq
=
unsafePerformIO
$
do
debugLog
opts
$
"[HAL.makeClientRequestLog] "
<>
show
rq
pure
rq
{-# NOINLINE requestLog #-}
runHalAPIClient
::
(
FromJSON
doc
,
ToHttpApiData
doc
)
=>
HalCrawlerOptions
->
ClientM
(
Response
doc
)
->
IO
(
Either
ClientError
(
Response
doc
))
runHalAPIClient
opts
cmd
=
do
manager'
<-
newManager
tlsManagerSettings
runClientM
cmd
(
mkClientEnv
manager'
$
BaseUrl
Https
"api.archives-ouvertes.fr"
443
""
)
runClientM
cmd'
(
mkClientEnv
manager'
$
BaseUrl
Https
"api.archives-ouvertes.fr"
443
""
)
where
cmd'
=
local
(
\
r
->
r
{
makeClientRequest
=
\
bUrl
servantRq
->
requestLog
opts
((
makeClientRequest
r
)
bUrl
servantRq
)
})
cmd
runStructureRequest
::
Maybe
Text
->
IO
(
Either
ClientError
(
Response
Struct
))
runStructureRequest
rq
=
runHalAPIClient
$
structure
(
Just
structFields
)
rq
(
Just
10000
)
runHalAPIClient
defaultHalOptions
$
structure
(
Just
structFields
)
rq
(
Just
10000
)
runSearchRequest
::
[
Text
]
->
Maybe
ISO639_1
->
IO
(
Either
ClientError
(
Response
Corpus
))
runSearchRequest
rq
lang
=
runHalAPIClient
$
search
(
Just
$
requestedFields
la
ng
)
rq
Nothing
Nothing
Nothing
runSearchRequest
::
[
Text
]
->
IO
(
Either
ClientError
(
Response
Corpus
))
runSearchRequest
rq
=
runHalAPIClient
defaultHalOptions
$
search
(
Just
$
requestedFields
Nothi
ng
)
rq
Nothing
Nothing
Nothing
generateRequestByStructID
::
Text
->
[
Text
]
->
Text
generateRequestByStructID
rq
struct_ids
=
...
...
src/HAL/Client.hs
View file @
25a1e955
...
...
@@ -11,6 +11,7 @@ import Protolude
import
Servant.API
import
Servant.Client
hiding
(
Response
)
type
HALAPI
doc
=
Search
doc
:<|>
Structure
doc
...
...
@@ -25,7 +26,7 @@ type Search doc = "search"
-- permit to start at the x result
:>
QueryParam
"start"
Int
-- use rows to make the request only return the x number of result
:>
QueryParam
"rows"
Int
eger
:>
QueryParam
"rows"
Int
:>
Get
'[
J
SON
]
(
Response
doc
)
type
Structure
doc
=
"ref"
:>
"structure"
...
...
@@ -50,7 +51,7 @@ desc = Just . Desc
-- Response type
data
Response
doc
=
Response
{
_numFound
::
Int
eger
{
_numFound
::
Int
,
_start
::
Int
,
_docs
::
[
doc
]
}
deriving
(
Show
,
Generic
)
...
...
@@ -72,7 +73,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
->
[
Text
]
-- fq
->
Maybe
SortField
-- sort
->
Maybe
Int
-- start
->
Maybe
Int
eger
-- rows
->
Maybe
Int
-- rows
->
ClientM
(
Response
doc
)
structure
::
(
FromJSON
doc
,
ToHttpApiData
doc
)
=>
...
...
test/Main.hs
0 → 100644
View file @
25a1e955
module
Main
where
import
Data.LanguageCodes
(
ISO639_1
(
..
))
import
HAL
qualified
import
HAL.Utils
qualified
as
HAL
import
Protolude
import
Test.Tasty
import
Test.Tasty.HUnit
main
::
IO
()
main
=
defaultMain
unitTests
unitTests
=
testGroup
"unit tests"
[
testCase
"Check langAbstractS"
$
do
"pl_abstract_s"
@?=
(
HAL
.
langAbstractS
PL
)
"en_abstract_s"
@?=
(
HAL
.
langAbstractS
EN
)
"fr_abstract_s"
@?=
(
HAL
.
langAbstractS
FR
)
]
test/Spec.hs
deleted
100644 → 0
View file @
f7b928da
main
::
IO
()
main
=
putStrLn
"Test suite not yet implemented"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment