Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
hal
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
hal
Commits
25a1e955
Verified
Commit
25a1e955
authored
Jul 26, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' into refactoring
Tests added as well
parents
f7b928da
06eedee4
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
109 additions
and
67 deletions
+109
-67
cabal.project
cabal.project
+1
-0
crawlerHAL.cabal
crawlerHAL.cabal
+10
-28
HAL.hs
src/HAL.hs
+77
-34
Client.hs
src/HAL/Client.hs
+4
-3
Main.hs
test/Main.hs
+17
-0
Spec.hs
test/Spec.hs
+0
-2
No files found.
cabal.project
View file @
25a1e955
...
@@ -5,6 +5,7 @@ with-compiler: ghc-8.10.7
...
@@ -5,6 +5,7 @@ with-compiler: ghc-8.10.7
packages:
packages:
./
./
tests: True
-- allow-older: *
-- allow-older: *
allow-newer: base:*
allow-newer: base:*
crawlerHAL.cabal
View file @
25a1e955
cabal-version: 1.12
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.
0
.
-- This file has been generated from package.yaml by hpack version 0.35.
2
.
--
--
-- see: https://github.com/sol/hpack
-- see: https://github.com/sol/hpack
--
--
-- hash:
d5f6b9788433a3873efd55684dfd55f4d85380f24209a6a8eb8ba80c82a330f8
-- hash:
8e1f14ffe78d88b3a3011e489d4b883c7c81039249d2334d78d7dbcc155816f6
name: crawlerHAL
name: crawlerHAL
version: 0.1.0.0
version: 0.1.0.0
...
@@ -58,6 +58,7 @@ library
...
@@ -58,6 +58,7 @@ library
, http-client-tls >= 0.3.6.1 && < 0.4
, http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2
, iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, lens >= 5.1.1 && < 5.3
, mtl >= 2.2.2 && < 2.3
, optparse-applicative >= 0.17 && < 0.19
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
...
@@ -87,8 +88,7 @@ executable crawlerHAL-exe
...
@@ -87,8 +88,7 @@ executable crawlerHAL-exe
TypeOperators
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
build-depends:
aeson >= 1.5.6 && < 1.6
base >=4.7 && <5
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.7 && < 0.7
, containers >= 0.6.7 && < 0.7
...
@@ -97,21 +97,16 @@ executable crawlerHAL-exe
...
@@ -97,21 +97,16 @@ executable crawlerHAL-exe
, http-client >= 0.7.13.1 && < 0.8
, http-client >= 0.7.13.1 && < 0.8
, http-client-tls >= 0.3.6.1 && < 0.4
, http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2
, iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, optparse-applicative >= 0.17 && < 0.19
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, split >= 0.2.3.5 && < 0.3
, text >= 1.2.3.0 && < 2.1
, text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.12.0 && < 0.14
default-language: Haskell2010
default-language: Haskell2010
test-suite halCrawler-test
test-suite halCrawler-test
type: exitcode-stdio-1.0
type: exitcode-stdio-1.0
main-is:
Spec
.hs
main-is:
Main
.hs
other-modules:
other-modules:
Paths_crawlerHAL
Paths_crawlerHAL
hs-source-dirs:
hs-source-dirs:
...
@@ -127,24 +122,11 @@ test-suite halCrawler-test
...
@@ -127,24 +122,11 @@ test-suite halCrawler-test
TypeOperators
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
build-depends:
aeson >= 1.5.6 && < 1.6
base >=4.7 && <5
, base >=4.7 && <5
, crawlerHAL
, bytestring >= 0.11.0 && < 0.13
, HUnit >= 1.6.2.0 && < 1.7
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.7 && < 0.7
, data-default >= 0.7.1.1 && < 0.8
, halCrawler
, http-client >= 0.7.13.1 && < 0.8
, http-client-tls >= 0.3.6.1 && < 0.4
, iso639 >= 0.1.0.3 && < 0.2
, iso639 >= 0.1.0.3 && < 0.2
, lens >= 5.1.1 && < 5.3
, optparse-applicative >= 0.17 && < 0.19
, protolude >= 0.3.3 && < 0.4
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7.0 && < 0.4
, tasty >= 1.4.3 && < 1.5
, servant >= 0.19 && < 0.21
, tasty-hunit >= 0.10.0.3 && < 0.11
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.12.0 && < 0.14
default-language: Haskell2010
default-language: Haskell2010
src/HAL.hs
View file @
25a1e955
module
HAL
where
module
HAL
where
{-# LANGUAGE BangPatterns #-}
import
Conduit
import
Conduit
import
Control.Monad
import
Control.Monad.Reader
import
Data.Aeson
import
Data.Aeson
import
Data.LanguageCodes
(
ISO639_1
(
..
))
import
Data.LanguageCodes
(
ISO639_1
(
..
))
import
Data.Text
import
Data.Text
...
@@ -8,14 +12,25 @@ import HAL.Client
...
@@ -8,14 +12,25 @@ import HAL.Client
import
HAL.Doc.Corpus
import
HAL.Doc.Corpus
import
HAL.Doc.Struct
import
HAL.Doc.Struct
import
HAL.Utils
(
langAbstractS
,
toText
)
import
HAL.Utils
(
langAbstractS
,
toText
)
import
Network.HTTP.Client
(
newManager
)
import
Network.HTTP.Client
(
newManager
,
Request
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Protolude
import
Protolude
import
Servant.API
import
Servant.API
import
Servant.Client
(
BaseUrl
(
..
),
Scheme
(
..
),
ClientM
,
ClientError
,
runClientM
,
mkClientEnv
)
import
Servant.Client
(
BaseUrl
(
..
),
Scheme
(
..
),
ClientM
,
ClientError
,
runClientM
,
mkClientEnv
,
ClientEnv
(
makeClientRequest
))
import
System.IO.Unsafe
(
unsafePerformIO
)
batchSize
::
Int
batchSize
=
1000
data
HalCrawlerOptions
=
HalCrawlerOptions
{
-- | If 'True', enable the debug logs to stdout.
_hco_debugLogs
::
!
Bool
,
_hco_batchSize
::
!
Int
}
defaultHalOptions
::
HalCrawlerOptions
defaultHalOptions
=
HalCrawlerOptions
{
_hco_debugLogs
=
False
,
_hco_batchSize
=
1000
}
type
Query
=
Text
type
Query
=
Text
type
Start
=
Int
type
Start
=
Int
...
@@ -27,61 +42,77 @@ queryWithLang Nothing qs = qs
...
@@ -27,61 +42,77 @@ queryWithLang Nothing qs = qs
queryWithLang
(
Just
lang
)
qs
=
qs
<>
[
"language_s:"
<>
toText
lang
]
queryWithLang
(
Just
lang
)
qs
=
qs
<>
[
"language_s:"
<>
toText
lang
]
getMetadataWith
::
[
Query
]
getMetadataWith
::
[
Query
]
-- ^ The textual query
->
Maybe
Start
->
Maybe
Start
-- ^ An optional offset
->
Maybe
Limit
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Response
Corpus
))
->
IO
(
Either
ClientError
(
Response
Corpus
))
getMetadataWith
qs
start_
limit
lang
=
do
getMetadataWith
qs
start_
limit
lang
=
do
runHalAPIClient
$
search
(
Just
$
requestedFields
lang
)
(
queryWithLang
lang
qs
)
Nothing
start_
limit
runHalAPIClient
defaultHalOptions
$
search
(
Just
$
requestedFields
lang
)
(
queryWithLang
lang
qs
)
Nothing
start_
(
fromIntegral
<$>
limit
)
getMetadataWithC
::
[
Query
]
getMetadataWithC
::
[
Query
]
-- ^ The textual query
->
Maybe
Start
->
Maybe
Start
-- ^ An optional offset
->
Maybe
Limit
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithC
qs
start_
limit
lang
=
getMetadataWithLangC
(
queryWithLang
lang
qs
)
start_
limit
lang
getMetadataWithC
qs
start_
limit
lang
=
getMetadataWithLangC
defaultHalOptions
(
queryWithLang
lang
qs
)
start_
limit
lang
getMetadataWithLangC
::
[
Query
]
getMetadataWithLangC
::
HalCrawlerOptions
-- ^ The options for the crawler
->
[
Query
]
-- ^ The textual query
->
Maybe
Start
->
Maybe
Start
-- ^ An optional offset
->
Maybe
Limit
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithLangC
qs
start_
limit
lang
=
do
getMetadataWithLangC
opts
@
HalCrawlerOptions
{
..
}
qs
mb_offset
mb_
limit
lang
=
do
-- First, estimate the total number of documents
-- First, estimate the total number of documents
eCount
<-
countResults
qs
eCount
<-
countResults
qs
pure
$
get'
<$>
eCount
pure
$
get'
<$>
eCount
where
where
get'
::
Count
get'
::
Count
->
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
)
->
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
)
get'
numFound
_
=
get'
numFound
'
=
(
Just
numResults
(
Just
numResults
,
yieldMany
[
0
..
]
,
yieldMany
[
0
..
]
.|
takeC
(
fromInteg
er
numPages
)
.|
takeC
(
fromInteg
ral
numPages
)
.|
concatMapMC
(
getPage
start'
))
.|
concatMapMC
(
getPage
offset
))
where
where
start'
=
fromMaybe
0
start_
offset
=
fromMaybe
0
mb_offset
rows'
=
min
numFound_
$
fromMaybe
numFound_
limit
limit
=
min
numFound'
$
fromMaybe
numFound'
mb_
limit
numResults
=
rows'
-
(
fromIntegral
start'
)
numResults
=
limit
-
fromIntegral
offset
numPages
=
numResults
`
div
`
(
fromIntegral
batchSize
)
+
1
numPages
=
numResults
`
div
`
fromIntegral
_hco_batchSize
+
1
getPage
::
Start
->
Int
->
IO
[
Corpus
]
getPage
::
Start
->
Int
->
IO
[
Corpus
]
getPage
start'
pageNum
=
do
getPage
start'
pageNum
=
do
-- putText $ "requestedFields: " <> (show $ requestedFields lang)
let
offset
=
start'
+
pageNum
*
_hco_batchSize
let
offset
=
start'
+
pageNum
*
batchSize
debugLog
opts
$
"[getMetadataWithC] getPage: "
<>
show
offset
eRes
<-
runHalAPIClient
$
search
(
Just
$
requestedFields
lang
)
qs
Nothing
(
Just
offset
)
(
Just
$
fromIntegral
batchSize
)
eRes
<-
runHalAPIClient
opts
$
search
(
Just
$
requestedFields
lang
)
qs
Nothing
(
Just
offset
)
(
Just
$
fromIntegral
_hco_
batchSize
)
pure
$
case
eRes
of
pure
$
case
eRes
of
Left
_
->
[]
Left
_
->
[]
Right
(
Response
{
_docs
})
->
_docs
Right
Response
{
_docs
}
->
_docs
-- printDoc :: Corpus -> IO Corpus
-- printDoc c@(Corpus { .. }) = do
debugLog
::
HalCrawlerOptions
->
Text
->
IO
()
-- putText $ show _corpus_title
debugLog
HalCrawlerOptions
{
..
}
msg
=
when
_hco_debugLogs
$
putStrLn
msg
-- pure c
countResults
::
[
Query
]
->
IO
(
Either
ClientError
Count
)
countResults
::
[
Query
]
->
IO
(
Either
ClientError
Count
)
countResults
qs
=
do
countResults
qs
=
do
-- First, estimate the total number of documents
-- First, estimate the total number of documents
eRes
<-
runHalAPIClient
$
search
(
Just
$
requestedFields
Nothing
)
qs
Nothing
(
Just
0
)
(
Just
1
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
eRes
<-
runHalAPIClient
defaultHalOptions
$
search
(
Just
$
requestedFields
Nothing
)
qs
Nothing
(
Just
0
)
(
Just
1
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
pure
$
_numFound
<$>
eRes
pure
$
fromIntegral
<$>
_numFound
<$>
eRes
requestedFields
::
Maybe
ISO639_1
->
Text
requestedFields
::
Maybe
ISO639_1
->
Text
requestedFields
(
Just
EN
)
=
"docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
requestedFields
(
Just
EN
)
=
"docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
...
@@ -91,19 +122,31 @@ requestedFields _ = requestedFields (Just EN)
...
@@ -91,19 +122,31 @@ requestedFields _ = requestedFields (Just EN)
structFields
::
Text
structFields
::
Text
structFields
=
"docid,label_s,parentDocid_i"
structFields
=
"docid,label_s,parentDocid_i"
runHalAPIClient
::
(
FromJSON
doc
,
ToHttpApiData
doc
)
=>
requestLog
::
HalCrawlerOptions
->
Request
->
Request
ClientM
(
Response
doc
)
->
IO
(
Either
ClientError
(
Response
doc
))
requestLog
opts
rq
=
unsafePerformIO
$
do
runHalAPIClient
cmd
=
do
debugLog
opts
$
"[HAL.makeClientRequestLog] "
<>
show
rq
pure
rq
{-# NOINLINE requestLog #-}
runHalAPIClient
::
(
FromJSON
doc
,
ToHttpApiData
doc
)
=>
HalCrawlerOptions
->
ClientM
(
Response
doc
)
->
IO
(
Either
ClientError
(
Response
doc
))
runHalAPIClient
opts
cmd
=
do
manager'
<-
newManager
tlsManagerSettings
manager'
<-
newManager
tlsManagerSettings
runClientM
cmd
(
mkClientEnv
manager'
$
BaseUrl
Https
"api.archives-ouvertes.fr"
443
""
)
runClientM
cmd'
(
mkClientEnv
manager'
$
BaseUrl
Https
"api.archives-ouvertes.fr"
443
""
)
where
cmd'
=
local
(
\
r
->
r
{
makeClientRequest
=
\
bUrl
servantRq
->
requestLog
opts
((
makeClientRequest
r
)
bUrl
servantRq
)
})
cmd
runStructureRequest
::
Maybe
Text
->
IO
(
Either
ClientError
(
Response
Struct
))
runStructureRequest
::
Maybe
Text
->
IO
(
Either
ClientError
(
Response
Struct
))
runStructureRequest
rq
=
runStructureRequest
rq
=
runHalAPIClient
$
structure
(
Just
structFields
)
rq
(
Just
10000
)
runHalAPIClient
defaultHalOptions
$
structure
(
Just
structFields
)
rq
(
Just
10000
)
runSearchRequest
::
[
Text
]
->
Maybe
ISO639_1
->
IO
(
Either
ClientError
(
Response
Corpus
))
runSearchRequest
::
[
Text
]
->
IO
(
Either
ClientError
(
Response
Corpus
))
runSearchRequest
rq
lang
=
runSearchRequest
rq
=
runHalAPIClient
$
search
(
Just
$
requestedFields
la
ng
)
rq
Nothing
Nothing
Nothing
runHalAPIClient
defaultHalOptions
$
search
(
Just
$
requestedFields
Nothi
ng
)
rq
Nothing
Nothing
Nothing
generateRequestByStructID
::
Text
->
[
Text
]
->
Text
generateRequestByStructID
::
Text
->
[
Text
]
->
Text
generateRequestByStructID
rq
struct_ids
=
generateRequestByStructID
rq
struct_ids
=
...
...
src/HAL/Client.hs
View file @
25a1e955
...
@@ -11,6 +11,7 @@ import Protolude
...
@@ -11,6 +11,7 @@ import Protolude
import
Servant.API
import
Servant.API
import
Servant.Client
hiding
(
Response
)
import
Servant.Client
hiding
(
Response
)
type
HALAPI
doc
=
Search
doc
type
HALAPI
doc
=
Search
doc
:<|>
Structure
doc
:<|>
Structure
doc
...
@@ -25,7 +26,7 @@ type Search doc = "search"
...
@@ -25,7 +26,7 @@ type Search doc = "search"
-- permit to start at the x result
-- permit to start at the x result
:>
QueryParam
"start"
Int
:>
QueryParam
"start"
Int
-- use rows to make the request only return the x number of result
-- use rows to make the request only return the x number of result
:>
QueryParam
"rows"
Int
eger
:>
QueryParam
"rows"
Int
:>
Get
'[
J
SON
]
(
Response
doc
)
:>
Get
'[
J
SON
]
(
Response
doc
)
type
Structure
doc
=
"ref"
:>
"structure"
type
Structure
doc
=
"ref"
:>
"structure"
...
@@ -50,7 +51,7 @@ desc = Just . Desc
...
@@ -50,7 +51,7 @@ desc = Just . Desc
-- Response type
-- Response type
data
Response
doc
=
Response
data
Response
doc
=
Response
{
_numFound
::
Int
eger
{
_numFound
::
Int
,
_start
::
Int
,
_start
::
Int
,
_docs
::
[
doc
]
,
_docs
::
[
doc
]
}
deriving
(
Show
,
Generic
)
}
deriving
(
Show
,
Generic
)
...
@@ -72,7 +73,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
...
@@ -72,7 +73,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
->
[
Text
]
-- fq
->
[
Text
]
-- fq
->
Maybe
SortField
-- sort
->
Maybe
SortField
-- sort
->
Maybe
Int
-- start
->
Maybe
Int
-- start
->
Maybe
Int
eger
-- rows
->
Maybe
Int
-- rows
->
ClientM
(
Response
doc
)
->
ClientM
(
Response
doc
)
structure
::
(
FromJSON
doc
,
ToHttpApiData
doc
)
=>
structure
::
(
FromJSON
doc
,
ToHttpApiData
doc
)
=>
...
...
test/Main.hs
0 → 100644
View file @
25a1e955
module
Main
where
import
Data.LanguageCodes
(
ISO639_1
(
..
))
import
HAL
qualified
import
HAL.Utils
qualified
as
HAL
import
Protolude
import
Test.Tasty
import
Test.Tasty.HUnit
main
::
IO
()
main
=
defaultMain
unitTests
unitTests
=
testGroup
"unit tests"
[
testCase
"Check langAbstractS"
$
do
"pl_abstract_s"
@?=
(
HAL
.
langAbstractS
PL
)
"en_abstract_s"
@?=
(
HAL
.
langAbstractS
EN
)
"fr_abstract_s"
@?=
(
HAL
.
langAbstractS
FR
)
]
test/Spec.hs
deleted
100644 → 0
View file @
f7b928da
main
::
IO
()
main
=
putStrLn
"Test suite not yet implemented"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment