Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
hal
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
2
Merge Requests
2
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
crawlers
hal
Commits
b99b9e56
Verified
Commit
b99b9e56
authored
Mar 18, 2024
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Use fq for lang filter
Also, some refactoring for app (add debug etc).
parent
5edaca5c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
131 additions
and
95 deletions
+131
-95
Main.hs
app/Main.hs
+30
-19
HAL.hs
src/HAL.hs
+101
-76
No files found.
app/Main.hs
View file @
b99b9e56
...
...
@@ -5,21 +5,24 @@ module Main where
import
Conduit
(
sinkList
,
mapM_C
,
(
.|
),
runConduit
)
import
Data.LanguageCodes
(
ISO639_1
(
..
))
import
Data.Text
qualified
as
T
import
HAL
(
getMetadataWith
,
getMetadataWithC
,
getMetadataWithCursorC
)
import
HAL
(
getMetadataWith
CursorOptsC
,
countResultsOpts'
,
HalCrawlerOptions
(
..
),
defaultHalOptions
)
import
HAL.Doc
import
HAL.Doc.Corpus
(
Corpus
(
..
))
import
Network.HTTP.Client
(
newManager
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Options.Applicative
import
Prelude
qualified
import
Protolude
data
CountParams
=
CountParams
{
cp_query
::
[
T
.
Text
]
}
{
cp_query
::
T
.
Text
,
cp_lang
::
Maybe
ISO639_1
}
data
FetchParams
=
FetchParams
{
fp_query
::
[
T
.
Text
]
,
fp_limit
::
Integer
}
{
fp_query
::
T
.
Text
,
fp_limit
::
Integer
,
fp_lang
::
Maybe
ISO639_1
}
data
Command
=
Count
CountParams
...
...
@@ -28,14 +31,19 @@ data Command =
countParams
::
Parser
Command
countParams
=
Count
<$>
(
CountParams
<$>
many
(
strArgument
(
metavar
"query"
)))
<$>
strArgument
(
metavar
"query"
)
<*>
optional
(
option
(
maybeReader
readLang
)
(
long
"lang"
)))
fetchParams
::
Parser
Command
fetchParams
=
Fetch
<$>
(
FetchParams
<$>
many
(
strArgument
(
metavar
"query"
))
<*>
option
auto
(
long
"limit"
))
<$>
strArgument
(
metavar
"query"
)
<*>
option
auto
(
long
"limit"
)
<*>
optional
(
option
(
maybeReader
readLang
)
(
long
"lang"
)))
readLang
::
Prelude
.
String
->
Maybe
ISO639_1
readLang
=
readMaybe
params
::
Parser
Command
params
=
subparser
(
command
"count"
(
info
countParams
(
progDesc
"Count number of docs for a given query"
))
...
...
@@ -55,13 +63,15 @@ main = run =<< execParser opts
-- (Right val) -> print $ _docs val
run
::
Command
->
IO
()
run
(
Count
(
CountParams
{
cp_query
}))
=
do
res
<-
getMetadataWithC
cp_query
(
Just
0
)
Nothing
Nothi
ng
run
(
Count
(
CountParams
{
cp_query
,
cp_lang
}))
=
do
res
<-
countResultsOpts'
opts
cp_query
cp_la
ng
case
res
of
Left
err
->
putText
$
show
err
Right
(
cnt
,
_docsC
)
->
putText
$
show
cnt
run
(
Fetch
(
FetchParams
{
fp_query
,
fp_limit
}))
=
do
res
<-
getMetadataWithCursorC
fp_query
(
Just
fp_limit
)
Nothing
Right
cnt
->
putText
$
show
cnt
where
opts
=
defaultHalOptions
{
_hco_debugLogs
=
True
}
run
(
Fetch
(
FetchParams
{
fp_query
,
fp_limit
,
fp_lang
}))
=
do
res
<-
getMetadataWithCursorOptsC
opts
fp_query
(
Just
fp_limit
)
fp_lang
case
res
of
Left
err
->
putText
$
show
err
Right
(
_cnt
,
docsC
)
->
do
...
...
@@ -70,13 +80,14 @@ run (Fetch (FetchParams { fp_query, fp_limit })) = do
.|
mapM_C
printCorpus
.|
sinkList
pure
()
where
printCorpus
Corpus
{
..
}
=
do
putText
$
"docid: "
<>
_corpus_docid
<>
" ["
<>
(
T
.
intercalate
" "
_corpus_title
)
<>
"]"
putText
$
" "
<>
(
T
.
intercalate
" "
_corpus_abstract
)
putText
$
" "
<>
show
_corpus_abstract_lang_map
putText
$
" "
<>
show
_corpus_original
putText
"------------"
where
opts
=
defaultHalOptions
{
_hco_debugLogs
=
True
}
printCorpus
Corpus
{
..
}
=
do
putText
$
"docid: "
<>
_corpus_docid
<>
" ["
<>
(
T
.
intercalate
" "
_corpus_title
)
<>
"]"
putText
$
" "
<>
(
T
.
intercalate
" "
_corpus_abstract
)
putText
$
" "
<>
show
_corpus_abstract_lang_map
putText
$
" "
<>
show
_corpus_original
putText
"------------"
-- data
...
...
src/HAL.hs
View file @
b99b9e56
...
...
@@ -36,9 +36,95 @@ type Start = Int
type
Limit
=
Integer
type
Count
=
Integer
queryWithLang
::
Maybe
ISO639_1
->
[
Query
]
->
[
Query
]
queryWithLang
Nothing
qs
=
qs
queryWithLang
(
Just
lang
)
qs
=
qs
<>
[
"language_s:"
<>
toText
lang
]
queryWithLang
::
Maybe
ISO639_1
->
[
Query
]
queryWithLang
Nothing
=
[]
queryWithLang
(
Just
lang
)
=
[
"language_s:"
<>
toText
lang
]
getMetadataWithCursorC
::
Query
-- ^ The textual query
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithCursorC
=
getMetadataWithCursorOptsC
defaultHalOptions
-- | Fetch metadata using cursors
-- https://api.archives-ouvertes.fr/docs/search#cursors
getMetadataWithCursorOptsC
::
HalCrawlerOptions
-- ^ The options for the crawler
->
Query
-- ^ The textual query
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithCursorOptsC
opts
@
HalCrawlerOptions
{
..
}
q
mb_limit
lang
=
do
-- Basically this works as follows:
-- - fetch first page with cursor = "*"
-- - get next cursor from the results
-- - feed the cursor to get next page
-- - when previous and current cursors are equal, there are no more results
-- First, estimate the total number of documents
eCount
<-
countResultsOpts'
opts
q
lang
pure
$
get'
<$>
eCount
where
sort_
=
Just
$
Asc
"docid"
fq
=
queryWithLang
lang
get'
::
Count
->
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
)
get'
numFound'
=
(
Just
numResults
,
producer
"*"
-- | we need takeC again, because getPage could give too many results
.|
takeC
(
fromIntegral
numResults
)
)
where
limit
=
min
numFound'
$
fromMaybe
numFound'
mb_limit
numResults
=
limit
producer
::
Text
->
ConduitT
()
Corpus
IO
()
producer
cursor
=
do
let
endpoint
=
searchCursor
(
Just
q
)
(
Just
$
requestedFields
lang
)
fq
sort_
(
Just
$
fromIntegral
_hco_batchSize
)
(
Just
cursor
)
liftIO
$
debugLog
opts
$
"[getMetadataWithCursorLangC] producer: "
<>
show
cursor
eRes
<-
liftIO
$
runHalAPIClient
opts
endpoint
case
eRes
of
Left
err
->
fail
$
"error: "
<>
show
err
Right
(
Response
{
_docs
,
_nextCursorMark
})
->
do
yieldMany
_docs
case
_nextCursorMark
of
Nothing
->
fail
"Expected next cursor mark, but got nothing"
Just
nextCursor
->
do
if
cursor
==
nextCursor
then
pure
()
else
do
producer
nextCursor
countResults'
::
Query
->
Maybe
ISO639_1
->
IO
(
Either
ClientError
Count
)
countResults'
q
lang
=
do
countResultsOpts'
defaultHalOptions
q
lang
countResultsOpts'
::
HalCrawlerOptions
->
Query
->
Maybe
ISO639_1
->
IO
(
Either
ClientError
Count
)
countResultsOpts'
opts
q
lang
=
do
-- Set rows=0 to query number of results
-- https://api.archives-ouvertes.fr/docs/search#rows
-- First, estimate the total number of documents
eRes
<-
runHalAPIClient
opts
$
search
(
Just
q
)
(
Just
$
requestedFields
Nothing
)
fq
Nothing
(
Just
0
)
(
Just
0
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
pure
(
fromIntegral
.
_numFound
<$>
eRes
)
where
fq
=
queryWithLang
lang
--- SOME OTHER FETCH FUNCTIONS
getMetadataWith
::
[
Query
]
-- ^ The textual query
...
...
@@ -53,7 +139,7 @@ getMetadataWith :: [Query]
getMetadataWith
qs
start_
limit
lang
=
do
runHalAPIClient
defaultHalOptions
$
search
(
Just
q
)
(
Just
$
requestedFields
lang
)
[]
Nothing
start_
(
fromIntegral
<$>
limit
)
where
q
=
joinQueries
$
q
ueryWithLang
lang
qs
q
=
joinQueries
$
q
s
<>
queryWithLang
lang
-- | Fetch results, returning a Conduit stream.
-- NOTE: Prefer fetching with `getMetadataWithCursorC` instead of this function.
...
...
@@ -67,9 +153,9 @@ getMetadataWithC :: [Query]
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithC
qs
start_
limit
lang
=
getMetadataWith
LangC
defaultHalOptions
(
queryWithLang
lang
qs
)
start_
limit
lang
getMetadataWithC
qs
start_
limit
lang
=
getMetadataWith
OptsC
defaultHalOptions
(
qs
<>
queryWithLang
lang
)
start_
limit
lang
getMetadataWith
Lang
C
::
HalCrawlerOptions
getMetadataWith
Opts
C
::
HalCrawlerOptions
-- ^ The options for the crawler
->
[
Query
]
-- ^ The textual query
...
...
@@ -81,7 +167,7 @@ getMetadataWithLangC :: HalCrawlerOptions
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWith
Lang
C
opts
@
HalCrawlerOptions
{
..
}
qs
mb_offset
mb_limit
lang
=
do
getMetadataWith
Opts
C
opts
@
HalCrawlerOptions
{
..
}
qs
mb_offset
mb_limit
lang
=
do
-- First, estimate the total number of documents
eCount
<-
countResults
qs
pure
$
get'
<$>
eCount
...
...
@@ -113,72 +199,7 @@ getMetadataWithLangC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang =
where
q
=
joinQueries
qs
getMetadataWithCursorC
::
[
Query
]
-- ^ The textual query
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithCursorC
qs
limit
lang
=
getMetadataWithCursorLangC
defaultHalOptions
(
queryWithLang
lang
qs
)
limit
lang
-- | Fetch metadata using cursors
-- https://api.archives-ouvertes.fr/docs/search#cursors
getMetadataWithCursorLangC
::
HalCrawlerOptions
-- ^ The options for the crawler
->
[
Query
]
-- ^ The textual query
->
Maybe
Limit
-- ^ An optional limit for the search, it influences the number of
-- rows returned.
->
Maybe
ISO639_1
-- ^ An optional language for the search.
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithCursorLangC
opts
@
HalCrawlerOptions
{
..
}
qs
mb_limit
lang
=
do
-- Basically this works as follows:
-- - fetch first page with cursor = "*"
-- - get next cursor from the results
-- - feed the cursor to get next page
-- - when previous and current cursors are equal, there are no more results
-- First, estimate the total number of documents
eCount
<-
countResults
qs
pure
$
get'
<$>
eCount
where
q
=
joinQueries
qs
sort_
=
Just
$
Asc
"docid"
get'
::
Count
->
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
)
get'
numFound'
=
(
Just
numResults
,
producer
"*"
-- | we need takeC again, because getPage could give too many results
.|
takeC
(
fromIntegral
numResults
)
)
where
limit
=
min
numFound'
$
fromMaybe
numFound'
mb_limit
numResults
=
limit
producer
::
Text
->
ConduitT
()
Corpus
IO
()
producer
cursor
=
do
let
endpoint
=
searchCursor
(
Just
q
)
(
Just
$
requestedFields
lang
)
[]
sort_
(
Just
$
fromIntegral
_hco_batchSize
)
(
Just
cursor
)
liftIO
$
debugLog
opts
$
"[getMetadataWithCursorLangC] producer: "
<>
show
cursor
eRes
<-
liftIO
$
runHalAPIClient
opts
endpoint
case
eRes
of
Left
err
->
fail
$
"error: "
<>
show
err
Right
(
Response
{
_docs
,
_nextCursorMark
})
->
do
yieldMany
_docs
case
_nextCursorMark
of
Nothing
->
fail
"Expected next cursor mark, but got nothing"
Just
nextCursor
->
do
if
cursor
==
nextCursor
then
pure
()
else
do
producer
nextCursor
debugLog
::
HalCrawlerOptions
->
Text
->
IO
()
debugLog
HalCrawlerOptions
{
..
}
msg
=
when
_hco_debugLogs
$
putStrLn
msg
...
...
@@ -188,10 +209,14 @@ countResults qs = do
-- https://api.archives-ouvertes.fr/docs/search#rows
-- First, estimate the total number of documents
eRes
<-
runHalAPIClient
defaultHalOptions
$
search
(
Just
q
)
(
Just
$
requestedFields
Nothing
)
[]
Nothing
(
Just
0
)
(
Just
0
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
pure
$
fromIntegral
<$>
_numFound
<$>
eRes
pure
(
fromIntegral
.
_numFound
<$>
eRes
)
where
q
=
joinQueries
qs
requestedFields
::
Maybe
ISO639_1
->
Text
requestedFields
(
Just
EN
)
=
T
.
intercalate
","
baseFields
requestedFields
(
Just
lang
)
=
T
.
intercalate
","
$
baseFields
<>
[
langAbstractS
lang
]
...
...
@@ -225,7 +250,7 @@ runHalAPIClient opts cmd = do
runClientM
cmd'
(
mkClientEnv
manager'
$
BaseUrl
Https
"api.archives-ouvertes.fr"
443
""
)
where
cmd'
=
local
(
\
r
->
r
{
makeClientRequest
=
\
bUrl
servantRq
->
requestLog
opts
(
(
makeClientRequest
r
)
bUrl
servantRq
)
makeClientRequest
=
\
bUrl
servantRq
->
requestLog
opts
(
makeClientRequest
r
bUrl
servantRq
)
})
cmd
runStructureRequest
::
Maybe
Text
->
IO
(
Either
ClientError
(
Response
Struct
))
...
...
@@ -244,12 +269,12 @@ generateRequestByStructID rq struct_ids =
rq
<>
" AND "
<>
"structId_i:("
<>
(
flattenPipe
struct_ids
)
<>
flattenPipe
struct_ids
<>
")"
flattenPipe
::
[
Text
]
->
Text
flattenPipe
[]
=
""
flattenPipe
(
x
:
[]
)
=
x
flattenPipe
[
x
]
=
x
flattenPipe
(
x
:
xs
)
=
x
<>
" || "
<>
flattenPipe
xs
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment