Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
hal
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
hal
Commits
8782d81e
Verified
Commit
8782d81e
authored
Jul 26, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[lang] fixes to requestedFields, Main updated, query is an array now
parent
240a13e7
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
48 additions
and
35 deletions
+48
-35
Main.hs
app/Main.hs
+14
-7
cabal.project
cabal.project
+1
-1
crawlerHAL.cabal
crawlerHAL.cabal
+9
-9
HAL.hs
src/HAL.hs
+8
-7
Corpus.hs
src/HAL/Doc/Corpus.hs
+12
-9
Utils.hs
src/HAL/Utils.hs
+4
-2
No files found.
app/Main.hs
View file @
8782d81e
...
...
@@ -21,10 +21,10 @@ import Tree
data
CountParams
=
CountParams
{
cp_query
::
T
.
Text
}
{
cp_query
::
[
T
.
Text
]
}
data
FetchParams
=
FetchParams
{
fp_query
::
T
.
Text
}
{
fp_query
::
[
T
.
Text
]
}
data
Command
=
Count
CountParams
...
...
@@ -33,12 +33,12 @@ data Command =
countParams
::
Parser
Command
countParams
=
Count
<$>
(
CountParams
<$>
strArgument
(
metavar
"query"
))
<$>
many
(
strArgument
(
metavar
"query"
)
))
fetchParams
::
Parser
Command
fetchParams
=
Fetch
<$>
(
FetchParams
<$>
strArgument
(
metavar
"query"
))
<$>
many
(
strArgument
(
metavar
"query"
)
))
params
::
Parser
Command
params
=
subparser
...
...
@@ -60,20 +60,27 @@ main = run =<< execParser opts
run
::
Command
->
IO
()
run
(
Count
(
CountParams
{
cp_query
}))
=
do
res
<-
getMetadataWithC
(
cp_query
)
(
Just
0
)
Nothing
Nothing
res
<-
getMetadataWithC
cp_query
(
Just
0
)
Nothing
Nothing
case
res
of
Left
err
->
putText
$
show
err
Right
(
cnt
,
_docsC
)
->
putText
$
show
cnt
run
(
Fetch
(
FetchParams
{
fp_query
}))
=
do
res
<-
getMetadataWithC
(
fp_query
)
(
Just
0
)
Nothing
Nothing
res
<-
getMetadataWithC
fp_query
(
Just
0
)
Nothing
Nothing
case
res
of
Left
err
->
putText
$
show
err
Right
(
_cnt
,
docsC
)
->
do
_
<-
runConduit
$
docsC
.|
mapM_C
(
\
(
Corpus
{
..
})
->
putText
$
"docid: "
<>
show
_corpus_docid
)
.|
mapM_C
printCorpus
.|
sinkList
pure
()
where
printCorpus
Corpus
{
..
}
=
do
putText
$
"docid: "
<>
_corpus_docid
<>
" ["
<>
(
T
.
intercalate
" "
_corpus_title
)
<>
"]"
putText
$
" "
<>
(
T
.
intercalate
" "
_corpus_abstract
)
putText
$
" "
<>
show
_corpus_abstract_lang_map
putText
$
" "
<>
show
_corpus_original
putText
"------------"
-- data
...
...
cabal.project
View file @
8782d81e
-- Generated by stack2cabal
with-compiler: ghc-
9.2.8
with-compiler: ghc-
8.10.7
packages:
./
...
...
crawlerHAL.cabal
View file @
8782d81e
...
...
@@ -48,7 +48,7 @@ library
RecordWildCards
TypeOperators
build-depends:
aeson >=
2.1.0 && < 2.3
aeson >=
1.5.6.0 && < 1.6
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
...
...
@@ -65,10 +65,10 @@ library
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >=
2.0.2
&& < 2.1
, text >=
1.2.3.0
&& < 2.1
, text-format >= 0.3.2.1 && < 0.4
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.1
3
.0 && < 0.14
, vector >= 0.1
2
.0 && < 0.14
default-language: Haskell2010
executable crawlerHAL-exe
...
...
@@ -88,7 +88,7 @@ executable crawlerHAL-exe
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson >=
2.1.0 && < 2.3
aeson >=
1.5.6 && < 1.6
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
...
...
@@ -106,9 +106,9 @@ executable crawlerHAL-exe
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >=
2.0.2
&& < 2.1
, text >=
1.2.3.0
&& < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.1
3
.0 && < 0.14
, vector >= 0.1
2
.0 && < 0.14
default-language: Haskell2010
test-suite halCrawler-test
...
...
@@ -129,7 +129,7 @@ test-suite halCrawler-test
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson >=
2.1.0 && < 2.3
aeson >=
1.5.6 && < 1.6
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
...
...
@@ -147,7 +147,7 @@ test-suite halCrawler-test
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >=
2.0.2
&& < 2.1
, text >=
1.2.3.0
&& < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.1
3
.0 && < 0.14
, vector >= 0.1
2
.0 && < 0.14
default-language: Haskell2010
src/HAL.hs
View file @
8782d81e
...
...
@@ -30,14 +30,14 @@ getMetadataWith :: Query
getMetadataWith
q
start_
limit
lang
=
do
runHalAPIClient
$
search
(
Just
$
requestedFields
lang
)
[
q
]
Nothing
start_
limit
getMetadataWithC
::
Query
getMetadataWithC
::
[
Query
]
->
Maybe
Start
->
Maybe
Limit
->
Maybe
ISO639_1
->
IO
(
Either
ClientError
(
Maybe
Count
,
ConduitT
()
Corpus
IO
()
))
getMetadataWithC
q
start_
limit
lang
=
do
getMetadataWithC
q
s
start_
limit
lang
=
do
-- First, estimate the total number of documents
eCount
<-
countResults
q
eCount
<-
countResults
q
s
pure
$
get'
<$>
eCount
where
get'
::
Count
...
...
@@ -55,8 +55,9 @@ getMetadataWithC q start_ limit lang = do
getPage
::
Start
->
Int
->
IO
[
Corpus
]
getPage
start'
pageNum
=
do
-- putText $ "requestedFields: " <> (show $ requestedFields lang)
let
offset
=
start'
+
pageNum
*
batchSize
eRes
<-
runHalAPIClient
$
search
(
Just
$
requestedFields
lang
)
[
q
]
Nothing
(
Just
offset
)
(
Just
$
fromIntegral
batchSize
)
eRes
<-
runHalAPIClient
$
search
(
Just
$
requestedFields
lang
)
qs
Nothing
(
Just
offset
)
(
Just
$
fromIntegral
batchSize
)
pure
$
case
eRes
of
Left
_
->
[]
Right
(
Response
{
_docs
})
->
_docs
...
...
@@ -65,10 +66,10 @@ getMetadataWithC q start_ limit lang = do
-- putText $ show _corpus_title
-- pure c
countResults
::
Query
->
IO
(
Either
ClientError
Count
)
countResults
q
=
do
countResults
::
[
Query
]
->
IO
(
Either
ClientError
Count
)
countResults
q
s
=
do
-- First, estimate the total number of documents
eRes
<-
runHalAPIClient
$
search
(
Just
$
requestedFields
Nothing
)
[
q
]
Nothing
(
Just
0
)
(
Just
1
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
eRes
<-
runHalAPIClient
$
search
(
Just
$
requestedFields
Nothing
)
qs
Nothing
(
Just
0
)
(
Just
1
)
::
IO
(
Either
ClientError
(
Response
Corpus
))
pure
$
_numFound
<$>
eRes
requestedFields
::
Maybe
ISO639_1
->
Text
...
...
src/HAL/Doc/Corpus.hs
View file @
8782d81e
...
...
@@ -23,22 +23,23 @@ data Corpus = Corpus
,
_corpus_authors_names
::
[
Text
]
,
_corpus_authors_affiliations
::
[
Text
]
,
_corpus_struct_id
::
[
Int
]
,
_corpus_original
::
Object
}
deriving
(
Show
,
Generic
)
L
.
makeLenses
''
C
orpus
instance
Default
Corpus
where
def
=
Corpus
"default Id"
def
def
def
def
def
def
def
def
def
=
Corpus
"default Id"
def
def
def
def
def
def
def
def
mempty
instance
FromJSON
Corpus
where
parseJSON
=
withObject
"Corpus"
$
\
o
->
do
_corpus_docid
<-
(
o
.:
"docid"
)
_corpus_title
<-
(
o
.:
"title_s"
<|>
return
[]
)
_corpus_abstract
<-
(
o
.:
"en_abstract_s"
<|>
return
[]
)
_corpus_date
<-
(
o
.:?
"submittedDate_s"
)
_corpus_source
<-
(
o
.:?
"source_s"
)
_corpus_authors_names
<-
(
o
.:
"authFullName_s"
<|>
return
[]
)
_corpus_authors_affiliations
<-
(
o
.:
"authOrganism_s"
<|>
return
[]
)
_corpus_struct_id
<-
(
o
.:
"structId_i"
<|>
return
[]
)
_corpus_docid
<-
o
.:
"docid"
_corpus_title
<-
o
.:
"title_s"
<|>
return
[]
_corpus_abstract
<-
o
.:
"en_abstract_s"
<|>
return
[]
_corpus_date
<-
o
.:?
"submittedDate_s"
_corpus_source
<-
o
.:?
"source_s"
_corpus_authors_names
<-
o
.:
"authFullName_s"
<|>
return
[]
_corpus_authors_affiliations
<-
o
.:
"authOrganism_s"
<|>
return
[]
_corpus_struct_id
<-
o
.:
"structId_i"
<|>
return
[]
abstracts
<-
mapM
(
\
lang
->
do
...
...
@@ -46,6 +47,8 @@ instance FromJSON Corpus where
pure
$
(
\
a
->
(
lang
,
a
))
<$>
ma
)
allLangs
let
_corpus_abstract_lang_map
=
Map
.
fromList
$
catMaybes
abstracts
let
_corpus_original
=
o
pure
$
Corpus
{
..
}
instance
ToHttpApiData
Corpus
where
...
...
src/HAL/Utils.hs
View file @
8782d81e
module
HAL.Utils
where
import
Data.LanguageCodes
(
ISO639_1
(
..
),
language
)
import
Data.LanguageCodes
(
ISO639_1
(
..
),
toChars
)
import
Data.Text
qualified
as
T
import
Protolude
...
...
@@ -8,4 +8,6 @@ allLangs :: [ISO639_1]
allLangs
=
enumFrom
(
toEnum
0
)
::
[
ISO639_1
]
langAbstractS
::
ISO639_1
->
Text
langAbstractS
lang
=
(
T
.
pack
$
language
lang
)
<>
"_abstract_s"
langAbstractS
lang
=
(
T
.
pack
[
l1
,
l2
])
<>
"_abstract_s"
where
(
l1
,
l2
)
=
toChars
lang
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment