Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
31af4e4b
Verified
Commit
31af4e4b
authored
Mar 18, 2024
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[hal] rewrite crawler, some small fixes to our code
parent
b71d4a5f
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
25 additions
and
26 deletions
+25
-26
update-project-dependencies
bin/update-project-dependencies
+1
-1
cabal.project
cabal.project
+1
-1
API.hs
src/Gargantext/Core/Text/Corpus/API.hs
+4
-3
Hal.hs
src/Gargantext/Core/Text/Corpus/API/Hal.hs
+13
-15
Istex.hs
src/Gargantext/Core/Text/Corpus/API/Istex.hs
+5
-5
stack.yaml
stack.yaml
+1
-1
No files found.
bin/update-project-dependencies
View file @
31af4e4b
...
...
@@ -18,7 +18,7 @@ fi
# with the `sha256sum` result calculated on the `cabal.project` and
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in.
expected_cabal_project_hash
=
"
1cbb47fd3f929a01b3b968cc2e148dcbf5ef4e662e14ed9832d32471a68f6766
"
expected_cabal_project_hash
=
"
3bfa2552464823ff4f1d892e9dc2778a9cbf1a153a6639ec9caf87e6d9c75a7b
"
expected_cabal_project_freeze_hash
=
"2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5"
cabal
--store-dir
=
$STORE_DIR
v2-build
--dry-run
...
...
cabal.project
View file @
31af4e4b
...
...
@@ -106,7 +106,7 @@ source-repository-package
source
-
repository
-
package
type
:
git
location
:
https
://
gitlab
.
iscpif
.
fr
/
gargantext
/
crawlers
/
hal
.
git
tag
:
b
fa9069b4ff70f341ca3244e8aff9e83eb4b8b73
tag
:
b
99b9e568c8bdc73af2b8016ed03ba5ee83c2030
source
-
repository
-
package
type
:
git
...
...
src/Gargantext/Core/Text/Corpus/API.hs
View file @
31af4e4b
...
...
@@ -18,8 +18,7 @@ module Gargantext.Core.Text.Corpus.API
,
externalAPIs
)
where
import
Conduit
import
Control.Monad.Except
import
Conduit
(
ConduitT
,
yieldMany
)
import
Data.Text
qualified
as
T
import
EPO.API.Client.Types
qualified
as
EPO
import
Gargantext.API.Admin.Orchestrator.Types
(
ExternalAPIs
(
..
),
externalAPIs
)
...
...
@@ -32,7 +31,7 @@ import Gargantext.Core.Text.Corpus.API.Istex qualified as ISTEX
import
Gargantext.Core.Text.Corpus.API.OpenAlex
qualified
as
OpenAlex
import
Gargantext.Core.Text.Corpus.API.Pubmed
qualified
as
PUBMED
import
Gargantext.Core.Text.Corpus.Query
qualified
as
Corpus
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
.Document
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
hiding
(
get
)
import
PUBMED.Types
qualified
as
PUBMED
import
Servant.Client
(
ClientError
)
...
...
@@ -80,3 +79,5 @@ get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
first
ExternalAPIError
<$>
EPO
.
get
epoAuthKey
epoAPIUrl
q
(
toISO639
lang
)
limit
where
parse_query
=
first
(
InvalidInputQuery
q
.
T
.
pack
)
$
Corpus
.
parseQuery
q
src/Gargantext/Core/Text/Corpus/API/Hal.hs
View file @
31af4e4b
...
...
@@ -12,29 +12,27 @@ Portability : POSIX
module
Gargantext.Core.Text.Corpus.API.Hal
where
import
Conduit
import
Data.Either
import
Conduit
(
(
.|
),
ConduitT
,
mapMC
)
import
Data.LanguageCodes
qualified
as
ISO639
import
Data.Map.Strict
qualified
as
Map
import
Data.Maybe
import
Data.Text
(
pack
,
intercalate
)
import
Data.Text
(
pack
)
import
Gargantext.Core.Text.Corpus.Parsers.Date
qualified
as
Date
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
.Document
(
HyperdataDocument
(
..
))
import
Gargantext.Defaults
qualified
as
Defaults
import
Gargantext.Prelude
hiding
(
intercalate
)
import
HAL
qualified
as
HAL
import
HAL.Client
qualified
as
HAL
import
HAL
qualified
import
HAL.Doc.Corpus
qualified
as
HAL
import
HAL.Types
qualified
as
HAL
import
Servant.Client
(
ClientError
)
get
::
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
[
HyperdataDocument
]
get
la
q
ml
=
do
eDocs
<-
HAL
.
getMetadataWith
[
q
]
(
Just
0
)
(
fromIntegral
<$>
ml
)
la
either
(
panicTrace
.
pack
.
show
)
(
\
d
->
mapM
(
toDoc'
la
)
$
HAL
.
_docs
d
)
eDocs
either
(
panicTrace
.
pack
.
show
)
(
mapM
(
toDoc'
la
)
.
HAL
.
_docs
)
eDocs
getC
::
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
))
getC
la
q
ml
=
do
eRes
<-
HAL
.
getMetadataWithC
[
q
]
(
Just
0
)
(
fromIntegral
<$>
ml
)
la
eRes
<-
HAL
.
getMetadataWithC
ursorC
q
(
fromIntegral
<$>
ml
)
la
pure
$
(
\
(
len
,
docsC
)
->
(
len
,
docsC
.|
mapMC
(
toDoc'
la
)))
<$>
eRes
-- case eRes of
-- Left err -> panic $ pack $ show err
...
...
@@ -43,21 +41,21 @@ getC la q ml = do
toDoc'
::
Maybe
ISO639
.
ISO639_1
->
HAL
.
Corpus
->
IO
HyperdataDocument
toDoc'
la
(
HAL
.
Corpus
{
..
})
=
do
-- printDebug "[toDoc corpus] h" h
let
mDateS
=
maybe
(
Just
$
pack
$
show
Defaults
.
year
)
Just
_corpus_date
let
mDateS
=
_corpus_date
<|>
Just
(
pack
$
show
Defaults
.
year
)
let
(
utctime
,
(
pub_year
,
pub_month
,
pub_day
))
=
Date
.
mDateSplit
mDateS
let
abstractDefault
=
intercalate
" "
_corpus_abstract
let
abstractDefault
=
unwords
_corpus_abstract
let
abstract
=
case
la
of
Nothing
->
abstractDefault
Just
l
->
fromMaybe
abstractDefault
(
intercalate
" "
<$>
Map
.
lookup
l
_corpus_abstract_lang_map
)
Just
l
->
maybe
abstractDefault
unwords
(
Map
.
lookup
l
_corpus_abstract_lang_map
)
pure
HyperdataDocument
{
_hd_bdd
=
Just
"Hal"
,
_hd_doi
=
Just
$
pack
$
show
_corpus_docid
,
_hd_url
=
Nothing
,
_hd_uniqId
=
Nothing
,
_hd_uniqIdBdd
=
Nothing
,
_hd_page
=
Nothing
,
_hd_title
=
Just
$
intercalate
" "
_corpus_title
,
_hd_authors
=
Just
$
foldl
(
\
x
y
->
if
x
==
""
then
y
else
x
<>
", "
<>
y
)
""
_corpus_authors_names
,
_hd_institutes
=
Just
$
foldl
(
\
x
y
->
if
x
==
""
then
y
else
x
<>
", "
<>
y
)
""
$
_corpus_authors_affiliations
<>
map
show
_corpus_struct_id
,
_hd_title
=
Just
$
unwords
_corpus_title
,
_hd_authors
=
Just
$
foldl
'
(
\
x
y
->
if
x
==
""
then
y
else
x
<>
", "
<>
y
)
""
_corpus_authors_names
,
_hd_institutes
=
Just
$
foldl
'
(
\
x
y
->
if
x
==
""
then
y
else
x
<>
", "
<>
y
)
""
$
_corpus_authors_affiliations
<>
map
show
_corpus_struct_id
,
_hd_source
=
Just
$
maybe
"Nothing"
identity
_corpus_source
,
_hd_abstract
=
Just
abstract
,
_hd_publication_date
=
fmap
show
utctime
...
...
src/Gargantext/Core/Text/Corpus/API/Istex.hs
View file @
31af4e4b
...
...
@@ -20,9 +20,9 @@ import Data.List qualified as List
import
Data.Text
qualified
as
Text
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
(
toDoc
)
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
.Document
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
hiding
(
get
)
import
ISTEX
qualified
as
ISTEX
import
ISTEX
qualified
import
ISTEX.Client
qualified
as
ISTEX
type
Query
=
Text
...
...
@@ -40,14 +40,14 @@ get la query' maxResults = do
-- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
let
query
=
case
(
List
.
length
$
Text
.
splitOn
":"
query'
)
==
1
of
let
query
=
if
List
.
length
(
Text
.
splitOn
":"
query'
)
==
1
then
-- True case means users is entering default search of IsTex
-- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT
-- Second : query in abstract
True
->
(
"language:"
<>
toISTEXLanguageCode
la
)
<>
" AND abstract:"
<>
query'
(
"language:"
<>
toISTEXLanguageCode
la
)
<>
" AND abstract:"
<>
query'
False
->
query'
else
query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing
...
...
stack.yaml
View file @
31af4e4b
...
...
@@ -135,7 +135,7 @@
git
:
"
https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git"
subdirs
:
-
.
-
commit
:
b
fa9069b4ff70f341ca3244e8aff9e83eb4b8b73
-
commit
:
b
99b9e568c8bdc73af2b8016ed03ba5ee83c2030
git
:
"
https://gitlab.iscpif.fr/gargantext/crawlers/hal.git"
subdirs
:
-
.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment