Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
155
Issues
155
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
31af4e4b
Verified
Commit
31af4e4b
authored
Mar 18, 2024
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[hal] rewrite crawler, some small fixes to our code
parent
b71d4a5f
Pipeline
#5783
canceled with stages
Changes
6
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
25 additions
and
26 deletions
+25
-26
update-project-dependencies
bin/update-project-dependencies
+1
-1
cabal.project
cabal.project
+1
-1
API.hs
src/Gargantext/Core/Text/Corpus/API.hs
+4
-3
Hal.hs
src/Gargantext/Core/Text/Corpus/API/Hal.hs
+13
-15
Istex.hs
src/Gargantext/Core/Text/Corpus/API/Istex.hs
+5
-5
stack.yaml
stack.yaml
+1
-1
No files found.
bin/update-project-dependencies
View file @
31af4e4b
...
@@ -18,7 +18,7 @@ fi
...
@@ -18,7 +18,7 @@ fi
# with the `sha256sum` result calculated on the `cabal.project` and
# with the `sha256sum` result calculated on the `cabal.project` and
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in.
# cache can kick in.
expected_cabal_project_hash
=
"
1cbb47fd3f929a01b3b968cc2e148dcbf5ef4e662e14ed9832d32471a68f6766
"
expected_cabal_project_hash
=
"
3bfa2552464823ff4f1d892e9dc2778a9cbf1a153a6639ec9caf87e6d9c75a7b
"
expected_cabal_project_freeze_hash
=
"2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5"
expected_cabal_project_freeze_hash
=
"2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5"
cabal
--store-dir
=
$STORE_DIR
v2-build
--dry-run
cabal
--store-dir
=
$STORE_DIR
v2-build
--dry-run
...
...
cabal.project
View file @
31af4e4b
...
@@ -106,7 +106,7 @@ source-repository-package
...
@@ -106,7 +106,7 @@ source-repository-package
source
-
repository
-
package
source
-
repository
-
package
type
:
git
type
:
git
location
:
https
://
gitlab
.
iscpif
.
fr
/
gargantext
/
crawlers
/
hal
.
git
location
:
https
://
gitlab
.
iscpif
.
fr
/
gargantext
/
crawlers
/
hal
.
git
tag
:
b
fa9069b4ff70f341ca3244e8aff9e83eb4b8b73
tag
:
b
99b9e568c8bdc73af2b8016ed03ba5ee83c2030
source
-
repository
-
package
source
-
repository
-
package
type
:
git
type
:
git
...
...
src/Gargantext/Core/Text/Corpus/API.hs
View file @
31af4e4b
...
@@ -18,8 +18,7 @@ module Gargantext.Core.Text.Corpus.API
...
@@ -18,8 +18,7 @@ module Gargantext.Core.Text.Corpus.API
,
externalAPIs
,
externalAPIs
)
where
)
where
import
Conduit
import
Conduit
(
ConduitT
,
yieldMany
)
import
Control.Monad.Except
import
Data.Text
qualified
as
T
import
Data.Text
qualified
as
T
import
EPO.API.Client.Types
qualified
as
EPO
import
EPO.API.Client.Types
qualified
as
EPO
import
Gargantext.API.Admin.Orchestrator.Types
(
ExternalAPIs
(
..
),
externalAPIs
)
import
Gargantext.API.Admin.Orchestrator.Types
(
ExternalAPIs
(
..
),
externalAPIs
)
...
@@ -32,7 +31,7 @@ import Gargantext.Core.Text.Corpus.API.Istex qualified as ISTEX
...
@@ -32,7 +31,7 @@ import Gargantext.Core.Text.Corpus.API.Istex qualified as ISTEX
import
Gargantext.Core.Text.Corpus.API.OpenAlex
qualified
as
OpenAlex
import
Gargantext.Core.Text.Corpus.API.OpenAlex
qualified
as
OpenAlex
import
Gargantext.Core.Text.Corpus.API.Pubmed
qualified
as
PUBMED
import
Gargantext.Core.Text.Corpus.API.Pubmed
qualified
as
PUBMED
import
Gargantext.Core.Text.Corpus.Query
qualified
as
Corpus
import
Gargantext.Core.Text.Corpus.Query
qualified
as
Corpus
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
.Document
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
hiding
(
get
)
import
Gargantext.Prelude
hiding
(
get
)
import
PUBMED.Types
qualified
as
PUBMED
import
PUBMED.Types
qualified
as
PUBMED
import
Servant.Client
(
ClientError
)
import
Servant.Client
(
ClientError
)
...
@@ -80,3 +79,5 @@ get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
...
@@ -80,3 +79,5 @@ get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
first
ExternalAPIError
<$>
EPO
.
get
epoAuthKey
epoAPIUrl
q
(
toISO639
lang
)
limit
first
ExternalAPIError
<$>
EPO
.
get
epoAuthKey
epoAPIUrl
q
(
toISO639
lang
)
limit
where
where
parse_query
=
first
(
InvalidInputQuery
q
.
T
.
pack
)
$
Corpus
.
parseQuery
q
parse_query
=
first
(
InvalidInputQuery
q
.
T
.
pack
)
$
Corpus
.
parseQuery
q
src/Gargantext/Core/Text/Corpus/API/Hal.hs
View file @
31af4e4b
...
@@ -12,29 +12,27 @@ Portability : POSIX
...
@@ -12,29 +12,27 @@ Portability : POSIX
module
Gargantext.Core.Text.Corpus.API.Hal
module
Gargantext.Core.Text.Corpus.API.Hal
where
where
import
Conduit
import
Conduit
(
(
.|
),
ConduitT
,
mapMC
)
import
Data.Either
import
Data.LanguageCodes
qualified
as
ISO639
import
Data.LanguageCodes
qualified
as
ISO639
import
Data.Map.Strict
qualified
as
Map
import
Data.Map.Strict
qualified
as
Map
import
Data.Maybe
import
Data.Text
(
pack
)
import
Data.Text
(
pack
,
intercalate
)
import
Gargantext.Core.Text.Corpus.Parsers.Date
qualified
as
Date
import
Gargantext.Core.Text.Corpus.Parsers.Date
qualified
as
Date
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
.Document
(
HyperdataDocument
(
..
))
import
Gargantext.Defaults
qualified
as
Defaults
import
Gargantext.Defaults
qualified
as
Defaults
import
Gargantext.Prelude
hiding
(
intercalate
)
import
Gargantext.Prelude
hiding
(
intercalate
)
import
HAL
qualified
as
HAL
import
HAL
qualified
import
HAL.Client
qualified
as
HAL
import
HAL.Doc.Corpus
qualified
as
HAL
import
HAL.Doc.Corpus
qualified
as
HAL
import
HAL.Types
qualified
as
HAL
import
Servant.Client
(
ClientError
)
import
Servant.Client
(
ClientError
)
get
::
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
[
HyperdataDocument
]
get
::
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
[
HyperdataDocument
]
get
la
q
ml
=
do
get
la
q
ml
=
do
eDocs
<-
HAL
.
getMetadataWith
[
q
]
(
Just
0
)
(
fromIntegral
<$>
ml
)
la
eDocs
<-
HAL
.
getMetadataWith
[
q
]
(
Just
0
)
(
fromIntegral
<$>
ml
)
la
either
(
panicTrace
.
pack
.
show
)
(
\
d
->
mapM
(
toDoc'
la
)
$
HAL
.
_docs
d
)
eDocs
either
(
panicTrace
.
pack
.
show
)
(
mapM
(
toDoc'
la
)
.
HAL
.
_docs
)
eDocs
getC
::
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
))
getC
::
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
))
getC
la
q
ml
=
do
getC
la
q
ml
=
do
eRes
<-
HAL
.
getMetadataWithC
[
q
]
(
Just
0
)
(
fromIntegral
<$>
ml
)
la
eRes
<-
HAL
.
getMetadataWithC
ursorC
q
(
fromIntegral
<$>
ml
)
la
pure
$
(
\
(
len
,
docsC
)
->
(
len
,
docsC
.|
mapMC
(
toDoc'
la
)))
<$>
eRes
pure
$
(
\
(
len
,
docsC
)
->
(
len
,
docsC
.|
mapMC
(
toDoc'
la
)))
<$>
eRes
-- case eRes of
-- case eRes of
-- Left err -> panic $ pack $ show err
-- Left err -> panic $ pack $ show err
...
@@ -43,21 +41,21 @@ getC la q ml = do
...
@@ -43,21 +41,21 @@ getC la q ml = do
toDoc'
::
Maybe
ISO639
.
ISO639_1
->
HAL
.
Corpus
->
IO
HyperdataDocument
toDoc'
::
Maybe
ISO639
.
ISO639_1
->
HAL
.
Corpus
->
IO
HyperdataDocument
toDoc'
la
(
HAL
.
Corpus
{
..
})
=
do
toDoc'
la
(
HAL
.
Corpus
{
..
})
=
do
-- printDebug "[toDoc corpus] h" h
-- printDebug "[toDoc corpus] h" h
let
mDateS
=
maybe
(
Just
$
pack
$
show
Defaults
.
year
)
Just
_corpus_date
let
mDateS
=
_corpus_date
<|>
Just
(
pack
$
show
Defaults
.
year
)
let
(
utctime
,
(
pub_year
,
pub_month
,
pub_day
))
=
Date
.
mDateSplit
mDateS
let
(
utctime
,
(
pub_year
,
pub_month
,
pub_day
))
=
Date
.
mDateSplit
mDateS
let
abstractDefault
=
intercalate
" "
_corpus_abstract
let
abstractDefault
=
unwords
_corpus_abstract
let
abstract
=
case
la
of
let
abstract
=
case
la
of
Nothing
->
abstractDefault
Nothing
->
abstractDefault
Just
l
->
fromMaybe
abstractDefault
(
intercalate
" "
<$>
Map
.
lookup
l
_corpus_abstract_lang_map
)
Just
l
->
maybe
abstractDefault
unwords
(
Map
.
lookup
l
_corpus_abstract_lang_map
)
pure
HyperdataDocument
{
_hd_bdd
=
Just
"Hal"
pure
HyperdataDocument
{
_hd_bdd
=
Just
"Hal"
,
_hd_doi
=
Just
$
pack
$
show
_corpus_docid
,
_hd_doi
=
Just
$
pack
$
show
_corpus_docid
,
_hd_url
=
Nothing
,
_hd_url
=
Nothing
,
_hd_uniqId
=
Nothing
,
_hd_uniqId
=
Nothing
,
_hd_uniqIdBdd
=
Nothing
,
_hd_uniqIdBdd
=
Nothing
,
_hd_page
=
Nothing
,
_hd_page
=
Nothing
,
_hd_title
=
Just
$
intercalate
" "
_corpus_title
,
_hd_title
=
Just
$
unwords
_corpus_title
,
_hd_authors
=
Just
$
foldl
(
\
x
y
->
if
x
==
""
then
y
else
x
<>
", "
<>
y
)
""
_corpus_authors_names
,
_hd_authors
=
Just
$
foldl
'
(
\
x
y
->
if
x
==
""
then
y
else
x
<>
", "
<>
y
)
""
_corpus_authors_names
,
_hd_institutes
=
Just
$
foldl
(
\
x
y
->
if
x
==
""
then
y
else
x
<>
", "
<>
y
)
""
$
_corpus_authors_affiliations
<>
map
show
_corpus_struct_id
,
_hd_institutes
=
Just
$
foldl
'
(
\
x
y
->
if
x
==
""
then
y
else
x
<>
", "
<>
y
)
""
$
_corpus_authors_affiliations
<>
map
show
_corpus_struct_id
,
_hd_source
=
Just
$
maybe
"Nothing"
identity
_corpus_source
,
_hd_source
=
Just
$
maybe
"Nothing"
identity
_corpus_source
,
_hd_abstract
=
Just
abstract
,
_hd_abstract
=
Just
abstract
,
_hd_publication_date
=
fmap
show
utctime
,
_hd_publication_date
=
fmap
show
utctime
...
...
src/Gargantext/Core/Text/Corpus/API/Istex.hs
View file @
31af4e4b
...
@@ -20,9 +20,9 @@ import Data.List qualified as List
...
@@ -20,9 +20,9 @@ import Data.List qualified as List
import
Data.Text
qualified
as
Text
import
Data.Text
qualified
as
Text
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
(
toDoc
)
import
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
(
toDoc
)
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
.Document
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
hiding
(
get
)
import
Gargantext.Prelude
hiding
(
get
)
import
ISTEX
qualified
as
ISTEX
import
ISTEX
qualified
import
ISTEX.Client
qualified
as
ISTEX
import
ISTEX.Client
qualified
as
ISTEX
type
Query
=
Text
type
Query
=
Text
...
@@ -40,14 +40,14 @@ get la query' maxResults = do
...
@@ -40,14 +40,14 @@ get la query' maxResults = do
-- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
let
query
=
case
(
List
.
length
$
Text
.
splitOn
":"
query'
)
==
1
of
let
query
=
if
List
.
length
(
Text
.
splitOn
":"
query'
)
==
1
then
-- True case means users is entering default search of IsTex
-- True case means users is entering default search of IsTex
-- In that case we need to enrich his query with 2 parameters
-- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT
-- First expected language: user has to define it in GTXT
-- Second : query in abstract
-- Second : query in abstract
True
->
(
"language:"
<>
toISTEXLanguageCode
la
)
<>
" AND abstract:"
<>
query'
(
"language:"
<>
toISTEXLanguageCode
la
)
<>
" AND abstract:"
<>
query'
False
->
query'
else
query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing
-- in that case we suppose user is knowing what s.he is doing
...
...
stack.yaml
View file @
31af4e4b
...
@@ -135,7 +135,7 @@
...
@@ -135,7 +135,7 @@
git
:
"
https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git"
git
:
"
https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git"
subdirs
:
subdirs
:
-
.
-
.
-
commit
:
b
fa9069b4ff70f341ca3244e8aff9e83eb4b8b73
-
commit
:
b
99b9e568c8bdc73af2b8016ed03ba5ee83c2030
git
:
"
https://gitlab.iscpif.fr/gargantext/crawlers/hal.git"
git
:
"
https://gitlab.iscpif.fr/gargantext/crawlers/hal.git"
subdirs
:
subdirs
:
-
.
-
.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment