Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
150
Issues
150
List
Board
Labels
Milestones
Merge Requests
5
Merge Requests
5
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
d8028550
Commit
d8028550
authored
Jun 05, 2023
by
Alfredo Di Napoli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
WIP - start porting Pubmed queries
parent
d8612190
Pipeline
#4178
failed with stages
in 40 minutes and 41 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
107 additions
and
11 deletions
+107
-11
gargantext.cabal
gargantext.cabal
+1
-1
package.yaml
package.yaml
+1
-0
Query.hs
src-test/Core/Text/Corpus/Query.hs
+35
-0
API.hs
src/Gargantext/Core/Text/Corpus/API.hs
+2
-2
Arxiv.hs
src/Gargantext/Core/Text/Corpus/API/Arxiv.hs
+2
-2
Pubmed.hs
src/Gargantext/Core/Text/Corpus/API/Pubmed.hs
+66
-6
No files found.
gargantext.cabal
View file @
d8028550
...
@@ -54,6 +54,7 @@ library
...
@@ -54,6 +54,7 @@ library
Gargantext.Core.Text.Context
Gargantext.Core.Text.Context
Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.CSV
...
@@ -190,7 +191,6 @@ library
...
@@ -190,7 +191,6 @@ library
Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
...
...
package.yaml
View file @
d8028550
...
@@ -82,6 +82,7 @@ library:
...
@@ -82,6 +82,7 @@ library:
-
Gargantext.Core.Text.Context
-
Gargantext.Core.Text.Context
-
Gargantext.Core.Text.Corpus.API
-
Gargantext.Core.Text.Corpus.API
-
Gargantext.Core.Text.Corpus.API.Arxiv
-
Gargantext.Core.Text.Corpus.API.Arxiv
-
Gargantext.Core.Text.Corpus.API.Pubmed
-
Gargantext.Core.Text.Corpus.Query
-
Gargantext.Core.Text.Corpus.Query
-
Gargantext.Core.Text.Corpus.Parsers
-
Gargantext.Core.Text.Corpus.Parsers
-
Gargantext.Core.Text.Corpus.Parsers.CSV
-
Gargantext.Core.Text.Corpus.Parsers.CSV
...
...
src-test/Core/Text/Corpus/Query.hs
View file @
d8028550
...
@@ -9,6 +9,7 @@ import Gargantext.Core.Types
...
@@ -9,6 +9,7 @@ import Gargantext.Core.Types
import
Prelude
import
Prelude
import
qualified
Gargantext.Core.Text.Corpus.API.Arxiv
as
Arxiv
import
qualified
Gargantext.Core.Text.Corpus.API.Arxiv
as
Arxiv
import
qualified
Network.Api.Arxiv
as
Arxiv
import
qualified
Network.Api.Arxiv
as
Arxiv
import
qualified
Gargantext.Core.Text.Corpus.API.Pubmed
as
Pubmed
import
Test.Tasty
import
Test.Tasty
import
Test.Tasty.HUnit
import
Test.Tasty.HUnit
...
@@ -38,6 +39,14 @@ tests = testGroup "Boolean Query Engine" [
...
@@ -38,6 +39,14 @@ tests = testGroup "Boolean Query Engine" [
,
testCase
"It supports 'A OR NOT B'"
testArxiv05
,
testCase
"It supports 'A OR NOT B'"
testArxiv05
,
testCase
"It supports '-A'"
testArxiv06
,
testCase
"It supports '-A'"
testArxiv06
]
]
,
testGroup
"PUBMED expression converter"
[
testCase
"It supports 'A'"
testPubMed01
,
testCase
"It supports '-A'"
testPubMed02_01
,
testCase
"It supports 'NOT A'"
testPubMed02_02
,
testCase
"It supports 'NOT (NOT A)'"
testPubMed02_03
,
testCase
"It supports '
\"
Haskell
\"
AND
\"
Idris
\"
'"
testPubMed03
,
testCase
"It supports 'A OR B'"
testPubMed04
]
]
]
-- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form,
-- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form,
...
@@ -158,3 +167,29 @@ testArxiv06 = withValidQuery "-A" $ \q ->
...
@@ -158,3 +167,29 @@ testArxiv06 = withValidQuery "-A" $ \q ->
Arxiv
.
AndNot
(
Arxiv
.
Exp
$
Arxiv
.
Abs
[
"A"
])
(
Arxiv
.
Exp
$
Arxiv
.
Abs
[
"A"
])
Arxiv
.
AndNot
(
Arxiv
.
Exp
$
Arxiv
.
Abs
[
"A"
])
(
Arxiv
.
Exp
$
Arxiv
.
Abs
[
"A"
])
)
)
)
)
--
-- PUBMED tests
--
testPubMed01
::
Assertion
testPubMed01
=
withValidQuery
"A"
$
\
q
->
assertBool
(
"Query not converted into expression: "
<>
show
@
(
BoolExpr
Term
)
(
fromCNF
$
getQuery
q
))
(
Pubmed
.
getESearch
(
Pubmed
.
convertQuery
q
)
==
"%23A"
)
testPubMed02_01
::
Assertion
testPubMed02_01
=
withValidQuery
"-A"
$
\
q
->
Pubmed
.
getESearch
(
Pubmed
.
convertQuery
q
)
@?=
"%23NOT+A"
testPubMed02_02
::
Assertion
testPubMed02_02
=
withValidQuery
"NOT A"
$
\
q
->
Pubmed
.
getESearch
(
Pubmed
.
convertQuery
q
)
@?=
"%23NOT+A"
testPubMed02_03
::
Assertion
testPubMed02_03
=
withValidQuery
"NOT (NOT A)"
$
\
q
->
Pubmed
.
getESearch
(
Pubmed
.
convertQuery
q
)
@?=
"%23A"
testPubMed03
::
Assertion
testPubMed03
=
withValidQuery
"
\"
Haskell
\"
AND
\"
Idris
\"
"
$
\
q
->
Pubmed
.
getESearch
(
Pubmed
.
convertQuery
q
)
@?=
"%23Haskell+AND+Idris"
testPubMed04
::
Assertion
testPubMed04
=
withValidQuery
"A OR B"
$
\
q
->
Pubmed
.
getESearch
(
Pubmed
.
convertQuery
q
)
@?=
"%23A+OR+B"
src/Gargantext/Core/Text/Corpus/API.hs
View file @
d8028550
...
@@ -57,10 +57,10 @@ get cfg externalAPI la q limit = do
...
@@ -57,10 +57,10 @@ get cfg externalAPI la q limit = do
Left
err
->
pure
$
Left
$
InvalidInputQuery
q
(
T
.
pack
err
)
Left
err
->
pure
$
Left
$
InvalidInputQuery
q
(
T
.
pack
err
)
Right
corpusQuery
->
case
externalAPI
of
Right
corpusQuery
->
case
externalAPI
of
PubMed
->
first
ExternalAPIError
<$>
PubMed
->
first
ExternalAPIError
<$>
PUBMED
.
get
(
cfg
^.
gc_pubmed_api_key
)
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
PUBMED
.
get
(
cfg
^.
gc_pubmed_api_key
)
corpusQuery
limit
--docs <- PUBMED.get q default_limit -- EN only by default
--docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
Arxiv
->
Right
<$>
Arxiv
.
get
la
corpusQuery
(
Corpus
.
getLimit
<$>
limit
)
Arxiv
->
Right
<$>
Arxiv
.
get
la
corpusQuery
limit
HAL
->
first
ExternalAPIError
<$>
HAL
.
getC
la
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
HAL
->
first
ExternalAPIError
<$>
HAL
.
getC
la
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
IsTex
->
do
docs
<-
ISTEX
.
get
la
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
IsTex
->
do
docs
<-
ISTEX
.
get
la
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
pure
$
Right
(
Just
$
fromIntegral
$
length
docs
,
yieldMany
docs
)
pure
$
Right
(
Just
$
fromIntegral
$
length
docs
,
yieldMany
docs
)
...
...
src/Gargantext/Core/Text/Corpus/API/Arxiv.hs
View file @
d8028550
...
@@ -77,9 +77,9 @@ convertQuery q = mkQuery (interpretQuery q transformAST)
...
@@ -77,9 +77,9 @@ convertQuery q = mkQuery (interpretQuery q transformAST)
-- by default: 10K docs
-- by default: 10K docs
get
::
Lang
get
::
Lang
->
Corpus
.
Query
->
Corpus
.
Query
->
Maybe
Arxiv
.
Limit
->
Maybe
Corpus
.
Limit
->
IO
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
)
->
IO
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
)
get
la
(
convertQuery
->
query
)
limit
=
do
get
la
(
convertQuery
->
query
)
(
fmap
getLimit
->
limit
)
=
do
(
cnt
,
resC
)
<-
case
limit
of
(
cnt
,
resC
)
<-
case
limit
of
Nothing
->
Arxiv
.
searchAxv'
query
Nothing
->
Arxiv
.
searchAxv'
query
(
Just
l
)
->
do
(
cnt
,
res
)
<-
Arxiv
.
searchAxv'
query
(
Just
l
)
->
do
(
cnt
,
res
)
<-
Arxiv
.
searchAxv'
query
...
...
src/Gargantext/Core/Text/Corpus/API/Pubmed.hs
View file @
d8028550
...
@@ -9,21 +9,33 @@ Portability : POSIX
...
@@ -9,21 +9,33 @@ Portability : POSIX
-}
-}
{-# LANGUAGE DerivingStrategies #-}
module
Gargantext.Core.Text.Corpus.API.Pubmed
module
Gargantext.Core.Text.Corpus.API.Pubmed
(
get
)
(
get
-- * Internals for testing
,
ESearch
(
..
)
,
convertQuery
,
getESearch
)
where
where
import
Conduit
import
Conduit
import
Control.Monad.Reader
(
runReaderT
)
import
Control.Monad.Reader
(
runReaderT
)
import
Data.Either
(
Either
)
import
Data.Either
(
Either
)
import
Data.Maybe
import
Data.Maybe
import
Data.Semigroup
import
Data.Monoid
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
qualified
Data.Text
as
Text
import
qualified
Data.Text
as
Text
import
qualified
Data.Text.Encoding
as
TE
import
Network.HTTP.Types.URI
(
EscapeItem
(
..
),
renderQueryPartialEscape
)
import
Servant.Client
(
ClientError
)
import
Servant.Client
(
ClientError
)
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Query
as
Corpus
import
Gargantext.Core.Types
(
Term
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
qualified
PUBMED
as
PubMed
import
qualified
PUBMED
as
PubMed
...
@@ -31,24 +43,72 @@ import qualified PUBMED.Parser as PubMedDoc
...
@@ -31,24 +43,72 @@ import qualified PUBMED.Parser as PubMedDoc
import
PUBMED.Types
(
Config
(
..
))
import
PUBMED.Types
(
Config
(
..
))
type
Query
=
Text
-- | A pubmed query.
type
Limit
=
Int
-- See: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
newtype
ESearch
=
ESearch
{
_ESearch
::
[
EscapeItem
]
}
deriving
stock
(
Show
,
Eq
)
deriving
newtype
(
Semigroup
,
Monoid
)
-- | Returns an /url encoded/ query ready to be sent to pubmed.
getESearch
::
ESearch
->
Text
getESearch
(
ESearch
items
)
=
Text
.
replace
"term="
""
.
TE
.
decodeUtf8
.
renderQueryPartialEscape
False
$
[
(
"term"
,
QE
"#"
:
items
)
]
convertQuery
::
Corpus
.
Query
->
ESearch
convertQuery
q
=
ESearch
(
interpretQuery
q
transformAST
)
where
transformAST
::
BoolExpr
Term
->
[
EscapeItem
]
transformAST
ast
=
case
ast
of
BAnd
sub
(
BConst
(
Negative
term
))
-- The second term become positive, so that it can be translated.
->
(
transformAST
sub
)
<>
[
QN
"+AND+NOT+"
]
<>
transformAST
(
BConst
(
Positive
term
))
BAnd
term1
(
BNot
term2
)
->
transformAST
term1
<>
[
QN
"+AND+NOT+"
]
<>
transformAST
term2
BAnd
sub1
sub2
->
transformAST
sub1
<>
[
QN
"+AND+"
]
<>
transformAST
sub2
BOr
sub1
sub2
->
transformAST
sub1
<>
[
QN
"+OR+"
]
<>
transformAST
sub2
BNot
(
BConst
(
Negative
term
))
->
transformAST
(
BConst
(
Positive
term
))
-- double negation
BNot
sub
->
[
QN
"NOT+"
]
<>
transformAST
sub
-- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
BTrue
->
mempty
-- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
BFalse
->
mempty
BConst
(
Positive
(
Term
term
))
->
[
QE
(
TE
.
encodeUtf8
term
)]
-- We can handle negatives via `ANDNOT` with itself.
BConst
(
Negative
(
Term
term
))
->
[
QN
"NOT+"
,
QE
(
TE
.
encodeUtf8
term
)]
-- | TODO put default pubmed query in gargantext.ini
-- | TODO put default pubmed query in gargantext.ini
-- by default: 10K docs
-- by default: 10K docs
get
::
Text
get
::
Text
->
Query
->
Corpus
.
Query
->
Maybe
Limit
->
Maybe
Limit
->
IO
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
))
->
IO
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
))
get
apiKey
q
l
=
do
get
apiKey
q
l
=
do
-- The documentation for PUBMED says:
-- Values for query keys may also be provided in term if they are preceeded by a
-- '#' (%23 in the URL). While only one query_key parameter can be provided to ESearch,
-- any number of query keys can be combined in term. Also, if query keys are provided in term,
-- they can be combined with OR or NOT in addition to AND.
-- Example:
-- esearch.fcgi?db=pubmed&term=%231+AND+asthma&WebEnv=<webenv string>&usehistory=y
--
-- Therefore, we can pretty-print our 'Query' back into something that PubMed could understand.
eRes
<-
runReaderT
PubMed
.
getMetadataWithC
(
Config
{
apiKey
=
Just
apiKey
eRes
<-
runReaderT
PubMed
.
getMetadataWithC
(
Config
{
apiKey
=
Just
apiKey
,
query
=
q
,
query
=
getESearch
$
convertQuery
q
,
perPage
=
Just
200
,
perPage
=
Just
200
,
mWebEnv
=
Nothing
})
,
mWebEnv
=
Nothing
})
let
takeLimit
=
case
l
of
let
takeLimit
=
case
l
of
Nothing
->
mapC
identity
Nothing
->
mapC
identity
Just
l'
->
takeC
$
fromIntegral
l'
Just
l'
->
takeC
$
getLimit
l'
pure
$
(
\
(
len
,
docsC
)
->
(
len
,
docsC
.|
takeLimit
.|
mapC
(
toDoc
EN
)))
<$>
eRes
pure
$
(
\
(
len
,
docsC
)
->
(
len
,
docsC
.|
takeLimit
.|
mapC
(
toDoc
EN
)))
<$>
eRes
--either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
--either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
-- <$> PubMed.getMetadataWithC q l
-- <$> PubMed.getMetadataWithC q l
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment