Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
cfc1aec2
Commit
cfc1aec2
authored
Jun 12, 2023
by
Alfredo Di Napoli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Support Istex
parent
9483b472
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
189 additions
and
50 deletions
+189
-50
gargantext.cabal
gargantext.cabal
+2
-2
API.hs
src/Gargantext/Core/Text/Corpus/API.hs
+8
-7
Arxiv.hs
src/Gargantext/Core/Text/Corpus/API/Arxiv.hs
+5
-3
Isidore.hs
src/Gargantext/Core/Text/Corpus/API/Isidore.hs
+1
-1
Istex.hs
src/Gargantext/Core/Text/Corpus/API/Istex.hs
+82
-27
Query.hs
test/Test/Core/Text/Corpus/Query.hs
+91
-10
No files found.
gargantext.cabal
View file @
cfc1aec2
...
...
@@ -85,8 +85,9 @@ library
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.EPO
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.
Pubmed
Gargantext.Core.Text.Corpus.API.
Istex
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
...
...
@@ -230,7 +231,6 @@ library
Gargantext.Core.Statistics
Gargantext.Core.Text.Convert
Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
...
...
src/Gargantext/Core/Text/Corpus/API.hs
View file @
cfc1aec2
...
...
@@ -63,17 +63,18 @@ get externalAPI la q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
OpenAlex
->
first
ExternalAPIError
<$>
OpenAlex
.
get
(
fromMaybe
""
Nothing
{- email -}
)
q
(
toISO639
la
)
limit
Arxiv
->
runExceptT
$
do
corpusQuery
<-
ExceptT
(
pure
parse_query
)
ExceptT
$
f
map
Right
(
Arxiv
.
get
la
corpusQuery
limit
)
corpusQuery
<-
parse_query
ExceptT
$
f
irst
ExternalAPIError
<$>
(
Arxiv
.
get
la
corpusQuery
limit
)
HAL
->
first
ExternalAPIError
<$>
HAL
.
getC
(
toISO639
la
)
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
IsTex
->
do
docs
<-
ISTEX
.
get
la
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
pure
$
Right
(
Just
$
fromIntegral
$
length
docs
,
yieldMany
docs
)
IsTex
->
runExceptT
$
do
corpusQuery
<-
parse_query
ExceptT
$
first
ExternalAPIError
<$>
ISTEX
.
get
la
corpusQuery
limit
Isidore
->
runExceptT
$
do
corpusQuery
<-
ExceptT
(
pure
parse_query
)
corpusQuery
<-
parse_query
ExceptT
$
first
ExternalAPIError
<$>
ISIDORE
.
get
la
limit
corpusQuery
Nothing
EPO
->
do
first
ExternalAPIError
<$>
EPO
.
get
epoAuthKey
epoAPIUrl
q
(
toISO639EN
la
)
limit
where
parse_query
=
first
(
InvalidInputQuery
q
.
T
.
pack
)
$
Corpus
.
parseQuery
q
parse_query
::
ExceptT
GetCorpusError
IO
Corpus
.
Query
parse_query
=
ExceptT
$
pure
$
first
(
InvalidInputQuery
q
.
T
.
pack
)
$
Corpus
.
parseQuery
q
src/Gargantext/Core/Text/Corpus/API/Arxiv.hs
View file @
cfc1aec2
...
...
@@ -20,6 +20,8 @@ module Gargantext.Core.Text.Corpus.API.Arxiv
import
Arxiv
qualified
as
Arxiv
import
Conduit
import
Data.Either
import
Data.Maybe
import
Data.Text
(
unpack
)
import
Data.Text
qualified
as
Text
import
Gargantext.Core
(
Lang
(
..
))
...
...
@@ -28,7 +30,7 @@ import Gargantext.Core.Types (Term(..))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
hiding
(
get
)
import
Network.Api.Arxiv
qualified
as
Ax
import
Servant.Client
(
ClientError
)
-- | Converts a Gargantext's generic boolean query into an Arxiv Query.
convertQuery
::
Corpus
.
Query
->
Ax
.
Query
...
...
@@ -75,13 +77,13 @@ convertQuery q = mkQuery (interpretQuery q transformAST)
get
::
Lang
->
Corpus
.
Query
->
Maybe
Corpus
.
Limit
->
IO
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
(
)
)
->
IO
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
))
get
la
(
convertQuery
->
query
)
(
fmap
getLimit
->
limit
)
=
do
(
cnt
,
resC
)
<-
case
limit
of
Nothing
->
Arxiv
.
searchAxv'
query
(
Just
l
)
->
do
(
cnt
,
res
)
<-
Arxiv
.
searchAxv'
query
pure
(
cnt
,
res
.|
takeC
l
)
pure
$
(
Just
$
fromIntegral
cnt
,
resC
.|
mapC
(
toDoc
la
))
pure
$
Right
$
(
Just
$
fromIntegral
cnt
,
resC
.|
mapC
(
toDoc
la
))
toDoc
::
Lang
->
Arxiv
.
Result
->
HyperdataDocument
toDoc
l
(
Arxiv
.
Result
{
abstract
...
...
src/Gargantext/Core/Text/Corpus/API/Isidore.hs
View file @
cfc1aec2
...
...
@@ -46,7 +46,7 @@ newtype IsidoreQuery = IsidoreQuery { _IsidoreQuery :: [EscapeItem] }
deriving
stock
(
Show
,
Eq
)
deriving
newtype
(
Semigroup
,
Monoid
)
-- | Returns an /url encoded/ query ready to be sent to
pubmed
.
-- | Returns an /url encoded/ query ready to be sent to
Isidore
.
getIsidoreQuery
::
IsidoreQuery
->
Text
getIsidoreQuery
(
IsidoreQuery
items
)
=
Text
.
replace
"q="
""
.
TE
.
decodeUtf8
.
renderQueryPartialEscape
False
$
[
...
...
src/Gargantext/Core/Text/Corpus/API/Istex.hs
View file @
cfc1aec2
...
...
@@ -9,24 +9,94 @@ Portability : POSIX
-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE ViewPatterns #-}
module
Gargantext.Core.Text.Corpus.API.Istex
(
get
-- * Internal API for testing
,
getIstexQuery
,
IstexQuery
(
..
)
,
convertQuery
)
where
import
Data.List
qualified
as
List
import
Data.ByteString.Char8
qualified
as
C8
import
Data.Conduit
import
Data.Conduit.Combinators
(
yieldMany
)
import
Data.Maybe
import
Data.Monoid
import
Data.Semigroup
import
Data.Text
qualified
as
Text
import
Data.Text.Encoding
qualified
as
TE
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
(
toDoc
)
import
Gargantext.Core.Text.Corpus.Query
as
Corpus
import
Gargantext.Core.Types
(
Term
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Prelude
hiding
(
get
)
import
ISTEX
qualified
as
ISTEX
import
ISTEX.Client
qualified
as
ISTEX
import
Network.HTTP.Types.URI
(
EscapeItem
(
..
),
renderQueryPartialEscape
)
import
Servant.Client
languageToQuery
::
Lang
->
C8
.
ByteString
languageToQuery
la
=
"language:"
<>
case
la
of
FR
->
"fre"
_
->
"eng"
-- FIXME -- we should support all the languages.
newtype
IstexQuery
=
IstexQuery
{
_IstexQuery
::
[
EscapeItem
]
}
deriving
stock
(
Show
,
Eq
)
deriving
newtype
(
Semigroup
,
Monoid
)
-- | Returns an /url encoded/ query ready to be sent to Istex.
getIstexQuery
::
Lang
->
IstexQuery
->
Text
getIstexQuery
lang
(
IstexQuery
items
)
=
Text
.
replace
"q="
""
.
TE
.
decodeUtf8
.
renderQueryPartialEscape
False
$
[
(
"q"
,
langItems
<>
items
)
]
where
langItems
::
[
EscapeItem
]
langItems
=
[
QN
(
languageToQuery
lang
),
QN
"+AND+"
]
type
Query
=
Text
type
MaxResults
=
Maybe
Int
convertQuery
::
Corpus
.
Query
->
IstexQuery
convertQuery
q
=
IstexQuery
(
interpretQuery
q
transformAST
)
where
transformAST
::
BoolExpr
Term
->
[
EscapeItem
]
transformAST
ast
=
case
ast
of
BAnd
sub
(
BConst
(
Negative
term
))
-- The second term become positive, so that it can be translated.
->
(
transformAST
sub
)
<>
[
QN
"+AND+NOT+"
]
<>
transformAST
(
BConst
(
Positive
term
))
BAnd
term1
(
BNot
term2
)
->
transformAST
term1
<>
[
QN
"+AND+NOT+"
]
<>
transformAST
term2
BAnd
sub1
sub2
->
transformAST
sub1
<>
[
QN
"+AND+"
]
<>
transformAST
sub2
BOr
sub1
sub2
->
transformAST
sub1
<>
[
QN
"+OR+"
]
<>
transformAST
sub2
BNot
(
BConst
(
Negative
term
))
->
transformAST
(
BConst
(
Positive
term
))
-- double negation
BNot
sub
->
[
QN
"NOT+"
]
<>
transformAST
sub
-- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
BTrue
->
mempty
-- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
BFalse
->
mempty
-- Maps the positive terms to contain 'abstract:'
BConst
(
Positive
(
Term
term
))
->
[
QN
"abstract:"
,
QE
(
TE
.
encodeUtf8
term
)]
BConst
(
Negative
sub
)
->
[
QN
"NOT+"
]
<>
transformAST
(
BConst
(
Positive
sub
))
get
::
Lang
->
Query
->
MaxResults
->
IO
[
HyperdataDocument
]
get
la
query'
maxResults
=
do
get
::
Lang
->
Corpus
.
Query
->
Maybe
Corpus
.
Limit
->
IO
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
))
get
la
(
convertQuery
->
query
)
maxResults
=
do
--printDebug "[Istex.get] calling getMetadataScrollProgress for la" la
--printDebug "[Istex.get] calling getMetadataScrollProgress for q" q
--printDebug "[Istex.get] calling getMetadataScrollProgress for ml" ml
...
...
@@ -36,30 +106,15 @@ get la query' maxResults = do
-- TODO check if abstract is in query already if not add like below
-- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
let
query
=
case
(
List
.
length
$
Text
.
splitOn
":"
query'
)
==
1
of
-- True case means users is entering default search of IsTex
-- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT
-- Second : query in abstract
True
->
(
"language:"
<>
lang
la
)
<>
" AND abstract:"
<>
query'
where
lang
FR
=
"fre"
lang
_
=
"eng"
False
->
query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing
eDocs
<-
ISTEX
.
getMetadataWith
query
maxResults
eDocs
<-
ISTEX
.
getMetadataWith
(
getIstexQuery
la
query
)
(
getLimit
<$>
maxResults
)
-- printDebug "[Istex.get] will print length" (0 :: Int)
case
eDocs
of
Left
_
->
pure
()
Right
(
ISTEX
.
Documents
{
_documents_hits
})
->
printDebug
"[Istex.get] length docs"
$
length
_documents_hits
Left
err
->
pure
$
Left
err
Right
docs
@
(
ISTEX
.
Documents
{
_documents_hits
})
->
do
printDebug
"[Istex.get] length docs"
$
length
_documents_hits
--ISTEX.getMetadataScrollProgress q ((\_ -> pack $ "1m") <$> ml) Nothing progress errorHandler
case
eDocs
of
Left
err
->
panic
.
Text
.
pack
.
show
$
err
Right
docs
->
toDoc'
la
docs
docs'
<-
toDoc'
la
docs
pure
$
Right
(
Just
$
fromIntegral
$
length
docs'
,
yieldMany
docs'
)
--pure $ either (panic . pack . show) (toDoc' la) eDocs
-- where
-- progress (ISTEX.ScrollResponse { _scroll_documents = ISTEX.Documents { _documents_hits }}) =
...
...
test/Test/Core/Text/Corpus/Query.hs
View file @
cfc1aec2
...
...
@@ -18,6 +18,7 @@ import qualified Data.Text as T
import
qualified
Gargantext.Core.Text.Corpus.API.Arxiv
as
Arxiv
import
qualified
Gargantext.Core.Text.Corpus.API.Pubmed
as
Pubmed
import
qualified
Gargantext.Core.Text.Corpus.API.Isidore
as
Isidore
import
qualified
Gargantext.Core.Text.Corpus.API.Istex
as
Istex
import
qualified
Network.Api.Arxiv
as
Arxiv
import
Test.Tasty
...
...
@@ -77,6 +78,14 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey ->
,
testCase
"It supports '
\"
Haskell
\"
AND
\"
Idris
\"
'"
testIsidore03
,
testCase
"It supports 'A OR B'"
testIsidore04
]
,
testGroup
"Istex expression converter"
[
testCase
"It supports 'A'"
testIstex01
,
testCase
"It supports '-A'"
testIstex02_01
,
testCase
"It supports 'NOT A'"
testIstex02_02
,
testCase
"It supports 'NOT (NOT A)'"
testIstex02_03
,
testCase
"It supports '
\"
Haskell
\"
AND
\"
Idris
\"
'"
testIstex03
,
testCase
"It supports 'A OR B'"
testIstex04
]
,
testGroup
"PUBMED real queries (skipped if PUBMED_API_KEY env var not set)"
[
testCase
"It searches for
\"
Covid
\"
"
(
testPubMedCovid_01
getPubmedKey
)
,
testCase
"It searches for
\"
Covid
\"
AND
\"
Alzheimer
\"
"
(
testPubMedCovid_02
getPubmedKey
)
...
...
@@ -92,6 +101,11 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey ->
testCase
"It searches for
\"
chicken pox
\"
"
(
testIsidoreRealWorld_01
getPubmedKey
)
,
testCase
"It searches for
\"
Dante
\"
AND
\"
Petrarca
\"
"
(
testIsidoreRealWorld_02
getPubmedKey
)
]
-- .. ditto for Istex
,
testGroup
"Istex real queries (skipped if PUBMED_API_KEY env var not set)"
[
testCase
"It searches for
\"
brain
\"
"
(
testIstexRealWorld_01
getPubmedKey
)
,
testCase
"It searches for
\"
brain
\"
AND NOT
\"
neural
\"
"
(
testIstexRealWorld_02
getPubmedKey
)
]
]
-- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form,
...
...
@@ -286,6 +300,32 @@ testIsidore04 :: Assertion
testIsidore04
=
withValidQuery
"A OR B"
$
\
q
->
Isidore
.
getIsidoreQuery
(
Isidore
.
convertQuery
q
)
@?=
"A+OR+B"
--
-- Istex tests
--
testIstex01
::
Assertion
testIstex01
=
withValidQuery
"A"
$
\
q
->
assertBool
(
"Query not converted into expression: "
<>
show
@
(
BoolExpr
Term
)
(
fromCNF
$
getQuery
q
))
(
Istex
.
getIstexQuery
EN
(
Istex
.
convertQuery
q
)
==
"language:eng+AND+abstract:A"
)
testIstex02_01
::
Assertion
testIstex02_01
=
withValidQuery
"-A"
$
\
q
->
Istex
.
getIstexQuery
EN
(
Istex
.
convertQuery
q
)
@?=
"language:eng+AND+NOT+abstract:A"
testIstex02_02
::
Assertion
testIstex02_02
=
withValidQuery
"NOT A"
$
\
q
->
Istex
.
getIstexQuery
EN
(
Istex
.
convertQuery
q
)
@?=
"language:eng+AND+NOT+abstract:A"
testIstex02_03
::
Assertion
testIstex02_03
=
withValidQuery
"NOT (NOT A)"
$
\
q
->
Istex
.
getIstexQuery
EN
(
Istex
.
convertQuery
q
)
@?=
"language:eng+AND+abstract:A"
testIstex03
::
Assertion
testIstex03
=
withValidQuery
"
\"
Haskell
\"
AND
\"
Idris
\"
"
$
\
q
->
Istex
.
getIstexQuery
FR
(
Istex
.
convertQuery
q
)
@?=
"language:fre+AND+abstract:Haskell+AND+abstract:Idris"
testIstex04
::
Assertion
testIstex04
=
withValidQuery
"A OR B"
$
\
q
->
Istex
.
getIstexQuery
EN
(
Istex
.
convertQuery
q
)
@?=
"language:eng+AND+abstract:A+OR+abstract:B"
--
-- Integration tests against the real services
--
...
...
@@ -327,7 +367,10 @@ testArxivRealWorld_01 getPubmedKey = do
case
mb_key
of
Nothing
->
pure
()
Just
_
->
withValidQuery
"
\"
Haskell
\"
"
$
\
query
->
do
(
_
,
cnd
)
<-
Arxiv
.
get
EN
query
(
Just
1
)
res
<-
Arxiv
.
get
EN
query
(
Just
1
)
case
res
of
Left
err
->
fail
(
show
err
)
Right
(
_
,
cnd
)
->
do
hyperDocs
<-
sourceToList
cnd
case
hyperDocs
of
[]
->
fail
"No documents found."
...
...
@@ -339,7 +382,10 @@ testArxivRealWorld_02 getPubmedKey = do
case
mb_key
of
Nothing
->
pure
()
Just
_
->
withValidQuery
"
\"
Haskell
\"
AND
\"
Agda
\"
"
$
\
query
->
do
(
_
,
cnd
)
<-
Arxiv
.
get
EN
query
(
Just
1
)
res
<-
Arxiv
.
get
EN
query
(
Just
1
)
case
res
of
Left
err
->
fail
(
show
err
)
Right
(
_
,
cnd
)
->
do
hyperDocs
<-
sourceToList
cnd
case
hyperDocs
of
[]
->
fail
"No documents found."
...
...
@@ -379,3 +425,38 @@ testIsidoreRealWorld_02 getPubmedKey = do
case
hyperDocs
of
[]
->
fail
"No documents found."
(
x
:
_
)
->
isJust
(
_hd_title
x
)
@?=
True
--
-- Istex integration tests
--
testIstexRealWorld_01
::
IO
(
Maybe
PubmedApiKey
)
->
Assertion
testIstexRealWorld_01
getPubmedKey
=
do
mb_key
<-
getPubmedKey
case
mb_key
of
Nothing
->
pure
()
Just
_
->
withValidQuery
"
\"
brain
\"
"
$
\
query
->
do
res
<-
Istex
.
get
EN
query
(
Just
1
)
case
res
of
Left
err
->
fail
(
show
err
)
Right
(
_
,
cnd
)
->
do
hyperDocs
<-
sourceToList
cnd
case
hyperDocs
of
[]
->
fail
"No documents found."
(
x
:
_
)
->
assertBool
(
"found: "
<>
show
(
_hd_title
x
))
(
maybe
False
(
T
.
isInfixOf
"brain"
)
(
_hd_title
x
))
testIstexRealWorld_02
::
IO
(
Maybe
PubmedApiKey
)
->
Assertion
testIstexRealWorld_02
getPubmedKey
=
do
mb_key
<-
getPubmedKey
case
mb_key
of
Nothing
->
pure
()
Just
_
->
withValidQuery
"
\"
brain
\"
AND NOT
\"
neural
\"
"
$
\
query
->
do
res
<-
Istex
.
get
EN
query
(
Just
1
)
case
res
of
Left
err
->
fail
(
show
err
)
Right
(
_
,
cnd
)
->
do
hyperDocs
<-
sourceToList
cnd
case
hyperDocs
of
[]
->
fail
"No documents found."
(
x
:
_
)
->
isJust
(
_hd_title
x
)
@?=
True
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment