Commit cfc1aec2 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Support Istex

parent 9483b472
......@@ -85,8 +85,9 @@ library
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.EPO
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
......@@ -230,7 +231,6 @@ library
Gargantext.Core.Statistics
Gargantext.Core.Text.Convert
Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
......
......@@ -63,17 +63,18 @@ get externalAPI la q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
OpenAlex ->
first ExternalAPIError <$> OpenAlex.get (fromMaybe "" Nothing {- email -}) q (toISO639 la) limit
Arxiv -> runExceptT $ do
corpusQuery <- ExceptT (pure parse_query)
ExceptT $ fmap Right (Arxiv.get la corpusQuery limit)
corpusQuery <- parse_query
ExceptT $ first ExternalAPIError <$> (Arxiv.get la corpusQuery limit)
HAL ->
first ExternalAPIError <$> HAL.getC (toISO639 la) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do
docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
IsTex -> runExceptT $ do
corpusQuery <- parse_query
ExceptT $ first ExternalAPIError <$> ISTEX.get la corpusQuery limit
Isidore -> runExceptT $ do
corpusQuery <- ExceptT (pure parse_query)
corpusQuery <- parse_query
ExceptT $ first ExternalAPIError <$> ISIDORE.get la limit corpusQuery Nothing
EPO -> do
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639EN la) limit
where
parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q
parse_query :: ExceptT GetCorpusError IO Corpus.Query
parse_query = ExceptT $ pure $ first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q
......@@ -20,6 +20,8 @@ module Gargantext.Core.Text.Corpus.API.Arxiv
import Arxiv qualified as Arxiv
import Conduit
import Data.Either
import Data.Maybe
import Data.Text (unpack)
import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
......@@ -28,7 +30,7 @@ import Gargantext.Core.Types (Term(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (get)
import Network.Api.Arxiv qualified as Ax
import Servant.Client (ClientError)
-- | Converts a Gargantext's generic boolean query into an Arxiv Query.
convertQuery :: Corpus.Query -> Ax.Query
......@@ -75,13 +77,13 @@ convertQuery q = mkQuery (interpretQuery q transformAST)
get :: Lang
-> Corpus.Query
-> Maybe Corpus.Limit
-> IO (Maybe Integer, ConduitT () HyperdataDocument IO ())
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get la (convertQuery -> query) (fmap getLimit -> limit) = do
(cnt, resC) <- case limit of
Nothing -> Arxiv.searchAxv' query
(Just l) -> do (cnt, res) <- Arxiv.searchAxv' query
pure (cnt, res .| takeC l)
pure $ (Just $ fromIntegral cnt, resC .| mapC (toDoc la))
pure $ Right $ (Just $ fromIntegral cnt, resC .| mapC (toDoc la))
toDoc :: Lang -> Arxiv.Result -> HyperdataDocument
toDoc l (Arxiv.Result { abstract
......
......@@ -46,7 +46,7 @@ newtype IsidoreQuery = IsidoreQuery { _IsidoreQuery :: [EscapeItem] }
deriving stock (Show, Eq)
deriving newtype (Semigroup, Monoid)
-- | Returns an /url encoded/ query ready to be sent to pubmed.
-- | Returns an /url encoded/ query ready to be sent to Isidore.
getIsidoreQuery :: IsidoreQuery -> Text
getIsidoreQuery (IsidoreQuery items) =
Text.replace "q=" "" . TE.decodeUtf8 . renderQueryPartialEscape False $ [
......
......@@ -9,24 +9,94 @@ Portability : POSIX
-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Corpus.API.Istex
( get
-- * Internal API for testing
, getIstexQuery
, IstexQuery(..)
, convertQuery
)
where
import Data.List qualified as List
import Data.ByteString.Char8 qualified as C8
import Data.Conduit
import Data.Conduit.Combinators (yieldMany)
import Data.Maybe
import Data.Monoid
import Data.Semigroup
import Data.Text qualified as Text
import Data.Text.Encoding qualified as TE
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Core.Types (Term(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Prelude hiding (get)
import ISTEX qualified as ISTEX
import ISTEX.Client qualified as ISTEX
import Network.HTTP.Types.URI (EscapeItem(..), renderQueryPartialEscape)
import Servant.Client
languageToQuery :: Lang -> C8.ByteString
languageToQuery la =
"language:" <> case la of
FR -> "fre"
_ -> "eng" -- FIXME -- we should support all the languages.
newtype IstexQuery = IstexQuery { _IstexQuery :: [EscapeItem] }
deriving stock (Show, Eq)
deriving newtype (Semigroup, Monoid)
-- | Returns an /url encoded/ query ready to be sent to Istex.
getIstexQuery :: Lang -> IstexQuery -> Text
getIstexQuery lang (IstexQuery items) =
Text.replace "q=" "" . TE.decodeUtf8 . renderQueryPartialEscape False $ [
("q", langItems <> items)
]
where
langItems :: [EscapeItem]
langItems = [QN (languageToQuery lang), QN "+AND+"]
type Query = Text
type MaxResults = Maybe Int
convertQuery :: Corpus.Query -> IstexQuery
convertQuery q = IstexQuery (interpretQuery q transformAST)
where
transformAST :: BoolExpr Term -> [EscapeItem]
transformAST ast = case ast of
BAnd sub (BConst (Negative term))
-- The second term become positive, so that it can be translated.
-> (transformAST sub) <> [QN "+AND+NOT+"] <> transformAST (BConst (Positive term))
BAnd term1 (BNot term2)
-> transformAST term1 <> [QN "+AND+NOT+"] <> transformAST term2
BAnd sub1 sub2
-> transformAST sub1 <> [QN "+AND+"] <> transformAST sub2
BOr sub1 sub2
-> transformAST sub1 <> [QN "+OR+"] <> transformAST sub2
BNot (BConst (Negative term))
-> transformAST (BConst (Positive term)) -- double negation
BNot sub
-> [QN "NOT+"] <> transformAST sub
-- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
BTrue
-> mempty
-- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
BFalse
-> mempty
-- Maps the positive terms to contain 'abstract:'
BConst (Positive (Term term))
-> [QN "abstract:", QE (TE.encodeUtf8 term)]
BConst (Negative sub)
-> [QN "NOT+"] <> transformAST (BConst (Positive sub))
get :: Lang -> Query -> MaxResults -> IO [HyperdataDocument]
get la query' maxResults = do
get :: Lang
-> Corpus.Query
-> Maybe Corpus.Limit
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get la (convertQuery -> query) maxResults = do
--printDebug "[Istex.get] calling getMetadataScrollProgress for la" la
--printDebug "[Istex.get] calling getMetadataScrollProgress for q" q
--printDebug "[Istex.get] calling getMetadataScrollProgress for ml" ml
......@@ -36,30 +106,15 @@ get la query' maxResults = do
-- TODO check if abstract is in query already if not add like below
-- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
let query = case (List.length $ Text.splitOn ":" query') == 1 of
-- True case means users is entering default search of IsTex
-- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT
-- Second : query in abstract
True -> ("language:"<> lang la) <> " AND abstract:"<>query'
where
lang FR = "fre"
lang _ = "eng"
False -> query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing
eDocs <- ISTEX.getMetadataWith query maxResults
eDocs <- ISTEX.getMetadataWith (getIstexQuery la query) (getLimit <$> maxResults)
-- printDebug "[Istex.get] will print length" (0 :: Int)
case eDocs of
Left _ -> pure ()
Right (ISTEX.Documents { _documents_hits }) -> printDebug "[Istex.get] length docs" $ length _documents_hits
Left err -> pure $ Left err
Right docs@(ISTEX.Documents { _documents_hits }) -> do
printDebug "[Istex.get] length docs" $ length _documents_hits
--ISTEX.getMetadataScrollProgress q ((\_ -> pack $ "1m") <$> ml) Nothing progress errorHandler
case eDocs of
Left err -> panic . Text.pack . show $ err
Right docs -> toDoc' la docs
docs' <- toDoc' la docs
pure $ Right (Just $ fromIntegral $ length docs', yieldMany docs')
--pure $ either (panic . pack . show) (toDoc' la) eDocs
-- where
-- progress (ISTEX.ScrollResponse { _scroll_documents = ISTEX.Documents { _documents_hits }}) =
......
......@@ -18,6 +18,7 @@ import qualified Data.Text as T
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
import qualified Gargantext.Core.Text.Corpus.API.Pubmed as Pubmed
import qualified Gargantext.Core.Text.Corpus.API.Isidore as Isidore
import qualified Gargantext.Core.Text.Corpus.API.Istex as Istex
import qualified Network.Api.Arxiv as Arxiv
import Test.Tasty
......@@ -77,6 +78,14 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey ->
, testCase "It supports '\"Haskell\" AND \"Idris\"'" testIsidore03
, testCase "It supports 'A OR B'" testIsidore04
]
, testGroup "Istex expression converter" [
testCase "It supports 'A'" testIstex01
, testCase "It supports '-A'" testIstex02_01
, testCase "It supports 'NOT A'" testIstex02_02
, testCase "It supports 'NOT (NOT A)'" testIstex02_03
, testCase "It supports '\"Haskell\" AND \"Idris\"'" testIstex03
, testCase "It supports 'A OR B'" testIstex04
]
, testGroup "PUBMED real queries (skipped if PUBMED_API_KEY env var not set)" [
testCase "It searches for \"Covid\"" (testPubMedCovid_01 getPubmedKey)
, testCase "It searches for \"Covid\" AND \"Alzheimer\"" (testPubMedCovid_02 getPubmedKey)
......@@ -92,6 +101,11 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey ->
testCase "It searches for \"chicken pox\"" (testIsidoreRealWorld_01 getPubmedKey)
, testCase "It searches for \"Dante\" AND \"Petrarca\"" (testIsidoreRealWorld_02 getPubmedKey)
]
-- .. ditto for Istex
, testGroup "Istex real queries (skipped if PUBMED_API_KEY env var not set)" [
testCase "It searches for \"brain\"" (testIstexRealWorld_01 getPubmedKey)
, testCase "It searches for \"brain\" AND NOT \"neural\"" (testIstexRealWorld_02 getPubmedKey)
]
]
-- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form,
......@@ -286,6 +300,32 @@ testIsidore04 :: Assertion
testIsidore04 = withValidQuery "A OR B" $ \q ->
Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "A+OR+B"
--
-- Istex tests
--
testIstex01 :: Assertion
testIstex01 = withValidQuery "A" $ \q ->
assertBool ("Query not converted into expression: " <> show @(BoolExpr Term) (fromCNF $ getQuery q))
(Istex.getIstexQuery EN (Istex.convertQuery q) == "language:eng+AND+abstract:A")
testIstex02_01 :: Assertion
testIstex02_01 = withValidQuery "-A" $ \q -> Istex.getIstexQuery EN (Istex.convertQuery q) @?= "language:eng+AND+NOT+abstract:A"
testIstex02_02 :: Assertion
testIstex02_02 = withValidQuery "NOT A" $ \q -> Istex.getIstexQuery EN (Istex.convertQuery q) @?= "language:eng+AND+NOT+abstract:A"
testIstex02_03 :: Assertion
testIstex02_03 = withValidQuery "NOT (NOT A)" $ \q -> Istex.getIstexQuery EN (Istex.convertQuery q) @?= "language:eng+AND+abstract:A"
testIstex03 :: Assertion
testIstex03 = withValidQuery "\"Haskell\" AND \"Idris\"" $ \q ->
Istex.getIstexQuery FR (Istex.convertQuery q) @?= "language:fre+AND+abstract:Haskell+AND+abstract:Idris"
testIstex04 :: Assertion
testIstex04 = withValidQuery "A OR B" $ \q ->
Istex.getIstexQuery EN (Istex.convertQuery q) @?= "language:eng+AND+abstract:A+OR+abstract:B"
--
-- Integration tests against the real services
--
......@@ -327,7 +367,10 @@ testArxivRealWorld_01 getPubmedKey = do
case mb_key of
Nothing -> pure ()
Just _ -> withValidQuery "\"Haskell\"" $ \query -> do
(_, cnd) <- Arxiv.get EN query (Just 1)
res <- Arxiv.get EN query (Just 1)
case res of
Left err -> fail (show err)
Right (_, cnd) -> do
hyperDocs <- sourceToList cnd
case hyperDocs of
[] -> fail "No documents found."
......@@ -339,7 +382,10 @@ testArxivRealWorld_02 getPubmedKey = do
case mb_key of
Nothing -> pure ()
Just _ -> withValidQuery "\"Haskell\" AND \"Agda\"" $ \query -> do
(_, cnd) <- Arxiv.get EN query (Just 1)
res <- Arxiv.get EN query (Just 1)
case res of
Left err -> fail (show err)
Right (_, cnd) -> do
hyperDocs <- sourceToList cnd
case hyperDocs of
[] -> fail "No documents found."
......@@ -379,3 +425,38 @@ testIsidoreRealWorld_02 getPubmedKey = do
case hyperDocs of
[] -> fail "No documents found."
(x:_) -> isJust (_hd_title x) @?= True
--
-- Istex integration tests
--
testIstexRealWorld_01 :: IO (Maybe PubmedApiKey) -> Assertion
testIstexRealWorld_01 getPubmedKey = do
mb_key <- getPubmedKey
case mb_key of
Nothing -> pure ()
Just _ -> withValidQuery "\"brain\"" $ \query -> do
res <- Istex.get EN query (Just 1)
case res of
Left err -> fail (show err)
Right (_, cnd) -> do
hyperDocs <- sourceToList cnd
case hyperDocs of
[] -> fail "No documents found."
(x:_) -> assertBool ("found: " <> show (_hd_title x))
(maybe False (T.isInfixOf "brain") (_hd_title x))
testIstexRealWorld_02 :: IO (Maybe PubmedApiKey) -> Assertion
testIstexRealWorld_02 getPubmedKey = do
mb_key <- getPubmedKey
case mb_key of
Nothing -> pure ()
Just _ -> withValidQuery "\"brain\" AND NOT \"neural\"" $ \query -> do
res <- Istex.get EN query (Just 1)
case res of
Left err -> fail (show err)
Right (_, cnd) -> do
hyperDocs <- sourceToList cnd
case hyperDocs of
[] -> fail "No documents found."
(x:_) -> isJust (_hd_title x) @?= True
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment