Commit 9483b472 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Isidore conversion to bool query API

parent 5a5185cb
......@@ -84,6 +84,7 @@ library
Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.EPO
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.Query
......@@ -229,7 +230,6 @@ library
Gargantext.Core.Statistics
Gargantext.Core.Text.Convert
Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
......
......@@ -70,9 +70,9 @@ get externalAPI la q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
IsTex -> do
docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do
docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> runExceptT $ do
corpusQuery <- ExceptT (pure parse_query)
ExceptT $ first ExternalAPIError <$> ISIDORE.get la limit corpusQuery Nothing
EPO -> do
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639EN la) limit
where
......
......@@ -9,45 +9,111 @@ Portability : POSIX
-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Corpus.API.Isidore where
module Gargantext.Core.Text.Corpus.API.Isidore (
get
, isidore2csvFile
-- * Internals for testing
, IsidoreQuery(..)
, getIsidoreQuery
, convertQuery
) where
import Data.Conduit
import Data.Conduit.Combinators (yieldMany)
import Data.Monoid
import Data.Semigroup
import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Core.Types (Term(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (get)
import Isidore qualified as Isidore
import Isidore.Client
import Network.HTTP.Types.URI (EscapeItem(..), renderQueryPartialEscape)
import Servant.Client
import qualified Data.Text.Encoding as TE
-- | TODO work with the ServantErr
get :: Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
-> IO [HyperdataDocument]
get la l q a = do
newtype IsidoreQuery = IsidoreQuery { _IsidoreQuery :: [EscapeItem] }
deriving stock (Show, Eq)
deriving newtype (Semigroup, Monoid)
-- | Returns an /url encoded/ query ready to be sent to pubmed.
getIsidoreQuery :: IsidoreQuery -> Text
getIsidoreQuery (IsidoreQuery items) =
Text.replace "q=" "" . TE.decodeUtf8 . renderQueryPartialEscape False $ [
("q", items)
]
convertQuery :: Corpus.Query -> IsidoreQuery
convertQuery q = IsidoreQuery (interpretQuery q transformAST)
where
-- It seems like Isidore supports a similar query language to Pubmed.
transformAST :: BoolExpr Term -> [EscapeItem]
transformAST ast = case ast of
BAnd sub (BConst (Negative term))
-- The second term become positive, so that it can be translated.
-> (transformAST sub) <> [QN "+AND+NOT+"] <> transformAST (BConst (Positive term))
BAnd term1 (BNot term2)
-> transformAST term1 <> [QN "+AND+NOT+"] <> transformAST term2
BAnd sub1 sub2
-> transformAST sub1 <> [QN "+AND+"] <> transformAST sub2
BOr sub1 sub2
-> transformAST sub1 <> [QN "+OR+"] <> transformAST sub2
BNot (BConst (Negative term))
-> transformAST (BConst (Positive term)) -- double negation
BNot sub
-> [QN "NOT+"] <> transformAST sub
-- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
BTrue
-> mempty
-- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
BFalse
-> mempty
BConst (Positive (Term term))
-> [QE (TE.encodeUtf8 term)]
BConst (Negative (Term term))
-> [QN "NOT+", QE (TE.encodeUtf8 term)]
get :: Lang
-> Maybe Corpus.Limit
-> Corpus.Query
-> Maybe Isidore.AuthorQuery
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get la l (convertQuery -> q) a = do
let
printErr (DecodeFailure e _) = panic e
printErr e = panic (show e)
toIsidoreDocs :: Reply -> [IsidoreDoc]
toIsidoreDocs (ReplyOnly r) = [r]
toIsidoreDocs (Replies rs) = rs
iDocs <- either printErr _content <$> Isidore.get l q a
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs)
pure hDocs
mb_iDocs <- Isidore.get (getLimit <$> l) (Just $ getIsidoreQuery q) a
case mb_iDocs of
Left err -> pure $ Left err
Right iDocs -> do
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs $ _content iDocs)
pure $ Right (Just $ fromIntegral $ length hDocs, yieldMany hDocs)
isidore2csvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
isidore2csvFile :: FilePath
-> Lang
-> Maybe Corpus.Limit
-> Corpus.Query
-> Maybe Isidore.AuthorQuery
-> IO ()
isidore2csvFile fp la li tq aq = do
hdocs <- get la li tq aq
mb_hdocs <- get la li tq aq
case mb_hdocs of
Left err -> throwIO err
Right (_, hdocsC) -> do
hdocs <- sourceToList hdocsC
writeDocs2Csv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
......
......@@ -83,7 +83,6 @@ convertQuery q = ESearch (interpretQuery q transformAST)
-> mempty
BConst (Positive (Term term))
-> [QE (TE.encodeUtf8 term)]
-- We can handle negatives via `ANDNOT` with itself.
BConst (Negative (Term term))
-> [QN "NOT+", QE (TE.encodeUtf8 term)]
......
......@@ -6,6 +6,7 @@ module Test.Core.Text.Corpus.Query (tests) where
import Data.BoolExpr
import Data.Conduit
import Data.Maybe
import Data.String
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Query
......@@ -16,6 +17,7 @@ import System.Environment
import qualified Data.Text as T
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
import qualified Gargantext.Core.Text.Corpus.API.Pubmed as Pubmed
import qualified Gargantext.Core.Text.Corpus.API.Isidore as Isidore
import qualified Network.Api.Arxiv as Arxiv
import Test.Tasty
......@@ -67,6 +69,14 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey ->
, testCase "It supports '\"Haskell\" AND \"Idris\"'" testPubMed03
, testCase "It supports 'A OR B'" testPubMed04
]
, testGroup "Isidore expression converter" [
testCase "It supports 'A'" testIsidore01
, testCase "It supports '-A'" testIsidore02_01
, testCase "It supports 'NOT A'" testIsidore02_02
, testCase "It supports 'NOT (NOT A)'" testIsidore02_03
, testCase "It supports '\"Haskell\" AND \"Idris\"'" testIsidore03
, testCase "It supports 'A OR B'" testIsidore04
]
, testGroup "PUBMED real queries (skipped if PUBMED_API_KEY env var not set)" [
testCase "It searches for \"Covid\"" (testPubMedCovid_01 getPubmedKey)
, testCase "It searches for \"Covid\" AND \"Alzheimer\"" (testPubMedCovid_02 getPubmedKey)
......@@ -77,6 +87,11 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey ->
testCase "It searches for \"Haskell\"" (testArxivRealWorld_01 getPubmedKey)
, testCase "It searches for \"Haskell\" AND \"Agda\"" (testArxivRealWorld_02 getPubmedKey)
]
-- .. ditto for Isidore
, testGroup "Isidore real queries (skipped if PUBMED_API_KEY env var not set)" [
testCase "It searches for \"chicken pox\"" (testIsidoreRealWorld_01 getPubmedKey)
, testCase "It searches for \"Dante\" AND \"Petrarca\"" (testIsidoreRealWorld_02 getPubmedKey)
]
]
-- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form,
......@@ -245,6 +260,36 @@ testPubMed04 :: Assertion
testPubMed04 = withValidQuery "A OR B" $ \q ->
Pubmed.getESearch (Pubmed.convertQuery q) @?= "A+OR+B"
--
-- Isidore tests
--
testIsidore01 :: Assertion
testIsidore01 = withValidQuery "A" $ \q ->
assertBool ("Query not converted into expression: " <> show @(BoolExpr Term) (fromCNF $ getQuery q))
(Isidore.getIsidoreQuery (Isidore.convertQuery q) == "A")
testIsidore02_01 :: Assertion
testIsidore02_01 = withValidQuery "-A" $ \q -> Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "NOT+A"
testIsidore02_02 :: Assertion
testIsidore02_02 = withValidQuery "NOT A" $ \q -> Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "NOT+A"
testIsidore02_03 :: Assertion
testIsidore02_03 = withValidQuery "NOT (NOT A)" $ \q -> Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "A"
testIsidore03 :: Assertion
testIsidore03 = withValidQuery "\"Haskell\" AND \"Idris\"" $ \q ->
Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "Haskell+AND+Idris"
testIsidore04 :: Assertion
testIsidore04 = withValidQuery "A OR B" $ \q ->
Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "A+OR+B"
--
-- Integration tests against the real services
--
testPubMedCovid_01 :: IO (Maybe PubmedApiKey) -> Assertion
testPubMedCovid_01 getPubmedKey = do
mb_key <- getPubmedKey
......@@ -258,7 +303,8 @@ testPubMedCovid_01 getPubmedKey = do
hyperDocs <- sourceToList cnd
case hyperDocs of
[] -> fail "No documents found."
(x:_) -> _hd_title x @?= Just "Being a Hospice Nurse in Times of the COVID-19 Pandemic: A Phenomenological Study of Providing End-of-Life Care."
(x:_) -> assertBool ("Found: " <> show (_hd_title x))
(maybe False (T.isInfixOf "COVID") (_hd_title x))
testPubMedCovid_02 :: IO (Maybe PubmedApiKey) -> Assertion
testPubMedCovid_02 getPubmedKey = do
......@@ -298,3 +344,38 @@ testArxivRealWorld_02 getPubmedKey = do
case hyperDocs of
[] -> fail "No documents found."
(x:_) -> _hd_title x @?= Just "Toward Hole-Driven Development with Liquid Haskell"
--
-- Isidore integration tests
--
testIsidoreRealWorld_01 :: IO (Maybe PubmedApiKey) -> Assertion
testIsidoreRealWorld_01 getPubmedKey = do
mb_key <- getPubmedKey
case mb_key of
Nothing -> pure ()
Just _ -> withValidQuery "\"Chicken Pox\"" $ \query -> do
res <- Isidore.get EN (Just 1) query Nothing
case res of
Left err -> fail (show err)
Right (_, cnd) -> do
hyperDocs <- sourceToList cnd
case hyperDocs of
[] -> fail "No documents found."
(x:_) -> assertBool ("found: " <> show (_hd_title x))
(maybe False (T.isInfixOf "chicken pox") (_hd_title x))
testIsidoreRealWorld_02 :: IO (Maybe PubmedApiKey) -> Assertion
testIsidoreRealWorld_02 getPubmedKey = do
mb_key <- getPubmedKey
case mb_key of
Nothing -> pure ()
Just _ -> withValidQuery "\"dante\" AND \"petrarca\"" $ \query -> do
res <- Isidore.get EN (Just 1) query Nothing
case res of
Left err -> fail (show err)
Right (_, cnd) -> do
hyperDocs <- sourceToList cnd
case hyperDocs of
[] -> fail "No documents found."
(x:_) -> isJust (_hd_title x) @?= True
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment