Commit 9483b472 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Isidore conversion to bool query API

parent 5a5185cb
...@@ -84,6 +84,7 @@ library ...@@ -84,6 +84,7 @@ library
Gargantext.Core.Text.Corpus.API Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.EPO Gargantext.Core.Text.Corpus.API.EPO
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Pubmed Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.API.OpenAlex Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.Query
...@@ -229,7 +230,6 @@ library ...@@ -229,7 +230,6 @@ library
Gargantext.Core.Statistics Gargantext.Core.Statistics
Gargantext.Core.Text.Convert Gargantext.Core.Text.Convert
Gargantext.Core.Text.Corpus.API.Hal Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Istex Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.Parsers.Book Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
......
...@@ -67,12 +67,12 @@ get externalAPI la q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do ...@@ -67,12 +67,12 @@ get externalAPI la q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
ExceptT $ fmap Right (Arxiv.get la corpusQuery limit) ExceptT $ fmap Right (Arxiv.get la corpusQuery limit)
HAL -> HAL ->
first ExternalAPIError <$> HAL.getC (toISO639 la) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) first ExternalAPIError <$> HAL.getC (toISO639 la) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do IsTex -> do
docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs) pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do Isidore -> runExceptT $ do
docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing corpusQuery <- ExceptT (pure parse_query)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs) ExceptT $ first ExternalAPIError <$> ISIDORE.get la limit corpusQuery Nothing
EPO -> do EPO -> do
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639EN la) limit first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639EN la) limit
where where
......
...@@ -9,46 +9,112 @@ Portability : POSIX ...@@ -9,46 +9,112 @@ Portability : POSIX
-} -}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Corpus.API.Isidore where module Gargantext.Core.Text.Corpus.API.Isidore (
get
, isidore2csvFile
-- * Internals for testing
, IsidoreQuery(..)
, getIsidoreQuery
, convertQuery
) where
import Data.Conduit
import Data.Conduit.Combinators (yieldMany)
import Data.Monoid
import Data.Semigroup
import Data.Text qualified as Text import Data.Text qualified as Text
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText) import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv) import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Core.Types (Term(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Defaults qualified as Defaults import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (get) import Gargantext.Prelude hiding (get)
import Isidore qualified as Isidore import Isidore qualified as Isidore
import Isidore.Client import Isidore.Client
import Network.HTTP.Types.URI (EscapeItem(..), renderQueryPartialEscape)
import Servant.Client import Servant.Client
import qualified Data.Text.Encoding as TE
-- | TODO work with the ServantErr newtype IsidoreQuery = IsidoreQuery { _IsidoreQuery :: [EscapeItem] }
get :: Lang -> Maybe Isidore.Limit deriving stock (Show, Eq)
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery deriving newtype (Semigroup, Monoid)
-> IO [HyperdataDocument]
get la l q a = do -- | Returns an /url encoded/ query ready to be sent to pubmed.
getIsidoreQuery :: IsidoreQuery -> Text
getIsidoreQuery (IsidoreQuery items) =
Text.replace "q=" "" . TE.decodeUtf8 . renderQueryPartialEscape False $ [
("q", items)
]
convertQuery :: Corpus.Query -> IsidoreQuery
convertQuery q = IsidoreQuery (interpretQuery q transformAST)
where
-- It seems like Isidore supports a similar query language to Pubmed.
transformAST :: BoolExpr Term -> [EscapeItem]
transformAST ast = case ast of
BAnd sub (BConst (Negative term))
-- The second term become positive, so that it can be translated.
-> (transformAST sub) <> [QN "+AND+NOT+"] <> transformAST (BConst (Positive term))
BAnd term1 (BNot term2)
-> transformAST term1 <> [QN "+AND+NOT+"] <> transformAST term2
BAnd sub1 sub2
-> transformAST sub1 <> [QN "+AND+"] <> transformAST sub2
BOr sub1 sub2
-> transformAST sub1 <> [QN "+OR+"] <> transformAST sub2
BNot (BConst (Negative term))
-> transformAST (BConst (Positive term)) -- double negation
BNot sub
-> [QN "NOT+"] <> transformAST sub
-- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
BTrue
-> mempty
-- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
BFalse
-> mempty
BConst (Positive (Term term))
-> [QE (TE.encodeUtf8 term)]
BConst (Negative (Term term))
-> [QN "NOT+", QE (TE.encodeUtf8 term)]
get :: Lang
-> Maybe Corpus.Limit
-> Corpus.Query
-> Maybe Isidore.AuthorQuery
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get la l (convertQuery -> q) a = do
let let
printErr (DecodeFailure e _) = panic e
printErr e = panic (show e)
toIsidoreDocs :: Reply -> [IsidoreDoc] toIsidoreDocs :: Reply -> [IsidoreDoc]
toIsidoreDocs (ReplyOnly r) = [r] toIsidoreDocs (ReplyOnly r) = [r]
toIsidoreDocs (Replies rs) = rs toIsidoreDocs (Replies rs) = rs
iDocs <- either printErr _content <$> Isidore.get l q a mb_iDocs <- Isidore.get (getLimit <$> l) (Just $ getIsidoreQuery q) a
case mb_iDocs of
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs) Left err -> pure $ Left err
pure hDocs Right iDocs -> do
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs $ _content iDocs)
pure $ Right (Just $ fromIntegral $ length hDocs, yieldMany hDocs)
isidore2csvFile :: FilePath -> Lang -> Maybe Isidore.Limit isidore2csvFile :: FilePath
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery -> Lang
-> IO () -> Maybe Corpus.Limit
-> Corpus.Query
-> Maybe Isidore.AuthorQuery
-> IO ()
isidore2csvFile fp la li tq aq = do isidore2csvFile fp la li tq aq = do
hdocs <- get la li tq aq mb_hdocs <- get la li tq aq
writeDocs2Csv fp hdocs case mb_hdocs of
Left err -> throwIO err
Right (_, hdocsC) -> do
hdocs <- sourceToList hdocsC
writeDocs2Csv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc l (IsidoreDoc t a d u s as) = do isidoreToDoc l (IsidoreDoc t a d u s as) = do
......
...@@ -83,7 +83,6 @@ convertQuery q = ESearch (interpretQuery q transformAST) ...@@ -83,7 +83,6 @@ convertQuery q = ESearch (interpretQuery q transformAST)
-> mempty -> mempty
BConst (Positive (Term term)) BConst (Positive (Term term))
-> [QE (TE.encodeUtf8 term)] -> [QE (TE.encodeUtf8 term)]
-- We can handle negatives via `ANDNOT` with itself.
BConst (Negative (Term term)) BConst (Negative (Term term))
-> [QN "NOT+", QE (TE.encodeUtf8 term)] -> [QN "NOT+", QE (TE.encodeUtf8 term)]
......
...@@ -6,6 +6,7 @@ module Test.Core.Text.Corpus.Query (tests) where ...@@ -6,6 +6,7 @@ module Test.Core.Text.Corpus.Query (tests) where
import Data.BoolExpr import Data.BoolExpr
import Data.Conduit import Data.Conduit
import Data.Maybe
import Data.String import Data.String
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Query import Gargantext.Core.Text.Corpus.Query
...@@ -16,6 +17,7 @@ import System.Environment ...@@ -16,6 +17,7 @@ import System.Environment
import qualified Data.Text as T import qualified Data.Text as T
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
import qualified Gargantext.Core.Text.Corpus.API.Pubmed as Pubmed import qualified Gargantext.Core.Text.Corpus.API.Pubmed as Pubmed
import qualified Gargantext.Core.Text.Corpus.API.Isidore as Isidore
import qualified Network.Api.Arxiv as Arxiv import qualified Network.Api.Arxiv as Arxiv
import Test.Tasty import Test.Tasty
...@@ -67,6 +69,14 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey -> ...@@ -67,6 +69,14 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey ->
, testCase "It supports '\"Haskell\" AND \"Idris\"'" testPubMed03 , testCase "It supports '\"Haskell\" AND \"Idris\"'" testPubMed03
, testCase "It supports 'A OR B'" testPubMed04 , testCase "It supports 'A OR B'" testPubMed04
] ]
, testGroup "Isidore expression converter" [
testCase "It supports 'A'" testIsidore01
, testCase "It supports '-A'" testIsidore02_01
, testCase "It supports 'NOT A'" testIsidore02_02
, testCase "It supports 'NOT (NOT A)'" testIsidore02_03
, testCase "It supports '\"Haskell\" AND \"Idris\"'" testIsidore03
, testCase "It supports 'A OR B'" testIsidore04
]
, testGroup "PUBMED real queries (skipped if PUBMED_API_KEY env var not set)" [ , testGroup "PUBMED real queries (skipped if PUBMED_API_KEY env var not set)" [
testCase "It searches for \"Covid\"" (testPubMedCovid_01 getPubmedKey) testCase "It searches for \"Covid\"" (testPubMedCovid_01 getPubmedKey)
, testCase "It searches for \"Covid\" AND \"Alzheimer\"" (testPubMedCovid_02 getPubmedKey) , testCase "It searches for \"Covid\" AND \"Alzheimer\"" (testPubMedCovid_02 getPubmedKey)
...@@ -77,6 +87,11 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey -> ...@@ -77,6 +87,11 @@ tests = withResource pubmedSettings (const (pure ())) $ \getPubmedKey ->
testCase "It searches for \"Haskell\"" (testArxivRealWorld_01 getPubmedKey) testCase "It searches for \"Haskell\"" (testArxivRealWorld_01 getPubmedKey)
, testCase "It searches for \"Haskell\" AND \"Agda\"" (testArxivRealWorld_02 getPubmedKey) , testCase "It searches for \"Haskell\" AND \"Agda\"" (testArxivRealWorld_02 getPubmedKey)
] ]
-- .. ditto for Isidore
, testGroup "Isidore real queries (skipped if PUBMED_API_KEY env var not set)" [
testCase "It searches for \"chicken pox\"" (testIsidoreRealWorld_01 getPubmedKey)
, testCase "It searches for \"Dante\" AND \"Petrarca\"" (testIsidoreRealWorld_02 getPubmedKey)
]
] ]
-- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form, -- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form,
...@@ -245,6 +260,36 @@ testPubMed04 :: Assertion ...@@ -245,6 +260,36 @@ testPubMed04 :: Assertion
testPubMed04 = withValidQuery "A OR B" $ \q -> testPubMed04 = withValidQuery "A OR B" $ \q ->
Pubmed.getESearch (Pubmed.convertQuery q) @?= "A+OR+B" Pubmed.getESearch (Pubmed.convertQuery q) @?= "A+OR+B"
--
-- Isidore tests
--
testIsidore01 :: Assertion
testIsidore01 = withValidQuery "A" $ \q ->
assertBool ("Query not converted into expression: " <> show @(BoolExpr Term) (fromCNF $ getQuery q))
(Isidore.getIsidoreQuery (Isidore.convertQuery q) == "A")
testIsidore02_01 :: Assertion
testIsidore02_01 = withValidQuery "-A" $ \q -> Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "NOT+A"
testIsidore02_02 :: Assertion
testIsidore02_02 = withValidQuery "NOT A" $ \q -> Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "NOT+A"
testIsidore02_03 :: Assertion
testIsidore02_03 = withValidQuery "NOT (NOT A)" $ \q -> Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "A"
testIsidore03 :: Assertion
testIsidore03 = withValidQuery "\"Haskell\" AND \"Idris\"" $ \q ->
Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "Haskell+AND+Idris"
testIsidore04 :: Assertion
testIsidore04 = withValidQuery "A OR B" $ \q ->
Isidore.getIsidoreQuery (Isidore.convertQuery q) @?= "A+OR+B"
--
-- Integration tests against the real services
--
testPubMedCovid_01 :: IO (Maybe PubmedApiKey) -> Assertion testPubMedCovid_01 :: IO (Maybe PubmedApiKey) -> Assertion
testPubMedCovid_01 getPubmedKey = do testPubMedCovid_01 getPubmedKey = do
mb_key <- getPubmedKey mb_key <- getPubmedKey
...@@ -258,7 +303,8 @@ testPubMedCovid_01 getPubmedKey = do ...@@ -258,7 +303,8 @@ testPubMedCovid_01 getPubmedKey = do
hyperDocs <- sourceToList cnd hyperDocs <- sourceToList cnd
case hyperDocs of case hyperDocs of
[] -> fail "No documents found." [] -> fail "No documents found."
(x:_) -> _hd_title x @?= Just "Being a Hospice Nurse in Times of the COVID-19 Pandemic: A Phenomenological Study of Providing End-of-Life Care." (x:_) -> assertBool ("Found: " <> show (_hd_title x))
(maybe False (T.isInfixOf "COVID") (_hd_title x))
testPubMedCovid_02 :: IO (Maybe PubmedApiKey) -> Assertion testPubMedCovid_02 :: IO (Maybe PubmedApiKey) -> Assertion
testPubMedCovid_02 getPubmedKey = do testPubMedCovid_02 getPubmedKey = do
...@@ -298,3 +344,38 @@ testArxivRealWorld_02 getPubmedKey = do ...@@ -298,3 +344,38 @@ testArxivRealWorld_02 getPubmedKey = do
case hyperDocs of case hyperDocs of
[] -> fail "No documents found." [] -> fail "No documents found."
(x:_) -> _hd_title x @?= Just "Toward Hole-Driven Development with Liquid Haskell" (x:_) -> _hd_title x @?= Just "Toward Hole-Driven Development with Liquid Haskell"
--
-- Isidore integration tests
--
testIsidoreRealWorld_01 :: IO (Maybe PubmedApiKey) -> Assertion
testIsidoreRealWorld_01 getPubmedKey = do
mb_key <- getPubmedKey
case mb_key of
Nothing -> pure ()
Just _ -> withValidQuery "\"Chicken Pox\"" $ \query -> do
res <- Isidore.get EN (Just 1) query Nothing
case res of
Left err -> fail (show err)
Right (_, cnd) -> do
hyperDocs <- sourceToList cnd
case hyperDocs of
[] -> fail "No documents found."
(x:_) -> assertBool ("found: " <> show (_hd_title x))
(maybe False (T.isInfixOf "chicken pox") (_hd_title x))
testIsidoreRealWorld_02 :: IO (Maybe PubmedApiKey) -> Assertion
testIsidoreRealWorld_02 getPubmedKey = do
mb_key <- getPubmedKey
case mb_key of
Nothing -> pure ()
Just _ -> withValidQuery "\"dante\" AND \"petrarca\"" $ \query -> do
res <- Isidore.get EN (Just 1) query Nothing
case res of
Left err -> fail (show err)
Right (_, cnd) -> do
hyperDocs <- sourceToList cnd
case hyperDocs of
[] -> fail "No documents found."
(x:_) -> isJust (_hd_title x) @?= True
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment