Commit d8028550 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

WIP - start porting Pubmed queries

parent d8612190
Pipeline #4178 failed with stages
in 40 minutes and 41 seconds
......@@ -54,6 +54,7 @@ library
Gargantext.Core.Text.Context
Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
......@@ -190,7 +191,6 @@ library
Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
......
......@@ -82,6 +82,7 @@ library:
- Gargantext.Core.Text.Context
- Gargantext.Core.Text.Corpus.API
- Gargantext.Core.Text.Corpus.API.Arxiv
- Gargantext.Core.Text.Corpus.API.Pubmed
- Gargantext.Core.Text.Corpus.Query
- Gargantext.Core.Text.Corpus.Parsers
- Gargantext.Core.Text.Corpus.Parsers.CSV
......
......@@ -9,6 +9,7 @@ import Gargantext.Core.Types
import Prelude
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
import qualified Network.Api.Arxiv as Arxiv
import qualified Gargantext.Core.Text.Corpus.API.Pubmed as Pubmed
import Test.Tasty
import Test.Tasty.HUnit
......@@ -38,6 +39,14 @@ tests = testGroup "Boolean Query Engine" [
, testCase "It supports 'A OR NOT B'" testArxiv05
, testCase "It supports '-A'" testArxiv06
]
, testGroup "PUBMED expression converter" [
testCase "It supports 'A'" testPubMed01
, testCase "It supports '-A'" testPubMed02_01
, testCase "It supports 'NOT A'" testPubMed02_02
, testCase "It supports 'NOT (NOT A)'" testPubMed02_03
, testCase "It supports '\"Haskell\" AND \"Idris\"'" testPubMed03
, testCase "It supports 'A OR B'" testPubMed04
]
]
-- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form,
......@@ -158,3 +167,29 @@ testArxiv06 = withValidQuery "-A" $ \q ->
Arxiv.AndNot (Arxiv.Exp $ Arxiv.Abs ["A"]) (Arxiv.Exp $ Arxiv.Abs ["A"])
)
)
--
-- PUBMED tests
--
testPubMed01 :: Assertion
testPubMed01 = withValidQuery "A" $ \q ->
assertBool ("Query not converted into expression: " <> show @(BoolExpr Term) (fromCNF $ getQuery q))
(Pubmed.getESearch (Pubmed.convertQuery q) == "%23A")
testPubMed02_01 :: Assertion
testPubMed02_01 = withValidQuery "-A" $ \q -> Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23NOT+A"
testPubMed02_02 :: Assertion
testPubMed02_02 = withValidQuery "NOT A" $ \q -> Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23NOT+A"
testPubMed02_03 :: Assertion
testPubMed02_03 = withValidQuery "NOT (NOT A)" $ \q -> Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23A"
testPubMed03 :: Assertion
testPubMed03 = withValidQuery "\"Haskell\" AND \"Idris\"" $ \q ->
Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23Haskell+AND+Idris"
testPubMed04 :: Assertion
testPubMed04 = withValidQuery "A OR B" $ \q ->
Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23A+OR+B"
......@@ -57,10 +57,10 @@ get cfg externalAPI la q limit = do
Left err -> pure $ Left $ InvalidInputQuery q (T.pack err)
Right corpusQuery -> case externalAPI of
PubMed -> first ExternalAPIError <$>
PUBMED.get (cfg ^. gc_pubmed_api_key) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
PUBMED.get (cfg ^. gc_pubmed_api_key) corpusQuery limit
--docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
Arxiv -> Right <$> Arxiv.get la corpusQuery (Corpus.getLimit <$> limit)
Arxiv -> Right <$> Arxiv.get la corpusQuery limit
HAL -> first ExternalAPIError <$> HAL.getC la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
......
......@@ -77,9 +77,9 @@ convertQuery q = mkQuery (interpretQuery q transformAST)
-- by default: 10K docs
get :: Lang
-> Corpus.Query
-> Maybe Arxiv.Limit
-> Maybe Corpus.Limit
-> IO (Maybe Integer, ConduitT () HyperdataDocument IO ())
get la (convertQuery -> query) limit = do
get la (convertQuery -> query) (fmap getLimit -> limit) = do
(cnt, resC) <- case limit of
Nothing -> Arxiv.searchAxv' query
(Just l) -> do (cnt, res) <- Arxiv.searchAxv' query
......
......@@ -9,21 +9,33 @@ Portability : POSIX
-}
{-# LANGUAGE DerivingStrategies #-}
module Gargantext.Core.Text.Corpus.API.Pubmed
( get )
( get
-- * Internals for testing
, ESearch(..)
, convertQuery
, getESearch
)
where
import Conduit
import Control.Monad.Reader (runReaderT)
import Data.Either (Either)
import Data.Maybe
import Data.Semigroup
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as TE
import Network.HTTP.Types.URI (EscapeItem(..), renderQueryPartialEscape)
import Servant.Client (ClientError)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Core.Types (Term(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified PUBMED as PubMed
......@@ -31,24 +43,72 @@ import qualified PUBMED.Parser as PubMedDoc
import PUBMED.Types (Config(..))
type Query = Text
type Limit = Int
-- | A pubmed query.
-- See: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
newtype ESearch = ESearch { _ESearch :: [EscapeItem] }
deriving stock (Show, Eq)
deriving newtype (Semigroup, Monoid)
-- | Returns an /url encoded/ query ready to be sent to pubmed.
getESearch :: ESearch -> Text
getESearch (ESearch items) =
Text.replace "term=" "" . TE.decodeUtf8 . renderQueryPartialEscape False $ [
("term", QE "#" : items)
]
convertQuery :: Corpus.Query -> ESearch
convertQuery q = ESearch (interpretQuery q transformAST)
where
transformAST :: BoolExpr Term -> [EscapeItem]
transformAST ast = case ast of
BAnd sub (BConst (Negative term))
-- The second term become positive, so that it can be translated.
-> (transformAST sub) <> [QN "+AND+NOT+"] <> transformAST (BConst (Positive term))
BAnd term1 (BNot term2)
-> transformAST term1 <> [QN "+AND+NOT+"] <> transformAST term2
BAnd sub1 sub2
-> transformAST sub1 <> [QN "+AND+"] <> transformAST sub2
BOr sub1 sub2
-> transformAST sub1 <> [QN "+OR+"] <> transformAST sub2
BNot (BConst (Negative term))
-> transformAST (BConst (Positive term)) -- double negation
BNot sub
-> [QN "NOT+"] <> transformAST sub
-- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
BTrue
-> mempty
-- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
BFalse
-> mempty
BConst (Positive (Term term))
-> [QE (TE.encodeUtf8 term)]
-- We can handle negatives via `ANDNOT` with itself.
BConst (Negative (Term term))
-> [QN "NOT+", QE (TE.encodeUtf8 term)]
-- | TODO put default pubmed query in gargantext.ini
-- by default: 10K docs
get :: Text
-> Query
-> Corpus.Query
-> Maybe Limit
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get apiKey q l = do
-- The documentation for PUBMED says:
-- Values for query keys may also be provided in term if they are preceeded by a
-- '#' (%23 in the URL). While only one query_key parameter can be provided to ESearch,
-- any number of query keys can be combined in term. Also, if query keys are provided in term,
-- they can be combined with OR or NOT in addition to AND.
-- Example:
-- esearch.fcgi?db=pubmed&term=%231+AND+asthma&WebEnv=<webenv string>&usehistory=y
--
-- Therefore, we can pretty-print our 'Query' back into something that PubMed could understand.
eRes <- runReaderT PubMed.getMetadataWithC (Config { apiKey = Just apiKey
, query = q
, query = getESearch $ convertQuery q
, perPage = Just 200
, mWebEnv = Nothing })
let takeLimit = case l of
Nothing -> mapC identity
Just l' -> takeC $ fromIntegral l'
Just l' -> takeC $ getLimit l'
pure $ (\(len, docsC) -> (len, docsC .| takeLimit .| mapC (toDoc EN))) <$> eRes
--either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
-- <$> PubMed.getMetadataWithC q l
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment