Commit d8028550 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

WIP - start porting Pubmed queries

parent d8612190
Pipeline #4178 failed with stages
in 40 minutes and 41 seconds
...@@ -54,6 +54,7 @@ library ...@@ -54,6 +54,7 @@ library
Gargantext.Core.Text.Context Gargantext.Core.Text.Context
Gargantext.Core.Text.Corpus.API Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV Gargantext.Core.Text.Corpus.Parsers.CSV
...@@ -190,7 +191,6 @@ library ...@@ -190,7 +191,6 @@ library
Gargantext.Core.Text.Corpus.API.Hal Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Isidore Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Istex Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Parsers.Book Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Date Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
......
...@@ -82,6 +82,7 @@ library: ...@@ -82,6 +82,7 @@ library:
- Gargantext.Core.Text.Context - Gargantext.Core.Text.Context
- Gargantext.Core.Text.Corpus.API - Gargantext.Core.Text.Corpus.API
- Gargantext.Core.Text.Corpus.API.Arxiv - Gargantext.Core.Text.Corpus.API.Arxiv
- Gargantext.Core.Text.Corpus.API.Pubmed
- Gargantext.Core.Text.Corpus.Query - Gargantext.Core.Text.Corpus.Query
- Gargantext.Core.Text.Corpus.Parsers - Gargantext.Core.Text.Corpus.Parsers
- Gargantext.Core.Text.Corpus.Parsers.CSV - Gargantext.Core.Text.Corpus.Parsers.CSV
......
...@@ -9,6 +9,7 @@ import Gargantext.Core.Types ...@@ -9,6 +9,7 @@ import Gargantext.Core.Types
import Prelude import Prelude
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
import qualified Network.Api.Arxiv as Arxiv import qualified Network.Api.Arxiv as Arxiv
import qualified Gargantext.Core.Text.Corpus.API.Pubmed as Pubmed
import Test.Tasty import Test.Tasty
import Test.Tasty.HUnit import Test.Tasty.HUnit
...@@ -38,6 +39,14 @@ tests = testGroup "Boolean Query Engine" [ ...@@ -38,6 +39,14 @@ tests = testGroup "Boolean Query Engine" [
, testCase "It supports 'A OR NOT B'" testArxiv05 , testCase "It supports 'A OR NOT B'" testArxiv05
, testCase "It supports '-A'" testArxiv06 , testCase "It supports '-A'" testArxiv06
] ]
, testGroup "PUBMED expression converter" [
testCase "It supports 'A'" testPubMed01
, testCase "It supports '-A'" testPubMed02_01
, testCase "It supports 'NOT A'" testPubMed02_02
, testCase "It supports 'NOT (NOT A)'" testPubMed02_03
, testCase "It supports '\"Haskell\" AND \"Idris\"'" testPubMed03
, testCase "It supports 'A OR B'" testPubMed04
]
] ]
-- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form, -- | Checks that the 'RawQuery' can be translated into the expected 'BoolExpr' form,
...@@ -158,3 +167,29 @@ testArxiv06 = withValidQuery "-A" $ \q -> ...@@ -158,3 +167,29 @@ testArxiv06 = withValidQuery "-A" $ \q ->
Arxiv.AndNot (Arxiv.Exp $ Arxiv.Abs ["A"]) (Arxiv.Exp $ Arxiv.Abs ["A"]) Arxiv.AndNot (Arxiv.Exp $ Arxiv.Abs ["A"]) (Arxiv.Exp $ Arxiv.Abs ["A"])
) )
) )
--
-- PUBMED tests
--
testPubMed01 :: Assertion
testPubMed01 = withValidQuery "A" $ \q ->
assertBool ("Query not converted into expression: " <> show @(BoolExpr Term) (fromCNF $ getQuery q))
(Pubmed.getESearch (Pubmed.convertQuery q) == "%23A")
testPubMed02_01 :: Assertion
testPubMed02_01 = withValidQuery "-A" $ \q -> Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23NOT+A"
testPubMed02_02 :: Assertion
testPubMed02_02 = withValidQuery "NOT A" $ \q -> Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23NOT+A"
testPubMed02_03 :: Assertion
testPubMed02_03 = withValidQuery "NOT (NOT A)" $ \q -> Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23A"
testPubMed03 :: Assertion
testPubMed03 = withValidQuery "\"Haskell\" AND \"Idris\"" $ \q ->
Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23Haskell+AND+Idris"
testPubMed04 :: Assertion
testPubMed04 = withValidQuery "A OR B" $ \q ->
Pubmed.getESearch (Pubmed.convertQuery q) @?= "%23A+OR+B"
...@@ -57,10 +57,10 @@ get cfg externalAPI la q limit = do ...@@ -57,10 +57,10 @@ get cfg externalAPI la q limit = do
Left err -> pure $ Left $ InvalidInputQuery q (T.pack err) Left err -> pure $ Left $ InvalidInputQuery q (T.pack err)
Right corpusQuery -> case externalAPI of Right corpusQuery -> case externalAPI of
PubMed -> first ExternalAPIError <$> PubMed -> first ExternalAPIError <$>
PUBMED.get (cfg ^. gc_pubmed_api_key) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) PUBMED.get (cfg ^. gc_pubmed_api_key) corpusQuery limit
--docs <- PUBMED.get q default_limit -- EN only by default --docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs) --pure (Just $ fromIntegral $ length docs, yieldMany docs)
Arxiv -> Right <$> Arxiv.get la corpusQuery (Corpus.getLimit <$> limit) Arxiv -> Right <$> Arxiv.get la corpusQuery limit
HAL -> first ExternalAPIError <$> HAL.getC la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) HAL -> first ExternalAPIError <$> HAL.getC la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs) pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
......
...@@ -77,9 +77,9 @@ convertQuery q = mkQuery (interpretQuery q transformAST) ...@@ -77,9 +77,9 @@ convertQuery q = mkQuery (interpretQuery q transformAST)
-- by default: 10K docs -- by default: 10K docs
get :: Lang get :: Lang
-> Corpus.Query -> Corpus.Query
-> Maybe Arxiv.Limit -> Maybe Corpus.Limit
-> IO (Maybe Integer, ConduitT () HyperdataDocument IO ()) -> IO (Maybe Integer, ConduitT () HyperdataDocument IO ())
get la (convertQuery -> query) limit = do get la (convertQuery -> query) (fmap getLimit -> limit) = do
(cnt, resC) <- case limit of (cnt, resC) <- case limit of
Nothing -> Arxiv.searchAxv' query Nothing -> Arxiv.searchAxv' query
(Just l) -> do (cnt, res) <- Arxiv.searchAxv' query (Just l) -> do (cnt, res) <- Arxiv.searchAxv' query
......
...@@ -9,21 +9,33 @@ Portability : POSIX ...@@ -9,21 +9,33 @@ Portability : POSIX
-} -}
{-# LANGUAGE DerivingStrategies #-}
module Gargantext.Core.Text.Corpus.API.Pubmed module Gargantext.Core.Text.Corpus.API.Pubmed
( get ) ( get
-- * Internals for testing
, ESearch(..)
, convertQuery
, getESearch
)
where where
import Conduit import Conduit
import Control.Monad.Reader (runReaderT) import Control.Monad.Reader (runReaderT)
import Data.Either (Either) import Data.Either (Either)
import Data.Maybe import Data.Maybe
import Data.Semigroup
import Data.Monoid
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.Text.Encoding as TE
import Network.HTTP.Types.URI (EscapeItem(..), renderQueryPartialEscape)
import Servant.Client (ClientError) import Servant.Client (ClientError)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Core.Types (Term(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified PUBMED as PubMed import qualified PUBMED as PubMed
...@@ -31,24 +43,72 @@ import qualified PUBMED.Parser as PubMedDoc ...@@ -31,24 +43,72 @@ import qualified PUBMED.Parser as PubMedDoc
import PUBMED.Types (Config(..)) import PUBMED.Types (Config(..))
type Query = Text -- | A pubmed query.
type Limit = Int -- See: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
newtype ESearch = ESearch { _ESearch :: [EscapeItem] }
deriving stock (Show, Eq)
deriving newtype (Semigroup, Monoid)
-- | Returns an /url encoded/ query ready to be sent to pubmed.
getESearch :: ESearch -> Text
getESearch (ESearch items) =
Text.replace "term=" "" . TE.decodeUtf8 . renderQueryPartialEscape False $ [
("term", QE "#" : items)
]
convertQuery :: Corpus.Query -> ESearch
convertQuery q = ESearch (interpretQuery q transformAST)
where
transformAST :: BoolExpr Term -> [EscapeItem]
transformAST ast = case ast of
BAnd sub (BConst (Negative term))
-- The second term become positive, so that it can be translated.
-> (transformAST sub) <> [QN "+AND+NOT+"] <> transformAST (BConst (Positive term))
BAnd term1 (BNot term2)
-> transformAST term1 <> [QN "+AND+NOT+"] <> transformAST term2
BAnd sub1 sub2
-> transformAST sub1 <> [QN "+AND+"] <> transformAST sub2
BOr sub1 sub2
-> transformAST sub1 <> [QN "+OR+"] <> transformAST sub2
BNot (BConst (Negative term))
-> transformAST (BConst (Positive term)) -- double negation
BNot sub
-> [QN "NOT+"] <> transformAST sub
-- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
BTrue
-> mempty
-- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
BFalse
-> mempty
BConst (Positive (Term term))
-> [QE (TE.encodeUtf8 term)]
-- We can handle negatives via `ANDNOT` with itself.
BConst (Negative (Term term))
-> [QN "NOT+", QE (TE.encodeUtf8 term)]
-- | TODO put default pubmed query in gargantext.ini -- | TODO put default pubmed query in gargantext.ini
-- by default: 10K docs -- by default: 10K docs
get :: Text get :: Text
-> Query -> Corpus.Query
-> Maybe Limit -> Maybe Limit
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ())) -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get apiKey q l = do get apiKey q l = do
-- The documentation for PUBMED says:
-- Values for query keys may also be provided in term if they are preceeded by a
-- '#' (%23 in the URL). While only one query_key parameter can be provided to ESearch,
-- any number of query keys can be combined in term. Also, if query keys are provided in term,
-- they can be combined with OR or NOT in addition to AND.
-- Example:
-- esearch.fcgi?db=pubmed&term=%231+AND+asthma&WebEnv=<webenv string>&usehistory=y
--
-- Therefore, we can pretty-print our 'Query' back into something that PubMed could understand.
eRes <- runReaderT PubMed.getMetadataWithC (Config { apiKey = Just apiKey eRes <- runReaderT PubMed.getMetadataWithC (Config { apiKey = Just apiKey
, query = q , query = getESearch $ convertQuery q
, perPage = Just 200 , perPage = Just 200
, mWebEnv = Nothing }) , mWebEnv = Nothing })
let takeLimit = case l of let takeLimit = case l of
Nothing -> mapC identity Nothing -> mapC identity
Just l' -> takeC $ fromIntegral l' Just l' -> takeC $ getLimit l'
pure $ (\(len, docsC) -> (len, docsC .| takeLimit .| mapC (toDoc EN))) <$> eRes pure $ (\(len, docsC) -> (len, docsC .| takeLimit .| mapC (toDoc EN))) <$> eRes
--either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN)) --either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
-- <$> PubMed.getMetadataWithC q l -- <$> PubMed.getMetadataWithC q l
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment