Commit ecc4fd6b authored by Mael NICOLAS's avatar Mael NICOLAS

Merge branch 'patch' into 'dev'

Patch

See merge request !2
parents 3f2a662a 400374d7
.stack-work/
pubMedCrawler.cabal
*~
\ No newline at end of file
# Changelog for pubMedCrawler
## Unreleased changes
Copyright Author name here (c) 2019
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Author name here nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# PUBMED API Crawler
## API documentation
https://www.ncbi.nlm.nih.gov/books/NBK25501/
## Usage
### Entry function
The basic entry point of this crawler is the function `PUBMED.` `getMetadataWith`.
This function take a `Text` representing the query you want to run on hal
and a `Maybe Int` representing the maximum number of result you want to get.
### Return Type
The return type is a bit more tricky, it's **either** a `Text` representing an error or a collection of `PUBMED.Parser` `PubMed`.
`PubMed` is a simple type that contain every informations we need (id,title,abstract,publicationDate,sources).
### Exemple
Here is a basic main using the entry point of the crawler and printing the 5 first documents.
```hs
{-# LANGUAGE OverloadedStrings #-}
module Main where
import PUBMED (getMetadataWith)
main :: IO ()
main = getMetadataWith "bisphenol" (Just 5) >>= print
```
import Distribution.Simple
main = defaultMain
{-# LANGUAGE OverloadedStrings #-}
module Main where
import PUBMED (getMetadataWith)
main :: IO ()
main = getMetadataWith "bisphenol" (Just 1000000) >>= print
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.31.2.
--
-- see: https://github.com/sol/hpack
--
-- hash: 4bb73f43a66d509480c9a672fe457ad8be7cb2d27c8b0892e234b8ac088a2a44
name: crawlerPubMed
version: 0.1.0.0
description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
homepage: https://github.com/gitlab/crawlerPubMed#readme
bug-reports: https://github.com/gitlab/crawlerPubMed/issues
author: CNRS Gargantext
maintainer: contact@gargantext.org
copyright: 2019 CNRS/IMT
license: BSD3
license-file: LICENSE
build-type: Simple
extra-source-files:
README.md
ChangeLog.md
source-repository head
type: git
location: https://github.com/gitlab/crawlerPubMed
library
exposed-modules:
PUBMED
PUBMED.Client
PUBMED.Parser
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
src
build-depends:
attoparsec
, base >=4.7 && <5
, bytestring
, conduit
, data-time-segment
, either
, exceptions
, http-client
, http-client-tls
, http-media
, protolude
, servant
, servant-client
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
executable crawlerPubMed-exe
main-is: Main.hs
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
app
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
attoparsec
, base >=4.7 && <5
, bytestring
, conduit
, crawlerPubMed
, data-time-segment
, either
, exceptions
, http-client
, http-client-tls
, http-media
, protolude
, servant
, servant-client
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
test-suite crawlerPubMed-test
type: exitcode-stdio-1.0
main-is: Spec.hs
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
test
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
attoparsec
, base >=4.7 && <5
, bytestring
, conduit
, crawlerPubMed
, data-time-segment
, either
, exceptions
, http-client
, http-client-tls
, http-media
, protolude
, servant
, servant-client
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
name: crawlerPubMed
version: 0.1.0.0
github: "gitlab/crawlerPubMed"
license: BSD3
author: "CNRS Gargantext"
maintainer: "contact@gargantext.org"
copyright: "2019 CNRS/IMT"
extra-source-files:
- README.md
- ChangeLog.md
# Metadata used when publishing your package
# synopsis: Short description of your package
# category: Web
# To avoid duplicated efforts in documentation and dealing with the
# complications of embedding Haddock markup inside cabal files, it is
# common to point users to the README.md file.
description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
dependencies:
- base >= 4.7 && < 5
- servant
- servant-client
- text
- bytestring
- xml-conduit
- http-client
- http-client-tls
- http-media
- exceptions
- conduit
- xml-types
- time
- data-time-segment
- protolude
- attoparsec
- either
library:
source-dirs: src
executables:
crawlerPubMed-exe:
main: Main.hs
source-dirs: app
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
dependencies:
- crawlerPubMed
tests:
crawlerPubMed-test:
main: Spec.hs
source-dirs: test
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
dependencies:
- crawlerPubMed
{-# LANGUAGE OverloadedStrings #-}
module PUBMED where
import Prelude hiding (takeWhile)
import Data.Text (Text)
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import Text.XML.Stream.Parse (XmlException)
import Data.Conduit (ConduitT)
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Monad.Catch (catch, MonadThrow, Exception)
import Control.Applicative
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as DB
import qualified Data.Text as T
-- | API main function
getMetadataWith :: Text -> Maybe Limit -> IO (Either Text [PubMed])
getMetadataWith = runSimpleFindPubmedAbstractRequest
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
-- | TODO this parser need at least one subs at the end
-- (use endOfInput)
removeSub :: Parser ByteString
removeSub = do
dt <- many textWithBalise
eo <- manyTill anyChar endOfInput
pure $ LBS.fromStrict $ pack $ concat dt <> eo
where
textWithBalise =
manyTill anyChar (sub <|> asub)
sub = string "<sub>"
<|> string "<sup>"
<|> string "<i>"
<|> string "<b>"
asub = string "</sub>"
<|> string "</sup>"
<|> string "</i>"
<|> string "</b>"
type Query = Text
type Limit = Integer
runMultipleFPAR ::
[Integer]
-> IO (Either Text [PubMed])
runMultipleFPAR ids
| length ids < 300 = runSimpleFetchPubmedAbstractRequest ids
| otherwise = do
runSimpleFetchPubmedAbstractRequest (Prelude.take 300 ids)
<> runMultipleFPAR (drop 300 ids)
runSimpleFetchPubmedAbstractRequest ::
[Integer]
-> IO (Either Text [PubMed])
runSimpleFetchPubmedAbstractRequest ids = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(fetch (Just "pubmed") (Just "abstract") ids)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> pure (Left . T.pack $ show err)
(Right (BsXml abs)) ->
case parseOnly removeSub $ LBS.toStrict abs of
(Left err'') -> pure (Left $ T.pack err'')
(Right v) -> do
parsed <- catch (pubMedParser v) ((\e -> do
_ <- print e
pure []) :: XmlException -> IO [PubMed])
Right <$> pure parsed
runSimpleFindPubmedAbstractRequest :: Text -> Maybe Limit -> IO (Either Text [PubMed])
runSimpleFindPubmedAbstractRequest query limit = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just query) limit)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> pure (Left $ T.pack $ show err)
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
runMultipleFPAR docIds
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module PUBMED.Client where
import Servant.API
import Servant.Client
import Data.Proxy (Proxy(..))
import Data.ByteString.Lazy (ByteString)
import qualified Network.HTTP.Media as M
import qualified Data.Text as T
data DB = PUBMED
newtype BsXml = BsXml ByteString
deriving (Show)
instance Accept BsXml where
contentType _ = "text" M.// "xml"
instance MimeUnrender BsXml BsXml where
mimeUnrender _ = Right . BsXml
type PUBMEDAPI =
"esearch.fcgi"
-- :> QueryParam "db" DB
-- not mandatory since the base db is pubmed
:> QueryParam "term" T.Text
:> QueryParam "retmax" Integer
:> Get '[BsXml] BsXml
:<|>
"efetch.fcgi"
:> QueryParam "db" T.Text
:> QueryParam "rettype" T.Text
:> QueryParams "id" Integer
:> Get '[BsXml] BsXml
pubmedApi :: Proxy PUBMEDAPI
pubmedApi = Proxy
search :: Maybe T.Text -> Maybe Integer -> ClientM BsXml
fetch :: Maybe T.Text
-> Maybe T.Text
-> [Integer]
-> ClientM BsXml
search :<|> fetch = client pubmedApi
{-# LANGUAGE OverloadedStrings #-}
module PUBMED.Parser where
import Text.XML.Stream.Parse
import qualified Text.XML.Cursor as C -- ((&/), (&//), Cursor, content, element)
import Text.XML (Name)
import Data.Either (rights)
import Data.Maybe (Maybe, fromJust, fromMaybe)
import Data.Monoid (mconcat)
import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack)
import Data.XML.Types (Event)
import Data.Time.Segment (jour)
import Data.Time (UTCTime(..))
import GHC.IO (FilePath)
import Protolude (head)
import Prelude hiding (head)
import Control.Monad.Catch (MonadThrow)
import Control.Monad (join)
import qualified Data.Text as T
import qualified Data.Text.Read as T
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Conduit.List as CL
parseDocId :: C.Cursor -> [Integer]
parseDocId cursor = fst <$>
rights (T.decimal
<$> filter notNullOrEOL (rawElement cursor)
)
where rawElement = C.element "eSearchResult" C.&/ C.element "IdList" C.&// C.content
notNullOrEOL t = not (T.null t) && t /= "\n"
identity :: a -> a
identity x = x
manyTagsUntil :: MonadThrow m =>
Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
-- | Utility function that matches everything but the tag given
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
data PubMed =
PubMed { pubmed_article :: PubMedArticle
, pubmed_date :: PubMedDate
} deriving Show
data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe T.Text
, pubmed_journal :: Maybe T.Text
, pubmed_abstract :: Maybe [T.Text]
, pubmed_authors :: Maybe [Author]
}
deriving (Show)
data Author =
Author {
lastName :: Maybe T.Text,
foreName :: Maybe T.Text,
affiliation :: Maybe T.Text
}
deriving (Show)
data PubMedDate =
PubMedDate { pubmedDate_date :: UTCTime
, pubmedDate_year :: Integer
, pubmedDate_month :: Int
, pubmedDate_day :: Int
} deriving (Show)
readPubMedFile :: FilePath -> IO [PubMed]
readPubMedFile fp = do
input <- DBL.readFile fp
pubMedParser input
pubMedParser :: DBL.ByteString -> IO [PubMed]
pubMedParser bstring = runConduit $ parseLBS def bstring
.| parseArticleSet
.| CL.consume
parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
parseArticleSet =
force "PubmedArticleSet required" $ manyTagsUntil "PubmedArticleSet" $ manyYield parsePubMedArticle
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
parsePubMedArticle =
manyTagsUntil "PubmedArticle" parsePubMedArticle'
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
parsePubMedArticle' = do
article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
dates <- tagIgnoreAttrs "PubmedData" $ do
dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
y' <- force "Year" $ tagIgnoreAttrs "Year" content
m' <- force "Month" $ tagIgnoreAttrs "Month" content
d' <- force "Day" $ tagIgnoreAttrs "Day" content
_ <- many ignoreAnyTreeContent
return (read $ unpack y', read $ unpack m', read $ unpack d')
_ <- many ignoreAnyTreeContent
return dates'
_ <- many ignoreAnyTreeContent
let (y,m,d) = maybe (1,1,1) (fromJust . head) (reverse <$> join dates)
return $ PubMed article (PubMedDate (jour y m d) y m d)
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = do
a <- force "article" $ manyTagsUntil "Article" parseArticle
_ <- many ignoreAnyTreeContent
return a
parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
parseArticle = do
journal <- force "journal" $
manyTagsUntil "Journal" $ do
j <- manyTagsUntil "Title" content
_ <- many ignoreAnyTreeContent
return j
title <- manyTagsUntil "ArticleTitle" content
abstracts <- manyTagsUntil "Abstract" . many $ do
txt <- tagIgnoreAttrs "AbstractText" content
_ <- many ignoreAnyTreeContent
return txt
-- TODO add authos
authors <- manyTagsUntil "AuthorList" . many $
tagIgnoreAttrs "Author" $ do
ln <- manyTagsUntil "LastName" content
fn <- manyTagsUntil "ForeName" content
affi <- manyTagsUntil "AffiliationInfo" $ do
aff <- manyTagsUntil "Affiliation" content
_ <- many ignoreAnyTreeContent
return aff
_ <- many ignoreAnyTreeContent
return Author {lastName=ln, foreName=fn, affiliation=fromMaybe Nothing affi}
_ <- many ignoreAnyTreeContent
return $ PubMedArticle title journal abstracts authors
pubMedData :: DBL.ByteString
pubMedData = mconcat
[ "<?xml version=\"1.0\"?>\n"
, "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
, "<PubmedArticleSet>\n"
, "<PubmedArticle>\n"
, " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
, " <PMID Version=\"1\">30357468</PMID>\n"
, " <DateRevised>\n"
, " <Year>2018</Year>\n"
, " </DateRevised>\n"
, " <Article PubModel=\"Print-Electronic\">\n"
, " <Journal>\n"
, " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
, " <Title>European journal of pediatrics</Title>\n"
, " </Journal>\n"
, " <ArticleTitle>Title of the Article</ArticleTitle>\n"
, " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
, " <Abstract>\n"
, " <AbstractText>Abstract Text.</AbstractText>\n"
, " </Abstract>\n"
, " <AuthorList>\n"
, " </AuthorList>\n"
, " </Article>\n"
, " </MedlineCitation>\n"
, " <PubmedData>\n"
, " <History>\n"
, " </History>\n"
, " </PubmedData>\n"
, "</PubmedArticle>\n"
, "</PubmedArticleSet>\n"
]
# This file was automatically generated by 'stack init'
#
# Some commonly used options have been documented as comments in this file.
# For advanced use and comprehensive documentation of the format, please see:
# https://docs.haskellstack.org/en/stable/yaml_configuration/
# Resolver to choose a 'specific' stackage snapshot or a compiler version.
# A snapshot resolver dictates the compiler version and the set of packages
# to be used for project dependencies. For example:
#
# resolver: lts-3.5
# resolver: nightly-2015-09-21
# resolver: ghc-7.10.2
#
# The location of a snapshot can be provided as a file or url. Stack assumes
# a snapshot provided as a file might change, whereas a url resource does not.
#
# resolver: ./custom-snapshot.yaml
# resolver: https://example.com/snapshots/2018-01-01.yaml
resolver: lts-13.13
# User packages to be built.
# Various formats can be used as shown in the example below.
#
# packages:
# - some-directory
# - https://example.com/foo/bar/baz-0.0.2.tar.gz
# - location:
# git: https://github.com/commercialhaskell/stack.git
# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a
# - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a
# subdirs:
# - auto-update
# - wai
packages:
- .
# Dependency packages to be pulled from upstream that are not in the resolver
# using the same syntax as the packages field.
# (e.g., acme-missiles-0.3)
extra-deps:
- git: https://github.com/delanoe/data-time-segment.git
commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723
# Override default flag values for local packages and extra-deps
# flags: {}
# Extra package databases containing global packages
# extra-package-dbs: []
# Control whether we use the GHC we find on the path
# system-ghc: true
#
# Require a specific version of stack, using version ranges
# require-stack-version: -any # Default
# require-stack-version: ">=1.9"
#
# Override the architecture used by stack, especially useful on Windows
# arch: i386
# arch: x86_64
#
# Extra directories used by stack for building
# extra-include-dirs: [/path/to/dir]
# extra-lib-dirs: [/path/to/dir]
#
# Allow a newer minor version of GHC than the snapshot specifies
# compiler-check: newer-minor
main :: IO ()
main = putStrLn "Test suite not yet implemented"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment