Commit dcaa0f5d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[Master] crawler function (without parameters, TODO).

parent ace45a17
......@@ -2,39 +2,7 @@
module Main where
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
runSimpleFindPubmedAbstractRequest :: T.Text -> IO [PubMed]
runSimpleFindPubmedAbstractRequest rq = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just rq))
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> return []
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> return []
(Right (BsXml abstracts)) ->
pubMedParser abstracts
import qualified PUBMED as PubMed
main :: IO ()
main = do
pubmeds <- runSimpleFindPubmedAbstractRequest "organ"
print pubmeds
main = PubMed.crawler "organ" >>= print
-- This file has been generated from package.yaml by hpack version 0.28.2.
--
-- see: https://github.com/sol/hpack
--
-- hash: 2d262a275f0e9e59092e1b7e005b90c2018a7b9484371aad24175b7a30116e60
name: crawlerPubMed
version: 0.1.0.0
description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
homepage: https://github.com/gitlab/crawlerPubMed#readme
bug-reports: https://github.com/gitlab/crawlerPubMed/issues
author: CNRS Gargantext
maintainer: contact@gargantext.org
copyright: 2019 CNRS/IMT
license: BSD3
license-file: LICENSE
build-type: Simple
cabal-version: >= 1.10
extra-source-files:
ChangeLog.md
README.md
source-repository head
type: git
location: https://github.com/gitlab/crawlerPubMed
library
exposed-modules:
PUBMED
PUBMED.Client
PUBMED.Parser
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
src
build-depends:
base >=4.7 && <5
, bytestring
, conduit
, data-time-segment
, exceptions
, http-client
, http-client-tls
, http-media
, protolude
, servant
, servant-client
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
executable crawlerPubMed-exe
main-is: Main.hs
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
app
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
base >=4.7 && <5
, bytestring
, conduit
, crawlerPubMed
, data-time-segment
, exceptions
, http-client
, http-client-tls
, http-media
, protolude
, servant
, servant-client
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
test-suite crawlerPubMed-test
type: exitcode-stdio-1.0
main-is: Spec.hs
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
test
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
base >=4.7 && <5
, bytestring
, conduit
, crawlerPubMed
, data-time-segment
, exceptions
, http-client
, http-client-tls
, http-media
, protolude
, servant
, servant-client
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
name: pubMedCrawler
name: crawlerPubMed
version: 0.1.0.0
github: "githubuser/pubMedCrawler"
github: "gitlab/crawlerPubMed"
license: BSD3
author: "Author name here"
maintainer: "example@example.com"
copyright: "2019 Author name here"
author: "CNRS Gargantext"
maintainer: "contact@gargantext.org"
copyright: "2019 CNRS/IMT"
extra-source-files:
- README.md
......@@ -17,7 +17,7 @@ extra-source-files:
# To avoid duplicated efforts in documentation and dealing with the
# complications of embedding Haddock markup inside cabal files, it is
# common to point users to the README.md file.
description: Please see the README on GitHub at <https://github.com/githubuser/pubMedCrawler#readme>
description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
dependencies:
- base >= 4.7 && < 5
......@@ -40,7 +40,7 @@ library:
source-dirs: src
executables:
pubMedCrawler-exe:
crawlerPubMed-exe:
main: Main.hs
source-dirs: app
ghc-options:
......@@ -48,10 +48,10 @@ executables:
- -rtsopts
- -with-rtsopts=-N
dependencies:
- pubMedCrawler
- crawlerPubMed
tests:
pubMedCrawler-test:
crawlerPubMed-test:
main: Spec.hs
source-dirs: test
ghc-options:
......@@ -59,4 +59,4 @@ tests:
- -rtsopts
- -with-rtsopts=-N
dependencies:
- pubMedCrawler
- crawlerPubMed
module Lib
( someFunc
) where
someFunc :: IO ()
someFunc = putStrLn "someFunc"
{-# LANGUAGE OverloadedStrings #-}
module PUBMED where
import PUBMED.Client
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
crawler :: T.Text -> IO [PubMed]
crawler rq = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just rq))
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> return []
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> return []
(Right (BsXml abstracts)) ->
pubMedParser abstracts
......@@ -59,7 +59,7 @@ manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
data PubMed =
PubMed { pubmed_article :: PubMedArticle
, pubmed_date :: PubMedData
, pubmed_date :: PubMedDate
} deriving Show
data PubMedArticle =
......@@ -78,11 +78,11 @@ data Author =
}
deriving (Show)
data PubMedData =
PubMedData { pubmedData_date :: UTCTime
, pubmedData_year :: Integer
, pubmedData_month :: Int
, pubmedData_day :: Int
data PubMedDate =
PubMedDate { pubmedDate_date :: UTCTime
, pubmedDate_year :: Integer
, pubmedDate_month :: Int
, pubmedDate_day :: Int
} deriving (Show)
readPubMedFile :: FilePath -> IO [PubMed]
......@@ -106,7 +106,7 @@ parsePubMedArticle =
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
parsePubMedArticle' = do
article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
dates <- tagIgnoreAttrs "PubmedData" $ do
dates <- tagIgnoreAttrs "PubmedDate" $ do
dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
y' <- force "Year" $ tagIgnoreAttrs "Year" content
m' <- force "Month" $ tagIgnoreAttrs "Month" content
......@@ -117,7 +117,7 @@ parsePubMedArticle' = do
return dates'
_ <- many ignoreAnyTreeContent
let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates
return $ PubMed article (PubMedData (jour y m d) y m d)
return $ PubMed article (PubMedDate (jour y m d) y m d)
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = do
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment