Commit dcaa0f5d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[Master] crawler function (without parameters, TODO).

parent ace45a17
...@@ -2,39 +2,7 @@ ...@@ -2,39 +2,7 @@
module Main where module Main where
import PUBMED.Client import qualified PUBMED as PubMed
import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
runSimpleFindPubmedAbstractRequest :: T.Text -> IO [PubMed]
runSimpleFindPubmedAbstractRequest rq = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just rq))
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> return []
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> return []
(Right (BsXml abstracts)) ->
pubMedParser abstracts
main :: IO () main :: IO ()
main = do main = PubMed.crawler "organ" >>= print
pubmeds <- runSimpleFindPubmedAbstractRequest "organ"
print pubmeds
-- This file has been generated from package.yaml by hpack version 0.28.2.
--
-- see: https://github.com/sol/hpack
--
-- hash: 2d262a275f0e9e59092e1b7e005b90c2018a7b9484371aad24175b7a30116e60
name: crawlerPubMed
version: 0.1.0.0
description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
homepage: https://github.com/gitlab/crawlerPubMed#readme
bug-reports: https://github.com/gitlab/crawlerPubMed/issues
author: CNRS Gargantext
maintainer: contact@gargantext.org
copyright: 2019 CNRS/IMT
license: BSD3
license-file: LICENSE
build-type: Simple
cabal-version: >= 1.10
extra-source-files:
ChangeLog.md
README.md
source-repository head
type: git
location: https://github.com/gitlab/crawlerPubMed
library
exposed-modules:
PUBMED
PUBMED.Client
PUBMED.Parser
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
src
build-depends:
base >=4.7 && <5
, bytestring
, conduit
, data-time-segment
, exceptions
, http-client
, http-client-tls
, http-media
, protolude
, servant
, servant-client
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
executable crawlerPubMed-exe
main-is: Main.hs
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
app
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
base >=4.7 && <5
, bytestring
, conduit
, crawlerPubMed
, data-time-segment
, exceptions
, http-client
, http-client-tls
, http-media
, protolude
, servant
, servant-client
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
test-suite crawlerPubMed-test
type: exitcode-stdio-1.0
main-is: Spec.hs
other-modules:
Paths_crawlerPubMed
hs-source-dirs:
test
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
base >=4.7 && <5
, bytestring
, conduit
, crawlerPubMed
, data-time-segment
, exceptions
, http-client
, http-client-tls
, http-media
, protolude
, servant
, servant-client
, text
, time
, xml-conduit
, xml-types
default-language: Haskell2010
name: pubMedCrawler name: crawlerPubMed
version: 0.1.0.0 version: 0.1.0.0
github: "githubuser/pubMedCrawler" github: "gitlab/crawlerPubMed"
license: BSD3 license: BSD3
author: "Author name here" author: "CNRS Gargantext"
maintainer: "example@example.com" maintainer: "contact@gargantext.org"
copyright: "2019 Author name here" copyright: "2019 CNRS/IMT"
extra-source-files: extra-source-files:
- README.md - README.md
...@@ -17,7 +17,7 @@ extra-source-files: ...@@ -17,7 +17,7 @@ extra-source-files:
# To avoid duplicated efforts in documentation and dealing with the # To avoid duplicated efforts in documentation and dealing with the
# complications of embedding Haddock markup inside cabal files, it is # complications of embedding Haddock markup inside cabal files, it is
# common to point users to the README.md file. # common to point users to the README.md file.
description: Please see the README on GitHub at <https://github.com/githubuser/pubMedCrawler#readme> description: Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
dependencies: dependencies:
- base >= 4.7 && < 5 - base >= 4.7 && < 5
...@@ -40,7 +40,7 @@ library: ...@@ -40,7 +40,7 @@ library:
source-dirs: src source-dirs: src
executables: executables:
pubMedCrawler-exe: crawlerPubMed-exe:
main: Main.hs main: Main.hs
source-dirs: app source-dirs: app
ghc-options: ghc-options:
...@@ -48,10 +48,10 @@ executables: ...@@ -48,10 +48,10 @@ executables:
- -rtsopts - -rtsopts
- -with-rtsopts=-N - -with-rtsopts=-N
dependencies: dependencies:
- pubMedCrawler - crawlerPubMed
tests: tests:
pubMedCrawler-test: crawlerPubMed-test:
main: Spec.hs main: Spec.hs
source-dirs: test source-dirs: test
ghc-options: ghc-options:
...@@ -59,4 +59,4 @@ tests: ...@@ -59,4 +59,4 @@ tests:
- -rtsopts - -rtsopts
- -with-rtsopts=-N - -with-rtsopts=-N
dependencies: dependencies:
- pubMedCrawler - crawlerPubMed
module Lib
( someFunc
) where
someFunc :: IO ()
someFunc = putStrLn "someFunc"
{-# LANGUAGE OverloadedStrings #-}
module PUBMED where module PUBMED where
import PUBMED.Client import PUBMED.Client
import PUBMED.Parser import PUBMED.Parser
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
import Text.XML (parseLBS_, def)
import Text.XML.Cursor (fromDocument, Cursor)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Text as T
runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
runParser parser = parser . fromDocument . parseLBS_ def
crawler :: T.Text -> IO [PubMed]
crawler rq = do
manager' <- newManager tlsManagerSettings
res <- runClientM
(search (Just rq))
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res of
(Left err) -> return []
(Right (BsXml docs)) -> do
let docIds = runParser parseDocId docs
res' <- runClientM
(fetch (Just "pubmed") (Just "abstract") docIds)
(mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
case res' of
(Left err) -> return []
(Right (BsXml abstracts)) ->
pubMedParser abstracts
...@@ -59,7 +59,7 @@ manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil ...@@ -59,7 +59,7 @@ manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
data PubMed = data PubMed =
PubMed { pubmed_article :: PubMedArticle PubMed { pubmed_article :: PubMedArticle
, pubmed_date :: PubMedData , pubmed_date :: PubMedDate
} deriving Show } deriving Show
data PubMedArticle = data PubMedArticle =
...@@ -78,11 +78,11 @@ data Author = ...@@ -78,11 +78,11 @@ data Author =
} }
deriving (Show) deriving (Show)
data PubMedData = data PubMedDate =
PubMedData { pubmedData_date :: UTCTime PubMedDate { pubmedDate_date :: UTCTime
, pubmedData_year :: Integer , pubmedDate_year :: Integer
, pubmedData_month :: Int , pubmedDate_month :: Int
, pubmedData_day :: Int , pubmedDate_day :: Int
} deriving (Show) } deriving (Show)
readPubMedFile :: FilePath -> IO [PubMed] readPubMedFile :: FilePath -> IO [PubMed]
...@@ -106,7 +106,7 @@ parsePubMedArticle = ...@@ -106,7 +106,7 @@ parsePubMedArticle =
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
parsePubMedArticle' = do parsePubMedArticle' = do
article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
dates <- tagIgnoreAttrs "PubmedData" $ do dates <- tagIgnoreAttrs "PubmedDate" $ do
dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
y' <- force "Year" $ tagIgnoreAttrs "Year" content y' <- force "Year" $ tagIgnoreAttrs "Year" content
m' <- force "Month" $ tagIgnoreAttrs "Month" content m' <- force "Month" $ tagIgnoreAttrs "Month" content
...@@ -117,7 +117,7 @@ parsePubMedArticle' = do ...@@ -117,7 +117,7 @@ parsePubMedArticle' = do
return dates' return dates'
_ <- many ignoreAnyTreeContent _ <- many ignoreAnyTreeContent
let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates
return $ PubMed article (PubMedData (jour y m d) y m d) return $ PubMed article (PubMedDate (jour y m d) y m d)
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = do parseMedlineCitation = do
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment