[Master] crawler function (without parameters, TODO).

dcaa0f5d · Alexandre Delanoë · ace45a17 · dcaa0f5d · dcaa0f5d · dcaa0f5d
Commit dcaa0f5d authored May 17, 2019 by Alexandre Delanoë
Showing with 163 additions and 58 deletions

Main.hs app/Main.hs +2 -34

crawlerPubMed.cabal crawlerPubMed.cabal +105 -0

package.yaml package.yaml +10 -10

Lib.hs src/Lib.hs +0 -6

PUBMED.hs src/PUBMED.hs +38 -0

Parser.hs src/PUBMED/Parser.hs +8 -8

No files found.
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -2,39 +2,7 @@

 module Main where

-import PUBMED.Client
-import PUBMED.Parser
-import Network.HTTP.Client (newManager)
-import Network.HTTP.Client.TLS (tlsManagerSettings)
-import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
-import Text.XML (parseLBS_, def)
-import Text.XML.Cursor (fromDocument, Cursor)
-
-import qualified Data.ByteString.Lazy as LBS
-import qualified Data.Text as T
-
-runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
-runParser parser = parser . fromDocument . parseLBS_ def
-
-runSimpleFindPubmedAbstractRequest :: T.Text -> IO [PubMed]
-runSimpleFindPubmedAbstractRequest rq = do
-  manager' <- newManager tlsManagerSettings
-  res <- runClientM
-    (search (Just rq))
-    (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
-  case res of
-    (Left err) -> return []
-    (Right (BsXml docs)) -> do
-      let docIds = runParser parseDocId docs
-      res' <- runClientM
-        (fetch (Just "pubmed") (Just "abstract") docIds)
-        (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
-      case res' of
-        (Left err) -> return []
-        (Right (BsXml abstracts)) ->
-          pubMedParser abstracts
+import qualified PUBMED as PubMed

 main :: IO ()
-main = do
-  pubmeds <- runSimpleFindPubmedAbstractRequest "organ"
-  print pubmeds
+main = PubMed.crawler "organ" >>= print
--- a/crawlerPubMed.cabal
+++ b/crawlerPubMed.cabal
+-- This file has been generated from package.yaml by hpack version 0.28.2.
+--
+-- see: https://github.com/sol/hpack
+--
+-- hash: 2d262a275f0e9e59092e1b7e005b90c2018a7b9484371aad24175b7a30116e60
+
+name:           crawlerPubMed
+version:        0.1.0.0
+description:    Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>
+homepage:       https://github.com/gitlab/crawlerPubMed#readme
+bug-reports:    https://github.com/gitlab/crawlerPubMed/issues
+author:         CNRS Gargantext
+maintainer:     contact@gargantext.org
+copyright:      2019 CNRS/IMT
+license:        BSD3
+license-file:   LICENSE
+build-type:     Simple
+cabal-version:  >= 1.10
+extra-source-files:
+    ChangeLog.md
+    README.md
+
+source-repository head
+  type: git
+  location: https://github.com/gitlab/crawlerPubMed
+
+library
+  exposed-modules:
+      PUBMED
+      PUBMED.Client
+      PUBMED.Parser
+  other-modules:
+      Paths_crawlerPubMed
+  hs-source-dirs:
+      src
+  build-depends:
+      base >=4.7 && <5
+    , bytestring
+    , conduit
+    , data-time-segment
+    , exceptions
+    , http-client
+    , http-client-tls
+    , http-media
+    , protolude
+    , servant
+    , servant-client
+    , text
+    , time
+    , xml-conduit
+    , xml-types
+  default-language: Haskell2010
+
+executable crawlerPubMed-exe
+  main-is: Main.hs
+  other-modules:
+      Paths_crawlerPubMed
+  hs-source-dirs:
+      app
+  ghc-options: -threaded -rtsopts -with-rtsopts=-N
+  build-depends:
+      base >=4.7 && <5
+    , bytestring
+    , conduit
+    , crawlerPubMed
+    , data-time-segment
+    , exceptions
+    , http-client
+    , http-client-tls
+    , http-media
+    , protolude
+    , servant
+    , servant-client
+    , text
+    , time
+    , xml-conduit
+    , xml-types
+  default-language: Haskell2010
+
+test-suite crawlerPubMed-test
+  type: exitcode-stdio-1.0
+  main-is: Spec.hs
+  other-modules:
+      Paths_crawlerPubMed
+  hs-source-dirs:
+      test
+  ghc-options: -threaded -rtsopts -with-rtsopts=-N
+  build-depends:
+      base >=4.7 && <5
+    , bytestring
+    , conduit
+    , crawlerPubMed
+    , data-time-segment
+    , exceptions
+    , http-client
+    , http-client-tls
+    , http-media
+    , protolude
+    , servant
+    , servant-client
+    , text
+    , time
+    , xml-conduit
+    , xml-types
+  default-language: Haskell2010
--- a/package.yaml
+++ b/package.yaml
-name:                pubMedCrawler
+name:                crawlerPubMed
 version:             0.1.0.0
-github:              "githubuser/pubMedCrawler"
+github:              "gitlab/crawlerPubMed"
 license:             BSD3
-author:              "Author name here"
-maintainer:          "example@example.com"
-copyright:           "2019 Author name here"
+author:              "CNRS Gargantext"
+maintainer:          "contact@gargantext.org"
+copyright:           "2019 CNRS/IMT"

 extra-source-files:
 - README.md
@@ -17,7 +17,7 @@ extra-source-files:
 # To avoid duplicated efforts in documentation and dealing with the
 # complications of embedding Haddock markup inside cabal files, it is
 # common to point users to the README.md file.
-description:         Please see the README on GitHub at <https://github.com/githubuser/pubMedCrawler#readme>
+description:         Please see the README on GitHub at <https://gitlab.iscpif.fr/gargantext/crawlers/pubmed/blob/dev/README.md>

 dependencies:
 - base >= 4.7 && < 5
@@ -40,7 +40,7 @@ library:
  source-dirs: src

 executables:
-  pubMedCrawler-exe:
+  crawlerPubMed-exe:
    main:                Main.hs
    source-dirs:         app
    ghc-options:
@@ -48,10 +48,10 @@ executables:
    - -rtsopts
    - -with-rtsopts=-N
    dependencies:
-    - pubMedCrawler
+    - crawlerPubMed

 tests:
-  pubMedCrawler-test:
+  crawlerPubMed-test:
    main:                Spec.hs
    source-dirs:         test
    ghc-options:
@@ -59,4 +59,4 @@ tests:
    - -rtsopts
    - -with-rtsopts=-N
    dependencies:
-    - pubMedCrawler
+    - crawlerPubMed
--- a/src/Lib.hs
+++ b/src/Lib.hs
-module Lib
-    ( someFunc
-    ) where
-
-someFunc :: IO ()
-someFunc = putStrLn "someFunc"
--- a/src/PUBMED.hs
+++ b/src/PUBMED.hs
+{-# LANGUAGE OverloadedStrings #-}
+
 module PUBMED where

 import PUBMED.Client
 import PUBMED.Parser
+
+import Network.HTTP.Client (newManager)
+import Network.HTTP.Client.TLS (tlsManagerSettings)
+import Servant.Client (runClientM, mkClientEnv, BaseUrl(..), Scheme(..))
+import Text.XML (parseLBS_, def)
+import Text.XML.Cursor (fromDocument, Cursor)
+
+import qualified Data.ByteString.Lazy as LBS
+import qualified Data.Text as T
+
+
+runParser :: Show res => (Cursor -> res) -> LBS.ByteString -> res
+runParser parser = parser . fromDocument . parseLBS_ def
+
+crawler :: T.Text -> IO [PubMed]
+crawler rq = do
+  manager' <- newManager tlsManagerSettings
+  res <- runClientM
+    (search (Just rq))
+    (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
+  case res of
+    (Left err) -> return []
+    (Right (BsXml docs)) -> do
+      let docIds = runParser parseDocId docs
+      res' <- runClientM
+        (fetch (Just "pubmed") (Just "abstract") docIds)
+        (mkClientEnv manager' $ BaseUrl Https "eutils.ncbi.nlm.nih.gov" 443 "entrez/eutils")
+      case res' of
+        (Left err) -> return []
+        (Right (BsXml abstracts)) ->
+          pubMedParser abstracts
+
+
+
+
+
--- a/src/PUBMED/Parser.hs
+++ b/src/PUBMED/Parser.hs
@@ -59,7 +59,7 @@ manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil

 data PubMed =
     PubMed { pubmed_article :: PubMedArticle
-            , pubmed_date    :: PubMedData
+            , pubmed_date    :: PubMedDate
     } deriving Show

 data PubMedArticle =
@@ -78,11 +78,11 @@ data Author =
  }
  deriving (Show)

-data PubMedData =
-     PubMedData { pubmedData_date     :: UTCTime
-                , pubmedData_year     :: Integer
-                , pubmedData_month    :: Int
-                , pubmedData_day      :: Int
+data PubMedDate =
+     PubMedDate { pubmedDate_date     :: UTCTime
+                , pubmedDate_year     :: Integer
+                , pubmedDate_month    :: Int
+                , pubmedDate_day      :: Int
                } deriving (Show)

 readPubMedFile :: FilePath -> IO [PubMed]
@@ -106,7 +106,7 @@ parsePubMedArticle =
 parsePubMedArticle' :: MonadThrow m => ConduitT Event o m PubMed
 parsePubMedArticle' = do
  article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
-  dates <- tagIgnoreAttrs "PubmedData" $ do
+  dates <- tagIgnoreAttrs "PubmedDate" $ do
    dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
      y' <- force "Year"  $ tagIgnoreAttrs "Year"  content
      m' <- force "Month" $ tagIgnoreAttrs "Month" content
@@ -117,7 +117,7 @@ parsePubMedArticle' = do
    return dates'
  _ <- many ignoreAnyTreeContent
  let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates
-  return $ PubMed article (PubMedData (jour y m d) y m d)
+  return $ PubMed article (PubMedDate (jour y m d) y m d)

 parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
 parseMedlineCitation = do