Commit a51cb46f authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[conduit] impement a recursive hal parser, with conduit

This allows to query large sets in batches and send the results
upstream.
parent 020f5f9b
cabal-version: 1.12 cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.33.0. -- This file has been generated from package.yaml by hpack version 0.34.4.
-- --
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
-- --
-- hash: fde6ab57a1c5f40dafd03a9761b892932ccb8edcc11851d57b3ca736e3d964e5 -- hash: 509e772465870fbf771b5c61338c6ae265f9eb1cd69ff040abb0b8421acb20f2
name: crawlerHAL name: crawlerHAL
version: 0.1.0.0 version: 0.1.0.0
description: Please see the README at <https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal> description: Please see the README at <https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal>
homepage: https://github.com/https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal#readme
bug-reports: https://github.com/https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal/issues
author: CNRS/IMT author: CNRS/IMT
maintainer: contact@gargantext.org maintainer: contact@gargantext.org
copyright: 2019 CNRS/IMT copyright: 2019 CNRS/IMT
...@@ -23,7 +21,7 @@ extra-source-files: ...@@ -23,7 +21,7 @@ extra-source-files:
source-repository head source-repository head
type: git type: git
location: https://github.com/https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal location: https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal
library library
exposed-modules: exposed-modules:
...@@ -38,10 +36,18 @@ library ...@@ -38,10 +36,18 @@ library
Paths_crawlerHAL Paths_crawlerHAL
hs-source-dirs: hs-source-dirs:
src src
default-extensions:
DataKinds
DeriveGeneric
NamedFieldPuns
OverloadedStrings
RecordWildCards
TypeOperators
build-depends: build-depends:
aeson aeson
, base >=4.7 && <5 , base >=4.7 && <5
, bytestring , bytestring
, conduit
, containers , containers
, data-default , data-default
, http-client , http-client
...@@ -62,11 +68,19 @@ executable crawlerHAL-exe ...@@ -62,11 +68,19 @@ executable crawlerHAL-exe
Paths_crawlerHAL Paths_crawlerHAL
hs-source-dirs: hs-source-dirs:
app app
default-extensions:
DataKinds
DeriveGeneric
NamedFieldPuns
OverloadedStrings
RecordWildCards
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
aeson aeson
, base >=4.7 && <5 , base >=4.7 && <5
, bytestring , bytestring
, conduit
, containers , containers
, crawlerHAL , crawlerHAL
, data-default , data-default
...@@ -89,11 +103,19 @@ test-suite halCrawler-test ...@@ -89,11 +103,19 @@ test-suite halCrawler-test
Paths_crawlerHAL Paths_crawlerHAL
hs-source-dirs: hs-source-dirs:
test test
default-extensions:
DataKinds
DeriveGeneric
NamedFieldPuns
OverloadedStrings
RecordWildCards
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
aeson aeson
, base >=4.7 && <5 , base >=4.7 && <5
, bytestring , bytestring
, conduit
, containers , containers
, data-default , data-default
, halCrawler , halCrawler
......
name: crawlerHAL name: crawlerHAL
version: 0.1.0.0 version: 0.1.0.0
github: "https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal" git: "https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal"
license: BSD3 license: BSD3
author: "CNRS/IMT" author: "CNRS/IMT"
maintainer: "contact@gargantext.org" maintainer: "contact@gargantext.org"
...@@ -20,25 +20,34 @@ extra-source-files: ...@@ -20,25 +20,34 @@ extra-source-files:
description: Please see the README at <https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal> description: Please see the README at <https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal>
dependencies: dependencies:
- aeson
- base >= 4.7 && < 5 - base >= 4.7 && < 5
- bytestring
- conduit
- containers
- data-default
- http-client
- http-client-tls
- lens
- scientific
- servant - servant
- servant-client - servant-client
- aeson - split
- http-client-tls
- http-client
- text - text
- containers
- lens
- bytestring
- utf8-string - utf8-string
- split
- scientific
- vector - vector
- data-default
library: library:
source-dirs: src source-dirs: src
default-extensions:
- DataKinds
- DeriveGeneric
- NamedFieldPuns
- OverloadedStrings
- RecordWildCards
- TypeOperators
executables: executables:
crawlerHAL-exe: crawlerHAL-exe:
main: Main.hs main: Main.hs
......
{-# LANGUAGE OverloadedStrings #-}
module HAL where module HAL where
import Conduit
import Data.Default (def) import Data.Default (def)
import Data.Maybe (fromMaybe)
import Data.Either (fromRight)
import Data.Text import Data.Text
import Network.HTTP.Client (newManager) import Network.HTTP.Client (newManager)
...@@ -17,11 +18,44 @@ import HAL.Doc.Struct ...@@ -17,11 +18,44 @@ import HAL.Doc.Struct
import Servant.API import Servant.API
import Data.Aeson import Data.Aeson
getMetadataWith :: Text -> Maybe Int -> Maybe Int -> IO (Either ClientError (Response Corpus)) batchSize :: Int
batchSize = 1000
getMetadataWith :: Text -> Maybe Int -> Maybe Integer -> IO (Either ClientError (Response Corpus))
getMetadataWith q start rows = do getMetadataWith q start rows = do
manager' <- newManager tlsManagerSettings manager' <- newManager tlsManagerSettings
runHalAPIClient $ search (Just requestedFields) [q] Nothing start rows runHalAPIClient $ search (Just requestedFields) [q] Nothing start rows
getMetadataRecursively :: Text -> Maybe Int -> Maybe Integer -> IO (Either ClientError (ConduitT () Corpus IO ()))
getMetadataRecursively q start rows = do
manager' <- newManager tlsManagerSettings
-- First, estimate the total number of documents
eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just 0) (Just 1)
pure $ get' q start rows <$> eRes
where
get' :: Text -> Maybe Int -> Maybe Integer -> Response Corpus -> ConduitT () Corpus IO ()
get' q start rows (Response { _numFound }) = do
let start' = fromMaybe 0 start
let rows' = fromMaybe _numFound rows
let numResults = rows' - (fromIntegral start')
let numPages = numResults `div` (fromIntegral batchSize) + 1
yieldMany [0..]
.| takeC (fromInteger numPages)
.| concatMapMC (getPage q start')
-- .| mapMC printDoc
getPage :: Text -> Int -> Int -> IO [Corpus]
getPage q start pageNum = do
let offset = start + pageNum * batchSize
print $ show offset
eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just offset) (Just $ fromIntegral batchSize)
pure $ case eRes of
Left _ -> []
Right (Response { _docs }) -> _docs
printDoc :: Corpus -> IO Corpus
printDoc c@(Corpus { _corpus_docid, _corpus_title }) = do
print $ show _corpus_title
pure c
requestedFields :: Text requestedFields :: Text
requestedFields = "docid,title_s,abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s" requestedFields = "docid,title_s,abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
......
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE NoMonomorphismRestriction#-} {-# LANGUAGE NoMonomorphismRestriction#-}
...@@ -34,7 +30,7 @@ type Search doc = "search" ...@@ -34,7 +30,7 @@ type Search doc = "search"
-- permit to start a the x result -- permit to start a the x result
:> QueryParam "start" Int :> QueryParam "start" Int
-- use rows to make the request only return the x number of result -- use rows to make the request only return the x number of result
:> QueryParam "rows" Int :> QueryParam "rows" Integer
:> Get '[JSON] (Response doc) :> Get '[JSON] (Response doc)
type Structure doc = "ref" :> "structure" type Structure doc = "ref" :> "structure"
...@@ -80,7 +76,7 @@ search :: (FromJSON doc, ToHttpApiData doc) => ...@@ -80,7 +76,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
-> [Text] -- fq -> [Text] -- fq
-> Maybe SortField -- sort -> Maybe SortField -- sort
-> Maybe Int -- start -> Maybe Int -- start
-> Maybe Int -- rows -> Maybe Integer -- rows
-> ClientM (Response doc) -> ClientM (Response doc)
structure :: (FromJSON doc, ToHttpApiData doc) => structure :: (FromJSON doc, ToHttpApiData doc) =>
......
{-# LANGUAGE OverloadedStrings #-}
module HAL.Doc where module HAL.Doc where
import HAL.Doc.EntityTree import HAL.Doc.EntityTree
......
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
module HAL.Doc.Corpus where module HAL.Doc.Corpus where
......
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
module HAL.Doc.EntityTree where module HAL.Doc.EntityTree where
import GHC.Generics import GHC.Generics
......
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
module HAL.Doc.Struct where module HAL.Doc.Struct where
......
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module Tree where module Tree where
import Network.HTTP.Client (newManager) import Network.HTTP.Client (newManager)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment