Commit a51cb46f authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[conduit] impement a recursive hal parser, with conduit

This allows to query large sets in batches and send the results
upstream.
parent 020f5f9b
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.33.0.
-- This file has been generated from package.yaml by hpack version 0.34.4.
--
-- see: https://github.com/sol/hpack
--
-- hash: fde6ab57a1c5f40dafd03a9761b892932ccb8edcc11851d57b3ca736e3d964e5
-- hash: 509e772465870fbf771b5c61338c6ae265f9eb1cd69ff040abb0b8421acb20f2
name: crawlerHAL
version: 0.1.0.0
description: Please see the README at <https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal>
homepage: https://github.com/https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal#readme
bug-reports: https://github.com/https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal/issues
author: CNRS/IMT
maintainer: contact@gargantext.org
copyright: 2019 CNRS/IMT
......@@ -23,7 +21,7 @@ extra-source-files:
source-repository head
type: git
location: https://github.com/https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal
location: https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal
library
exposed-modules:
......@@ -38,10 +36,18 @@ library
Paths_crawlerHAL
hs-source-dirs:
src
default-extensions:
DataKinds
DeriveGeneric
NamedFieldPuns
OverloadedStrings
RecordWildCards
TypeOperators
build-depends:
aeson
, base >=4.7 && <5
, bytestring
, conduit
, containers
, data-default
, http-client
......@@ -62,11 +68,19 @@ executable crawlerHAL-exe
Paths_crawlerHAL
hs-source-dirs:
app
default-extensions:
DataKinds
DeriveGeneric
NamedFieldPuns
OverloadedStrings
RecordWildCards
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson
, base >=4.7 && <5
, bytestring
, conduit
, containers
, crawlerHAL
, data-default
......@@ -89,11 +103,19 @@ test-suite halCrawler-test
Paths_crawlerHAL
hs-source-dirs:
test
default-extensions:
DataKinds
DeriveGeneric
NamedFieldPuns
OverloadedStrings
RecordWildCards
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson
, base >=4.7 && <5
, bytestring
, conduit
, containers
, data-default
, halCrawler
......
name: crawlerHAL
version: 0.1.0.0
github: "https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal"
git: "https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal"
license: BSD3
author: "CNRS/IMT"
maintainer: "contact@gargantext.org"
......@@ -20,25 +20,34 @@ extra-source-files:
description: Please see the README at <https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal>
dependencies:
- aeson
- base >= 4.7 && < 5
- bytestring
- conduit
- containers
- data-default
- http-client
- http-client-tls
- lens
- scientific
- servant
- servant-client
- aeson
- http-client-tls
- http-client
- split
- text
- containers
- lens
- bytestring
- utf8-string
- split
- scientific
- vector
- data-default
library:
source-dirs: src
default-extensions:
- DataKinds
- DeriveGeneric
- NamedFieldPuns
- OverloadedStrings
- RecordWildCards
- TypeOperators
executables:
crawlerHAL-exe:
main: Main.hs
......
{-# LANGUAGE OverloadedStrings #-}
module HAL where
import Conduit
import Data.Default (def)
import Data.Maybe (fromMaybe)
import Data.Either (fromRight)
import Data.Text
import Network.HTTP.Client (newManager)
......@@ -17,11 +18,44 @@ import HAL.Doc.Struct
import Servant.API
import Data.Aeson
getMetadataWith :: Text -> Maybe Int -> Maybe Int -> IO (Either ClientError (Response Corpus))
batchSize :: Int
batchSize = 1000
getMetadataWith :: Text -> Maybe Int -> Maybe Integer -> IO (Either ClientError (Response Corpus))
getMetadataWith q start rows = do
manager' <- newManager tlsManagerSettings
runHalAPIClient $ search (Just requestedFields) [q] Nothing start rows
getMetadataRecursively :: Text -> Maybe Int -> Maybe Integer -> IO (Either ClientError (ConduitT () Corpus IO ()))
getMetadataRecursively q start rows = do
manager' <- newManager tlsManagerSettings
-- First, estimate the total number of documents
eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just 0) (Just 1)
pure $ get' q start rows <$> eRes
where
get' :: Text -> Maybe Int -> Maybe Integer -> Response Corpus -> ConduitT () Corpus IO ()
get' q start rows (Response { _numFound }) = do
let start' = fromMaybe 0 start
let rows' = fromMaybe _numFound rows
let numResults = rows' - (fromIntegral start')
let numPages = numResults `div` (fromIntegral batchSize) + 1
yieldMany [0..]
.| takeC (fromInteger numPages)
.| concatMapMC (getPage q start')
-- .| mapMC printDoc
getPage :: Text -> Int -> Int -> IO [Corpus]
getPage q start pageNum = do
let offset = start + pageNum * batchSize
print $ show offset
eRes <- runHalAPIClient $ search (Just requestedFields) [q] Nothing (Just offset) (Just $ fromIntegral batchSize)
pure $ case eRes of
Left _ -> []
Right (Response { _docs }) -> _docs
printDoc :: Corpus -> IO Corpus
printDoc c@(Corpus { _corpus_docid, _corpus_title }) = do
print $ show _corpus_title
pure c
requestedFields :: Text
requestedFields = "docid,title_s,abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
......
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE NoMonomorphismRestriction#-}
......@@ -34,7 +30,7 @@ type Search doc = "search"
-- permit to start a the x result
:> QueryParam "start" Int
-- use rows to make the request only return the x number of result
:> QueryParam "rows" Int
:> QueryParam "rows" Integer
:> Get '[JSON] (Response doc)
type Structure doc = "ref" :> "structure"
......@@ -80,7 +76,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
-> [Text] -- fq
-> Maybe SortField -- sort
-> Maybe Int -- start
-> Maybe Int -- rows
-> Maybe Integer -- rows
-> ClientM (Response doc)
structure :: (FromJSON doc, ToHttpApiData doc) =>
......
{-# LANGUAGE OverloadedStrings #-}
module HAL.Doc where
import HAL.Doc.EntityTree
......
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TemplateHaskell #-}
module HAL.Doc.Corpus where
......
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
module HAL.Doc.EntityTree where
import GHC.Generics
......
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TemplateHaskell #-}
module HAL.Doc.Struct where
......
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module Tree where
import Network.HTTP.Client (newManager)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment