Commit 6db3aee2 authored by Alexandre Delanoë's avatar Alexandre Delanoë
parents 7221dafc 39f9a367
.stack-work/
halCrawler.cabal
*~
crawlerHAL.cabal
\ No newline at end of file
......@@ -6,7 +6,7 @@ import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client
import HAL (runStructureRequest)
import HAL (runSearchRequest)
import HAL.Client
import HAL.Doc
import Tree
......@@ -14,28 +14,7 @@ import Tree
main :: IO ()
main = do
manager' <- newManager tlsManagerSettings
res <- runStructureRequest $ Just "parentDocid_i:302102"
res <- runSearchRequest $ ["ia"]
case res of
(Left err) -> print err
(Right val) -> print val
{-
main :: IO ()
main = do
rootRes <- runHalAPIClient $ structure (Just $ fqRootDoc) (Just "docid, parentDocid_i, label_s") (Just 10000)
case rootRes of
(Left err) -> print err
(Right val) -> do
childrenRes <- runHalAPIClient $ structure (Just $ fqParentDoc) (Just "docid, parentDocid_i, label_s") (Just 10000)
case childrenRes of
(Left err2) -> print err2
(Right val2) -> do
children <- fetchChildren $ val2 ^. docs
let trees = buildTree 0 children <$> (val ^. docs)
let noDuplicateTrees = noDuplicateTree <$> trees
mapM_ putStrLn $ formatTree <$> noDuplicateTrees
where fqParentDoc =
"parentDocid_i:(302102 || 469216 || 6279 || 224096 || 144103 || 497330 || 84538 || 301262 || 481355 || 29212 || 301442 || 542824 || 300362 || 352124 || 300104 || 421532 || 301492)"
fqRootDoc =
"docid:(302102 || 469216 || 6279 || 224096 || 144103 || 497330 || 84538 || 301262 || 481355 || 29212 || 301442 || 542824 || 300362 || 352124 || 300104 || 421532 || 301492)"
-}
(Right val) -> print $ _docs val
-- This file has been generated from package.yaml by hpack version 0.28.2.
--
-- see: https://github.com/sol/hpack
--
-- hash: 9634b793447cd493f4caaa2fa593379ae858856de45a2b71bb012af23e3c40f1
name: crawlerHAL
version: 0.1.0.0
description: Please see the README at <https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal>
homepage: https://github.com/https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal#readme
bug-reports: https://github.com/https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal/issues
author: CNRS/IMT
maintainer: contact@gargantext.org
copyright: 2019 CNRS/IMT
license: BSD3
license-file: LICENSE
build-type: Simple
cabal-version: >= 1.10
extra-source-files:
ChangeLog.md
README.md
source-repository head
type: git
location: https://github.com/https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/hal
library
exposed-modules:
HAL
HAL.Client
HAL.Doc
Tree
other-modules:
Paths_crawlerHAL
hs-source-dirs:
src
build-depends:
aeson
, base >=4.7 && <5
, bytestring
, containers
, data-default
, http-client
, http-client-tls
, lens
, scientific
, servant
, servant-client
, split
, text
, utf8-string
, vector
default-language: Haskell2010
executable crawlerHAL-exe
main-is: Main.hs
other-modules:
Paths_crawlerHAL
hs-source-dirs:
app
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson
, base >=4.7 && <5
, bytestring
, containers
, crawlerHAL
, data-default
, http-client
, http-client-tls
, lens
, scientific
, servant
, servant-client
, split
, text
, utf8-string
, vector
default-language: Haskell2010
test-suite halCrawler-test
type: exitcode-stdio-1.0
main-is: Spec.hs
other-modules:
Paths_crawlerHAL
hs-source-dirs:
test
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson
, base >=4.7 && <5
, bytestring
, containers
, data-default
, halCrawler
, http-client
, http-client-tls
, lens
, scientific
, servant
, servant-client
, split
, text
, utf8-string
, vector
default-language: Haskell2010
......@@ -8,21 +8,20 @@ import Data.Text
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant.Client (BaseUrl(..), Scheme(..), ClientM, ServantError,
runClientM, mkClientEnv)
import Servant.Client (BaseUrl(..), Scheme(..), ClientM, ClientError, runClientM, mkClientEnv)
import HAL.Client
import HAL.Doc
import HAL.Doc.Corpus
runHalAPIClient :: ClientM (Response Doc) -> IO (Either ServantError (Response Doc))
runHalAPIClient :: ClientM (Response Corpus) -> IO (Either ClientError (Response Corpus))
runHalAPIClient cmd = do
manager' <- newManager tlsManagerSettings
runClientM cmd (mkClientEnv manager' $ BaseUrl Https "api.archives-ouvertes.fr" 443 "")
runStructureRequest :: Maybe Text -> IO (Either ServantError (Response Doc))
runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Corpus))
runStructureRequest rq =
runHalAPIClient $ structure def rq (Just 10000)
runHalAPIClient $ structure (Just "docid,title_s,abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s") rq (Just 10000)
runSearchRequest :: [Text] -> IO (Either ServantError (Response Doc))
runSearchRequest :: [Text] -> IO (Either ClientError (Response Corpus))
runSearchRequest rq =
runHalAPIClient $ search def rq Nothing Nothing Nothing
runHalAPIClient $ search (Just "docid,title_s,abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s") rq Nothing Nothing Nothing
......@@ -25,7 +25,7 @@ type HALAPI doc = Search doc
type Search doc = "search"
-- fl determine which fields will be returned it can be a list of fields or *
:> QueryParam "fl" doc
:> QueryParam "fl" Text -- doc
-- TODO: type this monster
-- fq is to filter request
:> QueryParams "fq" Text
......@@ -38,7 +38,7 @@ type Search doc = "search"
:> Get '[JSON] (Response doc)
type Structure doc = "ref" :> "structure"
:> QueryParam "fl" doc
:> QueryParam "fl" Text
:> QueryParam "fq" Text
:> QueryParam "rows" Int
:> Get '[JSON] (Response doc)
......@@ -56,21 +56,6 @@ asc = Just . Asc
desc :: Text -> Maybe SortField
desc = Just . Desc
-- newtype Doc = Doc (Map Text Value)
-- deriving (Generic)
--
--instance Eq Doc where
-- (==) (Doc doc) (Doc doc') = (doc ! "docid") == (doc' ! "docid")
--
--instance Show Doc where
-- show (Doc o) = (UTF.decode $ BSL.unpack $ encode $ o ! "label_s")
-- <> "("
-- <> (show . encode $ o ! "docid")
-- <> ")"
--
--instance FromJSON Doc
--instance ToJSON Doc
-- Response type
data Response doc = Response
{
......@@ -91,7 +76,7 @@ halAPI = Proxy
-- search should always have at least `docid` and `label_s` in his fl params
search :: (FromJSON doc, ToHttpApiData doc) =>
Maybe doc -- fl
Maybe Text -- fl
-> [Text] -- fq
-> Maybe SortField -- sort
-> Maybe Int -- start
......@@ -99,7 +84,7 @@ search :: (FromJSON doc, ToHttpApiData doc) =>
-> ClientM (Response doc)
structure :: (FromJSON doc, ToHttpApiData doc) =>
Maybe doc
Maybe Text
-> Maybe Text -- fq
-> Maybe Int -- rows
-> ClientM (Response doc)
......
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
module HAL.Doc where
import GHC.Generics
import Data.Aeson ((.:), (.:?), (.!=), Value(..), ToJSON, FromJSON(..), encode)
import Data.Default
import Data.Text (pack, Text)
import Servant.API (ToHttpApiData(..))
data Doc = Doc
{
_docid :: Int,
_label_s :: Maybe Text,
_parentDocid_i :: [Text]
} deriving (Generic)
instance Default Doc where
def = Doc def def def
instance FromJSON Doc where
parseJSON (Object o) = Doc <$>
(o .: "docid")
<*> (o .:? "label_s")
<*> (o .:? "parentDocid_i" .!= [])
instance ToHttpApiData Doc where
toUrlPiece _ = "docid,label_s,parentDocid_i"
instance Show Doc where
show (Doc id label _) = show label
<> "(" <> show id <> ")"
import HAL.Doc.EntityTree
import HAL.Doc.Corpus
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TemplateHaskell #-}
module HAL.Doc.Corpus where
import GHC.Generics
import Data.Aeson
import Data.Default
import Data.Text (pack, Text)
import Control.Applicative ((<|>))
import qualified Control.Lens as L
import Servant.API (ToHttpApiData(..))
data Corpus = Corpus
{
_corpus_docid :: Int,
_corpus_title :: [Text],
_corpus_abstract :: [Text],
_corpus_date :: Maybe Text,
_corpus_source :: Maybe Text,
_corpus_authors_names :: [Text],
_corpus_authors_affiliations :: [Text]
} deriving (Show, Generic)
L.makeLenses ''Corpus
instance Default Corpus where
def = Corpus def def def def def def def
instance FromJSON Corpus where
parseJSON (Object o) = Corpus <$>
(o .: "docid")
<*> (o .: "title_s" <|> return [])
<*> (o .: "abstract_s" <|> return [])
<*> (o .:? "submittedDate_s")
<*> (o .:? "source_s")
<*> (o .: "authFullName_s" <|> return [])
<*> (o .: "authOrganism_s" <|> return [])
instance ToHttpApiData Corpus where
toUrlPiece _ = "docid,title_s,abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
module HAL.Doc.EntityTree where
import GHC.Generics
import Data.Aeson ((.:), (.:?), (.!=), Value(..), ToJSON, FromJSON(..), encode)
import Data.Default
import Data.Text (pack, Text)
import Servant.API (ToHttpApiData(..))
data EntityTree = EntityTree
{
_docid :: Int,
_label_s :: Maybe Text,
_parentEntityTreeid_i :: [Text]
} deriving (Generic)
instance Default EntityTree where
def = EntityTree def def def
instance FromJSON EntityTree where
parseJSON (Object o) = EntityTree <$>
(o .: "docid")
<*> (o .:? "label_s")
<*> (o .:? "parentEntityTreeid_i" .!= [])
instance ToHttpApiData EntityTree where
toUrlPiece _ = "docid,label_s,parentEntityTreeid_i"
instance Show EntityTree where
show (EntityTree id label _) = show label
<> "(" <> show id <> ")"
......@@ -17,7 +17,7 @@
#
# resolver: ./custom-snapshot.yaml
# resolver: https://example.com/snapshots/2018-01-01.yaml
resolver: lts-13.10
resolver: lts-14.1
# User packages to be built.
# Various formats can be used as shown in the example below.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment