First working draft of openalex client

parent 4df691f2
use_nix
export LANG=C.UTF-8
dist-newstyle/
docs/.ipynb_checkpoints/
# Revision history for openalex
## 0.1.0.0 -- YYYY-mm-dd
* First version. Released on an unsuspecting world.
This diff is collapsed.
# Open Alex Database API Crawler for GarganText
## Running
``` shell
cabal run openalex-main -- concepts
```
module Main where
import Options.Applicative.Simple
import Protolude
import qualified OpenAlex as OA
main :: IO ()
main = do
(opts, runCmd) <-
simpleOptions "0.1.0.0"
"OpenAlex"
"OpenAlex command line"
(pure ()) $ do
addCommand "concepts"
"Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)"
(const fetchConcepts)
(pure ())
runCmd ()
fetchConcepts :: () -> IO ()
fetchConcepts _ = do
-- ec <- OA.fetchConcepts (Just 1) (Just 1) Nothing
ec <- OA.fetchConcepts (Just 1) (Just 1) (Just "*")
case ec of
Left err -> putText $ "error: " <> show err
Right c -> do
putText "c"
putText $ show c
import (builtins.fetchGit {
name = "nixos-22.05";
url = "https://github.com/nixos/nixpkgs";
ref = "refs/heads/nixos-22.05";
rev = "ce6aa13369b667ac2542593170993504932eb836";
})
{ pkgs ? import ./pinned-22.05.nix {} }:
rec {
inherit pkgs;
ghc = pkgs.haskell.compiler.ghc8107;
hsBuildInputs = [
ghc
pkgs.cabal-install
];
pythonEnv = pkgs.python3.withPackages(ps: [
ps.ipywidgets
ps.pandas
ps.tqdm
]);
nonhsBuildInputs = with pkgs; [
jupyter
pythonEnv
zlib
];
#libPaths = pkgs.lib.makeLibraryPath nonhsBuildInputs;
shell = pkgs.mkShell {
name = "openalex";
buildInputs = hsBuildInputs ++ nonhsBuildInputs;
};
}
cabal-version: 2.4
name: openalex
version: 0.1.0.0
-- A short (one-line) description of the package.
-- synopsis: An importer for OpenAlex database.
-- A longer description of the package.
-- description:
-- A URL where users can report bugs.
-- bug-reports: https://gitlab.iscpif.fr/gargantext/crawlers/openalex
-- The license under which the package is released.
-- license:
author: Gargantext Team
maintainer: team@gargantext.org
-- A copyright notice.
copyright: Copyright: (c) 2023-Present: see git logs and README
license: AGPL-3.0-or-later
license-file: LICENSE
category: Data
extra-source-files:
CHANGELOG.md
README.md
source-repository head
type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/openalex
library
hs-source-dirs:
src
exposed-modules:
OpenAlex
OpenAlex.Client
OpenAlex.ServantClientLogging
OpenAlex.Types
default-extensions:
DataKinds
DeriveGeneric
DuplicateRecordFields
FlexibleContexts
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
OverloadedStrings
RankNTypes
RecordWildCards
StrictData
TypeOperators
ghc-options: -Wall -Wincomplete-uni-patterns -Wincomplete-record-updates -Wmissing-signatures -Wunused-binds -Wunused-imports -Werror
build-depends: base ^>= 4.14.3.0 && < 5
, aeson >= 2.1.2 && < 2.2
, binary >= 0.8.8 && < 0.9
, bytestring >= 0.10.12 && < 0.11
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.5.1 && < 0.7
, http-client >= 0.7.13.1 && < 0.8
, http-client-tls >= 0.3.6.2 && < 0.4
, http-conduit >= 2.3.8.2 && < 2.4
, protolude >= 0.3.3 && < 0.4
, scientific >= 0.3.7 && < 0.4
, servant >= 0.19 && < 0.20
, servant-client >= 0.19 && < 0.20
, servant-client-core >= 0.19 && < 0.20
, text >= 1.2.4 && < 1.3
, time >= 1.9.3 && < 1.10
default-language: Haskell2010
executable openalex-main
main-is: Main.hs
-- Modules included in this executable, other than Main.
-- other-modules:
-- LANGUAGE extensions used by modules in this package.
-- other-extensions:
build-depends: base ^>=4.14.3.0 && < 5
, optparse-simple >= 0.1.1.4 && < 0.2
, protolude >= 0.3.3 && < 0.4
, openalex
hs-source-dirs: app
default-language: Haskell2010
default-extensions:
DataKinds
DeriveGeneric
DuplicateRecordFields
FlexibleContexts
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
OverloadedStrings
RankNTypes
RecordWildCards
StrictData
TypeOperators
{ pkgs ? import ./nix/pkgs.nix {} }:
let
in
pkgs.pkgs.mkShell {
name = pkgs.shell.name;
shellHook = pkgs.shell.shellHook;
buildInputs = pkgs.shell.buildInputs;
}
{-|
Module : OpenAlex
Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2023-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext@: search, map, share
-}
module OpenAlex
( module OpenAlex.Client
, module OpenAlex.Types
-- , fetchConcepts'
, fetchConcepts )
where
-- import Data.Aeson
-- import qualified Data.ByteString.Lazy as BSL
import qualified Data.Text as T
-- import qualified Data.Text.Encoding as TE
-- import Network.HTTP.Client (httpLbs, newManager, parseRequest, requestHeaders, responseBody)
import Network.HTTP.Client (newManager, requestHeaders)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude
import OpenAlex.Client
import OpenAlex.ServantClientLogging
import OpenAlex.Types
import Servant.Client (BaseUrl(..), ClientEnv(..), ClientError, Scheme(Https), defaultMakeClientRequest, mkClientEnv, runClientM)
defaultClientEnv :: IO ClientEnv
defaultClientEnv = do
manager <- newManager tlsManagerSettings
let env' = mkClientEnv manager $ BaseUrl Https (T.unpack apiUrl) 443 ""
let makeClientRequest bu req =
let r = defaultMakeClientRequest bu req in
r { requestHeaders = (requestHeaders r) <> [("User-Agent", "servant.client")]}
let env = env' { makeClientRequest }
pure $ addLoggingToClientEnv env
fetchConcepts :: Maybe Page -> Maybe PerPage -> Maybe Cursor -> IO (Either ClientError (ListOf Concept))
fetchConcepts mPage mPerPage mCursor = do
env <- defaultClientEnv
runClientM (concepts mPage mPerPage mCursor) env
-- fetchConcepts' :: IO (Either Text (ListOf Concept))
-- fetchConcepts' = do
-- manager <- newManager tlsManagerSettings
-- req' <- parseRequest "https://api.openalex.org/concepts?page=1&per_page=10"
-- -- let setHeaders r =
-- -- filter (\(h, _) -> h /= "Accept") $
-- -- requestHeaders r -- <> [("Content-Type", "application/json")]
-- let setHeaders = const [("User-Agent", "v.http")]
-- let req = req' { requestHeaders = setHeaders req' }
-- putText $ show req
-- resBS <- httpLbs req manager
-- putText $ TE.decodeUtf8 $ BSL.toStrict $ responseBody resBS
-- pure $ case eitherDecode (responseBody resBS) of
-- Left err -> Left $ T.pack err
-- Right r -> Right $ r
{-|
Module : OpenAlex.Client
Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2023-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext@: search, map, share
-}
module OpenAlex.Client where
import Protolude
import Servant.API
import Servant.Client
import OpenAlex.Types
type API_URL = Text
apiUrl :: API_URL
apiUrl = "api.openalex.org"
type PageParams =
QueryParam "page" Page
:> QueryParam "per-page" PerPage
:> QueryParam "cursor" Cursor
type OpenAlexAPI =
-- https://api.openalex.org/concepts
"concepts"
:> QueryParam "page" Page
:> QueryParam "per-page" PerPage
:> QueryParam "cursor" Cursor
-- TODO: filter, search, sort
:> Get '[JSON] (ListOf Concept)
openAlexApi :: Proxy OpenAlexAPI
openAlexApi = Proxy
concepts :: Maybe Page -> Maybe PerPage -> Maybe Cursor -> ClientM (ListOf Concept)
concepts {- :<|> fetch -} = client openAlexApi
{-|
Module : OpenAlex.ServantClientLogging
Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2023-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext@: search, map, share
-}
module OpenAlex.ServantClientLogging where
import qualified Data.Binary.Builder as DBB
import Debug.Trace (trace)
import Protolude hiding (trace)
import Servant.Client (ClientEnv(..))
import Servant.Client.Core.Request (RequestF(..))
addLoggingToClientEnv :: ClientEnv -> ClientEnv
addLoggingToClientEnv env@(ClientEnv { makeClientRequest = mcr }) = env { makeClientRequest }
where
makeClientRequest baseUrl req =
trace ("req: " <> showRequestPath req <> " " <> show (requestQueryString req)) $ mcr baseUrl req
showRequestPath req =
show $ DBB.toLazyByteString $ requestPath req
{-# LANGUAGE DeriveAnyClass #-}
{-|
Module : OpenAlex.Types
Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2023-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext@: search, map, share
-}
module OpenAlex.Types where
import Control.Monad.Fail (fail)
import Data.Aeson
import Data.Scientific (floatingOrInteger)
import Data.Time.Calendar (Day)
import Protolude hiding (Meta)
type ConceptId = Text
type Count = Int
type Cursor = Text
data ExternalID = ExtIDUrl URL | ExtIDUrls [URL] | ExtIDInt Int
deriving (Generic, Show)
instance FromJSON ExternalID where
parseJSON (String v) = pure $ ExtIDUrl v
parseJSON (Number n) =
case (floatingOrInteger n :: Either Double Integer) of
Left _ -> fail "Floating number not supported as external id"
Right i -> pure $ ExtIDInt $ fromIntegral i
parseJSON a@(Array _a) =do
ids <- parseJSONList a
pure $ ExtIDUrls ids
parseJSON _ = fail "Don't know how to handle this external id"
type Language = Text -- TODO: https://doc.wikimedia.org/mediawiki-core/master/php/Names_8php_source.html
type Level = Int
type Page = Int
type PerPage = Int
type URL = Text
type Year = Int
newtype CreatedDate = CreatedDate Day
deriving (Generic, Show)
instance FromJSON CreatedDate
newtype UpdatedDate = UpdatedDate Day
deriving (Generic, Show)
instance FromJSON UpdatedDate
-- https://docs.openalex.org/api-entities/concepts/concept-object#ids
data ExternalDB = MAG | OpenAlex | UMLS_Cui | UMLS_Aui | Wikidata | Wikipedia
deriving (Show, Eq, Ord, Generic, Hashable, FromJSONKey)
instance FromJSON ExternalDB where
parseJSON (String "mag") = pure MAG
parseJSON (String "openalex") = pure OpenAlex
parseJSON (String "umls_cui") = pure UMLS_Cui
parseJSON (String "umls_aui") = pure UMLS_Aui
parseJSON (String "wikidata") = pure Wikidata
parseJSON (String "wikipedia") = pure Wikipedia
parseJSON _ = fail "Don't know how to handle this external db"
data Meta = Meta
{ count :: Count
, db_response_time_ms :: Int
, page :: Maybe Page
, per_page :: PerPage
-- | https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/paging
, next_cursor :: Maybe Cursor
} deriving (Generic, Show)
instance FromJSON Meta
data ListOf a = ListOf
{ meta :: Meta
, results :: [a]
, group_by :: [Int] -- Not implemented
} deriving (Generic, Show)
instance FromJSON a => FromJSON (ListOf a)
-- | https://docs.openalex.org/api-entities/concepts/concept-object
data Concept = Concept
{ ancestors :: [Ancestor]
, cited_by_count :: Count
, counts_by_year :: [CountByYear]
, created_date :: CreatedDate
, description :: Text
, display_name :: Text
, id :: ConceptId
, ids :: Map Text ExternalID -- TODO ExternalDB
, image_thumbnail_url :: URL
, image_url :: URL
, international :: Map Language Text
, level :: Level
, related_concepts :: [DehydratedConcept]
, summary_stats :: SummaryStats
, updated_date :: UpdatedDate
, wikidata :: URL
, works_api_url :: URL
, works_count :: Count
} deriving (Generic, Show)
instance FromJSON Concept where
parseJSON (Object v) = do
ancestors <- v .: "ancestors"
cited_by_count <- v .: "cited_by_count"
counts_by_year <- v .: "counts_by_year"
created_date <- v .: "created_date"
description <- v .: "description"
display_name <- v .: "display_name"
id <- v .: "id"
ids <- v .: "ids"
image_thumbnail_url <- v .: "image_thumbnail_url"
image_url <- v .: "image_url"
international' <- v .: "international"
international <- international' .: "display_name"
level <- v .: "level"
related_concepts <- v .: "related_concepts"
summary_stats <- v .: "summary_stats"
updated_date <- v .: "updated_date"
wikidata <- v .: "wikidata"
works_api_url <- v .: "works_api_url"
works_count <- v .: "works_count"
pure $ Concept { .. }
parseJSON _ = fail "Cannot parse Concept as a non-object"
-- | https://docs.openalex.org/api-entities/concepts/concept-object#the-dehydratedconcept-object
data DehydratedConcept = DehydratedConcept
{ display_name :: Text
, id :: ConceptId
, level :: Level
, wikidata :: Maybe URL
} deriving (Generic, Show)
instance FromJSON DehydratedConcept
data Ancestor = Ancestor
{ id :: URL
, wikidata :: URL
, display_name :: Text
, level :: Level
} deriving (Generic, Show)
instance FromJSON Ancestor
data CountByYear = CountByYear
{ year :: Year
, works_count :: Count
, cited_by_count :: Count
} deriving (Generic, Show)
instance FromJSON CountByYear
data SummaryStats = SummaryStats
{ two_year_mean_citedness :: Double
, h_index :: Int
, i10_index :: Int
} deriving (Generic, Show)
instance FromJSON SummaryStats where
parseJSON (Object v) =
SummaryStats <$> v .: "2yr_mean_citedness"
<*> v .: "h_index"
<*> v .: "i10_index"
parseJSON _ = fail "Don't know how to parse this as SummaryStats"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment