document parsing works now

parent 72a4c71e
# Open Alex Database API Crawler for GarganText # Open Alex Database API Crawler for GarganText
## Compilation
For non-GHC stuff, use Nix.
For GHC, use ghcup and use GHC 8.10.7.
## Running ## Running
``` shell ``` shell
......
...@@ -16,6 +16,10 @@ main = do ...@@ -16,6 +16,10 @@ main = do
"Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)" "Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)"
(const fetchConcepts) (const fetchConcepts)
(pure ()) (pure ())
addCommand "works"
"Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object)"
(const fetchWorks)
(pure ())
runCmd () runCmd ()
...@@ -27,5 +31,13 @@ fetchConcepts _ = do ...@@ -27,5 +31,13 @@ fetchConcepts _ = do
case ec of case ec of
Left err -> putText $ "error: " <> show err Left err -> putText $ "error: " <> show err
Right c -> do Right c -> do
putText "c"
putText $ show c putText $ show c
fetchWorks :: () -> IO ()
fetchWorks _ = do
-- ec <- OA.fetchConcepts (Just 1) (Just 1) Nothing
ew <- OA.fetchWorks (Just 1) (Just 1) (Just "*")
case ew of
Left err -> putText $ "error: " <> show err
Right w -> do
putText $ show w
...@@ -14,6 +14,7 @@ rec { ...@@ -14,6 +14,7 @@ rec {
ps.tqdm ps.tqdm
]); ]);
nonhsBuildInputs = with pkgs; [ nonhsBuildInputs = with pkgs; [
gmp
jupyter jupyter
pythonEnv pythonEnv
zlib zlib
...@@ -21,6 +22,6 @@ rec { ...@@ -21,6 +22,6 @@ rec {
#libPaths = pkgs.lib.makeLibraryPath nonhsBuildInputs; #libPaths = pkgs.lib.makeLibraryPath nonhsBuildInputs;
shell = pkgs.mkShell { shell = pkgs.mkShell {
name = "openalex"; name = "openalex";
buildInputs = hsBuildInputs ++ nonhsBuildInputs; buildInputs = nonhsBuildInputs;
}; };
} }
...@@ -14,7 +14,9 @@ module OpenAlex ...@@ -14,7 +14,9 @@ module OpenAlex
( module OpenAlex.Client ( module OpenAlex.Client
, module OpenAlex.Types , module OpenAlex.Types
-- , fetchConcepts' -- , fetchConcepts'
, fetchConcepts ) , fetchConcepts
, fetchWorks
)
where where
-- import Data.Aeson -- import Data.Aeson
...@@ -27,7 +29,7 @@ import Network.HTTP.Client.TLS (tlsManagerSettings) ...@@ -27,7 +29,7 @@ import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude import Protolude
import OpenAlex.Client import OpenAlex.Client
import OpenAlex.ServantClientLogging import OpenAlex.ServantClientLogging
import OpenAlex.Types import OpenAlex.Types (ListOf(..), Page, PerPage, Cursor, Concept, Work)
import Servant.Client (BaseUrl(..), ClientEnv(..), ClientError, Scheme(Https), defaultMakeClientRequest, mkClientEnv, runClientM) import Servant.Client (BaseUrl(..), ClientEnv(..), ClientError, Scheme(Https), defaultMakeClientRequest, mkClientEnv, runClientM)
defaultClientEnv :: IO ClientEnv defaultClientEnv :: IO ClientEnv
...@@ -46,6 +48,11 @@ fetchConcepts mPage mPerPage mCursor = do ...@@ -46,6 +48,11 @@ fetchConcepts mPage mPerPage mCursor = do
env <- defaultClientEnv env <- defaultClientEnv
runClientM (concepts mPage mPerPage mCursor) env runClientM (concepts mPage mPerPage mCursor) env
fetchWorks :: Maybe Page -> Maybe PerPage -> Maybe Cursor -> IO (Either ClientError (ListOf Work))
fetchWorks mPage mPerPage mCursor = do
env <- defaultClientEnv
runClientM (works mPage mPerPage mCursor) env
-- fetchConcepts' :: IO (Either Text (ListOf Concept)) -- fetchConcepts' :: IO (Either Text (ListOf Concept))
-- fetchConcepts' = do -- fetchConcepts' = do
-- manager <- newManager tlsManagerSettings -- manager <- newManager tlsManagerSettings
......
...@@ -16,7 +16,7 @@ import Protolude ...@@ -16,7 +16,7 @@ import Protolude
import Servant.API import Servant.API
import Servant.Client import Servant.Client
import OpenAlex.Types import OpenAlex.Types (Page, PerPage, Cursor, ListOf(..), Concept, Work)
type API_URL = Text type API_URL = Text
apiUrl :: API_URL apiUrl :: API_URL
...@@ -37,9 +37,19 @@ type OpenAlexAPI = ...@@ -37,9 +37,19 @@ type OpenAlexAPI =
-- TODO: filter, search, sort -- TODO: filter, search, sort
:> Get '[JSON] (ListOf Concept) :> Get '[JSON] (ListOf Concept)
-- https://docs.openalex.org/api-entities/works
:<|> "works"
:> QueryParam "page" Page
:> QueryParam "per-page" PerPage
:> QueryParam "cursor" Cursor
-- TODO: filter, search, sort
:> Get '[JSON] (ListOf Work)
openAlexApi :: Proxy OpenAlexAPI openAlexApi :: Proxy OpenAlexAPI
openAlexApi = Proxy openAlexApi = Proxy
concepts :: Maybe Page -> Maybe PerPage -> Maybe Cursor -> ClientM (ListOf Concept) concepts :: Maybe Page -> Maybe PerPage -> Maybe Cursor -> ClientM (ListOf Concept)
concepts {- :<|> fetch -} = client openAlexApi works :: Maybe Page -> Maybe PerPage -> Maybe Cursor -> ClientM (ListOf Work)
concepts :<|> works = client openAlexApi
...@@ -17,13 +17,16 @@ module OpenAlex.Types where ...@@ -17,13 +17,16 @@ module OpenAlex.Types where
import Control.Monad.Fail (fail) import Control.Monad.Fail (fail)
import Data.Aeson import Data.Aeson
import Data.Scientific (floatingOrInteger) import Data.Scientific (floatingOrInteger)
import qualified Data.Text as T
import Data.Time (UTCTime)
import Data.Time.Calendar (Day) import Data.Time.Calendar (Day)
import Protolude hiding (Meta) import qualified Data.Time.Format as DTF
import Protolude hiding (Location, Meta)
type ConceptId = Text
type Count = Int type Count = Int
type Cursor = Text type Cursor = Text
type DOI = Text
data ExternalID = ExtIDUrl URL | ExtIDUrls [URL] | ExtIDInt Int data ExternalID = ExtIDUrl URL | ExtIDUrls [URL] | ExtIDInt Int
deriving (Generic, Show) deriving (Generic, Show)
instance FromJSON ExternalID where instance FromJSON ExternalID where
...@@ -36,19 +39,47 @@ instance FromJSON ExternalID where ...@@ -36,19 +39,47 @@ instance FromJSON ExternalID where
ids <- parseJSONList a ids <- parseJSONList a
pure $ ExtIDUrls ids pure $ ExtIDUrls ids
parseJSON _ = fail "Don't know how to handle this external id" parseJSON _ = fail "Don't know how to handle this external id"
type ISSN = Text
type ISSNL = Text
type Language = Text -- TODO: https://doc.wikimedia.org/mediawiki-core/master/php/Names_8php_source.html type Language = Text -- TODO: https://doc.wikimedia.org/mediawiki-core/master/php/Names_8php_source.html
type Level = Int type Level = Int
-- |https://docs.openalex.org/api-entities/works/work-object#oa_status
data OAStatus = OAGold | OAGreen | OAHybrid | OABronze | OAClosed
deriving (Generic, Show)
instance FromJSON OAStatus where
parseJSON (String "gold") = pure OAGold
parseJSON (String "green") = pure OAGreen
parseJSON (String "hybrid") = pure OAHybrid
parseJSON (String "bronze") = pure OABronze
parseJSON (String "closed") = pure OAClosed
parseJSON _ = fail "Don't know how to parse this oa status"
type OpenAlexID = Text
type Page = Int type Page = Int
type PerPage = Int type PerPage = Int
type URL = Text type URL = Text
type Year = Int type Year = Int
newtype CreatedDate = CreatedDate Day -- newtype CreatedDate = CreatedDate Day
deriving (Generic, Show) -- deriving (Generic, Show)
instance FromJSON CreatedDate -- instance FromJSON CreatedDate
newtype UpdatedDate = UpdatedDate Day -- newtype UpdatedDate = UpdatedDate Day
-- deriving (Generic, Show)
-- instance FromJSON UpdatedDate
parseTimeE :: (MonadFail m, DTF.ParseTime t) => Text -> Text -> m t
parseTimeE fmt s = case (DTF.parseTimeM True DTF.defaultTimeLocale (T.unpack fmt) (T.unpack s)) of
Nothing -> fail $ "Cannot parse date with format " <> T.unpack fmt
Just p -> pure p
data Date = DDay Day | DUTCTime UTCTime
deriving (Generic, Show) deriving (Generic, Show)
instance FromJSON UpdatedDate instance FromJSON Date where
parseJSON = withText "Date" $ \s ->
(DDay <$> parseTimeE "%F" s) <|>
(DUTCTime <$> parseTimeE "%Y-%m-%dT%H:%M:%S%Q" s)
type CreatedDate = Date
type UpdatedDate = Date
-- https://docs.openalex.org/api-entities/concepts/concept-object#ids -- https://docs.openalex.org/api-entities/concepts/concept-object#ids
data ExternalDB = MAG | OpenAlex | UMLS_Cui | UMLS_Aui | Wikidata | Wikipedia data ExternalDB = MAG | OpenAlex | UMLS_Cui | UMLS_Aui | Wikidata | Wikipedia
...@@ -87,7 +118,7 @@ data Concept = Concept ...@@ -87,7 +118,7 @@ data Concept = Concept
, created_date :: CreatedDate , created_date :: CreatedDate
, description :: Text , description :: Text
, display_name :: Text , display_name :: Text
, id :: ConceptId , id :: OpenAlexID
, ids :: Map Text ExternalID -- TODO ExternalDB , ids :: Map Text ExternalID -- TODO ExternalDB
, image_thumbnail_url :: URL , image_thumbnail_url :: URL
, image_url :: URL , image_url :: URL
...@@ -127,7 +158,7 @@ instance FromJSON Concept where ...@@ -127,7 +158,7 @@ instance FromJSON Concept where
-- | https://docs.openalex.org/api-entities/concepts/concept-object#the-dehydratedconcept-object -- | https://docs.openalex.org/api-entities/concepts/concept-object#the-dehydratedconcept-object
data DehydratedConcept = DehydratedConcept data DehydratedConcept = DehydratedConcept
{ display_name :: Text { display_name :: Text
, id :: ConceptId , id :: OpenAlexID
, level :: Level , level :: Level
, wikidata :: Maybe URL , wikidata :: Maybe URL
} deriving (Generic, Show) } deriving (Generic, Show)
...@@ -143,10 +174,9 @@ instance FromJSON Ancestor ...@@ -143,10 +174,9 @@ instance FromJSON Ancestor
data CountByYear = CountByYear data CountByYear = CountByYear
{ year :: Year { year :: Year
, works_count :: Count , works_count :: Maybe Count
, cited_by_count :: Count , cited_by_count :: Count
} deriving (Generic, Show) } deriving (Generic, Show, FromJSON)
instance FromJSON CountByYear
data SummaryStats = SummaryStats data SummaryStats = SummaryStats
...@@ -160,3 +190,197 @@ instance FromJSON SummaryStats where ...@@ -160,3 +190,197 @@ instance FromJSON SummaryStats where
<*> v .: "h_index" <*> v .: "h_index"
<*> v .: "i10_index" <*> v .: "i10_index"
parseJSON _ = fail "Don't know how to parse this as SummaryStats" parseJSON _ = fail "Don't know how to parse this as SummaryStats"
-- | https://docs.openalex.org/api-entities/works/work-object
data Work = Work
{ abstract_inverted_index :: Map Text [Int] -- TODO
, authorships :: [Authorship]
, apc_list :: APCList
, apc_paid :: APCPaid
, best_oa_location :: Location
, biblio :: Biblio
, cited_by_api_url :: Text
, cited_by_count :: Count
, concepts :: [DehydratedConcept]
, corresponding_author_ids :: [OpenAlexID]
, corresponding_institution_ids :: [OpenAlexID]
, counts_by_year :: [CountByYear]
, created_date :: CreatedDate
, display_name :: Text
, doi :: DOI
, grants :: [Grant]
, id :: OpenAlexID
, ids :: Map Text ExternalID -- TODO ExternalDB
, is_paratext :: Bool
, is_retracted :: Bool
, language :: Text
, locations :: [Location]
, locations_count :: Count
, mesh :: [MeSH]
, ngrams_url :: URL
, open_access :: OpenAccess
, primary_location :: Location
, publication_date :: CreatedDate
, publication_year :: Year
, referenced_works :: [OpenAlexID]
, related_works :: [OpenAlexID]
, title :: Text
, type_ :: Text
, updated_date :: UpdatedDate
, is_oa :: Maybe Bool
, license :: Maybe Text
, url :: Maybe URL
, version :: Maybe Text
} deriving (Generic, Show)
instance FromJSON Work where
parseJSON = withObject "Work" $ \v -> do
abstract_inverted_index <- v .: "abstract_inverted_index"
authorships <- v .: "authorships"
apc_list <- v .: "apc_list"
apc_paid <- v .: "apc_paid"
best_oa_location <- v .: "best_oa_location"
biblio <- v .: "biblio"
cited_by_api_url <- v .: "cited_by_api_url"
cited_by_count <- v .: "cited_by_count"
concepts <- v .: "concepts"
corresponding_author_ids <- v .: "corresponding_author_ids"
corresponding_institution_ids <- v .: "corresponding_institution_ids"
counts_by_year <- v .: "counts_by_year"
created_date <- v .: "created_date"
display_name <- v .: "display_name"
doi <- v .: "doi"
grants <- v .: "grants"
id <- v .: "id"
ids <- v .: "ids"
is_paratext <- v .: "is_paratext"
is_retracted <- v .: "is_retracted"
language <- v .: "language"
locations <- v .: "locations"
locations_count <- v .: "locations_count"
mesh <- v .: "mesh"
ngrams_url <- v .: "ngrams_url"
open_access <- v .: "open_access"
primary_location <- v .: "primary_location"
publication_date <- v .: "publication_date"
publication_year <- v .: "publication_year"
referenced_works <- v .: "referenced_works"
related_works <- v .: "related_works"
title <- v .: "title"
type_ <- v .: "type"
updated_date <- v .: "updated_date"
is_oa <- v .:? "is_oa"
license <- v .:? "license"
url <- v .:? "url"
version <- v .:? "version"
pure $ Work { .. }
data APCList = APCList
{ value :: Int
, currency :: Text
, provenance :: Text
, value_usd :: Int
} deriving (Generic, Show, FromJSON)
data APCPaid = APCPaid
{ value :: Int
, currency :: Text
, provenance :: Text
, value_usd :: Int
} deriving (Generic, Show, FromJSON)
-- | https://docs.openalex.org/api-entities/works/work-object/authorship-object
data Authorship = Authorship
{ author :: DehydratedAuthor
, author_position :: Text
, institutions :: [DehydratedInstitution]
, is_corresponding :: Maybe Bool
, raw_affiliation_string :: Text
} deriving (Generic, Show, FromJSON)
data Biblio = Biblio
{ volume :: Text
, issue :: Text
, first_page :: Text
, last_page :: Text
} deriving (Generic, Show, FromJSON)
data DehydratedAuthor = DehydratedAuthor
{ id :: OpenAlexID
, display_name :: Text
, orcid :: Maybe URL
} deriving (Generic, Show, FromJSON)
data DehydratedInstitution = DehydratedInstitution
{ id :: OpenAlexID
, display_name :: Text
, ror :: Text
, country_code :: Text
, type_ :: Text
} deriving (Generic, Show)
instance FromJSON DehydratedInstitution where
parseJSON (Object v) = do
id <- v .: "id"
display_name <- v .: "display_name"
ror <- v .: "ror"
country_code <- v .: "country_code"
type_ <- v .: "type"
pure $ DehydratedInstitution { .. }
parseJSON _ = fail "Don't know how to parse a dehydrated institution from a non-object"
data Grant = Grant
{ funder :: OpenAlexID
, funder_display_name :: Text
, award_id :: Text
} deriving (Generic, Show, FromJSON)
data Location = Location
{ is_oa :: Bool
, landing_page_url :: URL
, license :: Text
, source :: DehydratedSource
, pdf_url :: Maybe URL
, version :: Maybe Text
} deriving (Generic, Show, FromJSON)
-- | PubMed only, https://docs.openalex.org/api-entities/works/work-object#mesh
data MeSH = MeSH
{ descriptor_ui :: Text
, descriptor_name :: Text
, qualifier_ui :: Text
, qualifier_name :: Text
, is_major_topic :: Bool
} deriving (Generic, Show, FromJSON)
-- | https://docs.openalex.org/api-entities/works/work-object#the-openaccess-object
data OpenAccess = OpenAccess
{ any_repository_has_fulltext :: Bool
, is_oa :: Bool
, oa_status :: OAStatus
, oa_url :: URL
} deriving (Generic, Show, FromJSON)
-- | https://docs.openalex.org/api-entities/sources/source-object#the-dehydratedsource-object
data DehydratedSource = DehydratedSource
{ display_name :: Text
, host_organization :: Text
, host_organization_lineage :: [OpenAlexID]
, host_organization_name :: Text
, id :: OpenAlexID
, is_in_doaj :: Bool
, issn :: [ISSN]
, issn_l :: ISSNL
, type_ ::Text
} deriving (Generic, Show)
instance FromJSON DehydratedSource where
parseJSON = withObject "DehydratedSource" $ \v -> do
display_name <- v .: "display_name"
host_organization <- v .: "host_organization"
host_organization_lineage <- v .: "host_organization_lineage"
host_organization_name <- v .: "host_organization_name"
id <- v .: "id"
is_in_doaj <- v .: "is_in_doaj"
issn <- v .: "issn"
issn_l <- v .: "issn_l"
type_ <- v .: "type"
pure $ DehydratedSource { .. }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment