document parsing works now

parent 72a4c71e
# Open Alex Database API Crawler for GarganText
## Compilation
For non-GHC stuff, use Nix.
For GHC, use ghcup and use GHC 8.10.7.
## Running
``` shell
......
......@@ -16,6 +16,10 @@ main = do
"Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)"
(const fetchConcepts)
(pure ())
addCommand "works"
"Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object)"
(const fetchWorks)
(pure ())
runCmd ()
......@@ -27,5 +31,13 @@ fetchConcepts _ = do
case ec of
Left err -> putText $ "error: " <> show err
Right c -> do
putText "c"
putText $ show c
fetchWorks :: () -> IO ()
fetchWorks _ = do
-- ec <- OA.fetchConcepts (Just 1) (Just 1) Nothing
ew <- OA.fetchWorks (Just 1) (Just 1) (Just "*")
case ew of
Left err -> putText $ "error: " <> show err
Right w -> do
putText $ show w
......@@ -14,6 +14,7 @@ rec {
ps.tqdm
]);
nonhsBuildInputs = with pkgs; [
gmp
jupyter
pythonEnv
zlib
......@@ -21,6 +22,6 @@ rec {
#libPaths = pkgs.lib.makeLibraryPath nonhsBuildInputs;
shell = pkgs.mkShell {
name = "openalex";
buildInputs = hsBuildInputs ++ nonhsBuildInputs;
buildInputs = nonhsBuildInputs;
};
}
......@@ -14,7 +14,9 @@ module OpenAlex
( module OpenAlex.Client
, module OpenAlex.Types
-- , fetchConcepts'
, fetchConcepts )
, fetchConcepts
, fetchWorks
)
where
-- import Data.Aeson
......@@ -27,7 +29,7 @@ import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude
import OpenAlex.Client
import OpenAlex.ServantClientLogging
import OpenAlex.Types
import OpenAlex.Types (ListOf(..), Page, PerPage, Cursor, Concept, Work)
import Servant.Client (BaseUrl(..), ClientEnv(..), ClientError, Scheme(Https), defaultMakeClientRequest, mkClientEnv, runClientM)
defaultClientEnv :: IO ClientEnv
......@@ -46,6 +48,11 @@ fetchConcepts mPage mPerPage mCursor = do
env <- defaultClientEnv
runClientM (concepts mPage mPerPage mCursor) env
fetchWorks :: Maybe Page -> Maybe PerPage -> Maybe Cursor -> IO (Either ClientError (ListOf Work))
fetchWorks mPage mPerPage mCursor = do
env <- defaultClientEnv
runClientM (works mPage mPerPage mCursor) env
-- fetchConcepts' :: IO (Either Text (ListOf Concept))
-- fetchConcepts' = do
-- manager <- newManager tlsManagerSettings
......
......@@ -16,7 +16,7 @@ import Protolude
import Servant.API
import Servant.Client
import OpenAlex.Types
import OpenAlex.Types (Page, PerPage, Cursor, ListOf(..), Concept, Work)
type API_URL = Text
apiUrl :: API_URL
......@@ -37,9 +37,19 @@ type OpenAlexAPI =
-- TODO: filter, search, sort
:> Get '[JSON] (ListOf Concept)
-- https://docs.openalex.org/api-entities/works
:<|> "works"
:> QueryParam "page" Page
:> QueryParam "per-page" PerPage
:> QueryParam "cursor" Cursor
-- TODO: filter, search, sort
:> Get '[JSON] (ListOf Work)
openAlexApi :: Proxy OpenAlexAPI
openAlexApi = Proxy
concepts :: Maybe Page -> Maybe PerPage -> Maybe Cursor -> ClientM (ListOf Concept)
concepts {- :<|> fetch -} = client openAlexApi
works :: Maybe Page -> Maybe PerPage -> Maybe Cursor -> ClientM (ListOf Work)
concepts :<|> works = client openAlexApi
......@@ -17,13 +17,16 @@ module OpenAlex.Types where
import Control.Monad.Fail (fail)
import Data.Aeson
import Data.Scientific (floatingOrInteger)
import qualified Data.Text as T
import Data.Time (UTCTime)
import Data.Time.Calendar (Day)
import Protolude hiding (Meta)
import qualified Data.Time.Format as DTF
import Protolude hiding (Location, Meta)
type ConceptId = Text
type Count = Int
type Cursor = Text
type DOI = Text
data ExternalID = ExtIDUrl URL | ExtIDUrls [URL] | ExtIDInt Int
deriving (Generic, Show)
instance FromJSON ExternalID where
......@@ -36,19 +39,47 @@ instance FromJSON ExternalID where
ids <- parseJSONList a
pure $ ExtIDUrls ids
parseJSON _ = fail "Don't know how to handle this external id"
type ISSN = Text
type ISSNL = Text
type Language = Text -- TODO: https://doc.wikimedia.org/mediawiki-core/master/php/Names_8php_source.html
type Level = Int
-- |https://docs.openalex.org/api-entities/works/work-object#oa_status
data OAStatus = OAGold | OAGreen | OAHybrid | OABronze | OAClosed
deriving (Generic, Show)
instance FromJSON OAStatus where
parseJSON (String "gold") = pure OAGold
parseJSON (String "green") = pure OAGreen
parseJSON (String "hybrid") = pure OAHybrid
parseJSON (String "bronze") = pure OABronze
parseJSON (String "closed") = pure OAClosed
parseJSON _ = fail "Don't know how to parse this oa status"
type OpenAlexID = Text
type Page = Int
type PerPage = Int
type URL = Text
type Year = Int
newtype CreatedDate = CreatedDate Day
deriving (Generic, Show)
instance FromJSON CreatedDate
newtype UpdatedDate = UpdatedDate Day
-- newtype CreatedDate = CreatedDate Day
-- deriving (Generic, Show)
-- instance FromJSON CreatedDate
-- newtype UpdatedDate = UpdatedDate Day
-- deriving (Generic, Show)
-- instance FromJSON UpdatedDate
parseTimeE :: (MonadFail m, DTF.ParseTime t) => Text -> Text -> m t
parseTimeE fmt s = case (DTF.parseTimeM True DTF.defaultTimeLocale (T.unpack fmt) (T.unpack s)) of
Nothing -> fail $ "Cannot parse date with format " <> T.unpack fmt
Just p -> pure p
data Date = DDay Day | DUTCTime UTCTime
deriving (Generic, Show)
instance FromJSON UpdatedDate
instance FromJSON Date where
parseJSON = withText "Date" $ \s ->
(DDay <$> parseTimeE "%F" s) <|>
(DUTCTime <$> parseTimeE "%Y-%m-%dT%H:%M:%S%Q" s)
type CreatedDate = Date
type UpdatedDate = Date
-- https://docs.openalex.org/api-entities/concepts/concept-object#ids
data ExternalDB = MAG | OpenAlex | UMLS_Cui | UMLS_Aui | Wikidata | Wikipedia
......@@ -87,7 +118,7 @@ data Concept = Concept
, created_date :: CreatedDate
, description :: Text
, display_name :: Text
, id :: ConceptId
, id :: OpenAlexID
, ids :: Map Text ExternalID -- TODO ExternalDB
, image_thumbnail_url :: URL
, image_url :: URL
......@@ -127,7 +158,7 @@ instance FromJSON Concept where
-- | https://docs.openalex.org/api-entities/concepts/concept-object#the-dehydratedconcept-object
data DehydratedConcept = DehydratedConcept
{ display_name :: Text
, id :: ConceptId
, id :: OpenAlexID
, level :: Level
, wikidata :: Maybe URL
} deriving (Generic, Show)
......@@ -143,10 +174,9 @@ instance FromJSON Ancestor
data CountByYear = CountByYear
{ year :: Year
, works_count :: Count
, works_count :: Maybe Count
, cited_by_count :: Count
} deriving (Generic, Show)
instance FromJSON CountByYear
} deriving (Generic, Show, FromJSON)
data SummaryStats = SummaryStats
......@@ -160,3 +190,197 @@ instance FromJSON SummaryStats where
<*> v .: "h_index"
<*> v .: "i10_index"
parseJSON _ = fail "Don't know how to parse this as SummaryStats"
-- | https://docs.openalex.org/api-entities/works/work-object
data Work = Work
{ abstract_inverted_index :: Map Text [Int] -- TODO
, authorships :: [Authorship]
, apc_list :: APCList
, apc_paid :: APCPaid
, best_oa_location :: Location
, biblio :: Biblio
, cited_by_api_url :: Text
, cited_by_count :: Count
, concepts :: [DehydratedConcept]
, corresponding_author_ids :: [OpenAlexID]
, corresponding_institution_ids :: [OpenAlexID]
, counts_by_year :: [CountByYear]
, created_date :: CreatedDate
, display_name :: Text
, doi :: DOI
, grants :: [Grant]
, id :: OpenAlexID
, ids :: Map Text ExternalID -- TODO ExternalDB
, is_paratext :: Bool
, is_retracted :: Bool
, language :: Text
, locations :: [Location]
, locations_count :: Count
, mesh :: [MeSH]
, ngrams_url :: URL
, open_access :: OpenAccess
, primary_location :: Location
, publication_date :: CreatedDate
, publication_year :: Year
, referenced_works :: [OpenAlexID]
, related_works :: [OpenAlexID]
, title :: Text
, type_ :: Text
, updated_date :: UpdatedDate
, is_oa :: Maybe Bool
, license :: Maybe Text
, url :: Maybe URL
, version :: Maybe Text
} deriving (Generic, Show)
instance FromJSON Work where
parseJSON = withObject "Work" $ \v -> do
abstract_inverted_index <- v .: "abstract_inverted_index"
authorships <- v .: "authorships"
apc_list <- v .: "apc_list"
apc_paid <- v .: "apc_paid"
best_oa_location <- v .: "best_oa_location"
biblio <- v .: "biblio"
cited_by_api_url <- v .: "cited_by_api_url"
cited_by_count <- v .: "cited_by_count"
concepts <- v .: "concepts"
corresponding_author_ids <- v .: "corresponding_author_ids"
corresponding_institution_ids <- v .: "corresponding_institution_ids"
counts_by_year <- v .: "counts_by_year"
created_date <- v .: "created_date"
display_name <- v .: "display_name"
doi <- v .: "doi"
grants <- v .: "grants"
id <- v .: "id"
ids <- v .: "ids"
is_paratext <- v .: "is_paratext"
is_retracted <- v .: "is_retracted"
language <- v .: "language"
locations <- v .: "locations"
locations_count <- v .: "locations_count"
mesh <- v .: "mesh"
ngrams_url <- v .: "ngrams_url"
open_access <- v .: "open_access"
primary_location <- v .: "primary_location"
publication_date <- v .: "publication_date"
publication_year <- v .: "publication_year"
referenced_works <- v .: "referenced_works"
related_works <- v .: "related_works"
title <- v .: "title"
type_ <- v .: "type"
updated_date <- v .: "updated_date"
is_oa <- v .:? "is_oa"
license <- v .:? "license"
url <- v .:? "url"
version <- v .:? "version"
pure $ Work { .. }
data APCList = APCList
{ value :: Int
, currency :: Text
, provenance :: Text
, value_usd :: Int
} deriving (Generic, Show, FromJSON)
data APCPaid = APCPaid
{ value :: Int
, currency :: Text
, provenance :: Text
, value_usd :: Int
} deriving (Generic, Show, FromJSON)
-- | https://docs.openalex.org/api-entities/works/work-object/authorship-object
data Authorship = Authorship
{ author :: DehydratedAuthor
, author_position :: Text
, institutions :: [DehydratedInstitution]
, is_corresponding :: Maybe Bool
, raw_affiliation_string :: Text
} deriving (Generic, Show, FromJSON)
data Biblio = Biblio
{ volume :: Text
, issue :: Text
, first_page :: Text
, last_page :: Text
} deriving (Generic, Show, FromJSON)
data DehydratedAuthor = DehydratedAuthor
{ id :: OpenAlexID
, display_name :: Text
, orcid :: Maybe URL
} deriving (Generic, Show, FromJSON)
data DehydratedInstitution = DehydratedInstitution
{ id :: OpenAlexID
, display_name :: Text
, ror :: Text
, country_code :: Text
, type_ :: Text
} deriving (Generic, Show)
instance FromJSON DehydratedInstitution where
parseJSON (Object v) = do
id <- v .: "id"
display_name <- v .: "display_name"
ror <- v .: "ror"
country_code <- v .: "country_code"
type_ <- v .: "type"
pure $ DehydratedInstitution { .. }
parseJSON _ = fail "Don't know how to parse a dehydrated institution from a non-object"
data Grant = Grant
{ funder :: OpenAlexID
, funder_display_name :: Text
, award_id :: Text
} deriving (Generic, Show, FromJSON)
data Location = Location
{ is_oa :: Bool
, landing_page_url :: URL
, license :: Text
, source :: DehydratedSource
, pdf_url :: Maybe URL
, version :: Maybe Text
} deriving (Generic, Show, FromJSON)
-- | PubMed only, https://docs.openalex.org/api-entities/works/work-object#mesh
data MeSH = MeSH
{ descriptor_ui :: Text
, descriptor_name :: Text
, qualifier_ui :: Text
, qualifier_name :: Text
, is_major_topic :: Bool
} deriving (Generic, Show, FromJSON)
-- | https://docs.openalex.org/api-entities/works/work-object#the-openaccess-object
data OpenAccess = OpenAccess
{ any_repository_has_fulltext :: Bool
, is_oa :: Bool
, oa_status :: OAStatus
, oa_url :: URL
} deriving (Generic, Show, FromJSON)
-- | https://docs.openalex.org/api-entities/sources/source-object#the-dehydratedsource-object
data DehydratedSource = DehydratedSource
{ display_name :: Text
, host_organization :: Text
, host_organization_lineage :: [OpenAlexID]
, host_organization_name :: Text
, id :: OpenAlexID
, is_in_doaj :: Bool
, issn :: [ISSN]
, issn_l :: ISSNL
, type_ ::Text
} deriving (Generic, Show)
instance FromJSON DehydratedSource where
parseJSON = withObject "DehydratedSource" $ \v -> do
display_name <- v .: "display_name"
host_organization <- v .: "host_organization"
host_organization_lineage <- v .: "host_organization_lineage"
host_organization_name <- v .: "host_organization_name"
id <- v .: "id"
is_in_doaj <- v .: "is_in_doaj"
issn <- v .: "issn"
issn_l <- v .: "issn_l"
type_ <- v .: "type"
pure $ DehydratedSource { .. }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment