Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
O
openalex
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
crawlers
openalex
Commits
5aac73a7
Verified
Commit
5aac73a7
authored
Jun 28, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
document parsing works now
parent
72a4c71e
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
278 additions
and
18 deletions
+278
-18
README.md
README.md
+6
-0
Main.hs
app/Main.hs
+13
-1
pkgs.nix
nix/pkgs.nix
+2
-1
OpenAlex.hs
src/OpenAlex.hs
+9
-2
Client.hs
src/OpenAlex/Client.hs
+12
-2
Types.hs
src/OpenAlex/Types.hs
+236
-12
No files found.
README.md
View file @
5aac73a7
# Open Alex Database API Crawler for GarganText
# Open Alex Database API Crawler for GarganText
## Compilation
For non-GHC stuff, use Nix.
For GHC, use ghcup and use GHC 8.10.7.
## Running
## Running
```
shell
```
shell
...
...
app/Main.hs
View file @
5aac73a7
...
@@ -16,6 +16,10 @@ main = do
...
@@ -16,6 +16,10 @@ main = do
"Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)"
"Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)"
(
const
fetchConcepts
)
(
const
fetchConcepts
)
(
pure
()
)
(
pure
()
)
addCommand
"works"
"Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object)"
(
const
fetchWorks
)
(
pure
()
)
runCmd
()
runCmd
()
...
@@ -27,5 +31,13 @@ fetchConcepts _ = do
...
@@ -27,5 +31,13 @@ fetchConcepts _ = do
case
ec
of
case
ec
of
Left
err
->
putText
$
"error: "
<>
show
err
Left
err
->
putText
$
"error: "
<>
show
err
Right
c
->
do
Right
c
->
do
putText
"c"
putText
$
show
c
putText
$
show
c
fetchWorks
::
()
->
IO
()
fetchWorks
_
=
do
-- ec <- OA.fetchConcepts (Just 1) (Just 1) Nothing
ew
<-
OA
.
fetchWorks
(
Just
1
)
(
Just
1
)
(
Just
"*"
)
case
ew
of
Left
err
->
putText
$
"error: "
<>
show
err
Right
w
->
do
putText
$
show
w
nix/pkgs.nix
View file @
5aac73a7
...
@@ -14,6 +14,7 @@ rec {
...
@@ -14,6 +14,7 @@ rec {
ps
.
tqdm
ps
.
tqdm
]);
]);
nonhsBuildInputs
=
with
pkgs
;
[
nonhsBuildInputs
=
with
pkgs
;
[
gmp
jupyter
jupyter
pythonEnv
pythonEnv
zlib
zlib
...
@@ -21,6 +22,6 @@ rec {
...
@@ -21,6 +22,6 @@ rec {
#libPaths = pkgs.lib.makeLibraryPath nonhsBuildInputs;
#libPaths = pkgs.lib.makeLibraryPath nonhsBuildInputs;
shell
=
pkgs
.
mkShell
{
shell
=
pkgs
.
mkShell
{
name
=
"openalex"
;
name
=
"openalex"
;
buildInputs
=
hsBuildInputs
++
nonhsBuildInputs
;
buildInputs
=
nonhsBuildInputs
;
};
};
}
}
src/OpenAlex.hs
View file @
5aac73a7
...
@@ -14,7 +14,9 @@ module OpenAlex
...
@@ -14,7 +14,9 @@ module OpenAlex
(
module
OpenAlex
.
Client
(
module
OpenAlex
.
Client
,
module
OpenAlex
.
Types
,
module
OpenAlex
.
Types
-- , fetchConcepts'
-- , fetchConcepts'
,
fetchConcepts
)
,
fetchConcepts
,
fetchWorks
)
where
where
-- import Data.Aeson
-- import Data.Aeson
...
@@ -27,7 +29,7 @@ import Network.HTTP.Client.TLS (tlsManagerSettings)
...
@@ -27,7 +29,7 @@ import Network.HTTP.Client.TLS (tlsManagerSettings)
import
Protolude
import
Protolude
import
OpenAlex.Client
import
OpenAlex.Client
import
OpenAlex.ServantClientLogging
import
OpenAlex.ServantClientLogging
import
OpenAlex.Types
import
OpenAlex.Types
(
ListOf
(
..
),
Page
,
PerPage
,
Cursor
,
Concept
,
Work
)
import
Servant.Client
(
BaseUrl
(
..
),
ClientEnv
(
..
),
ClientError
,
Scheme
(
Https
),
defaultMakeClientRequest
,
mkClientEnv
,
runClientM
)
import
Servant.Client
(
BaseUrl
(
..
),
ClientEnv
(
..
),
ClientError
,
Scheme
(
Https
),
defaultMakeClientRequest
,
mkClientEnv
,
runClientM
)
defaultClientEnv
::
IO
ClientEnv
defaultClientEnv
::
IO
ClientEnv
...
@@ -46,6 +48,11 @@ fetchConcepts mPage mPerPage mCursor = do
...
@@ -46,6 +48,11 @@ fetchConcepts mPage mPerPage mCursor = do
env
<-
defaultClientEnv
env
<-
defaultClientEnv
runClientM
(
concepts
mPage
mPerPage
mCursor
)
env
runClientM
(
concepts
mPage
mPerPage
mCursor
)
env
fetchWorks
::
Maybe
Page
->
Maybe
PerPage
->
Maybe
Cursor
->
IO
(
Either
ClientError
(
ListOf
Work
))
fetchWorks
mPage
mPerPage
mCursor
=
do
env
<-
defaultClientEnv
runClientM
(
works
mPage
mPerPage
mCursor
)
env
-- fetchConcepts' :: IO (Either Text (ListOf Concept))
-- fetchConcepts' :: IO (Either Text (ListOf Concept))
-- fetchConcepts' = do
-- fetchConcepts' = do
-- manager <- newManager tlsManagerSettings
-- manager <- newManager tlsManagerSettings
...
...
src/OpenAlex/Client.hs
View file @
5aac73a7
...
@@ -16,7 +16,7 @@ import Protolude
...
@@ -16,7 +16,7 @@ import Protolude
import
Servant.API
import
Servant.API
import
Servant.Client
import
Servant.Client
import
OpenAlex.Types
import
OpenAlex.Types
(
Page
,
PerPage
,
Cursor
,
ListOf
(
..
),
Concept
,
Work
)
type
API_URL
=
Text
type
API_URL
=
Text
apiUrl
::
API_URL
apiUrl
::
API_URL
...
@@ -37,9 +37,19 @@ type OpenAlexAPI =
...
@@ -37,9 +37,19 @@ type OpenAlexAPI =
-- TODO: filter, search, sort
-- TODO: filter, search, sort
:>
Get
'[
J
SON
]
(
ListOf
Concept
)
:>
Get
'[
J
SON
]
(
ListOf
Concept
)
-- https://docs.openalex.org/api-entities/works
:<|>
"works"
:>
QueryParam
"page"
Page
:>
QueryParam
"per-page"
PerPage
:>
QueryParam
"cursor"
Cursor
-- TODO: filter, search, sort
:>
Get
'[
J
SON
]
(
ListOf
Work
)
openAlexApi
::
Proxy
OpenAlexAPI
openAlexApi
::
Proxy
OpenAlexAPI
openAlexApi
=
Proxy
openAlexApi
=
Proxy
concepts
::
Maybe
Page
->
Maybe
PerPage
->
Maybe
Cursor
->
ClientM
(
ListOf
Concept
)
concepts
::
Maybe
Page
->
Maybe
PerPage
->
Maybe
Cursor
->
ClientM
(
ListOf
Concept
)
concepts
{- :<|> fetch -}
=
client
openAlexApi
works
::
Maybe
Page
->
Maybe
PerPage
->
Maybe
Cursor
->
ClientM
(
ListOf
Work
)
concepts
:<|>
works
=
client
openAlexApi
src/OpenAlex/Types.hs
View file @
5aac73a7
...
@@ -17,13 +17,16 @@ module OpenAlex.Types where
...
@@ -17,13 +17,16 @@ module OpenAlex.Types where
import
Control.Monad.Fail
(
fail
)
import
Control.Monad.Fail
(
fail
)
import
Data.Aeson
import
Data.Aeson
import
Data.Scientific
(
floatingOrInteger
)
import
Data.Scientific
(
floatingOrInteger
)
import
qualified
Data.Text
as
T
import
Data.Time
(
UTCTime
)
import
Data.Time.Calendar
(
Day
)
import
Data.Time.Calendar
(
Day
)
import
Protolude
hiding
(
Meta
)
import
qualified
Data.Time.Format
as
DTF
import
Protolude
hiding
(
Location
,
Meta
)
type
ConceptId
=
Text
type
Count
=
Int
type
Count
=
Int
type
Cursor
=
Text
type
Cursor
=
Text
type
DOI
=
Text
data
ExternalID
=
ExtIDUrl
URL
|
ExtIDUrls
[
URL
]
|
ExtIDInt
Int
data
ExternalID
=
ExtIDUrl
URL
|
ExtIDUrls
[
URL
]
|
ExtIDInt
Int
deriving
(
Generic
,
Show
)
deriving
(
Generic
,
Show
)
instance
FromJSON
ExternalID
where
instance
FromJSON
ExternalID
where
...
@@ -36,19 +39,47 @@ instance FromJSON ExternalID where
...
@@ -36,19 +39,47 @@ instance FromJSON ExternalID where
ids
<-
parseJSONList
a
ids
<-
parseJSONList
a
pure
$
ExtIDUrls
ids
pure
$
ExtIDUrls
ids
parseJSON
_
=
fail
"Don't know how to handle this external id"
parseJSON
_
=
fail
"Don't know how to handle this external id"
type
ISSN
=
Text
type
ISSNL
=
Text
type
Language
=
Text
-- TODO: https://doc.wikimedia.org/mediawiki-core/master/php/Names_8php_source.html
type
Language
=
Text
-- TODO: https://doc.wikimedia.org/mediawiki-core/master/php/Names_8php_source.html
type
Level
=
Int
type
Level
=
Int
-- |https://docs.openalex.org/api-entities/works/work-object#oa_status
data
OAStatus
=
OAGold
|
OAGreen
|
OAHybrid
|
OABronze
|
OAClosed
deriving
(
Generic
,
Show
)
instance
FromJSON
OAStatus
where
parseJSON
(
String
"gold"
)
=
pure
OAGold
parseJSON
(
String
"green"
)
=
pure
OAGreen
parseJSON
(
String
"hybrid"
)
=
pure
OAHybrid
parseJSON
(
String
"bronze"
)
=
pure
OABronze
parseJSON
(
String
"closed"
)
=
pure
OAClosed
parseJSON
_
=
fail
"Don't know how to parse this oa status"
type
OpenAlexID
=
Text
type
Page
=
Int
type
Page
=
Int
type
PerPage
=
Int
type
PerPage
=
Int
type
URL
=
Text
type
URL
=
Text
type
Year
=
Int
type
Year
=
Int
newtype
CreatedDate
=
CreatedDate
Day
-- newtype CreatedDate = CreatedDate Day
deriving
(
Generic
,
Show
)
-- deriving (Generic, Show)
instance
FromJSON
CreatedDate
-- instance FromJSON CreatedDate
newtype
UpdatedDate
=
UpdatedDate
Day
-- newtype UpdatedDate = UpdatedDate Day
-- deriving (Generic, Show)
-- instance FromJSON UpdatedDate
parseTimeE
::
(
MonadFail
m
,
DTF
.
ParseTime
t
)
=>
Text
->
Text
->
m
t
parseTimeE
fmt
s
=
case
(
DTF
.
parseTimeM
True
DTF
.
defaultTimeLocale
(
T
.
unpack
fmt
)
(
T
.
unpack
s
))
of
Nothing
->
fail
$
"Cannot parse date with format "
<>
T
.
unpack
fmt
Just
p
->
pure
p
data
Date
=
DDay
Day
|
DUTCTime
UTCTime
deriving
(
Generic
,
Show
)
deriving
(
Generic
,
Show
)
instance
FromJSON
UpdatedDate
instance
FromJSON
Date
where
parseJSON
=
withText
"Date"
$
\
s
->
(
DDay
<$>
parseTimeE
"%F"
s
)
<|>
(
DUTCTime
<$>
parseTimeE
"%Y-%m-%dT%H:%M:%S%Q"
s
)
type
CreatedDate
=
Date
type
UpdatedDate
=
Date
-- https://docs.openalex.org/api-entities/concepts/concept-object#ids
-- https://docs.openalex.org/api-entities/concepts/concept-object#ids
data
ExternalDB
=
MAG
|
OpenAlex
|
UMLS_Cui
|
UMLS_Aui
|
Wikidata
|
Wikipedia
data
ExternalDB
=
MAG
|
OpenAlex
|
UMLS_Cui
|
UMLS_Aui
|
Wikidata
|
Wikipedia
...
@@ -87,7 +118,7 @@ data Concept = Concept
...
@@ -87,7 +118,7 @@ data Concept = Concept
,
created_date
::
CreatedDate
,
created_date
::
CreatedDate
,
description
::
Text
,
description
::
Text
,
display_name
::
Text
,
display_name
::
Text
,
id
::
ConceptId
,
id
::
OpenAlexID
,
ids
::
Map
Text
ExternalID
-- TODO ExternalDB
,
ids
::
Map
Text
ExternalID
-- TODO ExternalDB
,
image_thumbnail_url
::
URL
,
image_thumbnail_url
::
URL
,
image_url
::
URL
,
image_url
::
URL
...
@@ -127,7 +158,7 @@ instance FromJSON Concept where
...
@@ -127,7 +158,7 @@ instance FromJSON Concept where
-- | https://docs.openalex.org/api-entities/concepts/concept-object#the-dehydratedconcept-object
-- | https://docs.openalex.org/api-entities/concepts/concept-object#the-dehydratedconcept-object
data
DehydratedConcept
=
DehydratedConcept
data
DehydratedConcept
=
DehydratedConcept
{
display_name
::
Text
{
display_name
::
Text
,
id
::
ConceptId
,
id
::
OpenAlexID
,
level
::
Level
,
level
::
Level
,
wikidata
::
Maybe
URL
,
wikidata
::
Maybe
URL
}
deriving
(
Generic
,
Show
)
}
deriving
(
Generic
,
Show
)
...
@@ -143,10 +174,9 @@ instance FromJSON Ancestor
...
@@ -143,10 +174,9 @@ instance FromJSON Ancestor
data
CountByYear
=
CountByYear
data
CountByYear
=
CountByYear
{
year
::
Year
{
year
::
Year
,
works_count
::
Count
,
works_count
::
Maybe
Count
,
cited_by_count
::
Count
,
cited_by_count
::
Count
}
deriving
(
Generic
,
Show
)
}
deriving
(
Generic
,
Show
,
FromJSON
)
instance
FromJSON
CountByYear
data
SummaryStats
=
SummaryStats
data
SummaryStats
=
SummaryStats
...
@@ -160,3 +190,197 @@ instance FromJSON SummaryStats where
...
@@ -160,3 +190,197 @@ instance FromJSON SummaryStats where
<*>
v
.:
"h_index"
<*>
v
.:
"h_index"
<*>
v
.:
"i10_index"
<*>
v
.:
"i10_index"
parseJSON
_
=
fail
"Don't know how to parse this as SummaryStats"
parseJSON
_
=
fail
"Don't know how to parse this as SummaryStats"
-- | https://docs.openalex.org/api-entities/works/work-object
data
Work
=
Work
{
abstract_inverted_index
::
Map
Text
[
Int
]
-- TODO
,
authorships
::
[
Authorship
]
,
apc_list
::
APCList
,
apc_paid
::
APCPaid
,
best_oa_location
::
Location
,
biblio
::
Biblio
,
cited_by_api_url
::
Text
,
cited_by_count
::
Count
,
concepts
::
[
DehydratedConcept
]
,
corresponding_author_ids
::
[
OpenAlexID
]
,
corresponding_institution_ids
::
[
OpenAlexID
]
,
counts_by_year
::
[
CountByYear
]
,
created_date
::
CreatedDate
,
display_name
::
Text
,
doi
::
DOI
,
grants
::
[
Grant
]
,
id
::
OpenAlexID
,
ids
::
Map
Text
ExternalID
-- TODO ExternalDB
,
is_paratext
::
Bool
,
is_retracted
::
Bool
,
language
::
Text
,
locations
::
[
Location
]
,
locations_count
::
Count
,
mesh
::
[
MeSH
]
,
ngrams_url
::
URL
,
open_access
::
OpenAccess
,
primary_location
::
Location
,
publication_date
::
CreatedDate
,
publication_year
::
Year
,
referenced_works
::
[
OpenAlexID
]
,
related_works
::
[
OpenAlexID
]
,
title
::
Text
,
type_
::
Text
,
updated_date
::
UpdatedDate
,
is_oa
::
Maybe
Bool
,
license
::
Maybe
Text
,
url
::
Maybe
URL
,
version
::
Maybe
Text
}
deriving
(
Generic
,
Show
)
instance
FromJSON
Work
where
parseJSON
=
withObject
"Work"
$
\
v
->
do
abstract_inverted_index
<-
v
.:
"abstract_inverted_index"
authorships
<-
v
.:
"authorships"
apc_list
<-
v
.:
"apc_list"
apc_paid
<-
v
.:
"apc_paid"
best_oa_location
<-
v
.:
"best_oa_location"
biblio
<-
v
.:
"biblio"
cited_by_api_url
<-
v
.:
"cited_by_api_url"
cited_by_count
<-
v
.:
"cited_by_count"
concepts
<-
v
.:
"concepts"
corresponding_author_ids
<-
v
.:
"corresponding_author_ids"
corresponding_institution_ids
<-
v
.:
"corresponding_institution_ids"
counts_by_year
<-
v
.:
"counts_by_year"
created_date
<-
v
.:
"created_date"
display_name
<-
v
.:
"display_name"
doi
<-
v
.:
"doi"
grants
<-
v
.:
"grants"
id
<-
v
.:
"id"
ids
<-
v
.:
"ids"
is_paratext
<-
v
.:
"is_paratext"
is_retracted
<-
v
.:
"is_retracted"
language
<-
v
.:
"language"
locations
<-
v
.:
"locations"
locations_count
<-
v
.:
"locations_count"
mesh
<-
v
.:
"mesh"
ngrams_url
<-
v
.:
"ngrams_url"
open_access
<-
v
.:
"open_access"
primary_location
<-
v
.:
"primary_location"
publication_date
<-
v
.:
"publication_date"
publication_year
<-
v
.:
"publication_year"
referenced_works
<-
v
.:
"referenced_works"
related_works
<-
v
.:
"related_works"
title
<-
v
.:
"title"
type_
<-
v
.:
"type"
updated_date
<-
v
.:
"updated_date"
is_oa
<-
v
.:?
"is_oa"
license
<-
v
.:?
"license"
url
<-
v
.:?
"url"
version
<-
v
.:?
"version"
pure
$
Work
{
..
}
data
APCList
=
APCList
{
value
::
Int
,
currency
::
Text
,
provenance
::
Text
,
value_usd
::
Int
}
deriving
(
Generic
,
Show
,
FromJSON
)
data
APCPaid
=
APCPaid
{
value
::
Int
,
currency
::
Text
,
provenance
::
Text
,
value_usd
::
Int
}
deriving
(
Generic
,
Show
,
FromJSON
)
-- | https://docs.openalex.org/api-entities/works/work-object/authorship-object
data
Authorship
=
Authorship
{
author
::
DehydratedAuthor
,
author_position
::
Text
,
institutions
::
[
DehydratedInstitution
]
,
is_corresponding
::
Maybe
Bool
,
raw_affiliation_string
::
Text
}
deriving
(
Generic
,
Show
,
FromJSON
)
data
Biblio
=
Biblio
{
volume
::
Text
,
issue
::
Text
,
first_page
::
Text
,
last_page
::
Text
}
deriving
(
Generic
,
Show
,
FromJSON
)
data
DehydratedAuthor
=
DehydratedAuthor
{
id
::
OpenAlexID
,
display_name
::
Text
,
orcid
::
Maybe
URL
}
deriving
(
Generic
,
Show
,
FromJSON
)
data
DehydratedInstitution
=
DehydratedInstitution
{
id
::
OpenAlexID
,
display_name
::
Text
,
ror
::
Text
,
country_code
::
Text
,
type_
::
Text
}
deriving
(
Generic
,
Show
)
instance
FromJSON
DehydratedInstitution
where
parseJSON
(
Object
v
)
=
do
id
<-
v
.:
"id"
display_name
<-
v
.:
"display_name"
ror
<-
v
.:
"ror"
country_code
<-
v
.:
"country_code"
type_
<-
v
.:
"type"
pure
$
DehydratedInstitution
{
..
}
parseJSON
_
=
fail
"Don't know how to parse a dehydrated institution from a non-object"
data
Grant
=
Grant
{
funder
::
OpenAlexID
,
funder_display_name
::
Text
,
award_id
::
Text
}
deriving
(
Generic
,
Show
,
FromJSON
)
data
Location
=
Location
{
is_oa
::
Bool
,
landing_page_url
::
URL
,
license
::
Text
,
source
::
DehydratedSource
,
pdf_url
::
Maybe
URL
,
version
::
Maybe
Text
}
deriving
(
Generic
,
Show
,
FromJSON
)
-- | PubMed only, https://docs.openalex.org/api-entities/works/work-object#mesh
data
MeSH
=
MeSH
{
descriptor_ui
::
Text
,
descriptor_name
::
Text
,
qualifier_ui
::
Text
,
qualifier_name
::
Text
,
is_major_topic
::
Bool
}
deriving
(
Generic
,
Show
,
FromJSON
)
-- | https://docs.openalex.org/api-entities/works/work-object#the-openaccess-object
data
OpenAccess
=
OpenAccess
{
any_repository_has_fulltext
::
Bool
,
is_oa
::
Bool
,
oa_status
::
OAStatus
,
oa_url
::
URL
}
deriving
(
Generic
,
Show
,
FromJSON
)
-- | https://docs.openalex.org/api-entities/sources/source-object#the-dehydratedsource-object
data
DehydratedSource
=
DehydratedSource
{
display_name
::
Text
,
host_organization
::
Text
,
host_organization_lineage
::
[
OpenAlexID
]
,
host_organization_name
::
Text
,
id
::
OpenAlexID
,
is_in_doaj
::
Bool
,
issn
::
[
ISSN
]
,
issn_l
::
ISSNL
,
type_
::
Text
}
deriving
(
Generic
,
Show
)
instance
FromJSON
DehydratedSource
where
parseJSON
=
withObject
"DehydratedSource"
$
\
v
->
do
display_name
<-
v
.:
"display_name"
host_organization
<-
v
.:
"host_organization"
host_organization_lineage
<-
v
.:
"host_organization_lineage"
host_organization_name
<-
v
.:
"host_organization_name"
id
<-
v
.:
"id"
is_in_doaj
<-
v
.:
"is_in_doaj"
issn
<-
v
.:
"issn"
issn_l
<-
v
.:
"issn_l"
type_
<-
v
.:
"type"
pure
$
DehydratedSource
{
..
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment