Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
O
openalex
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
2
Issues
2
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
crawlers
openalex
Commits
35a95e7e
Commit
35a95e7e
authored
Nov 30, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev' into 'main'
Dev merge See merge request
!1
parents
d7aeb114
a46abe16
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
144 additions
and
26 deletions
+144
-26
Main.hs
app/Main.hs
+47
-5
openalex.cabal
openalex.cabal
+13
-6
OpenAlex.hs
src/OpenAlex.hs
+15
-7
Types.hs
src/OpenAlex/Types.hs
+69
-8
No files found.
app/Main.hs
View file @
35a95e7e
...
...
@@ -14,16 +14,25 @@ Portability : POSIX
module
Main
where
import
Conduit
import
Data.ByteString
qualified
as
BS
import
Data.ByteString.Lazy
qualified
as
BSL
import
Data.Conduit.Combinators
qualified
as
Conduit
import
Data.Conduit.List
qualified
as
CL
import
Data.Csv
qualified
as
Csv
import
Data.Csv.Conduit
qualified
as
CsvC
import
OpenAlex
qualified
as
OA
import
OpenAlex.Types
qualified
as
OA
import
Options.Applicative.Simple
import
Protolude
import
qualified
OpenAlex
as
OA
import
qualified
OpenAlex.Types
as
OA
data
Options
=
Options
{
filter
::
Maybe
OA
.
Filter
,
search
::
Maybe
OA
.
Search
}
data
ToCSVOptions
=
ToCSVOptions
{
options
::
Options
,
output
::
FilePath
}
main
::
IO
()
main
=
do
let
filterHelp
=
help
"Filter, for example: display_name.search:einstein , see https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/filter-entity-lists"
...
...
@@ -32,6 +41,11 @@ main = do
optional
(
strOption
(
long
"filter"
))
<*>
optional
(
strOption
(
long
"search"
))
let
toCsvOptions
=
ToCSVOptions
<$>
commonOptions
<*>
strOption
(
long
"output"
)
(
opts
,
runCmd
)
<-
simpleOptions
"0.1.0.0"
"OpenAlex"
...
...
@@ -41,6 +55,10 @@ main = do
"Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)"
fetchConcepts
commonOptions
addCommand
"to-csv"
"Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object) and save to CSV (format compatible with Gargantext Corpus CSV)"
toCSVC
toCsvOptions
addCommand
"works"
"Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object)"
fetchWorks
...
...
@@ -79,7 +97,31 @@ fetchWorksC Options { .. } _ = do
_
<-
runConduit
$
c
.|
takeC
3
.|
mapM_C
(
\
w
@
(
OA
.
Work
{
..
})
->
do
liftIO
$
putText
$
show
id
<>
" :: "
<>
show
display_name
-- liftIO $ putText abstract_reconstructed
liftIO
$
do
putText
$
show
id
<>
" :: "
<>
show
display_name
putText
abstract_reconstructed
putText
"-----"
)
pure
()
toCSVC
::
ToCSVOptions
->
()
->
IO
()
toCSVC
ToCSVOptions
{
options
=
Options
{
..
},
..
}
_
=
do
eWorksC
<-
OA
.
fetchWorksC
Nothing
filter
search
case
eWorksC
of
Left
err
->
putText
$
"error: "
<>
show
err
Right
(
mCount
,
c
)
->
do
putText
$
"Count: "
<>
show
mCount
_d
<-
sourceToList
$
c
-- .| mapC (\(OA.Work { .. }) -> (id, fromMaybe "" display_name, abstract_reconstructed))
-- .| takeC 3
.|
toNamedCsv
Csv
.
defaultEncodeOptions
.|
mapM_C
(
BS
.
appendFile
output
)
-- BSL.writeFile output $ Csv.encodeDefaultOrderedByName d
pure
()
toNamedCsv
::
(
Monad
m
,
Csv
.
DefaultOrdered
a
,
Csv
.
ToNamedRecord
a
)
=>
Csv
.
EncodeOptions
->
ConduitT
a
BS
.
ByteString
m
()
toNamedCsv
opts
=
{-# SCC toCsv_p #-}
CL
.
map
$
BSL
.
toStrict
.
Csv
.
encodeDefaultOrderedByNameWith
opts
.
pure
openalex.cabal
View file @
35a95e7e
...
...
@@ -46,6 +46,7 @@ library
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
...
...
@@ -58,7 +59,8 @@ library
build-depends: base ^>= 4.14.3.0 && < 5
, aeson >= 2.1.2 && < 2.2
, binary >= 0.8.8 && < 0.9
, bytestring >= 0.10.12 && < 0.11
, bytestring >= 0.11 && < 0.12
, cassava >= 0.5.3.0 && < 0.6
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.5.1 && < 0.7
, http-client >= 0.7.13.1 && < 0.8
...
...
@@ -71,6 +73,7 @@ library
, servant-client-core >= 0.19 && < 0.20
, text >= 1.2.4 && < 1.3
, time >= 1.9.3 && < 1.10
, vector >= 0.13 && < 0.14
default-language: Haskell2010
executable openalex-main
...
...
@@ -81,12 +84,15 @@ executable openalex-main
-- LANGUAGE extensions used by modules in this package.
-- other-extensions:
build-depends: base ^>=4.14.3.0 && < 5
, conduit >= 1.3.5 && < 1.4
, optparse-simple >= 0.1.1.4 && < 0.2
, protolude >= 0.3.3 && < 0.4
build-depends: base ^>=4.14.3.0 && < 5
, bytestring >= 0.11 && < 0.12
, cassava >= 0.5.3.0 && < 0.6
, cassava-conduit >= 0.6.5 && < 0.7
, conduit >= 1.3.5 && < 1.4
, optparse-simple >= 0.1.1.4 && < 0.2
, protolude >= 0.3.3 && < 0.4
, openalex
, openalex
hs-source-dirs: app
default-language: Haskell2010
default-extensions:
...
...
@@ -97,6 +103,7 @@ executable openalex-main
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
...
...
src/OpenAlex.hs
View file @
35a95e7e
...
...
@@ -21,13 +21,14 @@ module OpenAlex
where
import
Conduit
import
qualified
Data.Text
as
T
import
Control.Monad.Fail
(
fail
)
import
Data.Text
qualified
as
T
import
Network.HTTP.Client
(
newManager
,
requestHeaders
)
import
Network.HTTP.Client.TLS
(
tlsManagerSettings
)
import
Protolude
hiding
(
yield
)
import
OpenAlex.Client
import
OpenAlex.ServantClientLogging
import
OpenAlex.Types
(
ListOf
(
..
),
Meta
(
..
),
Page
,
PerPage
,
Cursor
,
Filter
,
Search
,
Concept
,
Work
)
import
OpenAlex.Types
(
ListOf
(
..
),
Meta
(
..
),
Page
,
PerPage
,
Cursor
,
Filter
,
Search
,
Concept
,
Work
,
showDate
)
import
Protolude
hiding
(
yield
)
import
Servant.Client
(
BaseUrl
(
..
),
ClientEnv
(
..
),
ClientError
,
Scheme
(
Https
),
defaultMakeClientRequest
,
mkClientEnv
,
runClientM
)
defaultClientEnv
::
IO
ClientEnv
...
...
@@ -79,11 +80,18 @@ fetchWorksC mCursor mFilter mSearch = do
where
producer
::
ClientEnv
->
Maybe
Cursor
->
ConduitT
()
Work
IO
()
producer
env
mCursor'
=
do
eRes
<-
liftIO
$
runClientM
(
works
Nothing
(
Just
200
)
mCursor'
mFilter
mSearch
)
env
let
batchSize
=
200
eRes
<-
liftIO
$
runClientM
(
works
Nothing
(
Just
batchSize
)
mCursor'
mFilter
mSearch
)
env
-- liftIO $ putText $ "Conduit fetching page with cursor " <> show mCursor'
case
eRes
of
Left
err
->
panic
$
"error: "
<>
show
err
Right
(
ListOf
{
results
,
meta
=
_meta
@
(
Meta
{
next_cursor
})
})
->
do
Left
err
->
fail
$
"error: "
<>
show
err
Right
ListOf
{
results
,
meta
=
_meta
@
(
Meta
{
next_cursor
})
}
->
do
-- liftIO $ putText $ "Meta: " <> show meta
--liftIO $ putText $ "[fetchWorksC] Results length: " <> (show $ length results)
yieldMany
results
producer
env
next_cursor
if
length
results
<
batchSize
then
pure
()
else
do
producer
env
next_cursor
src/OpenAlex/Types.hs
View file @
35a95e7e
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-|
Module : OpenAlex.Types
...
...
@@ -16,13 +17,16 @@ module OpenAlex.Types where
import
Control.Monad.Fail
(
fail
)
import
Data.Aeson
import
Data.Csv
qualified
as
Csv
import
Data.Scientific
(
floatingOrInteger
)
import
qualified
Data.Text
as
T
import
Data.Time
(
UTCTime
)
import
Data.Time.Calendar
(
Day
)
import
qualified
Data.Time.Format
as
DTF
import
Data.Text
qualified
as
T
import
Data.Time
(
UTCTime
(
..
))
import
Data.Time.Calendar
(
Day
,
toGregorian
)
import
Data.Time.Format
qualified
as
DTF
import
Data.Vector
qualified
as
V
import
OpenAlex.Utils
(
reconstructAbstract
)
import
Protolude
hiding
(
Location
,
Meta
)
import
Protolude.Base
(
Show
(
..
))
-- API request types
type
Cursor
=
Text
...
...
@@ -78,14 +82,40 @@ parseTimeE :: (MonadFail m, DTF.ParseTime t) => Text -> Text -> m t
parseTimeE
fmt
s
=
case
(
DTF
.
parseTimeM
True
DTF
.
defaultTimeLocale
(
T
.
unpack
fmt
)
(
T
.
unpack
s
))
of
Nothing
->
fail
$
"Cannot parse date with format "
<>
T
.
unpack
fmt
Just
p
->
pure
p
showDate
::
Date
->
Text
showDate
(
DDay
day
)
=
T
.
pack
$
DTF
.
formatTime
DTF
.
defaultTimeLocale
"%Y-%m-%dT%H:%M:%S"
day
showDate
(
DUTCTime
t
)
=
T
.
pack
$
DTF
.
formatTime
DTF
.
defaultTimeLocale
"%Y-%m-%dT%H:%M:%S"
t
data
Date
=
DDay
Day
|
DUTCTime
UTCTime
deriving
(
Generic
,
Show
)
deriving
(
Generic
)
instance
Show
Date
where
show
=
T
.
unpack
.
showDate
instance
FromJSON
Date
where
parseJSON
=
withText
"Date"
$
\
s
->
(
DDay
<$>
parseTimeE
"%F"
s
)
<|>
(
DUTCTime
<$>
parseTimeE
"%Y-%m-%dT%H:%M:%S%Q"
s
)
-- | Return the 'day' part of `Date`
dateDay
::
Date
->
Int
dateDay
(
DDay
d
)
=
day
where
(
_year
,
_month
,
day
)
=
toGregorian
d
dateDay
(
DUTCTime
t
)
=
dateDay
(
DDay
(
utctDay
t
))
-- | Return the 'month' part of `Date`
dateMonth
::
Date
->
Int
dateMonth
(
DDay
d
)
=
month
where
(
_year
,
month
,
_day
)
=
toGregorian
d
dateMonth
(
DUTCTime
t
)
=
dateMonth
(
DDay
(
utctDay
t
))
-- | Return the 'year' part of `Date`
dateYear
::
Date
->
Integer
dateYear
(
DDay
d
)
=
year
where
(
year
,
_month
,
_day
)
=
toGregorian
d
dateYear
(
DUTCTime
t
)
=
dateYear
(
DDay
(
utctDay
t
))
type
CreatedDate
=
Date
type
UpdatedDate
=
Date
...
...
@@ -284,19 +314,48 @@ instance FromJSON Work where
url
<-
v
.:?
"url"
version
<-
v
.:?
"version"
pure
$
Work
{
..
}
-- | Publication Day,Publication Month,Publication Year,Authors,Title,Source,Abstract
instance
Csv
.
DefaultOrdered
Work
where
headerOrder
_
=
V
.
fromList
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Source"
,
"Abstract"
]
instance
Csv
.
ToNamedRecord
Work
where
toNamedRecord
(
Work
{
..
})
=
Csv
.
namedRecord
[
Csv
.
namedField
"Publication Day"
(
dateDay
$
publication_date
)
,
Csv
.
namedField
"Publication Month"
(
dateMonth
$
publication_date
)
,
Csv
.
namedField
"Publication Year"
(
dateYear
$
publication_date
)
,
Csv
.
namedField
"Authors"
(
T
.
intercalate
", "
authorList
)
,
Csv
.
namedField
"Title"
(
fromMaybe
""
title
)
,
Csv
.
namedField
"Source"
source
,
Csv
.
namedField
"Abstract"
abstract_reconstructed
]
where
authorList
::
[
Text
]
authorList
=
catMaybes
(
authorshipAuthorName
<$>
authorships
)
source
::
Text
source
=
case
primary_location
of
Nothing
->
""
Just
(
Location
{
source
=
s
})
->
case
s
of
Nothing
->
""
Just
(
DehydratedSource
{
display_name
=
dn
})
->
dn
data
APCList
=
APCList
{
value
::
Int
{
value
::
Maybe
Int
,
currency
::
Text
,
provenance
::
Text
,
value_usd
::
Int
,
value_usd
::
Maybe
Int
}
deriving
(
Generic
,
Show
,
FromJSON
)
data
APCPaid
=
APCPaid
{
value
::
Int
,
currency
::
Text
,
provenance
::
Text
,
value_usd
::
Int
,
value_usd
::
Maybe
Int
}
deriving
(
Generic
,
Show
,
FromJSON
)
-- | https://docs.openalex.org/api-entities/works/work-object/authorship-object
...
...
@@ -307,6 +366,8 @@ data Authorship = Authorship
,
is_corresponding
::
Maybe
Bool
,
raw_affiliation_string
::
Text
}
deriving
(
Generic
,
Show
,
FromJSON
)
authorshipAuthorName
::
Authorship
->
Maybe
Text
authorshipAuthorName
(
Authorship
{
author
=
DehydratedAuthor
{
display_name
}
})
=
display_name
data
Biblio
=
Biblio
{
volume
::
Maybe
Text
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment