Commit 35a95e7e authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into 'main'

Dev merge

See merge request !1
parents d7aeb114 a46abe16
......@@ -14,16 +14,25 @@ Portability : POSIX
module Main where
import Conduit
import Data.ByteString qualified as BS
import Data.ByteString.Lazy qualified as BSL
import Data.Conduit.Combinators qualified as Conduit
import Data.Conduit.List qualified as CL
import Data.Csv qualified as Csv
import Data.Csv.Conduit qualified as CsvC
import OpenAlex qualified as OA
import OpenAlex.Types qualified as OA
import Options.Applicative.Simple
import Protolude
import qualified OpenAlex as OA
import qualified OpenAlex.Types as OA
data Options = Options
{ filter :: Maybe OA.Filter
, search :: Maybe OA.Search }
data ToCSVOptions = ToCSVOptions
{ options :: Options
, output :: FilePath }
main :: IO ()
main = do
let filterHelp = help "Filter, for example: display_name.search:einstein , see https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/filter-entity-lists"
......@@ -32,6 +41,11 @@ main = do
optional (strOption (long "filter")) <*>
optional (strOption (long "search"))
let toCsvOptions =
ToCSVOptions <$>
commonOptions <*>
strOption (long "output")
(opts, runCmd) <-
simpleOptions "0.1.0.0"
"OpenAlex"
......@@ -41,6 +55,10 @@ main = do
"Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)"
fetchConcepts
commonOptions
addCommand "to-csv"
"Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object) and save to CSV (format compatible with Gargantext Corpus CSV)"
toCSVC
toCsvOptions
addCommand "works"
"Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object)"
fetchWorks
......@@ -79,7 +97,31 @@ fetchWorksC Options { .. } _ = do
_ <- runConduit $ c
.| takeC 3
.| mapM_C (\w@(OA.Work { .. }) -> do
liftIO $ putText $ show id <> " :: " <> show display_name
-- liftIO $ putText abstract_reconstructed
liftIO $ do
putText $ show id <> " :: " <> show display_name
putText abstract_reconstructed
putText "-----"
)
pure ()
toCSVC :: ToCSVOptions -> () -> IO ()
toCSVC ToCSVOptions { options = Options { .. }, .. } _ = do
eWorksC <- OA.fetchWorksC Nothing filter search
case eWorksC of
Left err -> putText $ "error: " <> show err
Right (mCount, c) -> do
putText $ "Count: " <> show mCount
_d <- sourceToList $
c
-- .| mapC (\(OA.Work { .. }) -> (id, fromMaybe "" display_name, abstract_reconstructed))
-- .| takeC 3
.| toNamedCsv Csv.defaultEncodeOptions
.| mapM_C (BS.appendFile output)
-- BSL.writeFile output $ Csv.encodeDefaultOrderedByName d
pure ()
toNamedCsv :: (Monad m, Csv.DefaultOrdered a, Csv.ToNamedRecord a)
=> Csv.EncodeOptions
-> ConduitT a BS.ByteString m ()
toNamedCsv opts = {-# SCC toCsv_p #-} CL.map $ BSL.toStrict . Csv.encodeDefaultOrderedByNameWith opts . pure
......@@ -46,6 +46,7 @@ library
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -58,7 +59,8 @@ library
build-depends: base ^>= 4.14.3.0 && < 5
, aeson >= 2.1.2 && < 2.2
, binary >= 0.8.8 && < 0.9
, bytestring >= 0.10.12 && < 0.11
, bytestring >= 0.11 && < 0.12
, cassava >= 0.5.3.0 && < 0.6
, conduit >= 1.3.5 && < 1.4
, containers >= 0.6.5.1 && < 0.7
, http-client >= 0.7.13.1 && < 0.8
......@@ -71,6 +73,7 @@ library
, servant-client-core >= 0.19 && < 0.20
, text >= 1.2.4 && < 1.3
, time >= 1.9.3 && < 1.10
, vector >= 0.13 && < 0.14
default-language: Haskell2010
executable openalex-main
......@@ -81,12 +84,15 @@ executable openalex-main
-- LANGUAGE extensions used by modules in this package.
-- other-extensions:
build-depends: base ^>=4.14.3.0 && < 5
, conduit >= 1.3.5 && < 1.4
, optparse-simple >= 0.1.1.4 && < 0.2
, protolude >= 0.3.3 && < 0.4
build-depends: base ^>=4.14.3.0 && < 5
, bytestring >= 0.11 && < 0.12
, cassava >= 0.5.3.0 && < 0.6
, cassava-conduit >= 0.6.5 && < 0.7
, conduit >= 1.3.5 && < 1.4
, optparse-simple >= 0.1.1.4 && < 0.2
, protolude >= 0.3.3 && < 0.4
, openalex
, openalex
hs-source-dirs: app
default-language: Haskell2010
default-extensions:
......@@ -97,6 +103,7 @@ executable openalex-main
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......
......@@ -21,13 +21,14 @@ module OpenAlex
where
import Conduit
import qualified Data.Text as T
import Control.Monad.Fail (fail)
import Data.Text qualified as T
import Network.HTTP.Client (newManager, requestHeaders)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude hiding (yield)
import OpenAlex.Client
import OpenAlex.ServantClientLogging
import OpenAlex.Types (ListOf(..), Meta(..), Page, PerPage, Cursor, Filter, Search, Concept, Work)
import OpenAlex.Types (ListOf(..), Meta(..), Page, PerPage, Cursor, Filter, Search, Concept, Work, showDate)
import Protolude hiding (yield)
import Servant.Client (BaseUrl(..), ClientEnv(..), ClientError, Scheme(Https), defaultMakeClientRequest, mkClientEnv, runClientM)
defaultClientEnv :: IO ClientEnv
......@@ -79,11 +80,18 @@ fetchWorksC mCursor mFilter mSearch = do
where
producer :: ClientEnv -> Maybe Cursor -> ConduitT () Work IO ()
producer env mCursor' = do
eRes <- liftIO $ runClientM (works Nothing (Just 200) mCursor' mFilter mSearch) env
let batchSize = 200
eRes <- liftIO $ runClientM (works Nothing (Just batchSize) mCursor' mFilter mSearch) env
-- liftIO $ putText $ "Conduit fetching page with cursor " <> show mCursor'
case eRes of
Left err -> panic $ "error: " <> show err
Right (ListOf { results, meta = _meta@(Meta { next_cursor }) }) -> do
Left err -> fail $ "error: " <> show err
Right ListOf { results, meta = _meta@(Meta { next_cursor }) } -> do
-- liftIO $ putText $ "Meta: " <> show meta
--liftIO $ putText $ "[fetchWorksC] Results length: " <> (show $ length results)
yieldMany results
producer env next_cursor
if length results < batchSize then
pure ()
else do
producer env next_cursor
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-|
Module : OpenAlex.Types
......@@ -16,13 +17,16 @@ module OpenAlex.Types where
import Control.Monad.Fail (fail)
import Data.Aeson
import Data.Csv qualified as Csv
import Data.Scientific (floatingOrInteger)
import qualified Data.Text as T
import Data.Time (UTCTime)
import Data.Time.Calendar (Day)
import qualified Data.Time.Format as DTF
import Data.Text qualified as T
import Data.Time (UTCTime(..))
import Data.Time.Calendar (Day, toGregorian)
import Data.Time.Format qualified as DTF
import Data.Vector qualified as V
import OpenAlex.Utils (reconstructAbstract)
import Protolude hiding (Location, Meta)
import Protolude.Base (Show(..))
-- API request types
type Cursor = Text
......@@ -78,14 +82,40 @@ parseTimeE :: (MonadFail m, DTF.ParseTime t) => Text -> Text -> m t
parseTimeE fmt s = case (DTF.parseTimeM True DTF.defaultTimeLocale (T.unpack fmt) (T.unpack s)) of
Nothing -> fail $ "Cannot parse date with format " <> T.unpack fmt
Just p -> pure p
showDate :: Date -> Text
showDate (DDay day) = T.pack $ DTF.formatTime DTF.defaultTimeLocale "%Y-%m-%dT%H:%M:%S" day
showDate (DUTCTime t) = T.pack $ DTF.formatTime DTF.defaultTimeLocale "%Y-%m-%dT%H:%M:%S" t
data Date = DDay Day | DUTCTime UTCTime
deriving (Generic, Show)
deriving (Generic)
instance Show Date where
show = T.unpack . showDate
instance FromJSON Date where
parseJSON = withText "Date" $ \s ->
(DDay <$> parseTimeE "%F" s) <|>
(DUTCTime <$> parseTimeE "%Y-%m-%dT%H:%M:%S%Q" s)
-- | Return the 'day' part of `Date`
dateDay :: Date -> Int
dateDay (DDay d) = day
where
(_year, _month, day) = toGregorian d
dateDay (DUTCTime t) = dateDay (DDay (utctDay t))
-- | Return the 'month' part of `Date`
dateMonth :: Date -> Int
dateMonth (DDay d) = month
where
(_year, month, _day) = toGregorian d
dateMonth (DUTCTime t) = dateMonth (DDay (utctDay t))
-- | Return the 'year' part of `Date`
dateYear :: Date -> Integer
dateYear (DDay d) = year
where
(year, _month, _day) = toGregorian d
dateYear (DUTCTime t) = dateYear (DDay (utctDay t))
type CreatedDate = Date
type UpdatedDate = Date
......@@ -284,19 +314,48 @@ instance FromJSON Work where
url <- v .:? "url"
version <- v .:? "version"
pure $ Work { .. }
-- | Publication Day,Publication Month,Publication Year,Authors,Title,Source,Abstract
instance Csv.DefaultOrdered Work where
headerOrder _ = V.fromList [ "Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract" ]
instance Csv.ToNamedRecord Work where
toNamedRecord (Work { .. }) = Csv.namedRecord [
Csv.namedField "Publication Day" (dateDay $ publication_date)
, Csv.namedField "Publication Month" (dateMonth $ publication_date)
, Csv.namedField "Publication Year" (dateYear $ publication_date)
, Csv.namedField "Authors" (T.intercalate ", " authorList)
, Csv.namedField "Title" (fromMaybe "" title)
, Csv.namedField "Source" source
, Csv.namedField "Abstract" abstract_reconstructed
]
where
authorList :: [Text]
authorList = catMaybes (authorshipAuthorName <$> authorships)
source :: Text
source = case primary_location of
Nothing -> ""
Just (Location { source = s }) ->
case s of
Nothing -> ""
Just (DehydratedSource { display_name = dn }) -> dn
data APCList = APCList
{ value :: Int
{ value :: Maybe Int
, currency :: Text
, provenance :: Text
, value_usd :: Int
, value_usd :: Maybe Int
} deriving (Generic, Show, FromJSON)
data APCPaid = APCPaid
{ value :: Int
, currency :: Text
, provenance :: Text
, value_usd :: Int
, value_usd :: Maybe Int
} deriving (Generic, Show, FromJSON)
-- | https://docs.openalex.org/api-entities/works/work-object/authorship-object
......@@ -307,6 +366,8 @@ data Authorship = Authorship
, is_corresponding :: Maybe Bool
, raw_affiliation_string :: Text
} deriving (Generic, Show, FromJSON)
authorshipAuthorName :: Authorship -> Maybe Text
authorshipAuthorName (Authorship { author = DehydratedAuthor { display_name } }) = display_name
data Biblio = Biblio
{ volume :: Maybe Text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment