Commit 35a95e7e authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into 'main'

Dev merge

See merge request !1
parents d7aeb114 a46abe16
...@@ -14,16 +14,25 @@ Portability : POSIX ...@@ -14,16 +14,25 @@ Portability : POSIX
module Main where module Main where
import Conduit import Conduit
import Data.ByteString qualified as BS
import Data.ByteString.Lazy qualified as BSL
import Data.Conduit.Combinators qualified as Conduit
import Data.Conduit.List qualified as CL
import Data.Csv qualified as Csv
import Data.Csv.Conduit qualified as CsvC
import OpenAlex qualified as OA
import OpenAlex.Types qualified as OA
import Options.Applicative.Simple import Options.Applicative.Simple
import Protolude import Protolude
import qualified OpenAlex as OA
import qualified OpenAlex.Types as OA
data Options = Options data Options = Options
{ filter :: Maybe OA.Filter { filter :: Maybe OA.Filter
, search :: Maybe OA.Search } , search :: Maybe OA.Search }
data ToCSVOptions = ToCSVOptions
{ options :: Options
, output :: FilePath }
main :: IO () main :: IO ()
main = do main = do
let filterHelp = help "Filter, for example: display_name.search:einstein , see https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/filter-entity-lists" let filterHelp = help "Filter, for example: display_name.search:einstein , see https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/filter-entity-lists"
...@@ -32,6 +41,11 @@ main = do ...@@ -32,6 +41,11 @@ main = do
optional (strOption (long "filter")) <*> optional (strOption (long "filter")) <*>
optional (strOption (long "search")) optional (strOption (long "search"))
let toCsvOptions =
ToCSVOptions <$>
commonOptions <*>
strOption (long "output")
(opts, runCmd) <- (opts, runCmd) <-
simpleOptions "0.1.0.0" simpleOptions "0.1.0.0"
"OpenAlex" "OpenAlex"
...@@ -41,6 +55,10 @@ main = do ...@@ -41,6 +55,10 @@ main = do
"Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)" "Fetch OpenAlex concepts (https://docs.openalex.org/api-entities/concepts/concept-object)"
fetchConcepts fetchConcepts
commonOptions commonOptions
addCommand "to-csv"
"Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object) and save to CSV (format compatible with Gargantext Corpus CSV)"
toCSVC
toCsvOptions
addCommand "works" addCommand "works"
"Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object)" "Fetch OpenAlex works (https://docs.openalex.org/api-entities/works/work-object)"
fetchWorks fetchWorks
...@@ -79,7 +97,31 @@ fetchWorksC Options { .. } _ = do ...@@ -79,7 +97,31 @@ fetchWorksC Options { .. } _ = do
_ <- runConduit $ c _ <- runConduit $ c
.| takeC 3 .| takeC 3
.| mapM_C (\w@(OA.Work { .. }) -> do .| mapM_C (\w@(OA.Work { .. }) -> do
liftIO $ putText $ show id <> " :: " <> show display_name liftIO $ do
-- liftIO $ putText abstract_reconstructed putText $ show id <> " :: " <> show display_name
putText abstract_reconstructed
putText "-----"
) )
pure () pure ()
toCSVC :: ToCSVOptions -> () -> IO ()
toCSVC ToCSVOptions { options = Options { .. }, .. } _ = do
eWorksC <- OA.fetchWorksC Nothing filter search
case eWorksC of
Left err -> putText $ "error: " <> show err
Right (mCount, c) -> do
putText $ "Count: " <> show mCount
_d <- sourceToList $
c
-- .| mapC (\(OA.Work { .. }) -> (id, fromMaybe "" display_name, abstract_reconstructed))
-- .| takeC 3
.| toNamedCsv Csv.defaultEncodeOptions
.| mapM_C (BS.appendFile output)
-- BSL.writeFile output $ Csv.encodeDefaultOrderedByName d
pure ()
toNamedCsv :: (Monad m, Csv.DefaultOrdered a, Csv.ToNamedRecord a)
=> Csv.EncodeOptions
-> ConduitT a BS.ByteString m ()
toNamedCsv opts = {-# SCC toCsv_p #-} CL.map $ BSL.toStrict . Csv.encodeDefaultOrderedByNameWith opts . pure
...@@ -46,6 +46,7 @@ library ...@@ -46,6 +46,7 @@ library
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -58,7 +59,8 @@ library ...@@ -58,7 +59,8 @@ library
build-depends: base ^>= 4.14.3.0 && < 5 build-depends: base ^>= 4.14.3.0 && < 5
, aeson >= 2.1.2 && < 2.2 , aeson >= 2.1.2 && < 2.2
, binary >= 0.8.8 && < 0.9 , binary >= 0.8.8 && < 0.9
, bytestring >= 0.10.12 && < 0.11 , bytestring >= 0.11 && < 0.12
, cassava >= 0.5.3.0 && < 0.6
, conduit >= 1.3.5 && < 1.4 , conduit >= 1.3.5 && < 1.4
, containers >= 0.6.5.1 && < 0.7 , containers >= 0.6.5.1 && < 0.7
, http-client >= 0.7.13.1 && < 0.8 , http-client >= 0.7.13.1 && < 0.8
...@@ -71,6 +73,7 @@ library ...@@ -71,6 +73,7 @@ library
, servant-client-core >= 0.19 && < 0.20 , servant-client-core >= 0.19 && < 0.20
, text >= 1.2.4 && < 1.3 , text >= 1.2.4 && < 1.3
, time >= 1.9.3 && < 1.10 , time >= 1.9.3 && < 1.10
, vector >= 0.13 && < 0.14
default-language: Haskell2010 default-language: Haskell2010
executable openalex-main executable openalex-main
...@@ -82,6 +85,9 @@ executable openalex-main ...@@ -82,6 +85,9 @@ executable openalex-main
-- LANGUAGE extensions used by modules in this package. -- LANGUAGE extensions used by modules in this package.
-- other-extensions: -- other-extensions:
build-depends: base ^>=4.14.3.0 && < 5 build-depends: base ^>=4.14.3.0 && < 5
, bytestring >= 0.11 && < 0.12
, cassava >= 0.5.3.0 && < 0.6
, cassava-conduit >= 0.6.5 && < 0.7
, conduit >= 1.3.5 && < 1.4 , conduit >= 1.3.5 && < 1.4
, optparse-simple >= 0.1.1.4 && < 0.2 , optparse-simple >= 0.1.1.4 && < 0.2
, protolude >= 0.3.3 && < 0.4 , protolude >= 0.3.3 && < 0.4
...@@ -97,6 +103,7 @@ executable openalex-main ...@@ -97,6 +103,7 @@ executable openalex-main
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
......
...@@ -21,13 +21,14 @@ module OpenAlex ...@@ -21,13 +21,14 @@ module OpenAlex
where where
import Conduit import Conduit
import qualified Data.Text as T import Control.Monad.Fail (fail)
import Data.Text qualified as T
import Network.HTTP.Client (newManager, requestHeaders) import Network.HTTP.Client (newManager, requestHeaders)
import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude hiding (yield)
import OpenAlex.Client import OpenAlex.Client
import OpenAlex.ServantClientLogging import OpenAlex.ServantClientLogging
import OpenAlex.Types (ListOf(..), Meta(..), Page, PerPage, Cursor, Filter, Search, Concept, Work) import OpenAlex.Types (ListOf(..), Meta(..), Page, PerPage, Cursor, Filter, Search, Concept, Work, showDate)
import Protolude hiding (yield)
import Servant.Client (BaseUrl(..), ClientEnv(..), ClientError, Scheme(Https), defaultMakeClientRequest, mkClientEnv, runClientM) import Servant.Client (BaseUrl(..), ClientEnv(..), ClientError, Scheme(Https), defaultMakeClientRequest, mkClientEnv, runClientM)
defaultClientEnv :: IO ClientEnv defaultClientEnv :: IO ClientEnv
...@@ -79,11 +80,18 @@ fetchWorksC mCursor mFilter mSearch = do ...@@ -79,11 +80,18 @@ fetchWorksC mCursor mFilter mSearch = do
where where
producer :: ClientEnv -> Maybe Cursor -> ConduitT () Work IO () producer :: ClientEnv -> Maybe Cursor -> ConduitT () Work IO ()
producer env mCursor' = do producer env mCursor' = do
eRes <- liftIO $ runClientM (works Nothing (Just 200) mCursor' mFilter mSearch) env let batchSize = 200
eRes <- liftIO $ runClientM (works Nothing (Just batchSize) mCursor' mFilter mSearch) env
-- liftIO $ putText $ "Conduit fetching page with cursor " <> show mCursor' -- liftIO $ putText $ "Conduit fetching page with cursor " <> show mCursor'
case eRes of case eRes of
Left err -> panic $ "error: " <> show err Left err -> fail $ "error: " <> show err
Right (ListOf { results, meta = _meta@(Meta { next_cursor }) }) -> do Right ListOf { results, meta = _meta@(Meta { next_cursor }) } -> do
-- liftIO $ putText $ "Meta: " <> show meta -- liftIO $ putText $ "Meta: " <> show meta
--liftIO $ putText $ "[fetchWorksC] Results length: " <> (show $ length results)
yieldMany results yieldMany results
if length results < batchSize then
pure ()
else do
producer env next_cursor producer env next_cursor
{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-| {-|
Module : OpenAlex.Types Module : OpenAlex.Types
...@@ -16,13 +17,16 @@ module OpenAlex.Types where ...@@ -16,13 +17,16 @@ module OpenAlex.Types where
import Control.Monad.Fail (fail) import Control.Monad.Fail (fail)
import Data.Aeson import Data.Aeson
import Data.Csv qualified as Csv
import Data.Scientific (floatingOrInteger) import Data.Scientific (floatingOrInteger)
import qualified Data.Text as T import Data.Text qualified as T
import Data.Time (UTCTime) import Data.Time (UTCTime(..))
import Data.Time.Calendar (Day) import Data.Time.Calendar (Day, toGregorian)
import qualified Data.Time.Format as DTF import Data.Time.Format qualified as DTF
import Data.Vector qualified as V
import OpenAlex.Utils (reconstructAbstract) import OpenAlex.Utils (reconstructAbstract)
import Protolude hiding (Location, Meta) import Protolude hiding (Location, Meta)
import Protolude.Base (Show(..))
-- API request types -- API request types
type Cursor = Text type Cursor = Text
...@@ -78,14 +82,40 @@ parseTimeE :: (MonadFail m, DTF.ParseTime t) => Text -> Text -> m t ...@@ -78,14 +82,40 @@ parseTimeE :: (MonadFail m, DTF.ParseTime t) => Text -> Text -> m t
parseTimeE fmt s = case (DTF.parseTimeM True DTF.defaultTimeLocale (T.unpack fmt) (T.unpack s)) of parseTimeE fmt s = case (DTF.parseTimeM True DTF.defaultTimeLocale (T.unpack fmt) (T.unpack s)) of
Nothing -> fail $ "Cannot parse date with format " <> T.unpack fmt Nothing -> fail $ "Cannot parse date with format " <> T.unpack fmt
Just p -> pure p Just p -> pure p
showDate :: Date -> Text
showDate (DDay day) = T.pack $ DTF.formatTime DTF.defaultTimeLocale "%Y-%m-%dT%H:%M:%S" day
showDate (DUTCTime t) = T.pack $ DTF.formatTime DTF.defaultTimeLocale "%Y-%m-%dT%H:%M:%S" t
data Date = DDay Day | DUTCTime UTCTime data Date = DDay Day | DUTCTime UTCTime
deriving (Generic, Show) deriving (Generic)
instance Show Date where
show = T.unpack . showDate
instance FromJSON Date where instance FromJSON Date where
parseJSON = withText "Date" $ \s -> parseJSON = withText "Date" $ \s ->
(DDay <$> parseTimeE "%F" s) <|> (DDay <$> parseTimeE "%F" s) <|>
(DUTCTime <$> parseTimeE "%Y-%m-%dT%H:%M:%S%Q" s) (DUTCTime <$> parseTimeE "%Y-%m-%dT%H:%M:%S%Q" s)
-- | Return the 'day' part of `Date`
dateDay :: Date -> Int
dateDay (DDay d) = day
where
(_year, _month, day) = toGregorian d
dateDay (DUTCTime t) = dateDay (DDay (utctDay t))
-- | Return the 'month' part of `Date`
dateMonth :: Date -> Int
dateMonth (DDay d) = month
where
(_year, month, _day) = toGregorian d
dateMonth (DUTCTime t) = dateMonth (DDay (utctDay t))
-- | Return the 'year' part of `Date`
dateYear :: Date -> Integer
dateYear (DDay d) = year
where
(year, _month, _day) = toGregorian d
dateYear (DUTCTime t) = dateYear (DDay (utctDay t))
type CreatedDate = Date type CreatedDate = Date
type UpdatedDate = Date type UpdatedDate = Date
...@@ -284,19 +314,48 @@ instance FromJSON Work where ...@@ -284,19 +314,48 @@ instance FromJSON Work where
url <- v .:? "url" url <- v .:? "url"
version <- v .:? "version" version <- v .:? "version"
pure $ Work { .. } pure $ Work { .. }
-- | Publication Day,Publication Month,Publication Year,Authors,Title,Source,Abstract
instance Csv.DefaultOrdered Work where
headerOrder _ = V.fromList [ "Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract" ]
instance Csv.ToNamedRecord Work where
toNamedRecord (Work { .. }) = Csv.namedRecord [
Csv.namedField "Publication Day" (dateDay $ publication_date)
, Csv.namedField "Publication Month" (dateMonth $ publication_date)
, Csv.namedField "Publication Year" (dateYear $ publication_date)
, Csv.namedField "Authors" (T.intercalate ", " authorList)
, Csv.namedField "Title" (fromMaybe "" title)
, Csv.namedField "Source" source
, Csv.namedField "Abstract" abstract_reconstructed
]
where
authorList :: [Text]
authorList = catMaybes (authorshipAuthorName <$> authorships)
source :: Text
source = case primary_location of
Nothing -> ""
Just (Location { source = s }) ->
case s of
Nothing -> ""
Just (DehydratedSource { display_name = dn }) -> dn
data APCList = APCList data APCList = APCList
{ value :: Int { value :: Maybe Int
, currency :: Text , currency :: Text
, provenance :: Text , provenance :: Text
, value_usd :: Int , value_usd :: Maybe Int
} deriving (Generic, Show, FromJSON) } deriving (Generic, Show, FromJSON)
data APCPaid = APCPaid data APCPaid = APCPaid
{ value :: Int { value :: Int
, currency :: Text , currency :: Text
, provenance :: Text , provenance :: Text
, value_usd :: Int , value_usd :: Maybe Int
} deriving (Generic, Show, FromJSON) } deriving (Generic, Show, FromJSON)
-- | https://docs.openalex.org/api-entities/works/work-object/authorship-object -- | https://docs.openalex.org/api-entities/works/work-object/authorship-object
...@@ -307,6 +366,8 @@ data Authorship = Authorship ...@@ -307,6 +366,8 @@ data Authorship = Authorship
, is_corresponding :: Maybe Bool , is_corresponding :: Maybe Bool
, raw_affiliation_string :: Text , raw_affiliation_string :: Text
} deriving (Generic, Show, FromJSON) } deriving (Generic, Show, FromJSON)
authorshipAuthorName :: Authorship -> Maybe Text
authorshipAuthorName (Authorship { author = DehydratedAuthor { display_name } }) = display_name
data Biblio = Biblio data Biblio = Biblio
{ volume :: Maybe Text { volume :: Maybe Text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment