Commit adc4c653 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/dev-openalex' into dev

parents 364a44c2 a522479e
Pipeline #4315 canceled with stages
......@@ -143,6 +143,11 @@ source-repository-package
location: https://github.com/rspeer/wikiparsec.git
tag: 9637a82344bb70f7fa8f02e75db3c081ccd434ce
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/openalex.git
tag: 5aac73a7be1b86dcc312936c000e5e5444144491
allow-older: *
allow-newer: *
......
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.34.6.
-- This file has been generated from package.yaml by hpack version 0.35.1.
--
-- see: https://github.com/sol/hpack
......@@ -59,11 +59,11 @@ library
Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.List.Formats.CSV
Gargantext.Core.Text.Metrics
Gargantext.Core.Text.Metrics.CharByChar
......@@ -202,6 +202,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Gargantext.Core.Text.Corpus.Parsers.RIS
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
......@@ -457,6 +458,7 @@ library
, natural-transformation
, network-uri
, opaleye
, openalex
, pandoc
, parallel
, parsec
......
......@@ -88,11 +88,11 @@ library:
- Gargantext.Core.Text.Corpus.API
- Gargantext.Core.Text.Corpus.API.Arxiv
- Gargantext.Core.Text.Corpus.API.Pubmed
- Gargantext.Core.Text.Corpus.API.OpenAlex
- Gargantext.Core.Text.Corpus.Query
- Gargantext.Core.Text.Corpus.Parsers
- Gargantext.Core.Text.Corpus.Parsers.CSV
- Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
- Gargantext.Core.Text.Corpus.Parsers.JSON
- Gargantext.Core.Text.List.Formats.CSV
- Gargantext.Core.Text.Metrics
- Gargantext.Core.Text.Metrics.CharByChar
......@@ -245,6 +245,7 @@ library:
- natural-transformation
- network-uri
- opaleye
- openalex
- pandoc
- parallel
- parsec
......
......@@ -34,7 +34,8 @@ instance Arbitrary a => Arbitrary (JobOutput a) where
-- | Main Types
-- TODO IsidoreAuth
data ExternalAPIs = PubMed
data ExternalAPIs = OpenAlex
| PubMed
| Arxiv
| HAL
| IsTex
......
......@@ -19,6 +19,7 @@ import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Action.Flow (DataOrigin(..))
data Database = Empty
| OpenAlex
| PubMed
| Arxiv
| HAL
......@@ -34,12 +35,13 @@ instance ToSchema Database where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
database2origin :: Database -> DataOrigin
database2origin Empty = InternalOrigin Types.IsTex
database2origin PubMed = ExternalOrigin Types.PubMed
database2origin Arxiv = ExternalOrigin Types.Arxiv
database2origin HAL = ExternalOrigin Types.HAL
database2origin IsTex = ExternalOrigin Types.IsTex
database2origin Isidore = ExternalOrigin Types.Isidore
database2origin Empty = InternalOrigin Types.IsTex
database2origin OpenAlex = ExternalOrigin Types.OpenAlex
database2origin PubMed = ExternalOrigin Types.PubMed
database2origin Arxiv = ExternalOrigin Types.Arxiv
database2origin HAL = ExternalOrigin Types.HAL
database2origin IsTex = ExternalOrigin Types.IsTex
database2origin Isidore = ExternalOrigin Types.Isidore
------------------------------------------------------------------------
data Datafield = Gargantext
......
......@@ -31,6 +31,7 @@ import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
import qualified Gargantext.Core.Text.Corpus.API.Hal as HAL
import qualified Gargantext.Core.Text.Corpus.API.Isidore as ISIDORE
import qualified Gargantext.Core.Text.Corpus.API.Istex as ISTEX
import qualified Gargantext.Core.Text.Corpus.API.OpenAlex as OpenAlex
import qualified Gargantext.Core.Text.Corpus.API.Pubmed as PUBMED
import qualified Gargantext.Core.Text.Corpus.Query as Corpus
import qualified PUBMED.Types as PUBMED
......@@ -55,6 +56,8 @@ get externalAPI la q mPubmedAPIKey limit = do
case Corpus.parseQuery q of
Left err -> pure $ Left $ InvalidInputQuery q (T.pack err)
Right corpusQuery -> case externalAPI of
OpenAlex -> first ExternalAPIError <$>
OpenAlex.get (fromMaybe "" Nothing {- email -}) q limit
PubMed -> first ExternalAPIError <$>
PUBMED.get (fromMaybe "" mPubmedAPIKey) corpusQuery limit
--docs <- PUBMED.get q default_limit -- EN only by default
......
{-|
Module : Gargantext.Core.Text.Corpus.API.OpenAlex
Description : OpenAlex API connection
Copyright : (c) CNRS, 2023
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.API.OpenAlex where
import Conduit
import qualified Data.Text as T
import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Protolude
import qualified OpenAlex as OA
import qualified OpenAlex.Types as OA
import Servant.Client (ClientError)
get :: Text
-> Corpus.RawQuery
-> Maybe Limit
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get _email q _l = do
eRes <- OA.fetchWorksC Nothing $ Just $ Corpus.getRawQuery q
pure $ (\(len, docsC) -> (len, docsC .| takeC 1000 .| mapC toDoc)) <$> eRes
toDoc :: OA.Work -> HyperdataDocument
toDoc (OA.Work { .. } ) =
HyperdataDocument { _hd_bdd = Just "OpenAlex"
, _hd_doi = doi
, _hd_url = url
, _hd_uniqId = Just id
, _hd_uniqIdBdd = Just id
, _hd_page = firstPage biblio
, _hd_title = title
, _hd_authors = authors authorships
, _hd_institutes = institutes authorships
, _hd_source = source
, _hd_abstract = Just abstract_reconstructed
, _hd_publication_date = Just $ show publication_date
, _hd_publication_year = Just $ publication_year
, _hd_publication_month = Nothing -- TODO
, _hd_publication_day = Nothing -- TODO
, _hd_publication_hour = Nothing -- TODO
, _hd_publication_minute = Nothing -- TODO
, _hd_publication_second = Nothing -- TODO
, _hd_language_iso2 = language }
where
firstPage :: OA.Biblio -> Maybe Int
firstPage OA.Biblio { first_page } = maybe Nothing readMaybe $ T.unpack <$> first_page
authors :: [OA.Authorship] -> Maybe Text
authors [] = Nothing
authors aus = Just $ T.intercalate ", " (getDisplayName <$> aus)
where
getDisplayName :: OA.Authorship -> Text
getDisplayName OA.Authorship { author = OA.DehydratedAuthor { display_name = dn } } = dn
institutes :: [OA.Authorship] -> Maybe Text
institutes [] = Nothing
institutes aus = Just $ T.intercalate ", " ((T.replace ", " " - ") . getInstitutesNames <$> aus)
where
getInstitutesNames OA.Authorship { institutions } = T.intercalate ", " $ getDisplayName <$> institutions
getDisplayName :: OA.DehydratedInstitution -> Text
getDisplayName OA.DehydratedInstitution { display_name = dn } = dn
source :: Maybe Text
source = maybe Nothing getSource primary_location
where
getSource OA.Location { source = s } = getSourceDisplayName <$> s
getSourceDisplayName OA.DehydratedSource { display_name = dn } = dn
......@@ -59,6 +59,8 @@ extra-deps:
commit: 3db385e767d2100d8abe900833c6e7de3ac55e1b
- git: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
commit: 2d7e5753cbbce248b860b571a0e9885415c846f7
- git: https://gitlab.iscpif.fr/gargantext/crawlers/openalex.git
commit: 723a641f108e11e0b57ecbf22d279aca81317c0a
# NP libs
- git: https://github.com/alpmestan/servant-job.git
commit: b4182487cfe479777c11ca19f3c0d47840b376f6
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment