Commit b454c5eb authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/121-dev-arxiv' into dev-merge

parents 75afdbb5 b691ddf2
......@@ -107,7 +107,6 @@ library
Gargantext.API.Flow
Gargantext.API.GraphQL
Gargantext.API.GraphQL.AsyncTask
Gargantext.API.GraphQL.Contact
Gargantext.API.GraphQL.IMT
Gargantext.API.GraphQL.Node
Gargantext.API.GraphQL.TreeFirstLevel
......@@ -161,6 +160,7 @@ library
Gargantext.Core.Methods.Matrix.Accelerate.Utils
Gargantext.Core.Statistics
Gargantext.Core.Text.Convert
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Istex
......@@ -342,6 +342,7 @@ library
, aeson-lens
, aeson-pretty
, array
, arxiv
, async
, attoparsec
, auto-update
......@@ -360,6 +361,7 @@ library
, conduit-extra
, containers
, contravariant
, crawlerArxiv
, crawlerHAL
, crawlerISTEX
, crawlerIsidore
......
......@@ -132,6 +132,7 @@ library:
- aeson-lens
- aeson-pretty
- array
- arxiv
- async
- attoparsec
- auto-update
......@@ -150,6 +151,7 @@ library:
- conduit-extra
- containers
- contravariant
- crawlerArxiv
- crawlerHAL
- crawlerISTEX
- crawlerIsidore
......
......@@ -36,6 +36,7 @@ instance Arbitrary a => Arbitrary (JobOutput a) where
-- TODO IsidoreAuth
data ExternalAPIs = All
| PubMed
| Arxiv
| HAL
| IsTex
| Isidore
......
......@@ -22,6 +22,7 @@ import Gargantext.Database.Action.Flow (DataOrigin(..))
data Database = Empty
| PubMed
| Arxiv
| HAL
| IsTex
| Isidore
......@@ -33,6 +34,7 @@ instance ToSchema Database
database2origin :: Database -> DataOrigin
database2origin Empty = InternalOrigin T.IsTex
database2origin PubMed = ExternalOrigin T.PubMed
database2origin Arxiv = ExternalOrigin T.Arxiv
database2origin HAL = ExternalOrigin T.HAL
database2origin IsTex = ExternalOrigin T.IsTex
database2origin Isidore = ExternalOrigin T.Isidore
......
......@@ -25,6 +25,7 @@ import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
import qualified Gargantext.Core.Text.Corpus.API.Hal as HAL
import qualified Gargantext.Core.Text.Corpus.API.Isidore as ISIDORE
import qualified Gargantext.Core.Text.Corpus.API.Istex as ISTEX
......@@ -41,6 +42,7 @@ get :: ExternalAPIs
get PubMed _la q limit = PUBMED.get q limit
--docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
get Arxiv la q limit = Arxiv.get la q (fromIntegral <$> limit)
get HAL la q limit = HAL.getC la q limit
get IsTex la q limit = do
docs <- ISTEX.get la q limit
......
{-|
Module : Gargantext.Core.Text.Corpus.API.Arxiv
Description : Pubmed API connection
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans -fno-warn-unused-top-binds #-}
module Gargantext.Core.Text.Corpus.API.Arxiv
where
import Conduit
import Data.Either (Either(..))
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as Text
import Servant.Client (ClientError)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified Arxiv as Arxiv
import qualified Network.Api.Arxiv as Ax
type Query = Text
type Limit = Arxiv.Limit
-- | TODO put default pubmed query in gargantext.ini
-- by default: 10K docs
get :: Lang -> Query -> Maybe Limit -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get la q l = do
(cnt, resC) <- Arxiv.apiSimpleC l [Text.unpack q]
pure $ Right (Just $ fromIntegral cnt, resC .| mapC (toDoc la))
toDoc :: Lang -> Arxiv.Result -> HyperdataDocument
toDoc l (Arxiv.Result { abstract
, authors = aus
--, categories
, doi
, id
, journal
--, primaryCategory
, publication_date
, title
--, total
, url
, year }
) = HyperdataDocument { _hd_bdd = Just "Arxiv"
, _hd_doi = Just $ Text.pack doi
, _hd_url = Just $ Text.pack url
, _hd_uniqId = Just $ Text.pack id
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ Text.pack title
, _hd_authors = authors aus
, _hd_institutes = institutes aus
, _hd_source = Just $ Text.pack journal
, _hd_abstract = Just $ Text.pack abstract
, _hd_publication_date = Just $ Text.pack publication_date
, _hd_publication_year = fromIntegral <$> year
, _hd_publication_month = Nothing -- TODO parse publication_date
, _hd_publication_day = Nothing
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l }
where
authors :: [Ax.Author] -> Maybe Text
authors [] = Nothing
authors aus' = Just $ (Text.intercalate ", ")
$ map Text.pack
$ map Ax.auName aus'
institutes :: [Ax.Author] -> Maybe Text
institutes [] = Nothing
institutes aus' = Just $ (Text.intercalate ", ")
$ (map (Text.replace ", " " - "))
$ map Text.pack
$ map Ax.auFil aus'
resolver:
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/18/18.yaml
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/18/28.yaml
flags: {}
extra-package-dbs: []
skip-ghc-check: true
......@@ -76,6 +76,10 @@ extra-deps:
commit: 3bf77f28d3dc71d2e8349cbf422a34cf4c23cd11
- git: https://gitlab.iscpif.fr/gargantext/crawlers/isidore.git
commit: 3db385e767d2100d8abe900833c6e7de3ac55e1b
#- git: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
- git: https://gitlab.iscpif.fr/cgenie/arxiv-api.git
commit: f3e517cc40d92e282c5245b23d253d2ca3f802e5
- arxiv-0.0.3@sha256:02de1114091d11f1f3ab401d104d125ad4301260806feb7f63b3dcefc7db88cf,1588
# NP libs
#- git: https://github.com/np/servant-job.git # waiting for PR
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment