Commit 63fcd605 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Drop 'All' data constructor from 'Lang'

Removing the `All` data constructor from  the `Lang` datatype has
several benifits, the main one being that we now have total functions
for things like `toISO693`.

This will make possible to write an uniform interface for stemming
`stem :: Lang -> StemmingAlgorithm -> T.Text -> T.Text`, as now
`Lang` will always map to a valid (ISO693) language.
parent 845db556
Pipeline #5725 passed with stages
in 100 minutes and 11 seconds
......@@ -53,8 +53,7 @@ import Network.HTTP.Client.TLS
import Prelude qualified
langToSearx :: Lang -> Text
langToSearx All = "en-US"
langToSearx x = (Text.toLower acronym) <> "-" <> acronym
langToSearx x = Text.toLower acronym <> "-" <> acronym
where
acronym = show x
......
......@@ -46,8 +46,7 @@ import Prelude (userError)
-- NOTE: Use international country codes
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
-- TODO This should be deprecated in favor of iso-639 library
data Lang = All
| DE
data Lang = DE
| EL
| EN
| ES
......@@ -58,7 +57,7 @@ data Lang = All
| RU
| UK
| ZH
deriving (Show, Eq, Ord, Enum, Bounded, Generic, GQLType)
deriving (Read, Show, Eq, Ord, Enum, Bounded, Generic, GQLType)
-- | Defaults to 'EN' in all those places where a language is mandatory,
-- but an optional one has been passed.
......@@ -75,41 +74,30 @@ instance ToSchema Lang where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
instance FromHttpApiData Lang
where
-- parseUrlPiece "All" = pure All
parseUrlPiece "DE" = pure DE
parseUrlPiece "EL" = pure EL
parseUrlPiece "EN" = pure EN
parseUrlPiece "ES" = pure ES
parseUrlPiece "FR" = pure FR
parseUrlPiece "IT" = pure IT
parseUrlPiece "PL" = pure PL
parseUrlPiece "PT" = pure PT
parseUrlPiece "RU" = pure RU
parseUrlPiece "UK" = pure UK
parseUrlPiece "ZH" = pure ZH
parseUrlPiece _ = Left "Unexpected value of Lang"
-- parseUrlPiece is exactly the 'read' instance,
-- if we are disciplined. Either way, this needs to
-- be tested.
parseUrlPiece fragment = case readMaybe fragment of
Nothing -> Left $ "Unexpected value of Lang: " <> fragment
Just lang -> Right lang
instance ToHttpApiData Lang where
toUrlPiece = pack . show
instance Hashable Lang
instance Arbitrary Lang where
arbitrary = arbitraryBoundedEnum
toISO639 :: Lang -> Maybe ISO639.ISO639_1
toISO639 DE = Just ISO639.DE
toISO639 EL = Just ISO639.EL
toISO639 EN = Just ISO639.EN
toISO639 ES = Just ISO639.ES
toISO639 FR = Just ISO639.FR
toISO639 IT = Just ISO639.IT
toISO639 PL = Just ISO639.PL
toISO639 PT = Just ISO639.PT
toISO639 RU = Just ISO639.RU
toISO639 UK = Just ISO639.UK
toISO639 ZH = Just ISO639.ZH
toISO639 All = Nothing
toISO639EN :: Lang -> ISO639.ISO639_1
toISO639EN l = fromMaybe ISO639.EN $ toISO639 l
toISO639 :: Lang -> ISO639.ISO639_1
toISO639 DE = ISO639.DE
toISO639 EL = ISO639.EL
toISO639 EN = ISO639.EN
toISO639 ES = ISO639.ES
toISO639 FR = ISO639.FR
toISO639 IT = ISO639.IT
toISO639 PL = ISO639.PL
toISO639 PT = ISO639.PT
toISO639 RU = ISO639.RU
toISO639 UK = ISO639.UK
toISO639 ZH = ISO639.ZH
iso639ToText :: ISO639.ISO639_1 -> Text
iso639ToText la = pack [a, b]
......@@ -117,19 +105,18 @@ iso639ToText la = pack [a, b]
(a, b) = ISO639.toChars la
-- | https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
toISO639Lang :: Lang -> Maybe Text
toISO639Lang All = Nothing
toISO639Lang DE = Just "de"
toISO639Lang EL = Just "el"
toISO639Lang EN = Just "en"
toISO639Lang ES = Just "es"
toISO639Lang FR = Just "fr"
toISO639Lang IT = Just "it"
toISO639Lang PL = Just "pl"
toISO639Lang PT = Just "pt"
toISO639Lang RU = Just "ru"
toISO639Lang UK = Just "uk"
toISO639Lang ZH = Just "zh"
toISO639Lang :: Lang -> Text
toISO639Lang DE = "de"
toISO639Lang EL = "el"
toISO639Lang EN = "en"
toISO639Lang ES = "es"
toISO639Lang FR = "fr"
toISO639Lang IT = "it"
toISO639Lang PL = "pl"
toISO639Lang PT = "pt"
toISO639Lang RU = "ru"
toISO639Lang UK = "uk"
toISO639Lang ZH = "zh"
allLangs :: [Lang]
allLangs = [minBound .. maxBound]
......@@ -145,7 +132,6 @@ class HasDBid a where
-- once we add a new 'Lang'.
langIds :: Bimap Lang Int
langIds = Bimap.fromList $ allLangs <&> \lid -> case lid of
All -> (lid, 0)
DE -> (lid, 276)
EL -> (lid, 300)
EN -> (lid, 2)
......
......@@ -23,7 +23,7 @@ import Control.Monad.Except
import Data.Text qualified as T
import EPO.API.Client.Types qualified as EPO
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..), toISO639, toISO639EN)
import Gargantext.Core (Lang(..), toISO639)
import Gargantext.Core.Text.Corpus.API.Arxiv qualified as Arxiv
import Gargantext.Core.Text.Corpus.API.EPO qualified as EPO
import Gargantext.Core.Text.Corpus.API.Hal qualified as HAL
......@@ -47,6 +47,9 @@ data GetCorpusError
-- | Get External API metadata main function
get :: ExternalAPIs
-> Lang
-- ^ A user-selected language in which documents needs to be retrieved.
-- If the provider doesn't support the search filtered by language, or if the language
-- is not important, the frontend will simply send 'EN' to the backend.
-> Corpus.RawQuery
-> Maybe PUBMED.APIKey
-> Maybe EPO.AuthKey
......@@ -54,26 +57,26 @@ get :: ExternalAPIs
-> Maybe Corpus.Limit
-- -> IO [HyperdataDocument]
-> IO (Either GetCorpusError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get externalAPI la q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
-- For PUBMED, HAL, IsTex, Isidore and OpenAlex, we want to send the query as-it.
-- For Arxiv we parse the query into a structured boolean query we submit over.
case externalAPI of
PubMed ->
first ExternalAPIError <$> PUBMED.get (fromMaybe "" mPubmedAPIKey) q limit
OpenAlex ->
first ExternalAPIError <$> OpenAlex.get (fromMaybe "" Nothing {- email -}) q (toISO639 la) limit
first ExternalAPIError <$> OpenAlex.get (fromMaybe "" Nothing {- email -}) q (Just $ toISO639 lang) limit
Arxiv -> runExceptT $ do
corpusQuery <- ExceptT (pure parse_query)
ExceptT $ fmap Right (Arxiv.get la corpusQuery limit)
ExceptT $ fmap Right (Arxiv.get lang corpusQuery limit)
HAL ->
first ExternalAPIError <$> HAL.getC (toISO639 la) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
first ExternalAPIError <$> HAL.getC (Just $ toISO639 lang) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do
docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
docs <- ISTEX.get lang (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do
docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
docs <- ISIDORE.get lang (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
EPO -> do
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639EN la) limit
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639 lang) limit
where
parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q
......@@ -11,7 +11,12 @@ Portability : POSIX
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Core.Text.Corpus.API.Isidore where
module Gargantext.Core.Text.Corpus.API.Isidore (
get
-- * Internals (possibly unused?)
, isidore2csvFile
) where
import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
......@@ -26,10 +31,12 @@ import Isidore.Client
import Servant.Client
-- | TODO work with the ServantErr
get :: Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
get :: Lang
-> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery
-> Maybe Isidore.AuthorQuery
-> IO [HyperdataDocument]
get la l q a = do
get lang l q a = do
let
printErr (DecodeFailure e _) = panicTrace e
printErr e = panicTrace (show e)
......@@ -40,18 +47,18 @@ get la l q a = do
iDocs <- either printErr _content <$> Isidore.get l q a
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs)
hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
pure hDocs
isidore2csvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
-> IO ()
isidore2csvFile fp la li tq aq = do
hdocs <- get la li tq aq
isidore2csvFile fp lang li tq aq = do
hdocs <- get lang li tq aq
writeDocs2Csv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc l (IsidoreDoc t a d u s as) = do
isidoreToDoc lang (IsidoreDoc t a d u s as) = do
let
author :: Author -> Text
author (Author fn ln) = (_name fn) <> ", " <> (_name ln)
......@@ -88,5 +95,5 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l
, _hd_language_iso2 = Just . Text.pack . show $ lang
}
{-# LANGUAGE LambdaCase #-}
{-|
Module : Gargantext.Core.Text.Corpus.API.Istex
Description : Pubmed API connection
......@@ -11,6 +13,7 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Istex
( get )
where
import Data.List qualified as List
......@@ -18,7 +21,7 @@ import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Prelude hiding (get)
import ISTEX qualified as ISTEX
import ISTEX.Client qualified as ISTEX
......@@ -42,10 +45,7 @@ get la query' maxResults = do
-- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT
-- Second : query in abstract
True -> ("language:"<> lang la) <> " AND abstract:"<>query'
where
lang FR = "fre"
lang _ = "eng"
True -> ("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query'
False -> query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
......@@ -70,4 +70,18 @@ toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument]
toDoc' la docs' = mapM (toDoc la) (ISTEX._documents_hits docs')
--printDebug "ISTEX" (ISTEX._documents_total docs')
-- | Returns the properly-rendered language code according to
-- https://doc.istex.fr/tdm/annexes/codes-langues.html
toISTEXLanguageCode :: Lang -> Text.Text
toISTEXLanguageCode = \case
DE -> "ger"
EL -> "gre"
EN -> "eng"
ES -> "spa"
FR -> "fre"
IT -> "ita"
PL -> "pol"
PT -> "por"
RU -> "Rus"
UK -> "ukr"
ZH -> "chi"
......@@ -33,7 +33,6 @@ data JSSpell = JSPOS Lang | JSLemma Lang
deriving (Show)
instance ToJSON JSSpell where
toJSON (JSPOS All) = "pos"
toJSON (JSPOS DE) = "de.pos"
toJSON (JSPOS EL) = "el.pos"
toJSON (JSPOS EN) = "en.pos"
......@@ -46,7 +45,6 @@ instance ToJSON JSSpell where
toJSON (JSPOS UK) = "uk.pos"
toJSON (JSPOS ZH) = "zh.pos"
toJSON (JSLemma All) = "lemma"
toJSON (JSLemma DE) = "de.lemma"
toJSON (JSLemma EL) = "el.lemma"
toJSON (JSLemma EN) = "en.lemma"
......@@ -71,7 +69,6 @@ instance FromJSON JSSpell where
parseJSON (String "ru.pos") = pure $ JSPOS RU
parseJSON (String "uk.pos") = pure $ JSPOS UK
parseJSON (String "zh.pos") = pure $ JSPOS ZH
parseJSON (String "pos") = pure $ JSPOS All
parseJSON (String "de.lemma") = pure $ JSLemma DE
parseJSON (String "en.lemma") = pure $ JSLemma EN
......@@ -84,7 +81,6 @@ instance FromJSON JSSpell where
parseJSON (String "ru.lemma") = pure $ JSLemma RU
parseJSON (String "uk.lemma") = pure $ JSLemma UK
parseJSON (String "zh.lemma") = pure $ JSLemma ZH
parseJSON (String "lemma") = pure $ JSLemma All
parseJSON s =
prependFailure "parsing spell failed, "
(typeMismatch "Spell" s)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment