Commit 63fcd605 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Drop 'All' data constructor from 'Lang'

Removing the `All` data constructor from  the `Lang` datatype has
several benifits, the main one being that we now have total functions
for things like `toISO693`.

This will make possible to write an uniform interface for stemming
`stem :: Lang -> StemmingAlgorithm -> T.Text -> T.Text`, as now
`Lang` will always map to a valid (ISO693) language.
parent 845db556
Pipeline #5725 passed with stages
in 100 minutes and 11 seconds
...@@ -53,8 +53,7 @@ import Network.HTTP.Client.TLS ...@@ -53,8 +53,7 @@ import Network.HTTP.Client.TLS
import Prelude qualified import Prelude qualified
langToSearx :: Lang -> Text langToSearx :: Lang -> Text
langToSearx All = "en-US" langToSearx x = Text.toLower acronym <> "-" <> acronym
langToSearx x = (Text.toLower acronym) <> "-" <> acronym
where where
acronym = show x acronym = show x
......
...@@ -46,8 +46,7 @@ import Prelude (userError) ...@@ -46,8 +46,7 @@ import Prelude (userError)
-- NOTE: Use international country codes -- NOTE: Use international country codes
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes -- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
-- TODO This should be deprecated in favor of iso-639 library -- TODO This should be deprecated in favor of iso-639 library
data Lang = All data Lang = DE
| DE
| EL | EL
| EN | EN
| ES | ES
...@@ -58,7 +57,7 @@ data Lang = All ...@@ -58,7 +57,7 @@ data Lang = All
| RU | RU
| UK | UK
| ZH | ZH
deriving (Show, Eq, Ord, Enum, Bounded, Generic, GQLType) deriving (Read, Show, Eq, Ord, Enum, Bounded, Generic, GQLType)
-- | Defaults to 'EN' in all those places where a language is mandatory, -- | Defaults to 'EN' in all those places where a language is mandatory,
-- but an optional one has been passed. -- but an optional one has been passed.
...@@ -75,41 +74,30 @@ instance ToSchema Lang where ...@@ -75,41 +74,30 @@ instance ToSchema Lang where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
instance FromHttpApiData Lang instance FromHttpApiData Lang
where where
-- parseUrlPiece "All" = pure All -- parseUrlPiece is exactly the 'read' instance,
parseUrlPiece "DE" = pure DE -- if we are disciplined. Either way, this needs to
parseUrlPiece "EL" = pure EL -- be tested.
parseUrlPiece "EN" = pure EN parseUrlPiece fragment = case readMaybe fragment of
parseUrlPiece "ES" = pure ES Nothing -> Left $ "Unexpected value of Lang: " <> fragment
parseUrlPiece "FR" = pure FR Just lang -> Right lang
parseUrlPiece "IT" = pure IT
parseUrlPiece "PL" = pure PL
parseUrlPiece "PT" = pure PT
parseUrlPiece "RU" = pure RU
parseUrlPiece "UK" = pure UK
parseUrlPiece "ZH" = pure ZH
parseUrlPiece _ = Left "Unexpected value of Lang"
instance ToHttpApiData Lang where instance ToHttpApiData Lang where
toUrlPiece = pack . show toUrlPiece = pack . show
instance Hashable Lang instance Hashable Lang
instance Arbitrary Lang where instance Arbitrary Lang where
arbitrary = arbitraryBoundedEnum arbitrary = arbitraryBoundedEnum
toISO639 :: Lang -> Maybe ISO639.ISO639_1 toISO639 :: Lang -> ISO639.ISO639_1
toISO639 DE = Just ISO639.DE toISO639 DE = ISO639.DE
toISO639 EL = Just ISO639.EL toISO639 EL = ISO639.EL
toISO639 EN = Just ISO639.EN toISO639 EN = ISO639.EN
toISO639 ES = Just ISO639.ES toISO639 ES = ISO639.ES
toISO639 FR = Just ISO639.FR toISO639 FR = ISO639.FR
toISO639 IT = Just ISO639.IT toISO639 IT = ISO639.IT
toISO639 PL = Just ISO639.PL toISO639 PL = ISO639.PL
toISO639 PT = Just ISO639.PT toISO639 PT = ISO639.PT
toISO639 RU = Just ISO639.RU toISO639 RU = ISO639.RU
toISO639 UK = Just ISO639.UK toISO639 UK = ISO639.UK
toISO639 ZH = Just ISO639.ZH toISO639 ZH = ISO639.ZH
toISO639 All = Nothing
toISO639EN :: Lang -> ISO639.ISO639_1
toISO639EN l = fromMaybe ISO639.EN $ toISO639 l
iso639ToText :: ISO639.ISO639_1 -> Text iso639ToText :: ISO639.ISO639_1 -> Text
iso639ToText la = pack [a, b] iso639ToText la = pack [a, b]
...@@ -117,19 +105,18 @@ iso639ToText la = pack [a, b] ...@@ -117,19 +105,18 @@ iso639ToText la = pack [a, b]
(a, b) = ISO639.toChars la (a, b) = ISO639.toChars la
-- | https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes -- | https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
toISO639Lang :: Lang -> Maybe Text toISO639Lang :: Lang -> Text
toISO639Lang All = Nothing toISO639Lang DE = "de"
toISO639Lang DE = Just "de" toISO639Lang EL = "el"
toISO639Lang EL = Just "el" toISO639Lang EN = "en"
toISO639Lang EN = Just "en" toISO639Lang ES = "es"
toISO639Lang ES = Just "es" toISO639Lang FR = "fr"
toISO639Lang FR = Just "fr" toISO639Lang IT = "it"
toISO639Lang IT = Just "it" toISO639Lang PL = "pl"
toISO639Lang PL = Just "pl" toISO639Lang PT = "pt"
toISO639Lang PT = Just "pt" toISO639Lang RU = "ru"
toISO639Lang RU = Just "ru" toISO639Lang UK = "uk"
toISO639Lang UK = Just "uk" toISO639Lang ZH = "zh"
toISO639Lang ZH = Just "zh"
allLangs :: [Lang] allLangs :: [Lang]
allLangs = [minBound .. maxBound] allLangs = [minBound .. maxBound]
...@@ -145,7 +132,6 @@ class HasDBid a where ...@@ -145,7 +132,6 @@ class HasDBid a where
-- once we add a new 'Lang'. -- once we add a new 'Lang'.
langIds :: Bimap Lang Int langIds :: Bimap Lang Int
langIds = Bimap.fromList $ allLangs <&> \lid -> case lid of langIds = Bimap.fromList $ allLangs <&> \lid -> case lid of
All -> (lid, 0)
DE -> (lid, 276) DE -> (lid, 276)
EL -> (lid, 300) EL -> (lid, 300)
EN -> (lid, 2) EN -> (lid, 2)
......
...@@ -23,7 +23,7 @@ import Control.Monad.Except ...@@ -23,7 +23,7 @@ import Control.Monad.Except
import Data.Text qualified as T import Data.Text qualified as T
import EPO.API.Client.Types qualified as EPO import EPO.API.Client.Types qualified as EPO
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs) import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..), toISO639, toISO639EN) import Gargantext.Core (Lang(..), toISO639)
import Gargantext.Core.Text.Corpus.API.Arxiv qualified as Arxiv import Gargantext.Core.Text.Corpus.API.Arxiv qualified as Arxiv
import Gargantext.Core.Text.Corpus.API.EPO qualified as EPO import Gargantext.Core.Text.Corpus.API.EPO qualified as EPO
import Gargantext.Core.Text.Corpus.API.Hal qualified as HAL import Gargantext.Core.Text.Corpus.API.Hal qualified as HAL
...@@ -47,6 +47,9 @@ data GetCorpusError ...@@ -47,6 +47,9 @@ data GetCorpusError
-- | Get External API metadata main function -- | Get External API metadata main function
get :: ExternalAPIs get :: ExternalAPIs
-> Lang -> Lang
-- ^ A user-selected language in which documents needs to be retrieved.
-- If the provider doesn't support the search filtered by language, or if the language
-- is not important, the frontend will simply send 'EN' to the backend.
-> Corpus.RawQuery -> Corpus.RawQuery
-> Maybe PUBMED.APIKey -> Maybe PUBMED.APIKey
-> Maybe EPO.AuthKey -> Maybe EPO.AuthKey
...@@ -54,26 +57,26 @@ get :: ExternalAPIs ...@@ -54,26 +57,26 @@ get :: ExternalAPIs
-> Maybe Corpus.Limit -> Maybe Corpus.Limit
-- -> IO [HyperdataDocument] -- -> IO [HyperdataDocument]
-> IO (Either GetCorpusError (Maybe Integer, ConduitT () HyperdataDocument IO ())) -> IO (Either GetCorpusError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get externalAPI la q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
-- For PUBMED, HAL, IsTex, Isidore and OpenAlex, we want to send the query as-it. -- For PUBMED, HAL, IsTex, Isidore and OpenAlex, we want to send the query as-it.
-- For Arxiv we parse the query into a structured boolean query we submit over. -- For Arxiv we parse the query into a structured boolean query we submit over.
case externalAPI of case externalAPI of
PubMed -> PubMed ->
first ExternalAPIError <$> PUBMED.get (fromMaybe "" mPubmedAPIKey) q limit first ExternalAPIError <$> PUBMED.get (fromMaybe "" mPubmedAPIKey) q limit
OpenAlex -> OpenAlex ->
first ExternalAPIError <$> OpenAlex.get (fromMaybe "" Nothing {- email -}) q (toISO639 la) limit first ExternalAPIError <$> OpenAlex.get (fromMaybe "" Nothing {- email -}) q (Just $ toISO639 lang) limit
Arxiv -> runExceptT $ do Arxiv -> runExceptT $ do
corpusQuery <- ExceptT (pure parse_query) corpusQuery <- ExceptT (pure parse_query)
ExceptT $ fmap Right (Arxiv.get la corpusQuery limit) ExceptT $ fmap Right (Arxiv.get lang corpusQuery limit)
HAL -> HAL ->
first ExternalAPIError <$> HAL.getC (toISO639 la) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) first ExternalAPIError <$> HAL.getC (Just $ toISO639 lang) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do IsTex -> do
docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) docs <- ISTEX.get lang (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs) pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do Isidore -> do
docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing docs <- ISIDORE.get lang (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs) pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
EPO -> do EPO -> do
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639EN la) limit first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639 lang) limit
where where
parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q
...@@ -11,7 +11,12 @@ Portability : POSIX ...@@ -11,7 +11,12 @@ Portability : POSIX
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Core.Text.Corpus.API.Isidore where module Gargantext.Core.Text.Corpus.API.Isidore (
get
-- * Internals (possibly unused?)
, isidore2csvFile
) where
import Data.Text qualified as Text import Data.Text qualified as Text
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
...@@ -26,10 +31,12 @@ import Isidore.Client ...@@ -26,10 +31,12 @@ import Isidore.Client
import Servant.Client import Servant.Client
-- | TODO work with the ServantErr -- | TODO work with the ServantErr
get :: Lang -> Maybe Isidore.Limit get :: Lang
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery
-> Maybe Isidore.AuthorQuery
-> IO [HyperdataDocument] -> IO [HyperdataDocument]
get la l q a = do get lang l q a = do
let let
printErr (DecodeFailure e _) = panicTrace e printErr (DecodeFailure e _) = panicTrace e
printErr e = panicTrace (show e) printErr e = panicTrace (show e)
...@@ -40,18 +47,18 @@ get la l q a = do ...@@ -40,18 +47,18 @@ get la l q a = do
iDocs <- either printErr _content <$> Isidore.get l q a iDocs <- either printErr _content <$> Isidore.get l q a
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs) hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
pure hDocs pure hDocs
isidore2csvFile :: FilePath -> Lang -> Maybe Isidore.Limit isidore2csvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery -> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
-> IO () -> IO ()
isidore2csvFile fp la li tq aq = do isidore2csvFile fp lang li tq aq = do
hdocs <- get la li tq aq hdocs <- get lang li tq aq
writeDocs2Csv fp hdocs writeDocs2Csv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc l (IsidoreDoc t a d u s as) = do isidoreToDoc lang (IsidoreDoc t a d u s as) = do
let let
author :: Author -> Text author :: Author -> Text
author (Author fn ln) = (_name fn) <> ", " <> (_name ln) author (Author fn ln) = (_name fn) <> ", " <> (_name ln)
...@@ -88,5 +95,5 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do ...@@ -88,5 +95,5 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
, _hd_publication_hour = Nothing , _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing , _hd_publication_minute = Nothing
, _hd_publication_second = Nothing , _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l , _hd_language_iso2 = Just . Text.pack . show $ lang
} }
{-# LANGUAGE LambdaCase #-}
{-| {-|
Module : Gargantext.Core.Text.Corpus.API.Istex Module : Gargantext.Core.Text.Corpus.API.Istex
Description : Pubmed API connection Description : Pubmed API connection
...@@ -11,6 +13,7 @@ Portability : POSIX ...@@ -11,6 +13,7 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Istex module Gargantext.Core.Text.Corpus.API.Istex
( get )
where where
import Data.List qualified as List import Data.List qualified as List
...@@ -18,7 +21,7 @@ import Data.Text qualified as Text ...@@ -18,7 +21,7 @@ import Data.Text qualified as Text
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc) import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude import Gargantext.Prelude hiding (get)
import ISTEX qualified as ISTEX import ISTEX qualified as ISTEX
import ISTEX.Client qualified as ISTEX import ISTEX.Client qualified as ISTEX
...@@ -42,10 +45,7 @@ get la query' maxResults = do ...@@ -42,10 +45,7 @@ get la query' maxResults = do
-- In that case we need to enrich his query with 2 parameters -- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT -- First expected language: user has to define it in GTXT
-- Second : query in abstract -- Second : query in abstract
True -> ("language:"<> lang la) <> " AND abstract:"<>query' True -> ("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query'
where
lang FR = "fre"
lang _ = "eng"
False -> query' False -> query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is -- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
...@@ -70,4 +70,18 @@ toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument] ...@@ -70,4 +70,18 @@ toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument]
toDoc' la docs' = mapM (toDoc la) (ISTEX._documents_hits docs') toDoc' la docs' = mapM (toDoc la) (ISTEX._documents_hits docs')
--printDebug "ISTEX" (ISTEX._documents_total docs') --printDebug "ISTEX" (ISTEX._documents_total docs')
-- | Returns the properly-rendered language code according to
-- https://doc.istex.fr/tdm/annexes/codes-langues.html
toISTEXLanguageCode :: Lang -> Text.Text
toISTEXLanguageCode = \case
DE -> "ger"
EL -> "gre"
EN -> "eng"
ES -> "spa"
FR -> "fre"
IT -> "ita"
PL -> "pol"
PT -> "por"
RU -> "Rus"
UK -> "ukr"
ZH -> "chi"
...@@ -33,7 +33,6 @@ data JSSpell = JSPOS Lang | JSLemma Lang ...@@ -33,7 +33,6 @@ data JSSpell = JSPOS Lang | JSLemma Lang
deriving (Show) deriving (Show)
instance ToJSON JSSpell where instance ToJSON JSSpell where
toJSON (JSPOS All) = "pos"
toJSON (JSPOS DE) = "de.pos" toJSON (JSPOS DE) = "de.pos"
toJSON (JSPOS EL) = "el.pos" toJSON (JSPOS EL) = "el.pos"
toJSON (JSPOS EN) = "en.pos" toJSON (JSPOS EN) = "en.pos"
...@@ -46,7 +45,6 @@ instance ToJSON JSSpell where ...@@ -46,7 +45,6 @@ instance ToJSON JSSpell where
toJSON (JSPOS UK) = "uk.pos" toJSON (JSPOS UK) = "uk.pos"
toJSON (JSPOS ZH) = "zh.pos" toJSON (JSPOS ZH) = "zh.pos"
toJSON (JSLemma All) = "lemma"
toJSON (JSLemma DE) = "de.lemma" toJSON (JSLemma DE) = "de.lemma"
toJSON (JSLemma EL) = "el.lemma" toJSON (JSLemma EL) = "el.lemma"
toJSON (JSLemma EN) = "en.lemma" toJSON (JSLemma EN) = "en.lemma"
...@@ -71,7 +69,6 @@ instance FromJSON JSSpell where ...@@ -71,7 +69,6 @@ instance FromJSON JSSpell where
parseJSON (String "ru.pos") = pure $ JSPOS RU parseJSON (String "ru.pos") = pure $ JSPOS RU
parseJSON (String "uk.pos") = pure $ JSPOS UK parseJSON (String "uk.pos") = pure $ JSPOS UK
parseJSON (String "zh.pos") = pure $ JSPOS ZH parseJSON (String "zh.pos") = pure $ JSPOS ZH
parseJSON (String "pos") = pure $ JSPOS All
parseJSON (String "de.lemma") = pure $ JSLemma DE parseJSON (String "de.lemma") = pure $ JSLemma DE
parseJSON (String "en.lemma") = pure $ JSLemma EN parseJSON (String "en.lemma") = pure $ JSLemma EN
...@@ -84,7 +81,6 @@ instance FromJSON JSSpell where ...@@ -84,7 +81,6 @@ instance FromJSON JSSpell where
parseJSON (String "ru.lemma") = pure $ JSLemma RU parseJSON (String "ru.lemma") = pure $ JSLemma RU
parseJSON (String "uk.lemma") = pure $ JSLemma UK parseJSON (String "uk.lemma") = pure $ JSLemma UK
parseJSON (String "zh.lemma") = pure $ JSLemma ZH parseJSON (String "zh.lemma") = pure $ JSLemma ZH
parseJSON (String "lemma") = pure $ JSLemma All
parseJSON s = parseJSON s =
prependFailure "parsing spell failed, " prependFailure "parsing spell failed, "
(typeMismatch "Spell" s) (typeMismatch "Spell" s)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment