Commit 32301d6d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX/FEAT] Langs

parent 18067565
Pipeline #3914 failed with stage
in 29 minutes and 58 seconds
......@@ -5,7 +5,7 @@ cabal-version: 1.12
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.9.9
version: 0.0.6.9.9
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -215,12 +215,12 @@ library
Gargantext.Core.Text.Metrics.FrequentItemSet
Gargantext.Core.Text.Metrics.SpeGen.IncExc
Gargantext.Core.Text.Metrics.Utils
Gargantext.Core.Text.Samples.CN
Gargantext.Core.Text.Samples.DE
Gargantext.Core.Text.Samples.EN
Gargantext.Core.Text.Samples.ES
Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.PL
Gargantext.Core.Text.Samples.ZH
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token
......
......@@ -2,29 +2,15 @@
module Gargantext.API.Node.Corpus.Searx where
import Control.Lens (view)
import qualified Data.Aeson as Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Either (Either(..))
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as T
import Data.Time.Calendar (Day, toGregorian)
import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
import Data.Tuple.Select (sel1, sel2, sel3)
import GHC.Generics (Generic)
import Network.HTTP.Client
import Network.HTTP.Client.TLS
import qualified Prelude
import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
import Gargantext.Prelude
import Gargantext.Prelude.Config
import Gargantext.Core (Lang(..))
import Gargantext.Core.NLP (nlpServerGet)
import qualified Gargantext.Core.Text.Corpus.API as API
import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.Terms (TermType(..))
......@@ -41,18 +27,25 @@ import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
import Gargantext.Prelude
import Gargantext.Prelude.Config
import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
import Network.HTTP.Client
import Network.HTTP.Client.TLS
import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
import qualified Data.Aeson as Aeson
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as T
import qualified Data.Text as Text
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
import qualified Prelude
langToSearx :: Lang -> Text
langToSearx EN = "en-US"
langToSearx FR = "fr-FR"
langToSearx DE = "de-FR"
langToSearx ES = "es-FR"
langToSearx IT = "it-FR"
langToSearx PL = "pl-FR"
langToSearx CN = "cn-FR"
langToSearx All = "en-US"
langToSearx x = (Text.toLower acronym) <> "-" <> acronym
where
acronym = (cs $ show x)
data SearxResult = SearxResult
{ _sr_url :: Text
......
......@@ -14,15 +14,17 @@ Portability : POSIX
module Gargantext.Core
where
import Data.Text (Text, pack)
import Data.Aeson
import Data.Either(Either(Left))
import Data.Hashable (Hashable)
import Data.Morpheus.Types (GQLType)
import Data.Swagger
import Data.Text (Text, pack)
import Data.Tuple.Extra (swap)
import GHC.Generics (Generic)
import Gargantext.Prelude
import Servant.API
import qualified Data.Map as Map
------------------------------------------------------------------------
-- | Language of a Text
......@@ -34,14 +36,25 @@ import Servant.API
-- - IT == italian
-- - ES == spanish
-- - PL == polish
-- - CN == chinese
-- - ZH == chinese
--
-- ... add your language and help us to implement it (:
-- | All languages supported
-- NOTE: Use international country codes
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
data Lang = EN | FR | DE | IT | PL | PT | ES | EL | CN | UK | RU | ZH | All
data Lang = All
| DE
| EL
| EN
| ES
| FR
| IT
| PL
| PT
| RU
| UK
| ZH
deriving (Show, Eq, Ord, Enum, Bounded, Generic, GQLType)
instance ToJSON Lang
......@@ -50,14 +63,18 @@ instance ToSchema Lang where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
instance FromHttpApiData Lang
where
parseUrlPiece "EN" = pure EN
parseUrlPiece "FR" = pure FR
parseUrlPiece "All" = pure All
parseUrlPiece "DE" = pure DE
parseUrlPiece "EL" = pure EL
parseUrlPiece "EN" = pure EN
parseUrlPiece "ES" = pure ES
parseUrlPiece "FR" = pure FR
parseUrlPiece "IT" = pure IT
parseUrlPiece "PL" = pure PL
parseUrlPiece "CN" = pure CN
parseUrlPiece "All" = pure All
parseUrlPiece "PT" = pure PT
parseUrlPiece "RU" = pure RU
parseUrlPiece "UK" = pure UK
parseUrlPiece "ZH" = pure ZH
parseUrlPiece _ = Left "Unexpected value of Lang"
instance ToHttpApiData Lang where
toUrlPiece = pack . show
......@@ -73,25 +90,29 @@ class HasDBid a where
-- NOTE: We try to use numeric codes for countries
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
-- https://en.wikipedia.org/wiki/ISO_3166-1_numeric#004
dbIds :: [(Lang, Int)]
dbIds = [ (All, 0 )
, (DE , 276)
, (EL , 300)
, (EN , 2 )
, (ES , 724)
, (FR , 1 )
, (IT , 380)
, (PL , 616)
, (PT , 620)
, (RU , 643)
, (UK , 804)
, (ZH , 156)
]
instance HasDBid Lang where
toDBid All = 0
toDBid FR = 1
toDBid EN = 2
toDBid DE = 276
toDBid ES = 724
toDBid IT = 380
toDBid PL = 616
toDBid CN = 156
toDBid lang = case Map.lookup lang $ Map.fromList dbIds of
Just la -> la
Nothing -> panic "[G.Core] Add this lang to DB ids"
fromDBid 0 = All
fromDBid 1 = FR
fromDBid 2 = EN
fromDBid 276 = DE
fromDBid 724 = ES
fromDBid 380 = IT
fromDBid 616 = PL
fromDBid 156 = CN
fromDBid _ = panic "HasDBid lang, not implemented"
fromDBid dbId = case Map.lookup dbId $ Map.fromList $ map swap dbIds of
Just la -> la
Nothing -> panic "HasDBid lang, not implemented"
------------------------------------------------------------------------
data NLPServerConfig = NLPServerConfig
......
......@@ -39,7 +39,7 @@ import Gargantext.Core (Lang(..), allLangs)
import Gargantext.Core.Text.Terms.Mono (words)
import Gargantext.Core.Text.Metrics.Count (occurrencesWith)
import qualified Gargantext.Core.Text.Samples.CN as CN
import qualified Gargantext.Core.Text.Samples.ZH as ZH
import qualified Gargantext.Core.Text.Samples.DE as DE
import qualified Gargantext.Core.Text.Samples.EN as EN
import qualified Gargantext.Core.Text.Samples.ES as ES
......@@ -115,7 +115,7 @@ detectLangDefault = detectCat 99 eventLang
textSample FR = FR.textSample
textSample DE = DE.textSample
textSample ES = ES.textSample
textSample CN = CN.textSample
textSample ZH = ZH.textSample
textSample PL = PL.textSample
textSample _ = panic "[G.C.T.L:detectLangDefault] not impl yet"
--textSample DE = DE.textSample
......
{-|
Module : Gargantext.Core.Text.Samples.CN
Module : Gargantext.Core.Text.Samples.ZH
Description : Sample of Chinese Text
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
......@@ -14,7 +14,7 @@ Page : text mining
module Gargantext.Core.Text.Samples.CN where
module Gargantext.Core.Text.Samples.ZH where
import Data.String (String)
......
......@@ -52,7 +52,7 @@ tokenTag2terms :: TokenTag -> Terms
tokenTag2terms (TokenTag ws t _ _) = Terms ws t
tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
tokenTags (NLPServerConfig { server = CoreNLP, url }) l txt = tokenTagsWith l txt $ corenlp url
tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
tokenTags (NLPServerConfig { server = Spacy, url }) l txt = tokenTagsWith l txt $ SpacyNLP.nlp url
-- tokenTags FR txt = do
-- -- printDebug "[Spacy Debug]" txt
......@@ -74,7 +74,7 @@ tokenTagsWith lang txt nlp = map (groupTokens lang)
groupTokens :: Lang -> [TokenTag] -> [TokenTag]
groupTokens EN = En.groupTokens
groupTokens FR = Fr.groupTokens
groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"
groupTokens _ = Fr.groupTokens
-- TODO: make tests here
cleanTextForNLP :: Text -> Text
......
......@@ -27,8 +27,6 @@ module Gargantext.Core.Text.Terms.Multi.PosTagging
import Data.Aeson
import Data.ByteString.Lazy.Internal (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BSL
import qualified Data.Map as Map
import Data.Set (fromList)
import Data.Text (Text, splitOn, pack, toLower)
import Gargantext.Core (Lang(..))
......@@ -37,11 +35,11 @@ import Gargantext.Core.Types
import Gargantext.Prelude
import Network.HTTP.Simple
import Network.URI (URI(..))
import qualified Data.ByteString.Lazy.Char8 as BSL
import qualified Data.Map as Map
-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
------------------------------------------------------------------------
------------------------------------------------------------------------
tokens2tokensTags :: [Token] -> [TokenTag]
......@@ -110,7 +108,7 @@ corenlp' uri lang txt = do
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "pl") ]
CN -> [ ("annotators", "tokenize,pos,lemma,ner")
ZH -> [ ("annotators", "tokenize,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger")
, ("tokenize.language", "zh") ]
......
......@@ -16,60 +16,76 @@ module Gargantext.Utils.JohnSnowNLP where
import Control.Concurrent (threadDelay)
import Control.Lens
import Data.Aeson (encode, ToJSON, toJSON, FromJSON, parseJSON, Value(..), (.:), (.:?))
import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.Aeson.TH (deriveJSON)
import qualified Data.List.Safe as LS
import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Maybe (fromMaybe)
import Data.Text hiding (map, group, filter, concat, zip)
import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (POS(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Types (POS(..))
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
import qualified Data.List.Safe as LS
import qualified Data.Map.Strict as Map
data JSSpell = JSPOS Lang | JSLemma Lang
deriving (Show)
instance ToJSON JSSpell where
toJSON (JSPOS EN) = "en.pos"
toJSON (JSPOS FR) = "fr.pos"
toJSON (JSPOS All) = "pos"
toJSON (JSPOS DE) = "de.pos"
toJSON (JSPOS EL) = "el.pos"
toJSON (JSPOS EN) = "en.pos"
toJSON (JSPOS ES) = "es.pos"
toJSON (JSPOS FR) = "fr.pos"
toJSON (JSPOS IT) = "it.pos"
toJSON (JSPOS PL) = "pl.pos"
toJSON (JSPOS CN) = "cn.pos"
toJSON (JSPOS All) = "pos"
toJSON (JSPOS PT) = "pt.pos"
toJSON (JSPOS RU) = "ru.pos"
toJSON (JSPOS UK) = "uk.pos"
toJSON (JSPOS ZH) = "zh.pos"
toJSON (JSLemma EN) = "en.lemma"
toJSON (JSLemma FR) = "fr.lemma"
toJSON (JSLemma All) = "lemma"
toJSON (JSLemma DE) = "de.lemma"
toJSON (JSLemma EL) = "el.lemma"
toJSON (JSLemma EN) = "en.lemma"
toJSON (JSLemma ES) = "es.lemma"
toJSON (JSLemma FR) = "fr.lemma"
toJSON (JSLemma IT) = "it.lemma"
toJSON (JSLemma PL) = "pl.lemma"
toJSON (JSLemma CN) = "cn.lemma"
toJSON (JSLemma All) = "lemma"
toJSON (JSLemma PT) = "pt.lemma"
toJSON (JSLemma RU) = "ru.lemma"
toJSON (JSLemma UK) = "uk.lemma"
toJSON (JSLemma ZH) = "zh.lemma"
instance FromJSON JSSpell where
parseJSON (String "en.pos") = pure $ JSPOS EN
parseJSON (String "fr.pos") = pure $ JSPOS FR
parseJSON (String "de.pos") = pure $ JSPOS DE
parseJSON (String "en.pos") = pure $ JSPOS EN
parseJSON (String "el.pos") = pure $ JSPOS EL
parseJSON (String "es.pos") = pure $ JSPOS ES
parseJSON (String "fr.pos") = pure $ JSPOS FR
parseJSON (String "it.pos") = pure $ JSPOS IT
parseJSON (String "pl.pos") = pure $ JSPOS PL
parseJSON (String "cn.pos") = pure $ JSPOS CN
parseJSON (String "pt.pos") = pure $ JSPOS PT
parseJSON (String "ru.pos") = pure $ JSPOS RU
parseJSON (String "uk.pos") = pure $ JSPOS UK
parseJSON (String "zh.pos") = pure $ JSPOS ZH
parseJSON (String "pos") = pure $ JSPOS All
parseJSON (String "en.lemma") = pure $ JSLemma EN
parseJSON (String "fr.lemma") = pure $ JSLemma FR
parseJSON (String "de.lemma") = pure $ JSLemma DE
parseJSON (String "en.lemma") = pure $ JSLemma EN
parseJSON (String "el.lemma") = pure $ JSLemma EL
parseJSON (String "es.lemma") = pure $ JSLemma ES
parseJSON (String "fr.lemma") = pure $ JSLemma FR
parseJSON (String "it.lemma") = pure $ JSLemma IT
parseJSON (String "pl.lemma") = pure $ JSLemma PL
parseJSON (String "cn.lemma") = pure $ JSLemma CN
parseJSON (String "pt.lemma") = pure $ JSLemma PT
parseJSON (String "ru.lemma") = pure $ JSLemma RU
parseJSON (String "uk.lemma") = pure $ JSLemma UK
parseJSON (String "zh.lemma") = pure $ JSLemma ZH
parseJSON (String "lemma") = pure $ JSLemma All
parseJSON s =
prependFailure "parsing spell failed, "
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment