Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
153
Issues
153
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
32301d6d
Commit
32301d6d
authored
Apr 20, 2023
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX/FEAT] Langs
parent
18067565
Pipeline
#3914
failed with stage
in 29 minutes and 58 seconds
Changes
8
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
107 additions
and
79 deletions
+107
-79
gargantext.cabal
gargantext.cabal
+2
-2
Searx.hs
src/Gargantext/API/Node/Corpus/Searx.hs
+14
-21
Core.hs
src/Gargantext/Core.hs
+45
-24
Learn.hs
src/Gargantext/Core/Text/Learn.hs
+2
-2
ZH.hs
src/Gargantext/Core/Text/Samples/ZH.hs
+2
-2
Multi.hs
src/Gargantext/Core/Text/Terms/Multi.hs
+2
-2
PosTagging.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+3
-5
JohnSnowNLP.hs
src/Gargantext/Utils/JohnSnowNLP.hs
+37
-21
No files found.
gargantext.cabal
View file @
32301d6d
...
...
@@ -5,7 +5,7 @@ cabal-version: 1.12
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.9.9
version:
0.0.6.9.9
synopsis: Search, map, share
description: Please see README.md
category: Data
...
...
@@ -215,12 +215,12 @@ library
Gargantext.Core.Text.Metrics.FrequentItemSet
Gargantext.Core.Text.Metrics.SpeGen.IncExc
Gargantext.Core.Text.Metrics.Utils
Gargantext.Core.Text.Samples.CN
Gargantext.Core.Text.Samples.DE
Gargantext.Core.Text.Samples.EN
Gargantext.Core.Text.Samples.ES
Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.PL
Gargantext.Core.Text.Samples.ZH
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token
...
...
src/Gargantext/API/Node/Corpus/Searx.hs
View file @
32301d6d
...
...
@@ -2,29 +2,15 @@
module
Gargantext.API.Node.Corpus.Searx
where
import
Control.Lens
(
view
)
import
qualified
Data.Aeson
as
Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Either
(
Either
(
..
))
import
qualified
Data.HashMap.Strict
as
HashMap
import
qualified
Data.Text
as
T
import
Data.Time.Calendar
(
Day
,
toGregorian
)
import
Data.Time.Format
(
defaultTimeLocale
,
formatTime
,
parseTimeM
)
import
Data.Tuple.Select
(
sel1
,
sel2
,
sel3
)
import
GHC.Generics
(
Generic
)
import
Network.HTTP.Client
import
Network.HTTP.Client.TLS
import
qualified
Prelude
import
Protolude
(
catMaybes
,
encodeUtf8
,
rightToMaybe
,
Text
)
import
Gargantext.Prelude
import
Gargantext.Prelude.Config
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.NLP
(
nlpServerGet
)
import
qualified
Gargantext.Core.Text.Corpus.API
as
API
import
Gargantext.Core.Text.List
(
buildNgramsLists
)
import
Gargantext.Core.Text.List.Group.WithStem
(
{-StopSize(..),-}
GroupParams
(
..
))
import
Gargantext.Core.Text.Terms
(
TermType
(
..
))
...
...
@@ -41,18 +27,25 @@ import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
import
Gargantext.Database.Prelude
(
hasConfig
)
import
Gargantext.Database.Query.Table.Node
(
defaultListMaybe
,
getOrMkList
)
import
Gargantext.Database.Query.Tree.Root
(
getOrMk_RootWithCorpus
)
import
Gargantext.Prelude
import
Gargantext.Prelude.Config
import
Gargantext.Utils.Jobs
(
JobHandle
,
MonadJobStatus
(
..
))
import
Network.HTTP.Client
import
Network.HTTP.Client.TLS
import
Protolude
(
catMaybes
,
encodeUtf8
,
rightToMaybe
,
Text
)
import
qualified
Data.Aeson
as
Aeson
import
qualified
Data.HashMap.Strict
as
HashMap
import
qualified
Data.Text
as
T
import
qualified
Data.Text
as
Text
import
qualified
Gargantext.Core.Text.Corpus.API
as
API
import
qualified
Gargantext.Database.Query.Table.Node.Document.Add
as
Doc
(
add
)
import
qualified
Prelude
langToSearx
::
Lang
->
Text
langToSearx
EN
=
"en-US"
langToSearx
FR
=
"fr-FR"
langToSearx
DE
=
"de-FR"
langToSearx
ES
=
"es-FR"
langToSearx
IT
=
"it-FR"
langToSearx
PL
=
"pl-FR"
langToSearx
CN
=
"cn-FR"
langToSearx
All
=
"en-US"
langToSearx
x
=
(
Text
.
toLower
acronym
)
<>
"-"
<>
acronym
where
acronym
=
(
cs
$
show
x
)
data
SearxResult
=
SearxResult
{
_sr_url
::
Text
...
...
src/Gargantext/Core.hs
View file @
32301d6d
...
...
@@ -14,15 +14,17 @@ Portability : POSIX
module
Gargantext.Core
where
import
Data.Text
(
Text
,
pack
)
import
Data.Aeson
import
Data.Either
(
Either
(
Left
))
import
Data.Hashable
(
Hashable
)
import
Data.Morpheus.Types
(
GQLType
)
import
Data.Swagger
import
Data.Text
(
Text
,
pack
)
import
Data.Tuple.Extra
(
swap
)
import
GHC.Generics
(
Generic
)
import
Gargantext.Prelude
import
Servant.API
import
qualified
Data.Map
as
Map
------------------------------------------------------------------------
-- | Language of a Text
...
...
@@ -34,14 +36,25 @@ import Servant.API
-- - IT == italian
-- - ES == spanish
-- - PL == polish
-- -
CN
== chinese
-- -
ZH
== chinese
--
-- ... add your language and help us to implement it (:
-- | All languages supported
-- NOTE: Use international country codes
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
data
Lang
=
EN
|
FR
|
DE
|
IT
|
PL
|
PT
|
ES
|
EL
|
CN
|
UK
|
RU
|
ZH
|
All
data
Lang
=
All
|
DE
|
EL
|
EN
|
ES
|
FR
|
IT
|
PL
|
PT
|
RU
|
UK
|
ZH
deriving
(
Show
,
Eq
,
Ord
,
Enum
,
Bounded
,
Generic
,
GQLType
)
instance
ToJSON
Lang
...
...
@@ -50,14 +63,18 @@ instance ToSchema Lang where
declareNamedSchema
=
genericDeclareNamedSchemaUnrestricted
defaultSchemaOptions
instance
FromHttpApiData
Lang
where
parseUrlPiece
"EN"
=
pure
EN
parseUrlPiece
"FR"
=
pure
FR
parseUrlPiece
"All"
=
pure
All
parseUrlPiece
"DE"
=
pure
DE
parseUrlPiece
"EL"
=
pure
EL
parseUrlPiece
"EN"
=
pure
EN
parseUrlPiece
"ES"
=
pure
ES
parseUrlPiece
"FR"
=
pure
FR
parseUrlPiece
"IT"
=
pure
IT
parseUrlPiece
"PL"
=
pure
PL
parseUrlPiece
"CN"
=
pure
CN
parseUrlPiece
"All"
=
pure
All
parseUrlPiece
"PT"
=
pure
PT
parseUrlPiece
"RU"
=
pure
RU
parseUrlPiece
"UK"
=
pure
UK
parseUrlPiece
"ZH"
=
pure
ZH
parseUrlPiece
_
=
Left
"Unexpected value of Lang"
instance
ToHttpApiData
Lang
where
toUrlPiece
=
pack
.
show
...
...
@@ -73,25 +90,29 @@ class HasDBid a where
-- NOTE: We try to use numeric codes for countries
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
-- https://en.wikipedia.org/wiki/ISO_3166-1_numeric#004
dbIds
::
[(
Lang
,
Int
)]
dbIds
=
[
(
All
,
0
)
,
(
DE
,
276
)
,
(
EL
,
300
)
,
(
EN
,
2
)
,
(
ES
,
724
)
,
(
FR
,
1
)
,
(
IT
,
380
)
,
(
PL
,
616
)
,
(
PT
,
620
)
,
(
RU
,
643
)
,
(
UK
,
804
)
,
(
ZH
,
156
)
]
instance
HasDBid
Lang
where
toDBid
All
=
0
toDBid
FR
=
1
toDBid
EN
=
2
toDBid
DE
=
276
toDBid
ES
=
724
toDBid
IT
=
380
toDBid
PL
=
616
toDBid
CN
=
156
toDBid
lang
=
case
Map
.
lookup
lang
$
Map
.
fromList
dbIds
of
Just
la
->
la
Nothing
->
panic
"[G.Core] Add this lang to DB ids"
fromDBid
0
=
All
fromDBid
1
=
FR
fromDBid
2
=
EN
fromDBid
276
=
DE
fromDBid
724
=
ES
fromDBid
380
=
IT
fromDBid
616
=
PL
fromDBid
156
=
CN
fromDBid
_
=
panic
"HasDBid lang, not implemented"
fromDBid
dbId
=
case
Map
.
lookup
dbId
$
Map
.
fromList
$
map
swap
dbIds
of
Just
la
->
la
Nothing
->
panic
"HasDBid lang, not implemented"
------------------------------------------------------------------------
data
NLPServerConfig
=
NLPServerConfig
...
...
src/Gargantext/Core/Text/Learn.hs
View file @
32301d6d
...
...
@@ -39,7 +39,7 @@ import Gargantext.Core (Lang(..), allLangs)
import
Gargantext.Core.Text.Terms.Mono
(
words
)
import
Gargantext.Core.Text.Metrics.Count
(
occurrencesWith
)
import
qualified
Gargantext.Core.Text.Samples.
CN
as
CN
import
qualified
Gargantext.Core.Text.Samples.
ZH
as
ZH
import
qualified
Gargantext.Core.Text.Samples.DE
as
DE
import
qualified
Gargantext.Core.Text.Samples.EN
as
EN
import
qualified
Gargantext.Core.Text.Samples.ES
as
ES
...
...
@@ -115,7 +115,7 @@ detectLangDefault = detectCat 99 eventLang
textSample
FR
=
FR
.
textSample
textSample
DE
=
DE
.
textSample
textSample
ES
=
ES
.
textSample
textSample
CN
=
CN
.
textSample
textSample
ZH
=
ZH
.
textSample
textSample
PL
=
PL
.
textSample
textSample
_
=
panic
"[G.C.T.L:detectLangDefault] not impl yet"
--textSample DE = DE.textSample
...
...
src/Gargantext/Core/Text/Samples/
CN
.hs
→
src/Gargantext/Core/Text/Samples/
ZH
.hs
View file @
32301d6d
{-|
Module : Gargantext.Core.Text.Samples.
CN
Module : Gargantext.Core.Text.Samples.
ZH
Description : Sample of Chinese Text
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
...
...
@@ -14,7 +14,7 @@ Page : text mining
module
Gargantext.Core.Text.Samples.
CN
where
module
Gargantext.Core.Text.Samples.
ZH
where
import
Data.String
(
String
)
...
...
src/Gargantext/Core/Text/Terms/Multi.hs
View file @
32301d6d
...
...
@@ -52,7 +52,7 @@ tokenTag2terms :: TokenTag -> Terms
tokenTag2terms
(
TokenTag
ws
t
_
_
)
=
Terms
ws
t
tokenTags
::
NLPServerConfig
->
Lang
->
Text
->
IO
[[
TokenTag
]]
tokenTags
(
NLPServerConfig
{
server
=
CoreNLP
,
url
})
l
txt
=
tokenTagsWith
l
txt
$
corenlp
url
tokenTags
(
NLPServerConfig
{
server
=
CoreNLP
,
url
})
EN
txt
=
tokenTagsWith
EN
txt
$
corenlp
url
tokenTags
(
NLPServerConfig
{
server
=
Spacy
,
url
})
l
txt
=
tokenTagsWith
l
txt
$
SpacyNLP
.
nlp
url
-- tokenTags FR txt = do
-- -- printDebug "[Spacy Debug]" txt
...
...
@@ -74,7 +74,7 @@ tokenTagsWith lang txt nlp = map (groupTokens lang)
groupTokens
::
Lang
->
[
TokenTag
]
->
[
TokenTag
]
groupTokens
EN
=
En
.
groupTokens
groupTokens
FR
=
Fr
.
groupTokens
groupTokens
_
=
panic
$
pack
"groupTokens :: Lang not implemeted yet"
groupTokens
_
=
Fr
.
groupTokens
-- TODO: make tests here
cleanTextForNLP
::
Text
->
Text
...
...
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
View file @
32301d6d
...
...
@@ -27,8 +27,6 @@ module Gargantext.Core.Text.Terms.Multi.PosTagging
import
Data.Aeson
import
Data.ByteString.Lazy.Internal
(
ByteString
)
import
qualified
Data.ByteString.Lazy.Char8
as
BSL
import
qualified
Data.Map
as
Map
import
Data.Set
(
fromList
)
import
Data.Text
(
Text
,
splitOn
,
pack
,
toLower
)
import
Gargantext.Core
(
Lang
(
..
))
...
...
@@ -37,11 +35,11 @@ import Gargantext.Core.Types
import
Gargantext.Prelude
import
Network.HTTP.Simple
import
Network.URI
(
URI
(
..
))
import
qualified
Data.ByteString.Lazy.Char8
as
BSL
import
qualified
Data.Map
as
Map
-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
------------------------------------------------------------------------
------------------------------------------------------------------------
tokens2tokensTags
::
[
Token
]
->
[
TokenTag
]
...
...
@@ -110,7 +108,7 @@ corenlp' uri lang txt = do
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
,
(
"tokenize.language"
,
"pl"
)
]
CN
->
[
(
"annotators"
,
"tokenize,pos,lemma,ner"
)
ZH
->
[
(
"annotators"
,
"tokenize,pos,lemma,ner"
)
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
,
(
"pos.model"
,
"edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger"
)
,
(
"tokenize.language"
,
"zh"
)
]
...
...
src/Gargantext/Utils/JohnSnowNLP.hs
View file @
32301d6d
...
...
@@ -16,60 +16,76 @@ module Gargantext.Utils.JohnSnowNLP where
import
Control.Concurrent
(
threadDelay
)
import
Control.Lens
import
Data.Aeson
(
encode
,
ToJSON
,
toJSON
,
FromJSON
,
parseJSON
,
Value
(
..
),
(
.:
),
(
.:?
))
import
Data.Aeson.Types
(
prependFailure
,
typeMismatch
)
import
Data.Aeson.TH
(
deriveJSON
)
import
qualified
Data.List.Safe
as
LS
import
Data.Aeson.Types
(
prependFailure
,
typeMismatch
)
import
Data.Map.Strict
(
Map
)
import
qualified
Data.Map.Strict
as
Map
import
Data.Maybe
(
fromMaybe
)
import
Data.Text
hiding
(
map
,
group
,
filter
,
concat
,
zip
)
import
Network.HTTP.Simple
(
parseRequest
,
httpJSON
,
setRequestBodyLBS
,
getResponseBody
,
Response
)
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
(
POS
(
..
))
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import
Gargantext.Core.Types
(
POS
(
..
))
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
import
Gargantext.Prelude
import
Network.HTTP.Simple
(
parseRequest
,
httpJSON
,
setRequestBodyLBS
,
getResponseBody
,
Response
)
import
qualified
Data.List.Safe
as
LS
import
qualified
Data.Map.Strict
as
Map
data
JSSpell
=
JSPOS
Lang
|
JSLemma
Lang
deriving
(
Show
)
instance
ToJSON
JSSpell
where
toJSON
(
JSPOS
EN
)
=
"en.pos"
toJSON
(
JSPOS
FR
)
=
"fr.pos"
toJSON
(
JSPOS
All
)
=
"pos"
toJSON
(
JSPOS
DE
)
=
"de.pos"
toJSON
(
JSPOS
EL
)
=
"el.pos"
toJSON
(
JSPOS
EN
)
=
"en.pos"
toJSON
(
JSPOS
ES
)
=
"es.pos"
toJSON
(
JSPOS
FR
)
=
"fr.pos"
toJSON
(
JSPOS
IT
)
=
"it.pos"
toJSON
(
JSPOS
PL
)
=
"pl.pos"
toJSON
(
JSPOS
CN
)
=
"cn.pos"
toJSON
(
JSPOS
All
)
=
"pos"
toJSON
(
JSPOS
PT
)
=
"pt.pos"
toJSON
(
JSPOS
RU
)
=
"ru.pos"
toJSON
(
JSPOS
UK
)
=
"uk.pos"
toJSON
(
JSPOS
ZH
)
=
"zh.pos"
toJSON
(
JSLemma
EN
)
=
"en.lemma"
toJSON
(
JSLemma
FR
)
=
"fr.lemma"
toJSON
(
JSLemma
All
)
=
"lemma"
toJSON
(
JSLemma
DE
)
=
"de.lemma"
toJSON
(
JSLemma
EL
)
=
"el.lemma"
toJSON
(
JSLemma
EN
)
=
"en.lemma"
toJSON
(
JSLemma
ES
)
=
"es.lemma"
toJSON
(
JSLemma
FR
)
=
"fr.lemma"
toJSON
(
JSLemma
IT
)
=
"it.lemma"
toJSON
(
JSLemma
PL
)
=
"pl.lemma"
toJSON
(
JSLemma
CN
)
=
"cn.lemma"
toJSON
(
JSLemma
All
)
=
"lemma"
toJSON
(
JSLemma
PT
)
=
"pt.lemma"
toJSON
(
JSLemma
RU
)
=
"ru.lemma"
toJSON
(
JSLemma
UK
)
=
"uk.lemma"
toJSON
(
JSLemma
ZH
)
=
"zh.lemma"
instance
FromJSON
JSSpell
where
parseJSON
(
String
"en.pos"
)
=
pure
$
JSPOS
EN
parseJSON
(
String
"fr.pos"
)
=
pure
$
JSPOS
FR
parseJSON
(
String
"de.pos"
)
=
pure
$
JSPOS
DE
parseJSON
(
String
"en.pos"
)
=
pure
$
JSPOS
EN
parseJSON
(
String
"el.pos"
)
=
pure
$
JSPOS
EL
parseJSON
(
String
"es.pos"
)
=
pure
$
JSPOS
ES
parseJSON
(
String
"fr.pos"
)
=
pure
$
JSPOS
FR
parseJSON
(
String
"it.pos"
)
=
pure
$
JSPOS
IT
parseJSON
(
String
"pl.pos"
)
=
pure
$
JSPOS
PL
parseJSON
(
String
"cn.pos"
)
=
pure
$
JSPOS
CN
parseJSON
(
String
"pt.pos"
)
=
pure
$
JSPOS
PT
parseJSON
(
String
"ru.pos"
)
=
pure
$
JSPOS
RU
parseJSON
(
String
"uk.pos"
)
=
pure
$
JSPOS
UK
parseJSON
(
String
"zh.pos"
)
=
pure
$
JSPOS
ZH
parseJSON
(
String
"pos"
)
=
pure
$
JSPOS
All
parseJSON
(
String
"en.lemma"
)
=
pure
$
JSLemma
EN
parseJSON
(
String
"fr.lemma"
)
=
pure
$
JSLemma
FR
parseJSON
(
String
"de.lemma"
)
=
pure
$
JSLemma
DE
parseJSON
(
String
"en.lemma"
)
=
pure
$
JSLemma
EN
parseJSON
(
String
"el.lemma"
)
=
pure
$
JSLemma
EL
parseJSON
(
String
"es.lemma"
)
=
pure
$
JSLemma
ES
parseJSON
(
String
"fr.lemma"
)
=
pure
$
JSLemma
FR
parseJSON
(
String
"it.lemma"
)
=
pure
$
JSLemma
IT
parseJSON
(
String
"pl.lemma"
)
=
pure
$
JSLemma
PL
parseJSON
(
String
"cn.lemma"
)
=
pure
$
JSLemma
CN
parseJSON
(
String
"pt.lemma"
)
=
pure
$
JSLemma
PT
parseJSON
(
String
"ru.lemma"
)
=
pure
$
JSLemma
RU
parseJSON
(
String
"uk.lemma"
)
=
pure
$
JSLemma
UK
parseJSON
(
String
"zh.lemma"
)
=
pure
$
JSLemma
ZH
parseJSON
(
String
"lemma"
)
=
pure
$
JSLemma
All
parseJSON
s
=
prependFailure
"parsing spell failed, "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment