Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
32301d6d
Commit
32301d6d
authored
Apr 20, 2023
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX/FEAT] Langs
parent
18067565
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
107 additions
and
79 deletions
+107
-79
gargantext.cabal
gargantext.cabal
+2
-2
Searx.hs
src/Gargantext/API/Node/Corpus/Searx.hs
+14
-21
Core.hs
src/Gargantext/Core.hs
+45
-24
Learn.hs
src/Gargantext/Core/Text/Learn.hs
+2
-2
ZH.hs
src/Gargantext/Core/Text/Samples/ZH.hs
+2
-2
Multi.hs
src/Gargantext/Core/Text/Terms/Multi.hs
+2
-2
PosTagging.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+3
-5
JohnSnowNLP.hs
src/Gargantext/Utils/JohnSnowNLP.hs
+37
-21
No files found.
gargantext.cabal
View file @
32301d6d
...
...
@@ -5,7 +5,7 @@ cabal-version: 1.12
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.9.9
version:
0.0.6.9.9
synopsis: Search, map, share
description: Please see README.md
category: Data
...
...
@@ -215,12 +215,12 @@ library
Gargantext.Core.Text.Metrics.FrequentItemSet
Gargantext.Core.Text.Metrics.SpeGen.IncExc
Gargantext.Core.Text.Metrics.Utils
Gargantext.Core.Text.Samples.CN
Gargantext.Core.Text.Samples.DE
Gargantext.Core.Text.Samples.EN
Gargantext.Core.Text.Samples.ES
Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.PL
Gargantext.Core.Text.Samples.ZH
Gargantext.Core.Text.Terms.Mono.Stem
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token
...
...
src/Gargantext/API/Node/Corpus/Searx.hs
View file @
32301d6d
...
...
@@ -2,29 +2,15 @@
module
Gargantext.API.Node.Corpus.Searx
where
import
Control.Lens
(
view
)
import
qualified
Data.Aeson
as
Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Either
(
Either
(
..
))
import
qualified
Data.HashMap.Strict
as
HashMap
import
qualified
Data.Text
as
T
import
Data.Time.Calendar
(
Day
,
toGregorian
)
import
Data.Time.Format
(
defaultTimeLocale
,
formatTime
,
parseTimeM
)
import
Data.Tuple.Select
(
sel1
,
sel2
,
sel3
)
import
GHC.Generics
(
Generic
)
import
Network.HTTP.Client
import
Network.HTTP.Client.TLS
import
qualified
Prelude
import
Protolude
(
catMaybes
,
encodeUtf8
,
rightToMaybe
,
Text
)
import
Gargantext.Prelude
import
Gargantext.Prelude.Config
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.NLP
(
nlpServerGet
)
import
qualified
Gargantext.Core.Text.Corpus.API
as
API
import
Gargantext.Core.Text.List
(
buildNgramsLists
)
import
Gargantext.Core.Text.List.Group.WithStem
(
{-StopSize(..),-}
GroupParams
(
..
))
import
Gargantext.Core.Text.Terms
(
TermType
(
..
))
...
...
@@ -41,18 +27,25 @@ import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
import
Gargantext.Database.Prelude
(
hasConfig
)
import
Gargantext.Database.Query.Table.Node
(
defaultListMaybe
,
getOrMkList
)
import
Gargantext.Database.Query.Tree.Root
(
getOrMk_RootWithCorpus
)
import
Gargantext.Prelude
import
Gargantext.Prelude.Config
import
Gargantext.Utils.Jobs
(
JobHandle
,
MonadJobStatus
(
..
))
import
Network.HTTP.Client
import
Network.HTTP.Client.TLS
import
Protolude
(
catMaybes
,
encodeUtf8
,
rightToMaybe
,
Text
)
import
qualified
Data.Aeson
as
Aeson
import
qualified
Data.HashMap.Strict
as
HashMap
import
qualified
Data.Text
as
T
import
qualified
Data.Text
as
Text
import
qualified
Gargantext.Core.Text.Corpus.API
as
API
import
qualified
Gargantext.Database.Query.Table.Node.Document.Add
as
Doc
(
add
)
import
qualified
Prelude
langToSearx
::
Lang
->
Text
langToSearx
EN
=
"en-US"
langToSearx
FR
=
"fr-FR"
langToSearx
DE
=
"de-FR"
langToSearx
ES
=
"es-FR"
langToSearx
IT
=
"it-FR"
langToSearx
PL
=
"pl-FR"
langToSearx
CN
=
"cn-FR"
langToSearx
All
=
"en-US"
langToSearx
x
=
(
Text
.
toLower
acronym
)
<>
"-"
<>
acronym
where
acronym
=
(
cs
$
show
x
)
data
SearxResult
=
SearxResult
{
_sr_url
::
Text
...
...
src/Gargantext/Core.hs
View file @
32301d6d
...
...
@@ -14,15 +14,17 @@ Portability : POSIX
module
Gargantext.Core
where
import
Data.Text
(
Text
,
pack
)
import
Data.Aeson
import
Data.Either
(
Either
(
Left
))
import
Data.Hashable
(
Hashable
)
import
Data.Morpheus.Types
(
GQLType
)
import
Data.Swagger
import
Data.Text
(
Text
,
pack
)
import
Data.Tuple.Extra
(
swap
)
import
GHC.Generics
(
Generic
)
import
Gargantext.Prelude
import
Servant.API
import
qualified
Data.Map
as
Map
------------------------------------------------------------------------
-- | Language of a Text
...
...
@@ -34,14 +36,25 @@ import Servant.API
-- - IT == italian
-- - ES == spanish
-- - PL == polish
-- -
CN
== chinese
-- -
ZH
== chinese
--
-- ... add your language and help us to implement it (:
-- | All languages supported
-- NOTE: Use international country codes
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
data
Lang
=
EN
|
FR
|
DE
|
IT
|
PL
|
PT
|
ES
|
EL
|
CN
|
UK
|
RU
|
ZH
|
All
data
Lang
=
All
|
DE
|
EL
|
EN
|
ES
|
FR
|
IT
|
PL
|
PT
|
RU
|
UK
|
ZH
deriving
(
Show
,
Eq
,
Ord
,
Enum
,
Bounded
,
Generic
,
GQLType
)
instance
ToJSON
Lang
...
...
@@ -50,14 +63,18 @@ instance ToSchema Lang where
declareNamedSchema
=
genericDeclareNamedSchemaUnrestricted
defaultSchemaOptions
instance
FromHttpApiData
Lang
where
parseUrlPiece
"EN"
=
pure
EN
parseUrlPiece
"FR"
=
pure
FR
parseUrlPiece
"All"
=
pure
All
parseUrlPiece
"DE"
=
pure
DE
parseUrlPiece
"EL"
=
pure
EL
parseUrlPiece
"EN"
=
pure
EN
parseUrlPiece
"ES"
=
pure
ES
parseUrlPiece
"FR"
=
pure
FR
parseUrlPiece
"IT"
=
pure
IT
parseUrlPiece
"PL"
=
pure
PL
parseUrlPiece
"CN"
=
pure
CN
parseUrlPiece
"All"
=
pure
All
parseUrlPiece
"PT"
=
pure
PT
parseUrlPiece
"RU"
=
pure
RU
parseUrlPiece
"UK"
=
pure
UK
parseUrlPiece
"ZH"
=
pure
ZH
parseUrlPiece
_
=
Left
"Unexpected value of Lang"
instance
ToHttpApiData
Lang
where
toUrlPiece
=
pack
.
show
...
...
@@ -73,25 +90,29 @@ class HasDBid a where
-- NOTE: We try to use numeric codes for countries
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
-- https://en.wikipedia.org/wiki/ISO_3166-1_numeric#004
dbIds
::
[(
Lang
,
Int
)]
dbIds
=
[
(
All
,
0
)
,
(
DE
,
276
)
,
(
EL
,
300
)
,
(
EN
,
2
)
,
(
ES
,
724
)
,
(
FR
,
1
)
,
(
IT
,
380
)
,
(
PL
,
616
)
,
(
PT
,
620
)
,
(
RU
,
643
)
,
(
UK
,
804
)
,
(
ZH
,
156
)
]
instance
HasDBid
Lang
where
toDBid
All
=
0
toDBid
FR
=
1
toDBid
EN
=
2
toDBid
DE
=
276
toDBid
ES
=
724
toDBid
IT
=
380
toDBid
PL
=
616
toDBid
CN
=
156
toDBid
lang
=
case
Map
.
lookup
lang
$
Map
.
fromList
dbIds
of
Just
la
->
la
Nothing
->
panic
"[G.Core] Add this lang to DB ids"
fromDBid
0
=
All
fromDBid
1
=
FR
fromDBid
2
=
EN
fromDBid
276
=
DE
fromDBid
724
=
ES
fromDBid
380
=
IT
fromDBid
616
=
PL
fromDBid
156
=
CN
fromDBid
_
=
panic
"HasDBid lang, not implemented"
fromDBid
dbId
=
case
Map
.
lookup
dbId
$
Map
.
fromList
$
map
swap
dbIds
of
Just
la
->
la
Nothing
->
panic
"HasDBid lang, not implemented"
------------------------------------------------------------------------
data
NLPServerConfig
=
NLPServerConfig
...
...
src/Gargantext/Core/Text/Learn.hs
View file @
32301d6d
...
...
@@ -39,7 +39,7 @@ import Gargantext.Core (Lang(..), allLangs)
import
Gargantext.Core.Text.Terms.Mono
(
words
)
import
Gargantext.Core.Text.Metrics.Count
(
occurrencesWith
)
import
qualified
Gargantext.Core.Text.Samples.
CN
as
CN
import
qualified
Gargantext.Core.Text.Samples.
ZH
as
ZH
import
qualified
Gargantext.Core.Text.Samples.DE
as
DE
import
qualified
Gargantext.Core.Text.Samples.EN
as
EN
import
qualified
Gargantext.Core.Text.Samples.ES
as
ES
...
...
@@ -115,7 +115,7 @@ detectLangDefault = detectCat 99 eventLang
textSample
FR
=
FR
.
textSample
textSample
DE
=
DE
.
textSample
textSample
ES
=
ES
.
textSample
textSample
CN
=
CN
.
textSample
textSample
ZH
=
ZH
.
textSample
textSample
PL
=
PL
.
textSample
textSample
_
=
panic
"[G.C.T.L:detectLangDefault] not impl yet"
--textSample DE = DE.textSample
...
...
src/Gargantext/Core/Text/Samples/
CN
.hs
→
src/Gargantext/Core/Text/Samples/
ZH
.hs
View file @
32301d6d
{-|
Module : Gargantext.Core.Text.Samples.
CN
Module : Gargantext.Core.Text.Samples.
ZH
Description : Sample of Chinese Text
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
...
...
@@ -14,7 +14,7 @@ Page : text mining
module
Gargantext.Core.Text.Samples.
CN
where
module
Gargantext.Core.Text.Samples.
ZH
where
import
Data.String
(
String
)
...
...
src/Gargantext/Core/Text/Terms/Multi.hs
View file @
32301d6d
...
...
@@ -52,7 +52,7 @@ tokenTag2terms :: TokenTag -> Terms
tokenTag2terms
(
TokenTag
ws
t
_
_
)
=
Terms
ws
t
tokenTags
::
NLPServerConfig
->
Lang
->
Text
->
IO
[[
TokenTag
]]
tokenTags
(
NLPServerConfig
{
server
=
CoreNLP
,
url
})
l
txt
=
tokenTagsWith
l
txt
$
corenlp
url
tokenTags
(
NLPServerConfig
{
server
=
CoreNLP
,
url
})
EN
txt
=
tokenTagsWith
EN
txt
$
corenlp
url
tokenTags
(
NLPServerConfig
{
server
=
Spacy
,
url
})
l
txt
=
tokenTagsWith
l
txt
$
SpacyNLP
.
nlp
url
-- tokenTags FR txt = do
-- -- printDebug "[Spacy Debug]" txt
...
...
@@ -74,7 +74,7 @@ tokenTagsWith lang txt nlp = map (groupTokens lang)
groupTokens
::
Lang
->
[
TokenTag
]
->
[
TokenTag
]
groupTokens
EN
=
En
.
groupTokens
groupTokens
FR
=
Fr
.
groupTokens
groupTokens
_
=
panic
$
pack
"groupTokens :: Lang not implemeted yet"
groupTokens
_
=
Fr
.
groupTokens
-- TODO: make tests here
cleanTextForNLP
::
Text
->
Text
...
...
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
View file @
32301d6d
...
...
@@ -27,8 +27,6 @@ module Gargantext.Core.Text.Terms.Multi.PosTagging
import
Data.Aeson
import
Data.ByteString.Lazy.Internal
(
ByteString
)
import
qualified
Data.ByteString.Lazy.Char8
as
BSL
import
qualified
Data.Map
as
Map
import
Data.Set
(
fromList
)
import
Data.Text
(
Text
,
splitOn
,
pack
,
toLower
)
import
Gargantext.Core
(
Lang
(
..
))
...
...
@@ -37,11 +35,11 @@ import Gargantext.Core.Types
import
Gargantext.Prelude
import
Network.HTTP.Simple
import
Network.URI
(
URI
(
..
))
import
qualified
Data.ByteString.Lazy.Char8
as
BSL
import
qualified
Data.Map
as
Map
-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
------------------------------------------------------------------------
------------------------------------------------------------------------
tokens2tokensTags
::
[
Token
]
->
[
TokenTag
]
...
...
@@ -110,7 +108,7 @@ corenlp' uri lang txt = do
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
,
(
"tokenize.language"
,
"pl"
)
]
CN
->
[
(
"annotators"
,
"tokenize,pos,lemma,ner"
)
ZH
->
[
(
"annotators"
,
"tokenize,pos,lemma,ner"
)
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
,
(
"pos.model"
,
"edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger"
)
,
(
"tokenize.language"
,
"zh"
)
]
...
...
src/Gargantext/Utils/JohnSnowNLP.hs
View file @
32301d6d
...
...
@@ -16,60 +16,76 @@ module Gargantext.Utils.JohnSnowNLP where
import
Control.Concurrent
(
threadDelay
)
import
Control.Lens
import
Data.Aeson
(
encode
,
ToJSON
,
toJSON
,
FromJSON
,
parseJSON
,
Value
(
..
),
(
.:
),
(
.:?
))
import
Data.Aeson.Types
(
prependFailure
,
typeMismatch
)
import
Data.Aeson.TH
(
deriveJSON
)
import
qualified
Data.List.Safe
as
LS
import
Data.Aeson.Types
(
prependFailure
,
typeMismatch
)
import
Data.Map.Strict
(
Map
)
import
qualified
Data.Map.Strict
as
Map
import
Data.Maybe
(
fromMaybe
)
import
Data.Text
hiding
(
map
,
group
,
filter
,
concat
,
zip
)
import
Network.HTTP.Simple
(
parseRequest
,
httpJSON
,
setRequestBodyLBS
,
getResponseBody
,
Response
)
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
(
POS
(
..
))
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import
Gargantext.Core.Types
(
POS
(
..
))
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
import
Gargantext.Prelude
import
Network.HTTP.Simple
(
parseRequest
,
httpJSON
,
setRequestBodyLBS
,
getResponseBody
,
Response
)
import
qualified
Data.List.Safe
as
LS
import
qualified
Data.Map.Strict
as
Map
data
JSSpell
=
JSPOS
Lang
|
JSLemma
Lang
deriving
(
Show
)
instance
ToJSON
JSSpell
where
toJSON
(
JSPOS
EN
)
=
"en.pos"
toJSON
(
JSPOS
FR
)
=
"fr.pos"
toJSON
(
JSPOS
All
)
=
"pos"
toJSON
(
JSPOS
DE
)
=
"de.pos"
toJSON
(
JSPOS
EL
)
=
"el.pos"
toJSON
(
JSPOS
EN
)
=
"en.pos"
toJSON
(
JSPOS
ES
)
=
"es.pos"
toJSON
(
JSPOS
FR
)
=
"fr.pos"
toJSON
(
JSPOS
IT
)
=
"it.pos"
toJSON
(
JSPOS
PL
)
=
"pl.pos"
toJSON
(
JSPOS
CN
)
=
"cn.pos"
toJSON
(
JSPOS
All
)
=
"pos"
toJSON
(
JSPOS
PT
)
=
"pt.pos"
toJSON
(
JSPOS
RU
)
=
"ru.pos"
toJSON
(
JSPOS
UK
)
=
"uk.pos"
toJSON
(
JSPOS
ZH
)
=
"zh.pos"
toJSON
(
JSLemma
EN
)
=
"en.lemma"
toJSON
(
JSLemma
FR
)
=
"fr.lemma"
toJSON
(
JSLemma
All
)
=
"lemma"
toJSON
(
JSLemma
DE
)
=
"de.lemma"
toJSON
(
JSLemma
EL
)
=
"el.lemma"
toJSON
(
JSLemma
EN
)
=
"en.lemma"
toJSON
(
JSLemma
ES
)
=
"es.lemma"
toJSON
(
JSLemma
FR
)
=
"fr.lemma"
toJSON
(
JSLemma
IT
)
=
"it.lemma"
toJSON
(
JSLemma
PL
)
=
"pl.lemma"
toJSON
(
JSLemma
CN
)
=
"cn.lemma"
toJSON
(
JSLemma
All
)
=
"lemma"
toJSON
(
JSLemma
PT
)
=
"pt.lemma"
toJSON
(
JSLemma
RU
)
=
"ru.lemma"
toJSON
(
JSLemma
UK
)
=
"uk.lemma"
toJSON
(
JSLemma
ZH
)
=
"zh.lemma"
instance
FromJSON
JSSpell
where
parseJSON
(
String
"en.pos"
)
=
pure
$
JSPOS
EN
parseJSON
(
String
"fr.pos"
)
=
pure
$
JSPOS
FR
parseJSON
(
String
"de.pos"
)
=
pure
$
JSPOS
DE
parseJSON
(
String
"en.pos"
)
=
pure
$
JSPOS
EN
parseJSON
(
String
"el.pos"
)
=
pure
$
JSPOS
EL
parseJSON
(
String
"es.pos"
)
=
pure
$
JSPOS
ES
parseJSON
(
String
"fr.pos"
)
=
pure
$
JSPOS
FR
parseJSON
(
String
"it.pos"
)
=
pure
$
JSPOS
IT
parseJSON
(
String
"pl.pos"
)
=
pure
$
JSPOS
PL
parseJSON
(
String
"cn.pos"
)
=
pure
$
JSPOS
CN
parseJSON
(
String
"pt.pos"
)
=
pure
$
JSPOS
PT
parseJSON
(
String
"ru.pos"
)
=
pure
$
JSPOS
RU
parseJSON
(
String
"uk.pos"
)
=
pure
$
JSPOS
UK
parseJSON
(
String
"zh.pos"
)
=
pure
$
JSPOS
ZH
parseJSON
(
String
"pos"
)
=
pure
$
JSPOS
All
parseJSON
(
String
"en.lemma"
)
=
pure
$
JSLemma
EN
parseJSON
(
String
"fr.lemma"
)
=
pure
$
JSLemma
FR
parseJSON
(
String
"de.lemma"
)
=
pure
$
JSLemma
DE
parseJSON
(
String
"en.lemma"
)
=
pure
$
JSLemma
EN
parseJSON
(
String
"el.lemma"
)
=
pure
$
JSLemma
EL
parseJSON
(
String
"es.lemma"
)
=
pure
$
JSLemma
ES
parseJSON
(
String
"fr.lemma"
)
=
pure
$
JSLemma
FR
parseJSON
(
String
"it.lemma"
)
=
pure
$
JSLemma
IT
parseJSON
(
String
"pl.lemma"
)
=
pure
$
JSLemma
PL
parseJSON
(
String
"cn.lemma"
)
=
pure
$
JSLemma
CN
parseJSON
(
String
"pt.lemma"
)
=
pure
$
JSLemma
PT
parseJSON
(
String
"ru.lemma"
)
=
pure
$
JSLemma
RU
parseJSON
(
String
"uk.lemma"
)
=
pure
$
JSLemma
UK
parseJSON
(
String
"zh.lemma"
)
=
pure
$
JSLemma
ZH
parseJSON
(
String
"lemma"
)
=
pure
$
JSLemma
All
parseJSON
s
=
prependFailure
"parsing spell failed, "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment