Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
160
Issues
160
List
Board
Labels
Milestones
Merge Requests
14
Merge Requests
14
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
97d74a5e
Commit
97d74a5e
authored
Jan 28, 2022
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[john snow nlp] implement api for pos & lemma
parent
64789260
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
263 additions
and
47 deletions
+263
-47
package.yaml
package.yaml
+1
-0
Core.hs
src/Gargantext/Core.hs
+1
-0
Multi.hs
src/Gargantext/Core/Text/Terms/Multi.hs
+1
-0
PosTagging.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+1
-39
Types.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
+54
-0
Types.hs
src/Gargantext/Core/Types.hs
+20
-8
JohnSnowNLP.hs
src/Gargantext/Utils/JohnSnowNLP.hs
+185
-0
No files found.
package.yaml
View file @
97d74a5e
...
...
@@ -178,6 +178,7 @@ library:
-
jose
-
json-stream
-
lens
-
listsafe
-
located-base
-
logging-effect
-
matrix
...
...
src/Gargantext/Core.hs
View file @
97d74a5e
...
...
@@ -84,6 +84,7 @@ instance Hashable PosTagAlgo
instance
HasDBid
PosTagAlgo
where
toDBid
CoreNLP
=
1
toDBid
JohnSnowServer
=
2
fromDBid
1
=
CoreNLP
fromDBid
2
=
JohnSnowServer
fromDBid
_
=
panic
"HasDBid posTagAlgo : Not implemented"
...
...
src/Gargantext/Core/Text/Terms/Multi.hs
View file @
97d74a5e
...
...
@@ -23,6 +23,7 @@ import Gargantext.Core (Lang(..))
import
Gargantext.Core.Types
import
Gargantext.Core.Text.Terms.Multi.PosTagging
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.En
as
En
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.Fr
as
Fr
...
...
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
View file @
97d74a5e
...
...
@@ -26,31 +26,15 @@ module Gargantext.Core.Text.Terms.Multi.PosTagging
where
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.ByteString.Lazy.Internal
(
ByteString
)
import
Data.Set
(
fromList
)
import
Data.Text
(
Text
,
splitOn
,
pack
,
toLower
)
import
GHC.Generics
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import
Gargantext.Core.Types
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
import
Gargantext.Prelude
import
Network.HTTP.Simple
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Token
=
Token
{
_tokenIndex
::
Int
,
_tokenWord
::
Text
,
_tokenOriginalText
::
Text
,
_tokenLemma
::
Text
,
_tokenCharacterOffsetBegin
::
Int
,
_tokenCharacterOffsetEnd
::
Int
,
_tokenPos
::
Maybe
POS
,
_tokenNer
::
Maybe
NER
,
_tokenBefore
::
Maybe
Text
,
_tokenAfter
::
Maybe
Text
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_token"
)
''
T
oken
)
------------------------------------------------------------------------
------------------------------------------------------------------------
tokens2tokensTags
::
[
Token
]
->
[
TokenTag
]
...
...
@@ -69,23 +53,6 @@ filter' xs = filter isNgrams xs
isNgrams
(
TokenTag
_
_
p
n
)
=
isJust
p
||
isJust
n
------------------------------------------------------------------------
data
Sentence
=
Sentence
{
_sentenceIndex
::
Int
,
_sentenceTokens
::
[
Token
]
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_sentence"
)
''
S
entence
)
data
Properties
=
Properties
{
_propertiesAnnotators
::
Text
,
_propertiesOutputFormat
::
Text
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_properties"
)
''
P
roperties
)
data
PosSentences
=
PosSentences
{
_sentences
::
[
Sentence
]}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_"
)
''
P
osSentences
)
-- request =
-- "fr" : {
...
...
@@ -144,8 +111,3 @@ tokenWith f lang s = map (map (\t -> (_tokenWord t, f t)))
-- We need the PosTagging according to the language and the lems
serverNLP
::
Lang
->
Text
->
IO
PosSentences
serverNLP
=
undefined
src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
0 → 100644
View file @
97d74a5e
{-|
Module : Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
where
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Text
(
Text
)
import
Gargantext.Core.Types
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
import
Gargantext.Prelude
import
GHC.Generics
data
Token
=
Token
{
_tokenIndex
::
Int
,
_tokenWord
::
Text
,
_tokenOriginalText
::
Text
,
_tokenLemma
::
Text
,
_tokenCharacterOffsetBegin
::
Int
,
_tokenCharacterOffsetEnd
::
Int
,
_tokenPos
::
Maybe
POS
,
_tokenNer
::
Maybe
NER
,
_tokenBefore
::
Maybe
Text
,
_tokenAfter
::
Maybe
Text
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_token"
)
''
T
oken
)
data
Sentence
=
Sentence
{
_sentenceIndex
::
Int
,
_sentenceTokens
::
[
Token
]
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_sentence"
)
''
S
entence
)
data
Properties
=
Properties
{
_propertiesAnnotators
::
Text
,
_propertiesOutputFormat
::
Text
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_properties"
)
''
P
roperties
)
data
PosSentences
=
PosSentences
{
_sentences
::
[
Sentence
]}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_"
)
''
P
osSentences
)
src/Gargantext/Core/Types.hs
View file @
97d74a5e
...
...
@@ -71,26 +71,38 @@ data Tag = POS | NER
data
POS
=
NP
|
JJ
|
VB
|
CC
|
IN
|
DT
|
ADV
|
NoPos
deriving
(
Show
,
Generic
,
Eq
,
Ord
)
------------------------------------------------------------------------
-- https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/
instance
FromJSON
POS
where
parseJSON
=
withText
"String"
(
\
x
->
pure
(
pos
$
unpack
x
))
where
pos
::
[
Char
]
->
POS
pos
"NP"
=
NP
pos
"NN"
=
NP
pos
"ADJ"
=
JJ
pos
"CC"
=
CC
pos
"DT"
=
DT
pos
"IN"
=
IN
pos
"JJ"
=
JJ
pos
"JJR"
=
JJ
pos
"JJS"
=
JJ
pos
"NC"
=
NP
pos
"NN"
=
NP
pos
"NNS"
=
NP
pos
"NNP"
=
NP
pos
"
JJ"
=
JJ
pos
"
ADJ"
=
JJ
pos
"
NNPS"
=
NP
pos
"
NP"
=
NP
pos
"VB"
=
VB
pos
"VB
N
"
=
VB
pos
"VB
D
"
=
VB
pos
"VBG"
=
VB
pos
"CC"
=
CC
pos
"IN"
=
IN
pos
"DT"
=
DT
pos
"VBN"
=
VB
pos
"VBP"
=
VB
pos
"VBZ"
=
VB
pos
"RB"
=
ADV
pos
"RBR"
=
ADV
pos
"RBS"
=
ADV
pos
"WRB"
=
ADV
-- French specific
pos
"P"
=
IN
pos
_
=
NoPos
...
...
src/Gargantext/Utils/JohnSnowNLP.hs
0 → 100644
View file @
97d74a5e
{-|
Module : Gargantext.Utils.JohnSnowNLP
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Utils.JohnSnowNLP
where
import
Control.Concurrent
(
threadDelay
)
import
Control.Lens
import
Data.Aeson
(
encode
,
ToJSON
,
toJSON
,
FromJSON
,
parseJSON
,
Value
(
..
),
(
.:
))
import
Data.Aeson.Types
(
prependFailure
,
typeMismatch
)
import
Data.Aeson.TH
(
deriveJSON
)
import
qualified
Data.List.Safe
as
LS
import
Data.Map
(
Map
)
import
qualified
Data.Map
as
Map
import
Data.Maybe
(
fromMaybe
)
import
Data.Text
hiding
(
map
,
group
,
filter
,
concat
,
zip
)
import
Network.HTTP.Simple
(
parseRequest
,
httpJSON
,
setRequestBodyLBS
,
getResponseBody
,
Response
)
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
(
POS
(
..
))
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
data
JSSpell
=
JSPOS
|
JSLemma
deriving
(
Show
)
instance
ToJSON
JSSpell
where
toJSON
JSPOS
=
"pos"
toJSON
JSLemma
=
"lemma"
instance
FromJSON
JSSpell
where
parseJSON
(
String
"pos"
)
=
pure
JSPOS
parseJSON
(
String
"lemma"
)
=
pure
JSLemma
parseJSON
s
=
prependFailure
"parsing spell failed, "
(
typeMismatch
"Spell"
s
)
data
JSRequest
=
JSRequest
{
_jsRequest_data
::
!
Text
,
_jsRequest_format
::
!
Text
,
_jsRequest_grouping
::
!
(
Maybe
Text
)
,
_jsRequest_spell
::
!
JSSpell
}
deriving
(
Show
)
-- "spell" options:
-- https://nlu.johnsnowlabs.com/docs/en/spellbook
deriveJSON
(
unPrefix
"_jsRequest_"
)
''
J
SRequest
-- | JohnSnow NLP works via asynchronous tasks: send a query and get a
-- task in response. One must poll for task status and then get it's
-- result.
data
JSAsyncTask
=
JSAsyncTask
{
_jsAsyncTask_uuid
::
!
Text
}
deriving
(
Show
)
deriveJSON
(
unPrefix
"_jsAsyncTask_"
)
''
J
SAsyncTask
-- | Task status.
data
JSAsyncTaskStatus
=
JSAsyncTaskStatus
{
_jsAsyncTaskStatus_code
::
!
Text
,
_jsAsyncTaskStatus_message
::
!
(
Maybe
Text
)
}
deriving
(
Show
)
taskReady
::
JSAsyncTaskStatus
->
Bool
taskReady
(
JSAsyncTaskStatus
{
..
})
=
_jsAsyncTaskStatus_code
==
"success"
--deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
instance
FromJSON
JSAsyncTaskStatus
where
parseJSON
(
Object
v
)
=
do
status
<-
v
.:
"status"
code
<-
status
.:
"code"
message
<-
status
.:
"message"
pure
$
JSAsyncTaskStatus
{
_jsAsyncTaskStatus_code
=
code
,
_jsAsyncTaskStatus_message
=
message
}
parseJSON
s
=
prependFailure
"parsing status failed"
(
typeMismatch
"status"
s
)
-- | Response for our query. The `Maybe` types are here because we
-- combine 2 types of responses into one: `pos` and `lemma`.
data
JSAsyncTaskResponse
=
JSAsyncTaskResponse
{
_jsAsyncTaskResponse_index
::
Map
Text
Int
,
_jsAsyncTaskResponse_document
::
Map
Text
Text
,
_jsAsyncTaskResponse_sentence
::
Map
Text
[
Text
]
,
_jsAsyncTaskResponse_lem
::
Maybe
(
Map
Text
[
Text
])
,
_jsAsyncTaskResponse_pos
::
Maybe
(
Map
Text
[
POS
])
,
_jsAsyncTaskResponse_token
::
Map
Text
[
Text
]
}
deriving
(
Show
)
deriveJSON
(
unPrefix
"_jsAsyncTaskResponse_"
)
''
J
SAsyncTaskResponse
makeLenses
''
J
SAsyncTaskResponse
-- | We need to combine 2 responses: `pos` and `lemma` spells.
jsAsyncTaskResponseToSentences
::
JSAsyncTaskResponse
->
JSAsyncTaskResponse
->
PosSentences
jsAsyncTaskResponseToSentences
jsPos
jsLemma
=
PosSentences
{
_sentences
}
where
_sentences
=
Map
.
elems
$
Map
.
mapWithKey
mapSentence
(
jsPos
^.
jsAsyncTaskResponse_sentence
)
mapSentence
idx
sentence
=
Sentence
{
_sentenceIndex
=
sIndex
,
_sentenceTokens
=
sTokens
}
where
sIndex
=
Map
.
findWithDefault
(
-
1
)
idx
(
jsPos
^.
jsAsyncTaskResponse_index
)
lemmas
=
fromMaybe
[]
$
if
Just
sentence
==
Map
.
lookup
idx
(
jsLemma
^.
jsAsyncTaskResponse_sentence
)
then
Map
.
lookup
idx
$
fromMaybe
Map
.
empty
(
jsLemma
^.
jsAsyncTaskResponse_lem
)
else
Nothing
sTokens
=
imap
mapPosToken
$
zip
(
Map
.
findWithDefault
[]
idx
$
fromMaybe
Map
.
empty
(
jsPos
^.
jsAsyncTaskResponse_pos
))
(
Map
.
findWithDefault
[]
idx
(
jsPos
^.
jsAsyncTaskResponse_token
))
mapPosToken
idx'
(
pos
,
token
)
=
Token
{
_tokenIndex
=
-
1
,
_tokenWord
=
token
,
_tokenOriginalText
=
""
,
_tokenLemma
=
fromMaybe
""
$
(
LS
.!!
)
lemmas
idx'
,
_tokenCharacterOffsetBegin
=
-
1
,
_tokenCharacterOffsetEnd
=
-
1
,
_tokenPos
=
Just
pos
,
_tokenNer
=
Nothing
,
_tokenBefore
=
Nothing
,
_tokenAfter
=
Nothing
}
-----------------------------------------------------
jsRequest
::
Text
->
JSSpell
->
IO
JSAsyncTask
jsRequest
t
s
=
do
url
<-
parseRequest
$
"POST http://localhost:5000/api/results"
let
jsReq
=
JSRequest
{
_jsRequest_data
=
t
,
_jsRequest_format
=
"text"
,
_jsRequest_grouping
=
Nothing
,
_jsRequest_spell
=
s
}
let
request
=
setRequestBodyLBS
(
encode
jsReq
)
url
task
<-
httpJSON
request
::
IO
(
Response
JSAsyncTask
)
pure
$
getResponseBody
task
jsTaskStatus
::
JSAsyncTask
->
IO
JSAsyncTaskStatus
jsTaskStatus
(
JSAsyncTask
uuid
)
=
do
url
<-
parseRequest
$
unpack
$
"GET http://localhost:5000/api/results/"
<>
uuid
<>
"/status"
status
<-
httpJSON
url
pure
$
getResponseBody
status
jsTaskResponse
::
JSAsyncTask
->
IO
JSAsyncTaskResponse
jsTaskResponse
(
JSAsyncTask
uuid
)
=
do
url
<-
parseRequest
$
unpack
$
"GET http://localhost:5000/api/results/"
<>
uuid
result
<-
httpJSON
url
pure
$
getResponseBody
result
waitForJsTask
::
JSAsyncTask
->
IO
JSAsyncTaskResponse
waitForJsTask
jsTask
=
wait'
0
where
wait'
::
Int
->
IO
JSAsyncTaskResponse
wait'
counter
=
do
status
<-
jsTaskStatus
jsTask
if
taskReady
status
then
jsTaskResponse
jsTask
else
if
counter
>
60
then
panic
"[waitForJsTask] waited for 1 minute and still no answer from JohnSnow NLP"
else
do
printDebug
"[waitForJsTask] task not ready, waiting"
counter
_
<-
threadDelay
$
100000
*
1
wait'
$
counter
+
1
getPosTagAndLems
::
Lang
->
Text
->
IO
PosSentences
getPosTagAndLems
_l
t
=
do
jsPosTask
<-
jsRequest
t
JSPOS
jsPos
<-
waitForJsTask
jsPosTask
jsLemmaTask
<-
jsRequest
t
JSLemma
jsLemma
<-
waitForJsTask
jsLemmaTask
printDebug
"[getPosTagAndLems] sentences"
$
jsAsyncTaskResponseToSentences
jsPos
jsLemma
pure
$
PosSentences
[]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment