Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
152
Issues
152
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
97d74a5e
Commit
97d74a5e
authored
Jan 28, 2022
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[john snow nlp] implement api for pos & lemma
parent
64789260
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
263 additions
and
47 deletions
+263
-47
package.yaml
package.yaml
+1
-0
Core.hs
src/Gargantext/Core.hs
+1
-0
Multi.hs
src/Gargantext/Core/Text/Terms/Multi.hs
+1
-0
PosTagging.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+1
-39
Types.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
+54
-0
Types.hs
src/Gargantext/Core/Types.hs
+20
-8
JohnSnowNLP.hs
src/Gargantext/Utils/JohnSnowNLP.hs
+185
-0
No files found.
package.yaml
View file @
97d74a5e
...
...
@@ -178,6 +178,7 @@ library:
-
jose
-
json-stream
-
lens
-
listsafe
-
located-base
-
logging-effect
-
matrix
...
...
src/Gargantext/Core.hs
View file @
97d74a5e
...
...
@@ -84,6 +84,7 @@ instance Hashable PosTagAlgo
instance
HasDBid
PosTagAlgo
where
toDBid
CoreNLP
=
1
toDBid
JohnSnowServer
=
2
fromDBid
1
=
CoreNLP
fromDBid
2
=
JohnSnowServer
fromDBid
_
=
panic
"HasDBid posTagAlgo : Not implemented"
...
...
src/Gargantext/Core/Text/Terms/Multi.hs
View file @
97d74a5e
...
...
@@ -23,6 +23,7 @@ import Gargantext.Core (Lang(..))
import
Gargantext.Core.Types
import
Gargantext.Core.Text.Terms.Multi.PosTagging
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.En
as
En
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.Fr
as
Fr
...
...
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
View file @
97d74a5e
...
...
@@ -26,31 +26,15 @@ module Gargantext.Core.Text.Terms.Multi.PosTagging
where
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.ByteString.Lazy.Internal
(
ByteString
)
import
Data.Set
(
fromList
)
import
Data.Text
(
Text
,
splitOn
,
pack
,
toLower
)
import
GHC.Generics
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import
Gargantext.Core.Types
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
import
Gargantext.Prelude
import
Network.HTTP.Simple
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Token
=
Token
{
_tokenIndex
::
Int
,
_tokenWord
::
Text
,
_tokenOriginalText
::
Text
,
_tokenLemma
::
Text
,
_tokenCharacterOffsetBegin
::
Int
,
_tokenCharacterOffsetEnd
::
Int
,
_tokenPos
::
Maybe
POS
,
_tokenNer
::
Maybe
NER
,
_tokenBefore
::
Maybe
Text
,
_tokenAfter
::
Maybe
Text
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_token"
)
''
T
oken
)
------------------------------------------------------------------------
------------------------------------------------------------------------
tokens2tokensTags
::
[
Token
]
->
[
TokenTag
]
...
...
@@ -69,23 +53,6 @@ filter' xs = filter isNgrams xs
isNgrams
(
TokenTag
_
_
p
n
)
=
isJust
p
||
isJust
n
------------------------------------------------------------------------
data
Sentence
=
Sentence
{
_sentenceIndex
::
Int
,
_sentenceTokens
::
[
Token
]
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_sentence"
)
''
S
entence
)
data
Properties
=
Properties
{
_propertiesAnnotators
::
Text
,
_propertiesOutputFormat
::
Text
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_properties"
)
''
P
roperties
)
data
PosSentences
=
PosSentences
{
_sentences
::
[
Sentence
]}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_"
)
''
P
osSentences
)
-- request =
-- "fr" : {
...
...
@@ -144,8 +111,3 @@ tokenWith f lang s = map (map (\t -> (_tokenWord t, f t)))
-- We need the PosTagging according to the language and the lems
serverNLP
::
Lang
->
Text
->
IO
PosSentences
serverNLP
=
undefined
src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
0 → 100644
View file @
97d74a5e
{-|
Module : Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
where
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Text
(
Text
)
import
Gargantext.Core.Types
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
import
Gargantext.Prelude
import
GHC.Generics
data
Token
=
Token
{
_tokenIndex
::
Int
,
_tokenWord
::
Text
,
_tokenOriginalText
::
Text
,
_tokenLemma
::
Text
,
_tokenCharacterOffsetBegin
::
Int
,
_tokenCharacterOffsetEnd
::
Int
,
_tokenPos
::
Maybe
POS
,
_tokenNer
::
Maybe
NER
,
_tokenBefore
::
Maybe
Text
,
_tokenAfter
::
Maybe
Text
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_token"
)
''
T
oken
)
data
Sentence
=
Sentence
{
_sentenceIndex
::
Int
,
_sentenceTokens
::
[
Token
]
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_sentence"
)
''
S
entence
)
data
Properties
=
Properties
{
_propertiesAnnotators
::
Text
,
_propertiesOutputFormat
::
Text
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_properties"
)
''
P
roperties
)
data
PosSentences
=
PosSentences
{
_sentences
::
[
Sentence
]}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_"
)
''
P
osSentences
)
src/Gargantext/Core/Types.hs
View file @
97d74a5e
...
...
@@ -71,26 +71,38 @@ data Tag = POS | NER
data
POS
=
NP
|
JJ
|
VB
|
CC
|
IN
|
DT
|
ADV
|
NoPos
deriving
(
Show
,
Generic
,
Eq
,
Ord
)
------------------------------------------------------------------------
-- https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/
instance
FromJSON
POS
where
parseJSON
=
withText
"String"
(
\
x
->
pure
(
pos
$
unpack
x
))
where
pos
::
[
Char
]
->
POS
pos
"NP"
=
NP
pos
"NN"
=
NP
pos
"ADJ"
=
JJ
pos
"CC"
=
CC
pos
"DT"
=
DT
pos
"IN"
=
IN
pos
"JJ"
=
JJ
pos
"JJR"
=
JJ
pos
"JJS"
=
JJ
pos
"NC"
=
NP
pos
"NN"
=
NP
pos
"NNS"
=
NP
pos
"NNP"
=
NP
pos
"
JJ"
=
JJ
pos
"
ADJ"
=
JJ
pos
"
NNPS"
=
NP
pos
"
NP"
=
NP
pos
"VB"
=
VB
pos
"VB
N
"
=
VB
pos
"VB
D
"
=
VB
pos
"VBG"
=
VB
pos
"CC"
=
CC
pos
"IN"
=
IN
pos
"DT"
=
DT
pos
"VBN"
=
VB
pos
"VBP"
=
VB
pos
"VBZ"
=
VB
pos
"RB"
=
ADV
pos
"RBR"
=
ADV
pos
"RBS"
=
ADV
pos
"WRB"
=
ADV
-- French specific
pos
"P"
=
IN
pos
_
=
NoPos
...
...
src/Gargantext/Utils/JohnSnowNLP.hs
0 → 100644
View file @
97d74a5e
{-|
Module : Gargantext.Utils.JohnSnowNLP
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Utils.JohnSnowNLP
where
import
Control.Concurrent
(
threadDelay
)
import
Control.Lens
import
Data.Aeson
(
encode
,
ToJSON
,
toJSON
,
FromJSON
,
parseJSON
,
Value
(
..
),
(
.:
))
import
Data.Aeson.Types
(
prependFailure
,
typeMismatch
)
import
Data.Aeson.TH
(
deriveJSON
)
import
qualified
Data.List.Safe
as
LS
import
Data.Map
(
Map
)
import
qualified
Data.Map
as
Map
import
Data.Maybe
(
fromMaybe
)
import
Data.Text
hiding
(
map
,
group
,
filter
,
concat
,
zip
)
import
Network.HTTP.Simple
(
parseRequest
,
httpJSON
,
setRequestBodyLBS
,
getResponseBody
,
Response
)
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
(
POS
(
..
))
import
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
data
JSSpell
=
JSPOS
|
JSLemma
deriving
(
Show
)
instance
ToJSON
JSSpell
where
toJSON
JSPOS
=
"pos"
toJSON
JSLemma
=
"lemma"
instance
FromJSON
JSSpell
where
parseJSON
(
String
"pos"
)
=
pure
JSPOS
parseJSON
(
String
"lemma"
)
=
pure
JSLemma
parseJSON
s
=
prependFailure
"parsing spell failed, "
(
typeMismatch
"Spell"
s
)
data
JSRequest
=
JSRequest
{
_jsRequest_data
::
!
Text
,
_jsRequest_format
::
!
Text
,
_jsRequest_grouping
::
!
(
Maybe
Text
)
,
_jsRequest_spell
::
!
JSSpell
}
deriving
(
Show
)
-- "spell" options:
-- https://nlu.johnsnowlabs.com/docs/en/spellbook
deriveJSON
(
unPrefix
"_jsRequest_"
)
''
J
SRequest
-- | JohnSnow NLP works via asynchronous tasks: send a query and get a
-- task in response. One must poll for task status and then get it's
-- result.
data
JSAsyncTask
=
JSAsyncTask
{
_jsAsyncTask_uuid
::
!
Text
}
deriving
(
Show
)
deriveJSON
(
unPrefix
"_jsAsyncTask_"
)
''
J
SAsyncTask
-- | Task status.
data
JSAsyncTaskStatus
=
JSAsyncTaskStatus
{
_jsAsyncTaskStatus_code
::
!
Text
,
_jsAsyncTaskStatus_message
::
!
(
Maybe
Text
)
}
deriving
(
Show
)
taskReady
::
JSAsyncTaskStatus
->
Bool
taskReady
(
JSAsyncTaskStatus
{
..
})
=
_jsAsyncTaskStatus_code
==
"success"
--deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
instance
FromJSON
JSAsyncTaskStatus
where
parseJSON
(
Object
v
)
=
do
status
<-
v
.:
"status"
code
<-
status
.:
"code"
message
<-
status
.:
"message"
pure
$
JSAsyncTaskStatus
{
_jsAsyncTaskStatus_code
=
code
,
_jsAsyncTaskStatus_message
=
message
}
parseJSON
s
=
prependFailure
"parsing status failed"
(
typeMismatch
"status"
s
)
-- | Response for our query. The `Maybe` types are here because we
-- combine 2 types of responses into one: `pos` and `lemma`.
data
JSAsyncTaskResponse
=
JSAsyncTaskResponse
{
_jsAsyncTaskResponse_index
::
Map
Text
Int
,
_jsAsyncTaskResponse_document
::
Map
Text
Text
,
_jsAsyncTaskResponse_sentence
::
Map
Text
[
Text
]
,
_jsAsyncTaskResponse_lem
::
Maybe
(
Map
Text
[
Text
])
,
_jsAsyncTaskResponse_pos
::
Maybe
(
Map
Text
[
POS
])
,
_jsAsyncTaskResponse_token
::
Map
Text
[
Text
]
}
deriving
(
Show
)
deriveJSON
(
unPrefix
"_jsAsyncTaskResponse_"
)
''
J
SAsyncTaskResponse
makeLenses
''
J
SAsyncTaskResponse
-- | We need to combine 2 responses: `pos` and `lemma` spells.
jsAsyncTaskResponseToSentences
::
JSAsyncTaskResponse
->
JSAsyncTaskResponse
->
PosSentences
jsAsyncTaskResponseToSentences
jsPos
jsLemma
=
PosSentences
{
_sentences
}
where
_sentences
=
Map
.
elems
$
Map
.
mapWithKey
mapSentence
(
jsPos
^.
jsAsyncTaskResponse_sentence
)
mapSentence
idx
sentence
=
Sentence
{
_sentenceIndex
=
sIndex
,
_sentenceTokens
=
sTokens
}
where
sIndex
=
Map
.
findWithDefault
(
-
1
)
idx
(
jsPos
^.
jsAsyncTaskResponse_index
)
lemmas
=
fromMaybe
[]
$
if
Just
sentence
==
Map
.
lookup
idx
(
jsLemma
^.
jsAsyncTaskResponse_sentence
)
then
Map
.
lookup
idx
$
fromMaybe
Map
.
empty
(
jsLemma
^.
jsAsyncTaskResponse_lem
)
else
Nothing
sTokens
=
imap
mapPosToken
$
zip
(
Map
.
findWithDefault
[]
idx
$
fromMaybe
Map
.
empty
(
jsPos
^.
jsAsyncTaskResponse_pos
))
(
Map
.
findWithDefault
[]
idx
(
jsPos
^.
jsAsyncTaskResponse_token
))
mapPosToken
idx'
(
pos
,
token
)
=
Token
{
_tokenIndex
=
-
1
,
_tokenWord
=
token
,
_tokenOriginalText
=
""
,
_tokenLemma
=
fromMaybe
""
$
(
LS
.!!
)
lemmas
idx'
,
_tokenCharacterOffsetBegin
=
-
1
,
_tokenCharacterOffsetEnd
=
-
1
,
_tokenPos
=
Just
pos
,
_tokenNer
=
Nothing
,
_tokenBefore
=
Nothing
,
_tokenAfter
=
Nothing
}
-----------------------------------------------------
jsRequest
::
Text
->
JSSpell
->
IO
JSAsyncTask
jsRequest
t
s
=
do
url
<-
parseRequest
$
"POST http://localhost:5000/api/results"
let
jsReq
=
JSRequest
{
_jsRequest_data
=
t
,
_jsRequest_format
=
"text"
,
_jsRequest_grouping
=
Nothing
,
_jsRequest_spell
=
s
}
let
request
=
setRequestBodyLBS
(
encode
jsReq
)
url
task
<-
httpJSON
request
::
IO
(
Response
JSAsyncTask
)
pure
$
getResponseBody
task
jsTaskStatus
::
JSAsyncTask
->
IO
JSAsyncTaskStatus
jsTaskStatus
(
JSAsyncTask
uuid
)
=
do
url
<-
parseRequest
$
unpack
$
"GET http://localhost:5000/api/results/"
<>
uuid
<>
"/status"
status
<-
httpJSON
url
pure
$
getResponseBody
status
jsTaskResponse
::
JSAsyncTask
->
IO
JSAsyncTaskResponse
jsTaskResponse
(
JSAsyncTask
uuid
)
=
do
url
<-
parseRequest
$
unpack
$
"GET http://localhost:5000/api/results/"
<>
uuid
result
<-
httpJSON
url
pure
$
getResponseBody
result
waitForJsTask
::
JSAsyncTask
->
IO
JSAsyncTaskResponse
waitForJsTask
jsTask
=
wait'
0
where
wait'
::
Int
->
IO
JSAsyncTaskResponse
wait'
counter
=
do
status
<-
jsTaskStatus
jsTask
if
taskReady
status
then
jsTaskResponse
jsTask
else
if
counter
>
60
then
panic
"[waitForJsTask] waited for 1 minute and still no answer from JohnSnow NLP"
else
do
printDebug
"[waitForJsTask] task not ready, waiting"
counter
_
<-
threadDelay
$
100000
*
1
wait'
$
counter
+
1
getPosTagAndLems
::
Lang
->
Text
->
IO
PosSentences
getPosTagAndLems
_l
t
=
do
jsPosTask
<-
jsRequest
t
JSPOS
jsPos
<-
waitForJsTask
jsPosTask
jsLemmaTask
<-
jsRequest
t
JSLemma
jsLemma
<-
waitForJsTask
jsLemmaTask
printDebug
"[getPosTagAndLems] sentences"
$
jsAsyncTaskResponseToSentences
jsPos
jsLemma
pure
$
PosSentences
[]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment