1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
{-|
Module : Gargantext.Utils.JohnSnow
Description : John Snow NLP API connexion
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Utils.JohnSnowNLP where
import Control.Lens ( FunctorWithIndex(imap) )
import Data.Aeson (encode, Value(..), (.:), (.:?))
import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.List.Safe qualified as LS
import Data.Map.Strict qualified as Map
import Data.Text (unpack)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Types (POS(..))
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude hiding (All)
import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
import Prelude (userError)
data JSSpell = JSPOS Lang | JSLemma Lang
deriving (Show)
instance ToJSON JSSpell where
toJSON (JSPOS DE) = "de.pos"
toJSON (JSPOS EL) = "el.pos"
toJSON (JSPOS EN) = "en.pos"
toJSON (JSPOS ES) = "es.pos"
toJSON (JSPOS FR) = "fr.pos"
toJSON (JSPOS IT) = "it.pos"
toJSON (JSPOS PL) = "pl.pos"
toJSON (JSPOS PT) = "pt.pos"
toJSON (JSPOS RU) = "ru.pos"
toJSON (JSPOS UK) = "uk.pos"
toJSON (JSPOS ZH) = "zh.pos"
toJSON (JSLemma DE) = "de.lemma"
toJSON (JSLemma EL) = "el.lemma"
toJSON (JSLemma EN) = "en.lemma"
toJSON (JSLemma ES) = "es.lemma"
toJSON (JSLemma FR) = "fr.lemma"
toJSON (JSLemma IT) = "it.lemma"
toJSON (JSLemma PL) = "pl.lemma"
toJSON (JSLemma PT) = "pt.lemma"
toJSON (JSLemma RU) = "ru.lemma"
toJSON (JSLemma UK) = "uk.lemma"
toJSON (JSLemma ZH) = "zh.lemma"
instance FromJSON JSSpell where
parseJSON (String "de.pos") = pure $ JSPOS DE
parseJSON (String "en.pos") = pure $ JSPOS EN
parseJSON (String "el.pos") = pure $ JSPOS EL
parseJSON (String "es.pos") = pure $ JSPOS ES
parseJSON (String "fr.pos") = pure $ JSPOS FR
parseJSON (String "it.pos") = pure $ JSPOS IT
parseJSON (String "pl.pos") = pure $ JSPOS PL
parseJSON (String "pt.pos") = pure $ JSPOS PT
parseJSON (String "ru.pos") = pure $ JSPOS RU
parseJSON (String "uk.pos") = pure $ JSPOS UK
parseJSON (String "zh.pos") = pure $ JSPOS ZH
parseJSON (String "de.lemma") = pure $ JSLemma DE
parseJSON (String "en.lemma") = pure $ JSLemma EN
parseJSON (String "el.lemma") = pure $ JSLemma EL
parseJSON (String "es.lemma") = pure $ JSLemma ES
parseJSON (String "fr.lemma") = pure $ JSLemma FR
parseJSON (String "it.lemma") = pure $ JSLemma IT
parseJSON (String "pl.lemma") = pure $ JSLemma PL
parseJSON (String "pt.lemma") = pure $ JSLemma PT
parseJSON (String "ru.lemma") = pure $ JSLemma RU
parseJSON (String "uk.lemma") = pure $ JSLemma UK
parseJSON (String "zh.lemma") = pure $ JSLemma ZH
parseJSON s =
prependFailure "parsing spell failed, "
(typeMismatch "Spell" s)
data JSRequest =
JSRequest { _jsRequest_data :: !Text
, _jsRequest_format :: !Text
, _jsRequest_grouping :: !(Maybe Text)
, _jsRequest_spell :: !JSSpell }
deriving (Show)
-- "spell" options:
-- https://nlu.johnsnowlabs.com/docs/en/spellbook
deriveJSON (unPrefix "_jsRequest_") ''JSRequest
-- | JohnSnow NLP works via asynchronous tasks: send a query and get a
-- task in response. One must poll for task status and then get it's
-- result.
data JSAsyncTask =
JSAsyncTask { _jsAsyncTask_uuid :: !Text }
deriving (Show)
deriveJSON (unPrefix "_jsAsyncTask_") ''JSAsyncTask
-- | Task status.
data JSAsyncTaskStatus =
JSAsyncTaskStatus { _jsAsyncTaskStatus_code :: !Text
, _jsAsyncTaskStatus_message :: !(Maybe Text) }
deriving (Show)
taskReady :: JSAsyncTaskStatus -> Bool
taskReady (JSAsyncTaskStatus { .. }) = _jsAsyncTaskStatus_code == "success"
--deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
instance FromJSON JSAsyncTaskStatus where
parseJSON (Object v) = do
status <- v .: "status"
code <- status .: "code"
message <- status .:? "message"
pure $ JSAsyncTaskStatus { _jsAsyncTaskStatus_code = code
, _jsAsyncTaskStatus_message = message }
parseJSON s =
prependFailure "parsing status failed"
(typeMismatch "status" s)
-- | Response for our query. The `Maybe` types are here because we
-- combine 2 types of responses into one: `pos` and `lemma`.
data JSAsyncTaskResponse =
JSAsyncTaskResponse { _jsAsyncTaskResponse_index :: Map Text Int
, _jsAsyncTaskResponse_document :: Map Text Text
, _jsAsyncTaskResponse_sentence :: Map Text [Text]
, _jsAsyncTaskResponse_lem :: Maybe (Map Text [Text])
, _jsAsyncTaskResponse_pos :: Maybe (Map Text [POS])
, _jsAsyncTaskResponse_token :: Map Text [Text] }
deriving (Show)
deriveJSON (unPrefix "_jsAsyncTaskResponse_") ''JSAsyncTaskResponse
makeLenses ''JSAsyncTaskResponse
-- | We need to combine 2 responses: `pos` and `lemma` spells.
jsAsyncTaskResponseToSentences :: JSAsyncTaskResponse -> JSAsyncTaskResponse -> PosSentences
jsAsyncTaskResponseToSentences jsPos jsLemma =
PosSentences { _sentences }
where
_sentences = Map.elems $ Map.mapWithKey mapSentence (jsPos ^. jsAsyncTaskResponse_sentence)
mapSentence idx sentence = Sentence { _sentenceIndex = sIndex
, _sentenceTokens = sTokens }
where
sIndex = Map.findWithDefault (-1) idx (jsPos ^. jsAsyncTaskResponse_index)
lemmas = fromMaybe [] $
if Just sentence == Map.lookup idx (jsLemma ^. jsAsyncTaskResponse_sentence) then
Map.lookup idx $ fromMaybe Map.empty (jsLemma ^. jsAsyncTaskResponse_lem)
else
Nothing
sTokens = imap mapPosToken $ zip (Map.findWithDefault [] idx $ fromMaybe Map.empty (jsPos ^. jsAsyncTaskResponse_pos))
(Map.findWithDefault [] idx (jsPos ^. jsAsyncTaskResponse_token))
mapPosToken idx' (pos, token) = Token { _tokenIndex = -1
, _tokenWord = token
, _tokenOriginalText = ""
, _tokenLemma = fromMaybe "" $ (LS.!!) lemmas idx'
, _tokenCharacterOffsetBegin = -1
, _tokenCharacterOffsetEnd = -1
, _tokenPos = Just pos
, _tokenNer = Nothing
, _tokenBefore = Nothing
, _tokenAfter = Nothing }
-----------------------------------------------------
jsRequest :: Text -> JSSpell -> IO JSAsyncTask
jsRequest t s = do
url <- parseRequest $ "POST http://localhost:5000/api/results"
let jsReq = JSRequest { _jsRequest_data = t
, _jsRequest_format = "text"
, _jsRequest_grouping = Nothing
, _jsRequest_spell = s }
let request = setRequestBodyLBS (encode jsReq) url
task <- httpJSON request :: IO (Response JSAsyncTask)
pure $ getResponseBody task
jsTaskStatus :: JSAsyncTask -> IO JSAsyncTaskStatus
jsTaskStatus (JSAsyncTask uuid) = do
url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid <> "/status"
status <- httpJSON url
pure $ getResponseBody status
jsTaskResponse :: JSAsyncTask -> IO JSAsyncTaskResponse
jsTaskResponse (JSAsyncTask uuid) = do
url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid
result <- httpJSON url
pure $ getResponseBody result
waitForJsTask :: HasCallStack => JSAsyncTask -> IO JSAsyncTaskResponse
waitForJsTask jsTask = wait' 0
where
wait' :: Int -> IO JSAsyncTaskResponse
wait' counter = do
status <- jsTaskStatus jsTask
if taskReady status then
jsTaskResponse jsTask
else
if counter > 60 then
throwIO $ withStacktrace $ userError "waited for 1 minute and still no answer from JohnSnow NLP"
else do
-- printDebug "[waitForJsTask] task not ready, waiting" counter
_ <- threadDelay $ 1000000*1
wait' $ counter + 1
getPosTagAndLems :: Lang -> Text -> IO PosSentences
getPosTagAndLems l t = do
jsPosTask <- jsRequest t (JSPOS l)
jsLemmaTask <- jsRequest t (JSLemma l)
-- wait for both tasks
jsPos <- waitForJsTask jsPosTask
jsLemma <- waitForJsTask jsLemmaTask
pure $ jsAsyncTaskResponseToSentences jsPos jsLemma
nlp :: Lang -> Text -> IO PosSentences
nlp = getPosTagAndLems