1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
{-|
Module : Gargantext.Utils.JohnSnow
Description : John Snow NLP API connexion
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Utils.JohnSnowNLP where
import Control.Concurrent (threadDelay)
import Control.Lens
import Data.Aeson (encode, ToJSON, toJSON, FromJSON, parseJSON, Value(..), (.:), (.:?))
import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.Aeson.TH (deriveJSON)
import qualified Data.List.Safe as LS
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Maybe (fromMaybe)
import Data.Text hiding (map, group, filter, concat, zip)
import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (POS(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Utils.Prefix (unPrefix)
data JSSpell = JSPOS Lang | JSLemma Lang
deriving (Show)
instance ToJSON JSSpell where
toJSON (JSPOS EN) = "en.pos"
toJSON (JSPOS FR) = "fr.pos"
toJSON (JSPOS All) = "pos"
toJSON (JSLemma EN) = "en.lemma"
toJSON (JSLemma FR) = "fr.lemma"
toJSON (JSLemma All) = "lemma"
instance FromJSON JSSpell where
parseJSON (String "en.pos") = pure $ JSPOS EN
parseJSON (String "fr.pos") = pure $ JSPOS FR
parseJSON (String "pos") = pure $ JSPOS All
parseJSON (String "en.lemma") = pure $ JSLemma EN
parseJSON (String "fr.lemma") = pure $ JSLemma FR
parseJSON (String "lemma") = pure $ JSLemma All
parseJSON s =
prependFailure "parsing spell failed, "
(typeMismatch "Spell" s)
data JSRequest =
JSRequest { _jsRequest_data :: !Text
, _jsRequest_format :: !Text
, _jsRequest_grouping :: !(Maybe Text)
, _jsRequest_spell :: !JSSpell }
deriving (Show)
-- "spell" options:
-- https://nlu.johnsnowlabs.com/docs/en/spellbook
deriveJSON (unPrefix "_jsRequest_") ''JSRequest
-- | JohnSnow NLP works via asynchronous tasks: send a query and get a
-- task in response. One must poll for task status and then get it's
-- result.
data JSAsyncTask =
JSAsyncTask { _jsAsyncTask_uuid :: !Text }
deriving (Show)
deriveJSON (unPrefix "_jsAsyncTask_") ''JSAsyncTask
-- | Task status.
data JSAsyncTaskStatus =
JSAsyncTaskStatus { _jsAsyncTaskStatus_code :: !Text
, _jsAsyncTaskStatus_message :: !(Maybe Text) }
deriving (Show)
taskReady :: JSAsyncTaskStatus -> Bool
taskReady (JSAsyncTaskStatus { .. }) = _jsAsyncTaskStatus_code == "success"
--deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
instance FromJSON JSAsyncTaskStatus where
parseJSON (Object v) = do
status <- v .: "status"
code <- status .: "code"
message <- status .:? "message"
pure $ JSAsyncTaskStatus { _jsAsyncTaskStatus_code = code
, _jsAsyncTaskStatus_message = message }
parseJSON s =
prependFailure "parsing status failed"
(typeMismatch "status" s)
-- | Response for our query. The `Maybe` types are here because we
-- combine 2 types of responses into one: `pos` and `lemma`.
data JSAsyncTaskResponse =
JSAsyncTaskResponse { _jsAsyncTaskResponse_index :: Map Text Int
, _jsAsyncTaskResponse_document :: Map Text Text
, _jsAsyncTaskResponse_sentence :: Map Text [Text]
, _jsAsyncTaskResponse_lem :: Maybe (Map Text [Text])
, _jsAsyncTaskResponse_pos :: Maybe (Map Text [POS])
, _jsAsyncTaskResponse_token :: Map Text [Text] }
deriving (Show)
deriveJSON (unPrefix "_jsAsyncTaskResponse_") ''JSAsyncTaskResponse
makeLenses ''JSAsyncTaskResponse
-- | We need to combine 2 responses: `pos` and `lemma` spells.
jsAsyncTaskResponseToSentences :: JSAsyncTaskResponse -> JSAsyncTaskResponse -> PosSentences
jsAsyncTaskResponseToSentences jsPos jsLemma =
PosSentences { _sentences }
where
_sentences = Map.elems $ Map.mapWithKey mapSentence (jsPos ^. jsAsyncTaskResponse_sentence)
mapSentence idx sentence = Sentence { _sentenceIndex = sIndex
, _sentenceTokens = sTokens }
where
sIndex = Map.findWithDefault (-1) idx (jsPos ^. jsAsyncTaskResponse_index)
lemmas = fromMaybe [] $
if Just sentence == Map.lookup idx (jsLemma ^. jsAsyncTaskResponse_sentence) then
Map.lookup idx $ fromMaybe Map.empty (jsLemma ^. jsAsyncTaskResponse_lem)
else
Nothing
sTokens = imap mapPosToken $ zip (Map.findWithDefault [] idx $ fromMaybe Map.empty (jsPos ^. jsAsyncTaskResponse_pos))
(Map.findWithDefault [] idx (jsPos ^. jsAsyncTaskResponse_token))
mapPosToken idx' (pos, token) = Token { _tokenIndex = -1
, _tokenWord = token
, _tokenOriginalText = ""
, _tokenLemma = fromMaybe "" $ (LS.!!) lemmas idx'
, _tokenCharacterOffsetBegin = -1
, _tokenCharacterOffsetEnd = -1
, _tokenPos = Just pos
, _tokenNer = Nothing
, _tokenBefore = Nothing
, _tokenAfter = Nothing }
-----------------------------------------------------
jsRequest :: Text -> JSSpell -> IO JSAsyncTask
jsRequest t s = do
url <- parseRequest $ "POST http://localhost:5000/api/results"
let jsReq = JSRequest { _jsRequest_data = t
, _jsRequest_format = "text"
, _jsRequest_grouping = Nothing
, _jsRequest_spell = s }
let request = setRequestBodyLBS (encode jsReq) url
task <- httpJSON request :: IO (Response JSAsyncTask)
pure $ getResponseBody task
jsTaskStatus :: JSAsyncTask -> IO JSAsyncTaskStatus
jsTaskStatus (JSAsyncTask uuid) = do
url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid <> "/status"
status <- httpJSON url
pure $ getResponseBody status
jsTaskResponse :: JSAsyncTask -> IO JSAsyncTaskResponse
jsTaskResponse (JSAsyncTask uuid) = do
url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid
result <- httpJSON url
pure $ getResponseBody result
waitForJsTask :: JSAsyncTask -> IO JSAsyncTaskResponse
waitForJsTask jsTask = wait' 0
where
wait' :: Int -> IO JSAsyncTaskResponse
wait' counter = do
status <- jsTaskStatus jsTask
if taskReady status then
jsTaskResponse jsTask
else
if counter > 60 then
panic "[waitForJsTask] waited for 1 minute and still no answer from JohnSnow NLP"
else do
printDebug "[waitForJsTask] task not ready, waiting" counter
_ <- threadDelay $ 1000000*1
wait' $ counter + 1
getPosTagAndLems :: Lang -> Text -> IO PosSentences
getPosTagAndLems l t = do
jsPosTask <- jsRequest t (JSPOS l)
jsLemmaTask <- jsRequest t (JSLemma l)
-- wait for both tasks
jsPos <- waitForJsTask jsPosTask
jsLemma <- waitForJsTask jsLemmaTask
pure $ jsAsyncTaskResponseToSentences jsPos jsLemma
nlp :: Lang -> Text -> IO PosSentences
nlp = getPosTagAndLems