Commit 97d74a5e authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[john snow nlp] implement api for pos & lemma

parent 64789260
......@@ -178,6 +178,7 @@ library:
- jose
- json-stream
- lens
- listsafe
- located-base
- logging-effect
- matrix
......
......@@ -84,6 +84,7 @@ instance Hashable PosTagAlgo
instance HasDBid PosTagAlgo where
toDBid CoreNLP = 1
toDBid JohnSnowServer = 2
fromDBid 1 = CoreNLP
fromDBid 2 = JohnSnowServer
fromDBid _ = panic "HasDBid posTagAlgo : Not implemented"
......
......@@ -23,6 +23,7 @@ import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Core.Text.Terms.Multi.PosTagging
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
......
......@@ -26,31 +26,15 @@ module Gargantext.Core.Text.Terms.Multi.PosTagging
where
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.ByteString.Lazy.Internal (ByteString)
import Data.Set (fromList)
import Data.Text (Text, splitOn, pack, toLower)
import GHC.Generics
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Types
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import Network.HTTP.Simple
------------------------------------------------------------------------
------------------------------------------------------------------------
data Token = Token { _tokenIndex :: Int
, _tokenWord :: Text
, _tokenOriginalText :: Text
, _tokenLemma :: Text
, _tokenCharacterOffsetBegin :: Int
, _tokenCharacterOffsetEnd :: Int
, _tokenPos :: Maybe POS
, _tokenNer :: Maybe NER
, _tokenBefore :: Maybe Text
, _tokenAfter :: Maybe Text
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_token") ''Token)
------------------------------------------------------------------------
------------------------------------------------------------------------
tokens2tokensTags :: [Token] -> [TokenTag]
......@@ -69,23 +53,6 @@ filter' xs = filter isNgrams xs
isNgrams (TokenTag _ _ p n) = isJust p || isJust n
------------------------------------------------------------------------
data Sentence = Sentence { _sentenceIndex :: Int
, _sentenceTokens :: [Token]
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_sentence") ''Sentence)
data Properties = Properties { _propertiesAnnotators :: Text
, _propertiesOutputFormat :: Text
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_properties") ''Properties)
data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''PosSentences)
-- request =
-- "fr" : {
......@@ -144,8 +111,3 @@ tokenWith f lang s = map (map (\t -> (_tokenWord t, f t)))
-- We need the PosTagging according to the language and the lems
serverNLP :: Lang -> Text -> IO PosSentences
serverNLP = undefined
{-|
Module : Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Core.Text.Terms.Multi.PosTagging.Types where
import Data.Aeson.TH (deriveJSON)
import Data.Text (Text)
import Gargantext.Core.Types
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import GHC.Generics
data Token = Token { _tokenIndex :: Int
, _tokenWord :: Text
, _tokenOriginalText :: Text
, _tokenLemma :: Text
, _tokenCharacterOffsetBegin :: Int
, _tokenCharacterOffsetEnd :: Int
, _tokenPos :: Maybe POS
, _tokenNer :: Maybe NER
, _tokenBefore :: Maybe Text
, _tokenAfter :: Maybe Text
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_token") ''Token)
data Sentence = Sentence { _sentenceIndex :: Int
, _sentenceTokens :: [Token]
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_sentence") ''Sentence)
data Properties = Properties { _propertiesAnnotators :: Text
, _propertiesOutputFormat :: Text
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_properties") ''Properties)
data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''PosSentences)
......@@ -71,26 +71,38 @@ data Tag = POS | NER
data POS = NP
| JJ | VB
| CC | IN | DT
| ADV
| NoPos
deriving (Show, Generic, Eq, Ord)
------------------------------------------------------------------------
-- https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/
instance FromJSON POS where
parseJSON = withText "String" (\x -> pure (pos $ unpack x))
where
pos :: [Char] -> POS
pos "NP" = NP
pos "NN" = NP
pos "ADJ" = JJ
pos "CC" = CC
pos "DT" = DT
pos "IN" = IN
pos "JJ" = JJ
pos "JJR" = JJ
pos "JJS" = JJ
pos "NC" = NP
pos "NN" = NP
pos "NNS" = NP
pos "NNP" = NP
pos "JJ" = JJ
pos "ADJ" = JJ
pos "NNPS" = NP
pos "NP" = NP
pos "VB" = VB
pos "VBN" = VB
pos "VBD" = VB
pos "VBG" = VB
pos "CC" = CC
pos "IN" = IN
pos "DT" = DT
pos "VBN" = VB
pos "VBP" = VB
pos "VBZ" = VB
pos "RB" = ADV
pos "RBR" = ADV
pos "RBS" = ADV
pos "WRB" = ADV
-- French specific
pos "P" = IN
pos _ = NoPos
......
{-|
Module : Gargantext.Utils.JohnSnowNLP
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Utils.JohnSnowNLP where
import Control.Concurrent (threadDelay)
import Control.Lens
import Data.Aeson (encode, ToJSON, toJSON, FromJSON, parseJSON, Value(..), (.:))
import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.Aeson.TH (deriveJSON)
import qualified Data.List.Safe as LS
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Maybe (fromMaybe)
import Data.Text hiding (map, group, filter, concat, zip)
import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (POS(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Utils.Prefix (unPrefix)
data JSSpell = JSPOS | JSLemma
deriving (Show)
instance ToJSON JSSpell where
toJSON JSPOS = "pos"
toJSON JSLemma = "lemma"
instance FromJSON JSSpell where
parseJSON (String "pos") = pure JSPOS
parseJSON (String "lemma") = pure JSLemma
parseJSON s =
prependFailure "parsing spell failed, "
(typeMismatch "Spell" s)
data JSRequest =
JSRequest { _jsRequest_data :: !Text
, _jsRequest_format :: !Text
, _jsRequest_grouping :: !(Maybe Text)
, _jsRequest_spell :: !JSSpell }
deriving (Show)
-- "spell" options:
-- https://nlu.johnsnowlabs.com/docs/en/spellbook
deriveJSON (unPrefix "_jsRequest_") ''JSRequest
-- | JohnSnow NLP works via asynchronous tasks: send a query and get a
-- task in response. One must poll for task status and then get it's
-- result.
data JSAsyncTask =
JSAsyncTask { _jsAsyncTask_uuid :: !Text }
deriving (Show)
deriveJSON (unPrefix "_jsAsyncTask_") ''JSAsyncTask
-- | Task status.
data JSAsyncTaskStatus =
JSAsyncTaskStatus { _jsAsyncTaskStatus_code :: !Text
, _jsAsyncTaskStatus_message :: !(Maybe Text) }
deriving (Show)
taskReady :: JSAsyncTaskStatus -> Bool
taskReady (JSAsyncTaskStatus { .. }) = _jsAsyncTaskStatus_code == "success"
--deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
instance FromJSON JSAsyncTaskStatus where
parseJSON (Object v) = do
status <- v .: "status"
code <- status .: "code"
message <- status .: "message"
pure $ JSAsyncTaskStatus { _jsAsyncTaskStatus_code = code
, _jsAsyncTaskStatus_message = message }
parseJSON s =
prependFailure "parsing status failed"
(typeMismatch "status" s)
-- | Response for our query. The `Maybe` types are here because we
-- combine 2 types of responses into one: `pos` and `lemma`.
data JSAsyncTaskResponse =
JSAsyncTaskResponse { _jsAsyncTaskResponse_index :: Map Text Int
, _jsAsyncTaskResponse_document :: Map Text Text
, _jsAsyncTaskResponse_sentence :: Map Text [Text]
, _jsAsyncTaskResponse_lem :: Maybe (Map Text [Text])
, _jsAsyncTaskResponse_pos :: Maybe (Map Text [POS])
, _jsAsyncTaskResponse_token :: Map Text [Text] }
deriving (Show)
deriveJSON (unPrefix "_jsAsyncTaskResponse_") ''JSAsyncTaskResponse
makeLenses ''JSAsyncTaskResponse
-- | We need to combine 2 responses: `pos` and `lemma` spells.
jsAsyncTaskResponseToSentences :: JSAsyncTaskResponse -> JSAsyncTaskResponse -> PosSentences
jsAsyncTaskResponseToSentences jsPos jsLemma =
PosSentences { _sentences }
where
_sentences = Map.elems $ Map.mapWithKey mapSentence (jsPos ^. jsAsyncTaskResponse_sentence)
mapSentence idx sentence = Sentence { _sentenceIndex = sIndex
, _sentenceTokens = sTokens }
where
sIndex = Map.findWithDefault (-1) idx (jsPos ^. jsAsyncTaskResponse_index)
lemmas = fromMaybe [] $
if Just sentence == Map.lookup idx (jsLemma ^. jsAsyncTaskResponse_sentence) then
Map.lookup idx $ fromMaybe Map.empty (jsLemma ^. jsAsyncTaskResponse_lem)
else
Nothing
sTokens = imap mapPosToken $ zip (Map.findWithDefault [] idx $ fromMaybe Map.empty (jsPos ^. jsAsyncTaskResponse_pos))
(Map.findWithDefault [] idx (jsPos ^. jsAsyncTaskResponse_token))
mapPosToken idx' (pos, token) = Token { _tokenIndex = -1
, _tokenWord = token
, _tokenOriginalText = ""
, _tokenLemma = fromMaybe "" $ (LS.!!) lemmas idx'
, _tokenCharacterOffsetBegin = -1
, _tokenCharacterOffsetEnd = -1
, _tokenPos = Just pos
, _tokenNer = Nothing
, _tokenBefore = Nothing
, _tokenAfter = Nothing }
-----------------------------------------------------
jsRequest :: Text -> JSSpell -> IO JSAsyncTask
jsRequest t s = do
url <- parseRequest $ "POST http://localhost:5000/api/results"
let jsReq = JSRequest { _jsRequest_data = t
, _jsRequest_format = "text"
, _jsRequest_grouping = Nothing
, _jsRequest_spell = s }
let request = setRequestBodyLBS (encode jsReq) url
task <- httpJSON request :: IO (Response JSAsyncTask)
pure $ getResponseBody task
jsTaskStatus :: JSAsyncTask -> IO JSAsyncTaskStatus
jsTaskStatus (JSAsyncTask uuid) = do
url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid <> "/status"
status <- httpJSON url
pure $ getResponseBody status
jsTaskResponse :: JSAsyncTask -> IO JSAsyncTaskResponse
jsTaskResponse (JSAsyncTask uuid) = do
url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid
result <- httpJSON url
pure $ getResponseBody result
waitForJsTask :: JSAsyncTask -> IO JSAsyncTaskResponse
waitForJsTask jsTask = wait' 0
where
wait' :: Int -> IO JSAsyncTaskResponse
wait' counter = do
status <- jsTaskStatus jsTask
if taskReady status then
jsTaskResponse jsTask
else
if counter > 60 then
panic "[waitForJsTask] waited for 1 minute and still no answer from JohnSnow NLP"
else do
printDebug "[waitForJsTask] task not ready, waiting" counter
_ <- threadDelay $ 100000*1
wait' $ counter + 1
getPosTagAndLems :: Lang -> Text -> IO PosSentences
getPosTagAndLems _l t = do
jsPosTask <- jsRequest t JSPOS
jsPos <- waitForJsTask jsPosTask
jsLemmaTask <- jsRequest t JSLemma
jsLemma <- waitForJsTask jsLemmaTask
printDebug "[getPosTagAndLems] sentences" $ jsAsyncTaskResponseToSentences jsPos jsLemma
pure $ PosSentences []
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment