diff --git a/devops/docker/docker-compose.yaml b/devops/docker/docker-compose.yaml index cbe1aafefab4675a53c09a2b6b6326808fa8f909..9ab541a6dd53877a384d5d4efd5fb9b7d4380f48 100644 --- a/devops/docker/docker-compose.yaml +++ b/devops/docker/docker-compose.yaml @@ -35,5 +35,13 @@ services: ports: - 9000:9000 + johnsnownlp: + image: 'johnsnowlabs/nlp-server:latest' + volumes: + - js-cache:/home/johnsnowlabs/cache_pretrained + ports: + - 5000:5000 + volumes: garg-pgdata: + js-cache: diff --git a/package.yaml b/package.yaml index ee8d29b1d8872369bb207a931ac00cb20b6d0b94..e9fece5ebf87b4cddf1ee7858a491f22a34535ca 100644 --- a/package.yaml +++ b/package.yaml @@ -183,6 +183,7 @@ library: - jose - json-stream - lens + - listsafe - located-base - logging-effect - matrix diff --git a/src/Gargantext/Core.hs b/src/Gargantext/Core.hs index 691e8cf00df6fdb21adee1209d8b9041a1261478..5bf0e2ba8a3173c01344c0f524e06b87e937c8b0 100644 --- a/src/Gargantext/Core.hs +++ b/src/Gargantext/Core.hs @@ -77,13 +77,15 @@ instance HasDBid Lang where type Form = Text type Lem = Text ------------------------------------------------------------------------ -data PosTagAlgo = CoreNLP +data PosTagAlgo = CoreNLP | JohnSnowServer deriving (Show, Read, Eq, Ord, Generic) instance Hashable PosTagAlgo instance HasDBid PosTagAlgo where toDBid CoreNLP = 1 + toDBid JohnSnowServer = 2 fromDBid 1 = CoreNLP + fromDBid 2 = JohnSnowServer fromDBid _ = panic "HasDBid posTagAlgo : Not implemented" diff --git a/src/Gargantext/Core/Text/Terms/Multi.hs b/src/Gargantext/Core/Text/Terms/Multi.hs index ab3363a41125afa6882f5089129749534da95dee..53e7f28b2d786bd58c2fbea305d495f9eea8a00f 100644 --- a/src/Gargantext/Core/Text/Terms/Multi.hs +++ b/src/Gargantext/Core/Text/Terms/Multi.hs @@ -23,6 +23,7 @@ import Gargantext.Core (Lang(..)) import Gargantext.Core.Types import Gargantext.Core.Text.Terms.Multi.PosTagging +import Gargantext.Core.Text.Terms.Multi.PosTagging.Types import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr diff --git a/src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs b/src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs index 782bdfaee32463350a2ae6d313a5fce9c4098a8b..c2751853f3b12b2e81c06462be2a127f87172c2c 100644 --- a/src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs +++ b/src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs @@ -26,31 +26,15 @@ module Gargantext.Core.Text.Terms.Multi.PosTagging where import Data.Aeson -import Data.Aeson.TH (deriveJSON) import Data.ByteString.Lazy.Internal (ByteString) import Data.Set (fromList) import Data.Text (Text, splitOn, pack, toLower) -import GHC.Generics import Gargantext.Core (Lang(..)) +import Gargantext.Core.Text.Terms.Multi.PosTagging.Types import Gargantext.Core.Types -import Gargantext.Core.Utils.Prefix (unPrefix) import Gargantext.Prelude import Network.HTTP.Simple ------------------------------------------------------------------------- ------------------------------------------------------------------------- -data Token = Token { _tokenIndex :: Int - , _tokenWord :: Text - , _tokenOriginalText :: Text - , _tokenLemma :: Text - , _tokenCharacterOffsetBegin :: Int - , _tokenCharacterOffsetEnd :: Int - , _tokenPos :: Maybe POS - , _tokenNer :: Maybe NER - , _tokenBefore :: Maybe Text - , _tokenAfter :: Maybe Text - } deriving (Show, Generic) -$(deriveJSON (unPrefix "_token") ''Token) ------------------------------------------------------------------------ ------------------------------------------------------------------------ tokens2tokensTags :: [Token] -> [TokenTag] @@ -69,23 +53,6 @@ filter' xs = filter isNgrams xs isNgrams (TokenTag _ _ p n) = isJust p || isJust n ------------------------------------------------------------------------ -data Sentence = Sentence { _sentenceIndex :: Int - , _sentenceTokens :: [Token] - } deriving (Show, Generic) - -$(deriveJSON (unPrefix "_sentence") ''Sentence) - -data Properties = Properties { _propertiesAnnotators :: Text - , _propertiesOutputFormat :: Text - } deriving (Show, Generic) - -$(deriveJSON (unPrefix "_properties") ''Properties) - -data PosSentences = PosSentences { _sentences :: [Sentence]} - deriving (Show, Generic) - -$(deriveJSON (unPrefix "_") ''PosSentences) - -- request = -- "fr" : { @@ -139,4 +106,8 @@ tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> _sentences <$> corenlp lang s - +---------------------------------------------------------------------------------- +-- Here connect to the JohnSnow Server as it has been done above with the corenlp' +-- We need the PosTagging according to the language and the lems +serverNLP :: Lang -> Text -> IO PosSentences +serverNLP = undefined diff --git a/src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs b/src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs new file mode 100644 index 0000000000000000000000000000000000000000..2a2e9177162389d5e3a948a7946176916a6b5c69 --- /dev/null +++ b/src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs @@ -0,0 +1,54 @@ +{-| +Module : Gargantext.Core.Text.Terms.Multi.PosTagging.Types +Description : PosTagging module using Stanford java REST API +Copyright : (c) CNRS, 2017 +License : AGPL + CECILL v3 +Maintainer : team@gargantext.org +Stability : experimental +Portability : POSIX + +-} + +{-# LANGUAGE TemplateHaskell #-} +{-# LANGUAGE TypeOperators #-} + +module Gargantext.Core.Text.Terms.Multi.PosTagging.Types where + +import Data.Aeson.TH (deriveJSON) +import Data.Text (Text) +import Gargantext.Core.Types +import Gargantext.Core.Utils.Prefix (unPrefix) +import Gargantext.Prelude +import GHC.Generics + + +data Token = Token { _tokenIndex :: Int + , _tokenWord :: Text + , _tokenOriginalText :: Text + , _tokenLemma :: Text + , _tokenCharacterOffsetBegin :: Int + , _tokenCharacterOffsetEnd :: Int + , _tokenPos :: Maybe POS + , _tokenNer :: Maybe NER + , _tokenBefore :: Maybe Text + , _tokenAfter :: Maybe Text + } deriving (Show, Generic) +$(deriveJSON (unPrefix "_token") ''Token) + +data Sentence = Sentence { _sentenceIndex :: Int + , _sentenceTokens :: [Token] + } deriving (Show, Generic) + +$(deriveJSON (unPrefix "_sentence") ''Sentence) + +data Properties = Properties { _propertiesAnnotators :: Text + , _propertiesOutputFormat :: Text + } deriving (Show, Generic) + +$(deriveJSON (unPrefix "_properties") ''Properties) + +data PosSentences = PosSentences { _sentences :: [Sentence]} + deriving (Show, Generic) + +$(deriveJSON (unPrefix "_") ''PosSentences) + diff --git a/src/Gargantext/Core/Types.hs b/src/Gargantext/Core/Types.hs index b3a9b40a783a51380c6fed780140d7bd885fbc32..81f9434e9bc74c459b4de2098bcf586baf15da9c 100644 --- a/src/Gargantext/Core/Types.hs +++ b/src/Gargantext/Core/Types.hs @@ -71,26 +71,38 @@ data Tag = POS | NER data POS = NP | JJ | VB | CC | IN | DT + | ADV | NoPos deriving (Show, Generic, Eq, Ord) ------------------------------------------------------------------------ +-- https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/ instance FromJSON POS where parseJSON = withText "String" (\x -> pure (pos $ unpack x)) where pos :: [Char] -> POS - pos "NP" = NP - pos "NN" = NP + pos "ADJ" = JJ + pos "CC" = CC + pos "DT" = DT + pos "IN" = IN + pos "JJ" = JJ + pos "JJR" = JJ + pos "JJS" = JJ pos "NC" = NP + pos "NN" = NP pos "NNS" = NP pos "NNP" = NP - pos "JJ" = JJ - pos "ADJ" = JJ + pos "NNPS" = NP + pos "NP" = NP pos "VB" = VB - pos "VBN" = VB + pos "VBD" = VB pos "VBG" = VB - pos "CC" = CC - pos "IN" = IN - pos "DT" = DT + pos "VBN" = VB + pos "VBP" = VB + pos "VBZ" = VB + pos "RB" = ADV + pos "RBR" = ADV + pos "RBS" = ADV + pos "WRB" = ADV -- French specific pos "P" = IN pos _ = NoPos diff --git a/src/Gargantext/Database/Action/Flow.hs b/src/Gargantext/Database/Action/Flow.hs index 8a6afbaa4cb0c890462ae22bb8233ed9803bfe7b..9fc372b7aa541fe228934779108e113dba4484c6 100644 --- a/src/Gargantext/Database/Action/Flow.hs +++ b/src/Gargantext/Database/Action/Flow.hs @@ -295,7 +295,8 @@ flowCorpusUser l user corpusName ctype ids mfslw = do (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype - --let gp = (GroupParams l 2 3 (StopSize 3)) + --let gp = (GroupParams l 2 3 (StopSize 3)) + -- Here the PosTagAlgo should be chosen according the Lang let gp = GroupWithPosTag l CoreNLP HashMap.empty ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp diff --git a/src/Gargantext/Utils/JohnSnowNLP.hs b/src/Gargantext/Utils/JohnSnowNLP.hs new file mode 100644 index 0000000000000000000000000000000000000000..eece6e7e2f4b53bb7927ee381b3009c39db82af4 --- /dev/null +++ b/src/Gargantext/Utils/JohnSnowNLP.hs @@ -0,0 +1,194 @@ +{-| +Module : Gargantext.Utils.JohnSnowNLP +Description : PosTagging module using Stanford java REST API +Copyright : (c) CNRS, 2017 +License : AGPL + CECILL v3 +Maintainer : team@gargantext.org +Stability : experimental +Portability : POSIX + +-} + +{-# LANGUAGE TemplateHaskell #-} + +module Gargantext.Utils.JohnSnowNLP where + +import Control.Concurrent (threadDelay) +import Control.Lens +import Data.Aeson (encode, ToJSON, toJSON, FromJSON, parseJSON, Value(..), (.:), (.:?)) +import Data.Aeson.Types (prependFailure, typeMismatch) +import Data.Aeson.TH (deriveJSON) +import qualified Data.List.Safe as LS +import Data.Map (Map) +import qualified Data.Map as Map +import Data.Maybe (fromMaybe) +import Data.Text hiding (map, group, filter, concat, zip) +import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response) + +import Gargantext.Prelude +import Gargantext.Core (Lang(..)) +import Gargantext.Core.Types (POS(..)) +import Gargantext.Core.Text.Terms.Multi.PosTagging.Types +import Gargantext.Core.Utils.Prefix (unPrefix) + + +data JSSpell = JSPOS Lang | JSLemma Lang + deriving (Show) + +instance ToJSON JSSpell where + toJSON (JSPOS EN) = "en.pos" + toJSON (JSPOS FR) = "fr.pos" + toJSON (JSPOS All) = "pos" + toJSON (JSLemma EN) = "en.lemma" + toJSON (JSLemma FR) = "fr.lemma" + toJSON (JSLemma All) = "lemma" + +instance FromJSON JSSpell where + parseJSON (String "en.pos") = pure $ JSPOS EN + parseJSON (String "fr.pos") = pure $ JSPOS FR + parseJSON (String "pos") = pure $ JSPOS All + parseJSON (String "en.lemma") = pure $ JSLemma EN + parseJSON (String "fr.lemma") = pure $ JSLemma FR + parseJSON (String "lemma") = pure $ JSLemma All + parseJSON s = + prependFailure "parsing spell failed, " + (typeMismatch "Spell" s) + +data JSRequest = + JSRequest { _jsRequest_data :: !Text + , _jsRequest_format :: !Text + , _jsRequest_grouping :: !(Maybe Text) + , _jsRequest_spell :: !JSSpell } + deriving (Show) + +-- "spell" options: +-- https://nlu.johnsnowlabs.com/docs/en/spellbook + +deriveJSON (unPrefix "_jsRequest_") ''JSRequest + +-- | JohnSnow NLP works via asynchronous tasks: send a query and get a +-- task in response. One must poll for task status and then get it's +-- result. +data JSAsyncTask = + JSAsyncTask { _jsAsyncTask_uuid :: !Text } + deriving (Show) + +deriveJSON (unPrefix "_jsAsyncTask_") ''JSAsyncTask + +-- | Task status. +data JSAsyncTaskStatus = + JSAsyncTaskStatus { _jsAsyncTaskStatus_code :: !Text + , _jsAsyncTaskStatus_message :: !(Maybe Text) } + deriving (Show) + +taskReady :: JSAsyncTaskStatus -> Bool +taskReady (JSAsyncTaskStatus { .. }) = _jsAsyncTaskStatus_code == "success" + +--deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus +instance FromJSON JSAsyncTaskStatus where + parseJSON (Object v) = do + status <- v .: "status" + code <- status .: "code" + message <- status .:? "message" + pure $ JSAsyncTaskStatus { _jsAsyncTaskStatus_code = code + , _jsAsyncTaskStatus_message = message } + parseJSON s = + prependFailure "parsing status failed" + (typeMismatch "status" s) + +-- | Response for our query. The `Maybe` types are here because we +-- combine 2 types of responses into one: `pos` and `lemma`. +data JSAsyncTaskResponse = + JSAsyncTaskResponse { _jsAsyncTaskResponse_index :: Map Text Int + , _jsAsyncTaskResponse_document :: Map Text Text + , _jsAsyncTaskResponse_sentence :: Map Text [Text] + , _jsAsyncTaskResponse_lem :: Maybe (Map Text [Text]) + , _jsAsyncTaskResponse_pos :: Maybe (Map Text [POS]) + , _jsAsyncTaskResponse_token :: Map Text [Text] } + deriving (Show) + +deriveJSON (unPrefix "_jsAsyncTaskResponse_") ''JSAsyncTaskResponse +makeLenses ''JSAsyncTaskResponse + +-- | We need to combine 2 responses: `pos` and `lemma` spells. +jsAsyncTaskResponseToSentences :: JSAsyncTaskResponse -> JSAsyncTaskResponse -> PosSentences +jsAsyncTaskResponseToSentences jsPos jsLemma = + PosSentences { _sentences } + where + _sentences = Map.elems $ Map.mapWithKey mapSentence (jsPos ^. jsAsyncTaskResponse_sentence) + mapSentence idx sentence = Sentence { _sentenceIndex = sIndex + , _sentenceTokens = sTokens } + where + sIndex = Map.findWithDefault (-1) idx (jsPos ^. jsAsyncTaskResponse_index) + lemmas = fromMaybe [] $ + if Just sentence == Map.lookup idx (jsLemma ^. jsAsyncTaskResponse_sentence) then + Map.lookup idx $ fromMaybe Map.empty (jsLemma ^. jsAsyncTaskResponse_lem) + else + Nothing + sTokens = imap mapPosToken $ zip (Map.findWithDefault [] idx $ fromMaybe Map.empty (jsPos ^. jsAsyncTaskResponse_pos)) + (Map.findWithDefault [] idx (jsPos ^. jsAsyncTaskResponse_token)) + mapPosToken idx' (pos, token) = Token { _tokenIndex = -1 + , _tokenWord = token + , _tokenOriginalText = "" + , _tokenLemma = fromMaybe "" $ (LS.!!) lemmas idx' + , _tokenCharacterOffsetBegin = -1 + , _tokenCharacterOffsetEnd = -1 + , _tokenPos = Just pos + , _tokenNer = Nothing + , _tokenBefore = Nothing + , _tokenAfter = Nothing } + +----------------------------------------------------- + +jsRequest :: Text -> JSSpell -> IO JSAsyncTask +jsRequest t s = do + url <- parseRequest $ "POST http://localhost:5000/api/results" + let jsReq = JSRequest { _jsRequest_data = t + , _jsRequest_format = "text" + , _jsRequest_grouping = Nothing + , _jsRequest_spell = s } + let request = setRequestBodyLBS (encode jsReq) url + task <- httpJSON request :: IO (Response JSAsyncTask) + pure $ getResponseBody task + +jsTaskStatus :: JSAsyncTask -> IO JSAsyncTaskStatus +jsTaskStatus (JSAsyncTask uuid) = do + url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid <> "/status" + status <- httpJSON url + pure $ getResponseBody status + +jsTaskResponse :: JSAsyncTask -> IO JSAsyncTaskResponse +jsTaskResponse (JSAsyncTask uuid) = do + url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid + result <- httpJSON url + pure $ getResponseBody result + +waitForJsTask :: JSAsyncTask -> IO JSAsyncTaskResponse +waitForJsTask jsTask = wait' 0 + where + wait' :: Int -> IO JSAsyncTaskResponse + wait' counter = do + status <- jsTaskStatus jsTask + if taskReady status then + jsTaskResponse jsTask + else + if counter > 60 then + panic "[waitForJsTask] waited for 1 minute and still no answer from JohnSnow NLP" + else do + printDebug "[waitForJsTask] task not ready, waiting" counter + _ <- threadDelay $ 1000000*1 + wait' $ counter + 1 + +getPosTagAndLems :: Lang -> Text -> IO PosSentences +getPosTagAndLems l t = do + jsPosTask <- jsRequest t (JSPOS l) + jsLemmaTask <- jsRequest t (JSLemma l) + + -- wait for both tasks + jsPos <- waitForJsTask jsPosTask + jsLemma <- waitForJsTask jsLemmaTask + + printDebug "[getPosTagAndLems] sentences" $ jsAsyncTaskResponseToSentences jsPos jsLemma + + pure $ PosSentences [] +