[ngrams] add multi/corenlp.hs

parent ff01b79c
{-|
Module : Gargantext.Core.Text.Terms.Multi.CoreNLP
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
In corpus linguistics, part-of-speech tagging (POS tagging or PoS
tagging or POST), also called grammatical tagging or word-category
disambiguation, is the process of marking up a word in a text (corpus)
as corresponding to a particular part of speech,[1] based on both its
definition and its context—i.e., its relationship with adjacent and
related words in a phrase, sentence, or paragraph. A simplified form of
this is commonly taught to school-age children, in the identification of
words as nouns, verbs, adjectives, adverbs, etc.
Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
-}
{-# OPTIONS_GHC -fno-warn-deprecations #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Core.Text.Terms.Multi.CoreNLP where
import Data.Aeson
import Data.ByteString.Lazy.Char8 qualified as BSL
import Data.ByteString.Lazy.Internal (ByteString)
import Data.Map qualified as Map
import Data.Set (fromList)
import Data.Text (splitOn, pack, toLower)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Prelude hiding (ByteString, toLower)
import Network.HTTP.Simple
import Network.URI (URI(..))
import Text.CoreNLP.Types qualified as CoreNLP
-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
------------------------------------------------------------------------
------------------------------------------------------------------------
tokens2tokensTags :: [CoreNLP.Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------
tokenTag :: CoreNLP.Token -> TokenTag
tokenTag (CoreNLP.Token { .. }) = TokenTag { _my_token_word = w'
, _my_token_lemma = l'
, _my_token_pos = fromCoreNLPPOS pos
, _my_token_ner = fromCoreNLPNER ner
, _my_token_offset_begin = characterOffsetBegin
, _my_token_offset_end = characterOffsetEnd }
where
w' = split word
l' = fromList (split lemma)
split :: Text -> [Text]
split = splitOn " " . toLower
fromCoreNLPPOS :: CoreNLP.PennPOS -> Maybe POS
fromCoreNLPPOS = decode . encode
fromCoreNLPNER :: CoreNLP.NamedEntity -> Maybe NER
fromCoreNLPNER = decode . encode
filter' :: [TokenTag] -> [TokenTag]
filter' xs = filter isNgrams xs
where
isNgrams (TokenTag { .. }) = isJust _my_token_pos || isJust _my_token_ner
------------------------------------------------------------------------
-- request =
-- "fr" : {
-- "tokenize.language" : "fr",
-- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
-- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
-- // dependency parser
-- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
-- "depparse.language" : "french",
-- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
-- "ssplit.newlineIsSentenceBreak": "always"
-- },
--
properties :: Lang -> [(Text, Text)]
properties EN = [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
properties FR = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
, ("tokenize.language", "fr") ]
properties DE = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/german-hgc.tagger")
, ("tokenize.language", "de") ]
properties ES = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/spanish.tagger")
, ("tokenize.language", "es") ]
properties IT = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "it") ]
properties PL = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "pl") ]
properties ZH = [ ("annotators", "tokenize,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger")
, ("tokenize.language", "zh") ]
properties l = panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
corenlp' :: ( FromJSON a
, ConvertibleStrings p ByteString
)
=> URI -> Lang -> p -> IO (Response a)
corenlp' uri lang txt = do
req <- parseRequest $ "POST " <> show (uri { uriQuery = query })
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
-- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
case e of
JSONParseException _req res _err -> do
let body = getResponseBody res
printDebug "[corenlp'] request text" (cs txt :: ByteString)
printDebug "[corenlp'] response body (error)" body
throwIO e
JSONConversionException _req _res _err -> throwIO e
where
query = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList props)
props = (properties lang) <> [ ("outputFormat", "json") ]
corenlp :: URI -> Lang -> Text -> IO CoreNLP.Document
corenlp uri lang txt = do
response <- corenlp' uri lang txt
pure (getResponseBody response)
coreNLPTokenTags :: URI -> Lang -> Text -> IO [[TokenTag]]
coreNLPTokenTags uri lang txt = do
document <- corenlp uri lang txt
pure $ map tokens2tokensTags $ allTokens document
-- | parseWith
-- Part Of Speech example
-- parseWith _tokenPos "Hello world."
-- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
-- Named Entity Recognition example
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: URI -> (CoreNLP.Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
tokenWith uri f lang s = do
document <- corenlp uri lang s
pure $ map (map (\t -> (CoreNLP.word t, f t))) $ allTokens document
allTokens :: CoreNLP.Document -> [[CoreNLP.Token]]
allTokens (CoreNLP.Document { sentences }) = CoreNLP.tokens <$> sentences
----------------------------------------------------------------------------------
-- Here connect to the JohnSnow Server as it has been done above with the corenlp'
-- We need the PosTagging according to the language and the lems
serverNLP :: Lang -> Text -> IO CoreNLP.Document
serverNLP = undefined
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment