[ngrams] add multi/corenlp.hs

22d25fbe · Przemyslaw Kaminski · ff01b79c · 22d25fbe
Verified Commit 22d25fbe authored Dec 04, 2023 by Przemyslaw Kaminski
Hide whitespace changes
Inline Side-by-side

Showing with 171 additions and 0 deletions

CoreNLP.hs src/Gargantext/Core/Text/Terms/Multi/CoreNLP.hs +171 -0

No files found.
--- a/src/Gargantext/Core/Text/Terms/Multi/CoreNLP.hs
+++ b/src/Gargantext/Core/Text/Terms/Multi/CoreNLP.hs
+{-|
+Module      : Gargantext.Core.Text.Terms.Multi.CoreNLP
+Description : PosTagging module using Stanford java REST API
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+In corpus linguistics, part-of-speech tagging (POS tagging or PoS
+tagging or POST), also called grammatical tagging or word-category
+disambiguation, is the process of marking up a word in a text (corpus)
+as corresponding to a particular part of speech,[1] based on both its
+definition and its context—i.e., its relationship with adjacent and
+related words in a phrase, sentence, or paragraph. A simplified form of
+this is commonly taught to school-age children, in the identification of
+words as nouns, verbs, adjectives, adverbs, etc.
+Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
+-}
+{-# OPTIONS_GHC -fno-warn-deprecations #-}
+{-# LANGUAGE TemplateHaskell   #-}
+{-# LANGUAGE TypeOperators     #-}
+module Gargantext.Core.Text.Terms.Multi.CoreNLP where
+import Data.Aeson
+import Data.ByteString.Lazy.Char8 qualified as BSL
+import Data.ByteString.Lazy.Internal (ByteString)
+import Data.Map qualified as Map
+import Data.Set (fromList)
+import Data.Text (splitOn, pack, toLower)
+import Gargantext.Core (Lang(..))
+import Gargantext.Core.Types
+import Gargantext.Prelude hiding (ByteString, toLower)
+import Network.HTTP.Simple
+import Network.URI (URI(..))
+import Text.CoreNLP.Types qualified as CoreNLP
+-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
+------------------------------------------------------------------------
+------------------------------------------------------------------------
+tokens2tokensTags :: [CoreNLP.Token] -> [TokenTag]
+tokens2tokensTags ts = filter' $ map tokenTag ts
+------------------------------------------------------------------------
+tokenTag :: CoreNLP.Token -> TokenTag
+tokenTag (CoreNLP.Token { .. }) = TokenTag { _my_token_word = w'
+                                           , _my_token_lemma = l'
+                                           , _my_token_pos = fromCoreNLPPOS pos
+                                           , _my_token_ner = fromCoreNLPNER ner
+                                           , _my_token_offset_begin = characterOffsetBegin
+                                           , _my_token_offset_end = characterOffsetEnd }
+  where
+    w' = split word
+    l' = fromList (split lemma)
+    split :: Text -> [Text]
+    split = splitOn " " . toLower
+fromCoreNLPPOS :: CoreNLP.PennPOS -> Maybe POS
+fromCoreNLPPOS = decode . encode
+fromCoreNLPNER :: CoreNLP.NamedEntity -> Maybe NER
+fromCoreNLPNER = decode . encode
+filter' :: [TokenTag] -> [TokenTag]
+filter' xs = filter isNgrams xs
+    where
+      isNgrams (TokenTag { .. }) = isJust _my_token_pos || isJust _my_token_ner
+------------------------------------------------------------------------
+-- request =
+-- "fr" : {
+--                 "tokenize.language" : "fr",
+--                 "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
+--                 "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
+--                 // dependency parser
+--                 "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
+--                 "depparse.language" : "french",
+--                 "ner.model":  DATA_ROOT+"/eunews.fr.crf.gz",
+--                 "ssplit.newlineIsSentenceBreak": "always"
+--             },
+--
+properties :: Lang -> [(Text, Text)]
+properties EN = [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
+properties FR = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
+                , ("tokenize.language", "fr") ]
+properties DE = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/german-hgc.tagger")
+                , ("tokenize.language", "de") ]
+properties ES = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/spanish.tagger")
+                , ("tokenize.language", "es") ]
+properties IT = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                  -- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
+                , ("tokenize.language", "it") ]
+properties PL = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                  -- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
+                , ("tokenize.language", "pl") ]
+properties ZH = [ ("annotators", "tokenize,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger")
+                , ("tokenize.language", "zh") ]
+properties l = panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
+corenlp' :: ( FromJSON a
+            , ConvertibleStrings p ByteString
+            )
+          => URI -> Lang -> p -> IO (Response a)
+corenlp' uri lang txt = do
+  req <- parseRequest $ "POST " <> show (uri { uriQuery = query })
+   -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
+  --   printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
+  catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
+    case e of
+      JSONParseException _req res _err -> do
+        let body = getResponseBody res
+        printDebug "[corenlp'] request text" (cs txt :: ByteString)
+        printDebug "[corenlp'] response body (error)" body
+        throwIO e
+      JSONConversionException _req _res _err -> throwIO e
+  where
+    query = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList props)
+    props = (properties lang) <> [ ("outputFormat", "json") ]
+corenlp :: URI -> Lang -> Text -> IO CoreNLP.Document
+corenlp uri lang txt = do
+  response <- corenlp' uri lang txt
+  pure (getResponseBody response)
+coreNLPTokenTags :: URI -> Lang -> Text -> IO [[TokenTag]]
+coreNLPTokenTags uri lang txt = do
+  document <- corenlp uri lang txt
+  pure $ map tokens2tokensTags $ allTokens document
+-- | parseWith
+-- Part Of Speech example
+-- parseWith  _tokenPos    "Hello world."
+-- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
+-- Named Entity Recognition example
+-- parseWith  _tokenNer     "Hello world of Peter."
+-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
+tokenWith :: URI -> (CoreNLP.Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
+tokenWith uri f lang s = do
+  document <- corenlp uri lang s
+  pure $ map (map (\t -> (CoreNLP.word t, f t))) $ allTokens document
+allTokens :: CoreNLP.Document -> [[CoreNLP.Token]]
+allTokens (CoreNLP.Document { sentences }) = CoreNLP.tokens <$> sentences
+----------------------------------------------------------------------------------
+-- Here connect to the JohnSnow Server as it has been done above with the corenlp'
+-- We need the PosTagging according to the language and the lems
+serverNLP :: Lang -> Text -> IO CoreNLP.Document
+serverNLP = undefined