[ngrams] use the corenlp-types library

Now we don't keep custom corenlp stuff in our code and it's more
portable. Also, spacy is refactored to use its own types as well as
john snow nlp.
parent bdeed282
......@@ -11,7 +11,7 @@ STORE_DIR="${1:-$DEFAULT_STORE}"
# `expected_cabal_project_freeze_hash` with the
# `sha256sum` result calculated on the `cabal.project` and `cabal.project.freeze`.
# This ensures the files stay deterministic so that CI cache can kick in.
expected_cabal_project_hash="f2efe6832145c093cfe16832b17b06cd4e2d94e85dd0390e713c46c40ee9e461"
expected_cabal_project_hash="eb74b26c841d56bfeb59506b95ebb964f90d9255c64e48580f56dfa57f3d4a6c"
expected_cabal_project_freeze_hash="796f0109611f3381278b1885ae1fa257c4177b99885eb04701938f1107c06ee5"
cabal --store-dir=$STORE_DIR v2-update 'hackage.haskell.org,2023-11-23T20:05:40Z'
......
-- Generated by stack2cabal
index-state: 2023-12-04T09:05:40Z
index-state: 2023-11-23T20:05:40Z
with-compiler: ghc-8.10.7
......@@ -76,6 +76,11 @@ source-repository-package
location: https://gitlab.iscpif.fr/cgenie/patches-class.git
tag: 125c7cb90ab8f0cd6ac4a526dbdf302d10c945e9
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/corenlp-types
tag: 429117fad5f8414d8eaa85b5f3c474c9d003cfeb
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
......@@ -141,6 +146,11 @@ source-repository-package
location: https://gitlab.iscpif.fr/gargantext/iso639.git
tag: eab929d106833ded8011a0d6705135e3fc506a9c
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/servant-xml-conduit.git
tag: 339fd608341bd2652cf5c0e9e76a3293acffbea6
source-repository-package
type: git
location: https://github.com/haskell-servant/servant.git
......
......@@ -273,7 +273,7 @@ library
Gargantext.Core.Text.Terms.Mono.Token
Gargantext.Core.Text.Terms.Mono.Token.En
Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.CoreNLP
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Text.Upload
Gargantext.Core.Types.Search
......@@ -426,6 +426,7 @@ library
, conduit-extra ^>= 1.3.5
, containers ^>= 0.6.5.1
, contravariant ^>= 1.5.5
, corenlp-types ^>= 0.1.0.0
, crawlerArxiv
, crawlerHAL
, crawlerISTEX
......
......@@ -13,7 +13,11 @@ Multi-terms are ngrams where n > 1.
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
module Gargantext.Core.Text.Terms.Multi
( multiterms
, multiterms_rake
, tokenTags
, cleanTextForNLP )
where
import Control.Applicative
......@@ -22,8 +26,7 @@ import Data.Text hiding (map, group, filter, concat)
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr
import Gargantext.Core.Text.Terms.Multi.PosTagging
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Text.Terms.Multi.CoreNLP qualified as CoreNLP
import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
import Gargantext.Core.Types
import Gargantext.Core.Utils (groupWithCounts)
......@@ -31,9 +34,6 @@ import Gargantext.Prelude
import Gargantext.Utils.SpacyNLP qualified as SpacyNLP
import Replace.Attoparsec.Text as RAT
-------------------------------------------------------------------
type NLP_API = Lang -> Text -> IO PosSentences
-------------------------------------------------------------------
multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
multiterms nsc l txt = do
......@@ -54,14 +54,20 @@ multiterms nsc l txt = do
-------------------------------------------------------------------
tokenTag2terms :: TokenTag -> Terms
tokenTag2terms (TokenTag ws t _ _) = Terms ws t
tokenTag2terms (TokenTag ws t _ _ _ _) = Terms ws t
tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = tokenTagsWith FR txt $ corenlp url
tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = do
tt <- CoreNLP.coreNLPTokenTags url EN txt
pure $ map (groupTokens EN) tt
tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = do
tt <- CoreNLP.coreNLPTokenTags url FR txt
pure $ map (groupTokens FR) tt
tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
-- printDebug "NLP Debug" txt
tokenTagsWith l txt $ SpacyNLP.nlp url
tt <- SpacyNLP.nlpTokenTags url txt
pure $ map (groupTokens l) tt
-- tokenTagsWith l txt $ SpacyNLP.nlp url
-- tokenTags FR txt = do
-- -- printDebug "[Spacy Debug]" txt
-- if txt == ""
......@@ -69,12 +75,6 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
-- else tokenTagsWith FR txt SpacyNLP.nlp
tokenTags _ l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l)
tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
tokenTagsWith lang txt nlp = map (groupTokens lang)
<$> map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> nlp lang txt
---- | This function analyses and groups (or not) ngrams according to
......
......@@ -21,12 +21,12 @@ import Gargantext.Prelude
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [TokenTag] -> [TokenTag]
group2 p1 p2 (x@(TokenTag _ _ (Just p1') _):y@(TokenTag _ _ (Just p2') _):z) =
group2 p1 p2 (x@(TokenTag _ _ (Just p1') _ _ _):y@(TokenTag _ _ (Just p2') _ _ _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
then group2 p1 p2 (combineTokenTags x y : z)
-- then (x : y : group2 p1 p2 (x<>y : z))
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 p1 p2 (x@(TokenTag _ _ Nothing _ _ _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(TokenTag _ _ (Just _) _ _ _)] = [x]
group2 p1 p2 (x@(TokenTag _ _ (Just _) _ _ _):y@(TokenTag _ _ Nothing _ _ _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
......@@ -33,30 +33,37 @@ import Data.Map qualified as Map
import Data.Set (fromList)
import Data.Text (splitOn, pack, toLower)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Types
import Gargantext.Prelude hiding (ByteString, toLower)
import Network.HTTP.Simple
import Network.URI (URI(..))
import Text.CoreNLP.Types qualified as CoreNLP
-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
------------------------------------------------------------------------
------------------------------------------------------------------------
tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags :: [CoreNLP.Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------
tokenTag :: Token -> TokenTag
tokenTag (Token { .. }) = TokenTag { _my_token_word = w'
tokenTag :: CoreNLP.Token -> TokenTag
tokenTag (CoreNLP.Token { .. }) = TokenTag { _my_token_word = w'
, _my_token_lemma = l'
, _my_token_pos = _tokenPos
, _my_token_ner = _tokenNer }
, _my_token_pos = fromCoreNLPPOS pos
, _my_token_ner = fromCoreNLPNER ner }
where
w' = split _tokenWord
l' = fromList (split _tokenLemma)
w' = split word
l' = fromList (split lemma)
split :: Text -> [Text]
split = splitOn " " . toLower
fromCoreNLPPOS :: CoreNLP.PennPOS -> Maybe POS
fromCoreNLPPOS = decode . encode
fromCoreNLPNER :: CoreNLP.NamedEntity -> Maybe NER
fromCoreNLPNER = decode . encode
filter' :: [TokenTag] -> [TokenTag]
filter' xs = filter isNgrams xs
where
......@@ -76,59 +83,58 @@ filter' xs = filter isNgrams xs
-- },
--
corenlp' :: ( FromJSON a
, ConvertibleStrings p ByteString
)
=> URI -> Lang -> p -> IO (Response a)
corenlp' uri lang txt = do
req <- parseRequest $
"POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) })
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
-- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
case e of
JSONParseException _req res _err -> do
let body = getResponseBody res
printDebug "[corenlp'] request text" (cs txt :: ByteString)
printDebug "[corenlp'] response body (error)" body
throwIO e
JSONConversionException _req _res _err -> throwIO e
where
properties_ :: [(Text, Text)]
properties_ = case lang of
-- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings
EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
properties :: Lang -> [(Text, Text)]
properties EN = [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
properties FR = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
, ("tokenize.language", "fr") ]
DE -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
properties DE = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/german-hgc.tagger")
, ("tokenize.language", "de") ]
ES -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
properties ES = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/spanish.tagger")
, ("tokenize.language", "es") ]
IT -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
properties IT = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "it") ]
PL -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
properties PL = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "pl") ]
ZH -> [ ("annotators", "tokenize,pos,lemma,ner")
properties ZH = [ ("annotators", "tokenize,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger")
, ("tokenize.language", "zh") ]
l -> panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
properties l = panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
properties = properties_ <> [ ("outputFormat", "json") ]
corenlp' :: ( FromJSON a
, ConvertibleStrings p ByteString
)
=> URI -> Lang -> p -> IO (Response a)
corenlp' uri lang txt = do
req <- parseRequest $ "POST " <> show (uri { uriQuery = query })
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
-- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
case e of
JSONParseException _req res _err -> do
let body = getResponseBody res
printDebug "[corenlp'] request text" (cs txt :: ByteString)
printDebug "[corenlp'] response body (error)" body
throwIO e
JSONConversionException _req _res _err -> throwIO e
where
query = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList props)
props = (properties lang) <> [ ("outputFormat", "json") ]
corenlp :: URI -> Lang -> Text -> IO PosSentences
corenlp :: URI -> Lang -> Text -> IO CoreNLP.Document
corenlp uri lang txt = do
response <- corenlp' uri lang txt
pure (getResponseBody response)
......@@ -141,14 +147,14 @@ corenlp uri lang txt = do
-- Named Entity Recognition example
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: URI -> (Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
tokenWith uri f lang s = map (map (\t -> (_tokenWord t, f t)))
<$> map _sentenceTokens
<$> _sentences
<$> corenlp uri lang s
tokenWith :: URI -> (CoreNLP.Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
tokenWith uri f lang s = do
CoreNLP.Document { sentences } <- corenlp uri lang s
pure (map (map (\t -> (CoreNLP.word t, f t)))
$ map CoreNLP.tokens sentences)
----------------------------------------------------------------------------------
-- Here connect to the JohnSnow Server as it has been done above with the corenlp'
-- We need the PosTagging according to the language and the lems
serverNLP :: Lang -> Text -> IO PosSentences
serverNLP :: Lang -> Text -> IO CoreNLP.Document
serverNLP = undefined
......@@ -7,6 +7,11 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
NOTE: This started out as a wrapper around CoreNLP types, then was
forced into Spacy and JohnSnowNLP. Currently, this is deprecated:
CoreNLP and Spacy use their own types, JohnSnowNLP still uses this
module, but it's not used as well.
-}
{-# LANGUAGE BangPatterns #-}
......@@ -46,7 +51,7 @@ data Properties = Properties { _propertiesAnnotators :: !Text
$(deriveJSON (unPrefix "_properties") ''Properties)
data PosSentences = PosSentences { _sentences :: [Sentence]}
data PosSentences = PosSentences { _sentences :: [Sentence] }
deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''PosSentences)
......@@ -22,6 +22,7 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, DebugMode(..), withDebugMode
, Term(..), Terms(..), TermsCount, TermsWithCount
, TokenTag(..), POS(..), NER(..)
, combineTokenTags, emptyTokenTag
, Label, Stems
, HasValidationError(..), assertValid
, Name
......@@ -36,7 +37,6 @@ import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Maybe
import Data.Monoid
import Data.Semigroup
import Data.Set (empty)
import Data.String
import Data.Swagger (ToParamSchema)
......@@ -154,10 +154,15 @@ data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_lemma :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
, _my_token_offset_begin :: Int
, _my_token_offset_end :: Int
} deriving (Show)
instance Semigroup TokenTag where
(<>) (TokenTag w1 s1 p1 n1) (TokenTag w2 s2 p2 _) = TokenTag (w1 <> w2) (s1 <> s2) p3 n1
-- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And
-- lemma combining is just wrong. You can't just "cat" <> "woman" to
-- get a "Catwoman".
combineTokenTags :: TokenTag -> TokenTag -> TokenTag
combineTokenTags (TokenTag w1 l1 p1 n1 s1 e1) (TokenTag w2 l2 p2 _ s2 e2) = TokenTag (w1 <> w2) (l1 <> l2) p3 n1 (minimum [s1, s2]) (maximum [e1, e2])
where
p3 = case (p1,p2) of
(Just JJ, Just NP) -> Just NP
......@@ -165,10 +170,13 @@ instance Semigroup TokenTag where
_ -> p1
instance Monoid TokenTag where
mempty = TokenTag [] empty Nothing Nothing
mconcat = foldl mappend mempty
-- mappend t1 t2 = (<>) t1 t2
emptyTokenTag :: TokenTag
emptyTokenTag = TokenTag [] empty Nothing Nothing 0 0
-- instance Monoid TokenTag where
-- mempty = TokenTag [] empty Nothing Nothing 0 0
-- mconcat = foldl mappend mempty
-- -- mappend t1 t2 = (<>) t1 t2
class HasValidationError e where
......
......@@ -20,22 +20,23 @@ module Gargantext.Utils.SpacyNLP where
import Control.Lens
import Data.Aeson (encode)
import Data.Aeson.TH (deriveJSON)
import Data.Set qualified as Set
import Data.Text hiding (map, group, filter, concat, zip)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Types (POS(..), NER(..))
import Gargantext.Core.Types (POS(..), NER(..), TokenTag(..))
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
import Network.URI (URI(..))
data SpacyData = SpacyData { _spacy_data :: ![SpacyText]}
data SpacyData = SpacyData { _spacy_data :: ![SpacyText] }
deriving (Show)
data SpacyText = SpacyText { _spacy_text :: !Text
, _spacy_tags :: ![SpacyTags]
} deriving (Show)
-- | https://spacy.io/api/token/#attributes
data SpacyTags =
SpacyTags { _spacyTags_text :: !Text
, _spacyTags_text_with_ws :: !Text
......@@ -100,28 +101,24 @@ makeLenses ''SpacyTags
makeLenses ''SpacyRequest
----------------------------------------------------------------
spacyTagsToToken :: SpacyTags -> Token
spacyTagsToToken st = Token (st ^. spacyTags_index)
(st ^. spacyTags_normalized)
(st ^. spacyTags_text)
(st ^. spacyTags_lemma)
(st ^. spacyTags_head_index)
(st ^. spacyTags_char_offset)
(Just $ st ^. spacyTags_pos)
(Just $ st ^. spacyTags_ent_type)
(Just $ st ^. spacyTags_prefix)
(Just $ st ^. spacyTags_suffix)
spacyDataToPosSentences :: SpacyData -> PosSentences
spacyDataToPosSentences (SpacyData ds) = PosSentences
$ map (\(i, ts) -> Sentence i ts)
$ zip [1..]
$ map (\(SpacyText _ tags)-> map spacyTagsToToken tags) ds
spacyDataToTokenTags :: SpacyData -> [[TokenTag]]
spacyDataToTokenTags (SpacyData ds) =
map (\(SpacyText _ tags) -> spacyTagsToTokenTag <$> tags) ds
spacyTagsToTokenTag :: SpacyTags -> TokenTag
spacyTagsToTokenTag st = TokenTag { _my_token_word = split' (st ^. spacyTags_normalized)
, _my_token_lemma = Set.fromList (split' $ st ^. spacyTags_lemma)
, _my_token_pos = Just $ st ^. spacyTags_pos
, _my_token_ner = Just $ st ^. spacyTags_ent_type
, _my_token_offset_begin = 0 -- TODO
, _my_token_offset_end = 0 -- TODO
}
where
split' :: Text -> [Text]
split' = splitOn " " . Data.Text.toLower
-----------------------------------------------------------------
nlp :: URI -> Lang -> Text -> IO PosSentences
nlp uri _lang txt = spacyDataToPosSentences <$> spacyRequest uri txt
-- nlp _ _ _ = panic "Make sure you have the right model for your lang for spacy Server"
-- nlp FR txt = spacyDataToPosSentences <$> spacyRequest txt
-- nlp _ _ = panic "Make sure you have the right model for your lang for spacy Server"
nlpTokenTags :: URI -> Text -> IO [[TokenTag]]
nlpTokenTags uri txt = spacyDataToTokenTags <$> spacyRequest uri txt
......@@ -43,6 +43,8 @@ extra-deps:
- servant-auth/servant-auth/
- servant-auth/servant-auth-client/
- servant-auth/servant-auth-server/
- git: https://gitlab.iscpif.fr/gargantext/corenlp-types
commit: 429117fad5f8414d8eaa85b5f3c474c9d003cfeb
- git: https://gitlab.iscpif.fr/gargantext/servant-xml-conduit.git
commit: 339fd608341bd2652cf5c0e9e76a3293acffbea6
- git: https://github.com/alpmestan/ekg-json.git
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment