[ngrams] use the corenlp-types library

Now we don't keep custom corenlp stuff in our code and it's more
portable. Also, spacy is refactored to use its own types as well as
john snow nlp.
parent bdeed282
...@@ -11,7 +11,7 @@ STORE_DIR="${1:-$DEFAULT_STORE}" ...@@ -11,7 +11,7 @@ STORE_DIR="${1:-$DEFAULT_STORE}"
# `expected_cabal_project_freeze_hash` with the # `expected_cabal_project_freeze_hash` with the
# `sha256sum` result calculated on the `cabal.project` and `cabal.project.freeze`. # `sha256sum` result calculated on the `cabal.project` and `cabal.project.freeze`.
# This ensures the files stay deterministic so that CI cache can kick in. # This ensures the files stay deterministic so that CI cache can kick in.
expected_cabal_project_hash="f2efe6832145c093cfe16832b17b06cd4e2d94e85dd0390e713c46c40ee9e461" expected_cabal_project_hash="eb74b26c841d56bfeb59506b95ebb964f90d9255c64e48580f56dfa57f3d4a6c"
expected_cabal_project_freeze_hash="796f0109611f3381278b1885ae1fa257c4177b99885eb04701938f1107c06ee5" expected_cabal_project_freeze_hash="796f0109611f3381278b1885ae1fa257c4177b99885eb04701938f1107c06ee5"
cabal --store-dir=$STORE_DIR v2-update 'hackage.haskell.org,2023-11-23T20:05:40Z' cabal --store-dir=$STORE_DIR v2-update 'hackage.haskell.org,2023-11-23T20:05:40Z'
......
-- Generated by stack2cabal -- Generated by stack2cabal
index-state: 2023-12-04T09:05:40Z index-state: 2023-11-23T20:05:40Z
with-compiler: ghc-8.10.7 with-compiler: ghc-8.10.7
...@@ -76,6 +76,11 @@ source-repository-package ...@@ -76,6 +76,11 @@ source-repository-package
location: https://gitlab.iscpif.fr/cgenie/patches-class.git location: https://gitlab.iscpif.fr/cgenie/patches-class.git
tag: 125c7cb90ab8f0cd6ac4a526dbdf302d10c945e9 tag: 125c7cb90ab8f0cd6ac4a526dbdf302d10c945e9
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/corenlp-types
tag: 429117fad5f8414d8eaa85b5f3c474c9d003cfeb
source-repository-package source-repository-package
type: git type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git location: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
...@@ -141,6 +146,11 @@ source-repository-package ...@@ -141,6 +146,11 @@ source-repository-package
location: https://gitlab.iscpif.fr/gargantext/iso639.git location: https://gitlab.iscpif.fr/gargantext/iso639.git
tag: eab929d106833ded8011a0d6705135e3fc506a9c tag: eab929d106833ded8011a0d6705135e3fc506a9c
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/servant-xml-conduit.git
tag: 339fd608341bd2652cf5c0e9e76a3293acffbea6
source-repository-package source-repository-package
type: git type: git
location: https://github.com/haskell-servant/servant.git location: https://github.com/haskell-servant/servant.git
......
...@@ -273,7 +273,7 @@ library ...@@ -273,7 +273,7 @@ library
Gargantext.Core.Text.Terms.Mono.Token Gargantext.Core.Text.Terms.Mono.Token
Gargantext.Core.Text.Terms.Mono.Token.En Gargantext.Core.Text.Terms.Mono.Token.En
Gargantext.Core.Text.Terms.Multi.Group Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.PosTagging Gargantext.Core.Text.Terms.Multi.CoreNLP
Gargantext.Core.Text.Terms.Multi.PosTagging.Types Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Text.Upload Gargantext.Core.Text.Upload
Gargantext.Core.Types.Search Gargantext.Core.Types.Search
...@@ -426,6 +426,7 @@ library ...@@ -426,6 +426,7 @@ library
, conduit-extra ^>= 1.3.5 , conduit-extra ^>= 1.3.5
, containers ^>= 0.6.5.1 , containers ^>= 0.6.5.1
, contravariant ^>= 1.5.5 , contravariant ^>= 1.5.5
, corenlp-types ^>= 0.1.0.0
, crawlerArxiv , crawlerArxiv
, crawlerHAL , crawlerHAL
, crawlerISTEX , crawlerISTEX
......
...@@ -13,7 +13,11 @@ Multi-terms are ngrams where n > 1. ...@@ -13,7 +13,11 @@ Multi-terms are ngrams where n > 1.
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP) module Gargantext.Core.Text.Terms.Multi
( multiterms
, multiterms_rake
, tokenTags
, cleanTextForNLP )
where where
import Control.Applicative import Control.Applicative
...@@ -22,8 +26,7 @@ import Data.Text hiding (map, group, filter, concat) ...@@ -22,8 +26,7 @@ import Data.Text hiding (map, group, filter, concat)
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..)) import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr
import Gargantext.Core.Text.Terms.Multi.PosTagging import Gargantext.Core.Text.Terms.Multi.CoreNLP qualified as CoreNLP
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake) import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Core.Utils (groupWithCounts) import Gargantext.Core.Utils (groupWithCounts)
...@@ -31,9 +34,6 @@ import Gargantext.Prelude ...@@ -31,9 +34,6 @@ import Gargantext.Prelude
import Gargantext.Utils.SpacyNLP qualified as SpacyNLP import Gargantext.Utils.SpacyNLP qualified as SpacyNLP
import Replace.Attoparsec.Text as RAT import Replace.Attoparsec.Text as RAT
-------------------------------------------------------------------
type NLP_API = Lang -> Text -> IO PosSentences
------------------------------------------------------------------- -------------------------------------------------------------------
multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount] multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
multiterms nsc l txt = do multiterms nsc l txt = do
...@@ -54,14 +54,20 @@ multiterms nsc l txt = do ...@@ -54,14 +54,20 @@ multiterms nsc l txt = do
------------------------------------------------------------------- -------------------------------------------------------------------
tokenTag2terms :: TokenTag -> Terms tokenTag2terms :: TokenTag -> Terms
tokenTag2terms (TokenTag ws t _ _) = Terms ws t tokenTag2terms (TokenTag ws t _ _ _ _) = Terms ws t
tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]] tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = do
tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = tokenTagsWith FR txt $ corenlp url tt <- CoreNLP.coreNLPTokenTags url EN txt
pure $ map (groupTokens EN) tt
tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = do
tt <- CoreNLP.coreNLPTokenTags url FR txt
pure $ map (groupTokens FR) tt
tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
-- printDebug "NLP Debug" txt -- printDebug "NLP Debug" txt
tokenTagsWith l txt $ SpacyNLP.nlp url tt <- SpacyNLP.nlpTokenTags url txt
pure $ map (groupTokens l) tt
-- tokenTagsWith l txt $ SpacyNLP.nlp url
-- tokenTags FR txt = do -- tokenTags FR txt = do
-- -- printDebug "[Spacy Debug]" txt -- -- printDebug "[Spacy Debug]" txt
-- if txt == "" -- if txt == ""
...@@ -69,12 +75,6 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do ...@@ -69,12 +75,6 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
-- else tokenTagsWith FR txt SpacyNLP.nlp -- else tokenTagsWith FR txt SpacyNLP.nlp
tokenTags _ l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l) tokenTags _ l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l)
tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
tokenTagsWith lang txt nlp = map (groupTokens lang)
<$> map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> nlp lang txt
---- | This function analyses and groups (or not) ngrams according to ---- | This function analyses and groups (or not) ngrams according to
......
...@@ -21,12 +21,12 @@ import Gargantext.Prelude ...@@ -21,12 +21,12 @@ import Gargantext.Prelude
-- | FIXME p1 and p2 not really taken into account -- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [TokenTag] -> [TokenTag] group2 :: POS -> POS -> [TokenTag] -> [TokenTag]
group2 p1 p2 (x@(TokenTag _ _ (Just p1') _):y@(TokenTag _ _ (Just p2') _):z) = group2 p1 p2 (x@(TokenTag _ _ (Just p1') _ _ _):y@(TokenTag _ _ (Just p2') _ _ _):z) =
if (p1 == p1') && (p2 == p2') if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z) then group2 p1 p2 (combineTokenTags x y : z)
-- then (x : y : group2 p1 p2 (x<>y : z)) -- then (x : y : group2 p1 p2 (x<>y : z))
else (x : group2 p1 p2 (y:z)) else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y) group2 p1 p2 (x@(TokenTag _ _ Nothing _ _ _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x] group2 _ _ [x@(TokenTag _ _ (Just _) _ _ _)] = [x]
group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z)) group2 p1 p2 (x@(TokenTag _ _ (Just _) _ _ _):y@(TokenTag _ _ Nothing _ _ _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = [] group2 _ _ [] = []
...@@ -33,30 +33,37 @@ import Data.Map qualified as Map ...@@ -33,30 +33,37 @@ import Data.Map qualified as Map
import Data.Set (fromList) import Data.Set (fromList)
import Data.Text (splitOn, pack, toLower) import Data.Text (splitOn, pack, toLower)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Prelude hiding (ByteString, toLower) import Gargantext.Prelude hiding (ByteString, toLower)
import Network.HTTP.Simple import Network.HTTP.Simple
import Network.URI (URI(..)) import Network.URI (URI(..))
import Text.CoreNLP.Types qualified as CoreNLP
-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP -- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
tokens2tokensTags :: [Token] -> [TokenTag] tokens2tokensTags :: [CoreNLP.Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------ ------------------------------------------------------------------------
tokenTag :: Token -> TokenTag tokenTag :: CoreNLP.Token -> TokenTag
tokenTag (Token { .. }) = TokenTag { _my_token_word = w' tokenTag (CoreNLP.Token { .. }) = TokenTag { _my_token_word = w'
, _my_token_lemma = l' , _my_token_lemma = l'
, _my_token_pos = _tokenPos , _my_token_pos = fromCoreNLPPOS pos
, _my_token_ner = _tokenNer } , _my_token_ner = fromCoreNLPNER ner }
where where
w' = split _tokenWord w' = split word
l' = fromList (split _tokenLemma) l' = fromList (split lemma)
split :: Text -> [Text] split :: Text -> [Text]
split = splitOn " " . toLower split = splitOn " " . toLower
fromCoreNLPPOS :: CoreNLP.PennPOS -> Maybe POS
fromCoreNLPPOS = decode . encode
fromCoreNLPNER :: CoreNLP.NamedEntity -> Maybe NER
fromCoreNLPNER = decode . encode
filter' :: [TokenTag] -> [TokenTag] filter' :: [TokenTag] -> [TokenTag]
filter' xs = filter isNgrams xs filter' xs = filter isNgrams xs
where where
...@@ -76,13 +83,41 @@ filter' xs = filter isNgrams xs ...@@ -76,13 +83,41 @@ filter' xs = filter isNgrams xs
-- }, -- },
-- --
properties :: Lang -> [(Text, Text)]
properties EN = [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
properties FR = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
, ("tokenize.language", "fr") ]
properties DE = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/german-hgc.tagger")
, ("tokenize.language", "de") ]
properties ES = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/spanish.tagger")
, ("tokenize.language", "es") ]
properties IT = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "it") ]
properties PL = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "pl") ]
properties ZH = [ ("annotators", "tokenize,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger")
, ("tokenize.language", "zh") ]
properties l = panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
corenlp' :: ( FromJSON a corenlp' :: ( FromJSON a
, ConvertibleStrings p ByteString , ConvertibleStrings p ByteString
) )
=> URI -> Lang -> p -> IO (Response a) => URI -> Lang -> p -> IO (Response a)
corenlp' uri lang txt = do corenlp' uri lang txt = do
req <- parseRequest $ req <- parseRequest $ "POST " <> show (uri { uriQuery = query })
"POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) })
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq . -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
-- printDebug "[corenlp] sending body" $ (cs txt :: ByteString) -- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e -> catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
...@@ -94,41 +129,12 @@ corenlp' uri lang txt = do ...@@ -94,41 +129,12 @@ corenlp' uri lang txt = do
throwIO e throwIO e
JSONConversionException _req _res _err -> throwIO e JSONConversionException _req _res _err -> throwIO e
where where
properties_ :: [(Text, Text)] query = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList props)
properties_ = case lang of props = (properties lang) <> [ ("outputFormat", "json") ]
-- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings
EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
, ("tokenize.language", "fr") ]
DE -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/german-hgc.tagger")
, ("tokenize.language", "de") ]
ES -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/spanish.tagger")
, ("tokenize.language", "es") ]
IT -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "it") ]
PL -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "pl") ]
ZH -> [ ("annotators", "tokenize,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger")
, ("tokenize.language", "zh") ]
l -> panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
properties = properties_ <> [ ("outputFormat", "json") ]
corenlp :: URI -> Lang -> Text -> IO PosSentences corenlp :: URI -> Lang -> Text -> IO CoreNLP.Document
corenlp uri lang txt = do corenlp uri lang txt = do
response <- corenlp' uri lang txt response <- corenlp' uri lang txt
pure (getResponseBody response) pure (getResponseBody response)
...@@ -141,14 +147,14 @@ corenlp uri lang txt = do ...@@ -141,14 +147,14 @@ corenlp uri lang txt = do
-- Named Entity Recognition example -- Named Entity Recognition example
-- parseWith _tokenNer "Hello world of Peter." -- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]] -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: URI -> (Token -> t) -> Lang -> Text -> IO [[(Text, t)]] tokenWith :: URI -> (CoreNLP.Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
tokenWith uri f lang s = map (map (\t -> (_tokenWord t, f t))) tokenWith uri f lang s = do
<$> map _sentenceTokens CoreNLP.Document { sentences } <- corenlp uri lang s
<$> _sentences pure (map (map (\t -> (CoreNLP.word t, f t)))
<$> corenlp uri lang s $ map CoreNLP.tokens sentences)
---------------------------------------------------------------------------------- ----------------------------------------------------------------------------------
-- Here connect to the JohnSnow Server as it has been done above with the corenlp' -- Here connect to the JohnSnow Server as it has been done above with the corenlp'
-- We need the PosTagging according to the language and the lems -- We need the PosTagging according to the language and the lems
serverNLP :: Lang -> Text -> IO PosSentences serverNLP :: Lang -> Text -> IO CoreNLP.Document
serverNLP = undefined serverNLP = undefined
...@@ -7,6 +7,11 @@ Maintainer : team@gargantext.org ...@@ -7,6 +7,11 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
NOTE: This started out as a wrapper around CoreNLP types, then was
forced into Spacy and JohnSnowNLP. Currently, this is deprecated:
CoreNLP and Spacy use their own types, JohnSnowNLP still uses this
module, but it's not used as well.
-} -}
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
...@@ -46,7 +51,7 @@ data Properties = Properties { _propertiesAnnotators :: !Text ...@@ -46,7 +51,7 @@ data Properties = Properties { _propertiesAnnotators :: !Text
$(deriveJSON (unPrefix "_properties") ''Properties) $(deriveJSON (unPrefix "_properties") ''Properties)
data PosSentences = PosSentences { _sentences :: [Sentence]} data PosSentences = PosSentences { _sentences :: [Sentence] }
deriving (Show, Generic) deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''PosSentences) $(deriveJSON (unPrefix "_") ''PosSentences)
...@@ -22,6 +22,7 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main ...@@ -22,6 +22,7 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, DebugMode(..), withDebugMode , DebugMode(..), withDebugMode
, Term(..), Terms(..), TermsCount, TermsWithCount , Term(..), Terms(..), TermsCount, TermsWithCount
, TokenTag(..), POS(..), NER(..) , TokenTag(..), POS(..), NER(..)
, combineTokenTags, emptyTokenTag
, Label, Stems , Label, Stems
, HasValidationError(..), assertValid , HasValidationError(..), assertValid
, Name , Name
...@@ -36,7 +37,6 @@ import Data.Aeson ...@@ -36,7 +37,6 @@ import Data.Aeson
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Maybe import Data.Maybe
import Data.Monoid import Data.Monoid
import Data.Semigroup
import Data.Set (empty) import Data.Set (empty)
import Data.String import Data.String
import Data.Swagger (ToParamSchema) import Data.Swagger (ToParamSchema)
...@@ -150,25 +150,33 @@ instance FromJSON NER where ...@@ -150,25 +150,33 @@ instance FromJSON NER where
instance ToJSON NER instance ToJSON NER
data TokenTag = TokenTag { _my_token_word :: [Text] data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_lemma :: Set Text , _my_token_lemma :: Set Text
, _my_token_pos :: Maybe POS , _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER , _my_token_ner :: Maybe NER
, _my_token_offset_begin :: Int
, _my_token_offset_end :: Int
} deriving (Show) } deriving (Show)
instance Semigroup TokenTag where -- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And
(<>) (TokenTag w1 s1 p1 n1) (TokenTag w2 s2 p2 _) = TokenTag (w1 <> w2) (s1 <> s2) p3 n1 -- lemma combining is just wrong. You can't just "cat" <> "woman" to
where -- get a "Catwoman".
p3 = case (p1,p2) of combineTokenTags :: TokenTag -> TokenTag -> TokenTag
(Just JJ, Just NP) -> Just NP combineTokenTags (TokenTag w1 l1 p1 n1 s1 e1) (TokenTag w2 l2 p2 _ s2 e2) = TokenTag (w1 <> w2) (l1 <> l2) p3 n1 (minimum [s1, s2]) (maximum [e1, e2])
(Just VB, Just NP) -> Just NP where
_ -> p1 p3 = case (p1,p2) of
(Just JJ, Just NP) -> Just NP
(Just VB, Just NP) -> Just NP
instance Monoid TokenTag where _ -> p1
mempty = TokenTag [] empty Nothing Nothing
mconcat = foldl mappend mempty
-- mappend t1 t2 = (<>) t1 t2 emptyTokenTag :: TokenTag
emptyTokenTag = TokenTag [] empty Nothing Nothing 0 0
-- instance Monoid TokenTag where
-- mempty = TokenTag [] empty Nothing Nothing 0 0
-- mconcat = foldl mappend mempty
-- -- mappend t1 t2 = (<>) t1 t2
class HasValidationError e where class HasValidationError e where
......
...@@ -20,22 +20,23 @@ module Gargantext.Utils.SpacyNLP where ...@@ -20,22 +20,23 @@ module Gargantext.Utils.SpacyNLP where
import Control.Lens import Control.Lens
import Data.Aeson (encode) import Data.Aeson (encode)
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Set qualified as Set
import Data.Text hiding (map, group, filter, concat, zip) import Data.Text hiding (map, group, filter, concat, zip)
import Gargantext.Core (Lang(..)) import Gargantext.Core.Types (POS(..), NER(..), TokenTag(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Types (POS(..), NER(..))
import Gargantext.Core.Utils.Prefix (unPrefix) import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude import Gargantext.Prelude
import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response) import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
import Network.URI (URI(..)) import Network.URI (URI(..))
data SpacyData = SpacyData { _spacy_data :: ![SpacyText]} data SpacyData = SpacyData { _spacy_data :: ![SpacyText] }
deriving (Show) deriving (Show)
data SpacyText = SpacyText { _spacy_text :: !Text data SpacyText = SpacyText { _spacy_text :: !Text
, _spacy_tags :: ![SpacyTags] , _spacy_tags :: ![SpacyTags]
} deriving (Show) } deriving (Show)
-- | https://spacy.io/api/token/#attributes
data SpacyTags = data SpacyTags =
SpacyTags { _spacyTags_text :: !Text SpacyTags { _spacyTags_text :: !Text
, _spacyTags_text_with_ws :: !Text , _spacyTags_text_with_ws :: !Text
...@@ -100,28 +101,24 @@ makeLenses ''SpacyTags ...@@ -100,28 +101,24 @@ makeLenses ''SpacyTags
makeLenses ''SpacyRequest makeLenses ''SpacyRequest
---------------------------------------------------------------- ----------------------------------------------------------------
spacyTagsToToken :: SpacyTags -> Token
spacyTagsToToken st = Token (st ^. spacyTags_index)
(st ^. spacyTags_normalized)
(st ^. spacyTags_text)
(st ^. spacyTags_lemma)
(st ^. spacyTags_head_index)
(st ^. spacyTags_char_offset)
(Just $ st ^. spacyTags_pos)
(Just $ st ^. spacyTags_ent_type)
(Just $ st ^. spacyTags_prefix)
(Just $ st ^. spacyTags_suffix)
spacyDataToPosSentences :: SpacyData -> PosSentences
spacyDataToPosSentences (SpacyData ds) = PosSentences
$ map (\(i, ts) -> Sentence i ts)
$ zip [1..]
$ map (\(SpacyText _ tags)-> map spacyTagsToToken tags) ds
spacyDataToTokenTags :: SpacyData -> [[TokenTag]]
spacyDataToTokenTags (SpacyData ds) =
map (\(SpacyText _ tags) -> spacyTagsToTokenTag <$> tags) ds
spacyTagsToTokenTag :: SpacyTags -> TokenTag
spacyTagsToTokenTag st = TokenTag { _my_token_word = split' (st ^. spacyTags_normalized)
, _my_token_lemma = Set.fromList (split' $ st ^. spacyTags_lemma)
, _my_token_pos = Just $ st ^. spacyTags_pos
, _my_token_ner = Just $ st ^. spacyTags_ent_type
, _my_token_offset_begin = 0 -- TODO
, _my_token_offset_end = 0 -- TODO
}
where
split' :: Text -> [Text]
split' = splitOn " " . Data.Text.toLower
----------------------------------------------------------------- -----------------------------------------------------------------
nlp :: URI -> Lang -> Text -> IO PosSentences nlpTokenTags :: URI -> Text -> IO [[TokenTag]]
nlp uri _lang txt = spacyDataToPosSentences <$> spacyRequest uri txt nlpTokenTags uri txt = spacyDataToTokenTags <$> spacyRequest uri txt
-- nlp _ _ _ = panic "Make sure you have the right model for your lang for spacy Server"
-- nlp FR txt = spacyDataToPosSentences <$> spacyRequest txt
-- nlp _ _ = panic "Make sure you have the right model for your lang for spacy Server"
...@@ -43,6 +43,8 @@ extra-deps: ...@@ -43,6 +43,8 @@ extra-deps:
- servant-auth/servant-auth/ - servant-auth/servant-auth/
- servant-auth/servant-auth-client/ - servant-auth/servant-auth-client/
- servant-auth/servant-auth-server/ - servant-auth/servant-auth-server/
- git: https://gitlab.iscpif.fr/gargantext/corenlp-types
commit: 429117fad5f8414d8eaa85b5f3c474c9d003cfeb
- git: https://gitlab.iscpif.fr/gargantext/servant-xml-conduit.git - git: https://gitlab.iscpif.fr/gargantext/servant-xml-conduit.git
commit: 339fd608341bd2652cf5c0e9e76a3293acffbea6 commit: 339fd608341bd2652cf5c0e9e76a3293acffbea6
- git: https://github.com/alpmestan/ekg-json.git - git: https://github.com/alpmestan/ekg-json.git
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment