[tokenize] working backend version of tokenization/highlighting

parent e66f7257
......@@ -106,6 +106,8 @@ library
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr
Gargantext.Core.Text.Terms.Multi.RAKE
Gargantext.Core.Text.Terms.Tokenize
Gargantext.Core.Text.Terms.Tokenize.Types
Gargantext.Core.Text.Terms.WithList
Gargantext.Core.Types
Gargantext.Core.Types.Individu
......@@ -152,6 +154,7 @@ library
Gargantext.Database.Schema.User
Gargantext.Defaults
Gargantext.System.Logging
Gargantext.Utils.Array
Gargantext.Utils.Dict
Gargantext.Utils.Jobs
Gargantext.Utils.Jobs.Internal
......@@ -275,7 +278,6 @@ library
Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.CoreNLP
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Text.Terms.Tokenize
Gargantext.Core.Text.Upload
Gargantext.Core.Types.Search
Gargantext.Core.Utils.DateUtils
......@@ -436,6 +438,7 @@ library
, crawlerPubMed
, cron ^>= 0.7.0
, cryptohash ^>= 0.11.9
, data-interval ^>= 2.1.1
, data-time-segment ^>= 0.1.0.0
, deepseq ^>= 1.4.4.0
, directory ^>= 1.3.6.0
......@@ -444,6 +447,7 @@ library
, ekg-json ^>= 0.1.0.7
, epo-api-client
, exceptions ^>= 0.10.4
, extended-reals ^>= 0.2.4.0
, extra ^>= 1.7.9
, fast-logger ^>= 3.0.5
, fclabels ^>= 2.0.5
......@@ -934,6 +938,7 @@ test-suite garg-test-tasty
Test.Core.Text.Corpus.Query
Test.Core.Text.Examples
Test.Core.Text.Flow
Test.Core.Text.Tokenize
Test.Core.Utils
Test.Database.Operations
Test.Database.Operations.DocumentSearch
......
......@@ -11,18 +11,25 @@ Portability : POSIX
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Public
where
import Control.Lens ((^?), (^.), _Just)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Swagger hiding (title, url)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.API.Node.File
import Gargantext.API.Prelude
import Gargantext.Core (Lang)
import Gargantext.Core.Text.Terms.Tokenize qualified as Tokenize
import Gargantext.Core.Text.Terms.Tokenize.Types qualified as Tokenize
import Gargantext.Core.Types (TokenTag(..))
import Gargantext.Core.Utils.DateUtils (utc2year)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Hyperdata.CorpusField
......@@ -32,6 +39,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeNode (selectPublicNodes)
import Gargantext.Database.Schema.Node -- (NodePoly(..))
import Gargantext.Prelude
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Utils.Aeson qualified as GUA
import Servant
import Test.QuickCheck (elements)
......@@ -40,10 +48,12 @@ import Test.QuickCheck.Arbitrary
------------------------------------------------------------------------
type API = API_Home
:<|> API_Node
:<|> API_NLP
api :: Text -> GargServer API
api baseUrl = (api_home baseUrl)
:<|> api_node
:<|> api_nlp
-------------------------------------------------------------------------
type API_Home = Summary " Public Home API"
......@@ -70,6 +80,60 @@ api_node nId = do
-------------------------------------------------------------------------
type API_NLP = Summary " NLP"
:> "nlp"
:> ("tokenize"
:> ReqBody '[JSON] TokenizeData
:> Post '[JSON] [TokenTag]
:<|> "highlight"
:> ReqBody '[JSON] HighlightData
:> Post '[JSON] [Tokenize.HighlightedTerm]
:<|> "highlight-total"
:> ReqBody '[JSON] HighlightData
:> Post '[JSON] [Tokenize.HighlightResult] )
api_nlp :: GargServer API_NLP
api_nlp = api_tokenize
:<|> api_highlight
:<|> api_highlight_total
data TokenizeData = TokenizeData
{ _td_lang :: Lang
, _td_text :: Text }
deriving (Show, Eq, Generic)
instance ToSchema TokenizeData where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_td_")
api_tokenize :: (GargServerC env err m) => TokenizeData -> m [TokenTag]
api_tokenize (TokenizeData { .. }) = do
ret <- Tokenize.tokenize _td_lang _td_text
liftBase $ putText $ "[tokenize] ret: " <> show ret
pure ret
data HighlightData = HighlightData
{ _hd_lang :: Lang
, _hd_text :: Text
, _hd_terms :: [Text] }
deriving (Show, Eq, Generic)
instance ToSchema HighlightData where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_hd_")
api_highlight :: (GargServerC env err m) => HighlightData -> m [Tokenize.HighlightedTerm]
api_highlight (HighlightData { .. }) = do
Tokenize.highlightTerms (NgramsTerm <$> _hd_terms) _hd_lang _hd_text
-- | Similar to 'api_highlight' but returns an array containing the
-- whole text, i.e. if it's not highlighted, it is still part of the
-- returned array.
api_highlight_total :: (GargServerC env err m) => HighlightData -> m [Tokenize.HighlightResult]
api_highlight_total hd@(HighlightData { .. }) = do
hts <- api_highlight hd
liftBase $ putText $ "[api_highlight_total] _hd_text: " <> _hd_text
pure $ Tokenize.fillHighlightGaps hts _hd_text
-------------------------------------------------------------------------
selectPublic :: HasNodeError err
=> DBCmd err [( Node HyperdataFolder, Maybe Int)]
......@@ -153,3 +217,8 @@ defaultPublicData =
, date = "YY/MM/DD"
, database = "database"
, author = "Author" }
$(deriveJSON (unPrefix "_td_") ''TokenizeData)
$(deriveJSON (unPrefix "_hd_") ''HighlightData)
......@@ -17,6 +17,7 @@ module Gargantext.Core.Text.Terms.Multi
( multiterms
, multiterms_rake
, tokenTags
, tokenTagsNoGroup
, cleanTextForNLP )
where
......@@ -76,6 +77,23 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
tokenTags _ l _ = panicTrace $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l)
tokenTagsNoGroup :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
tokenTagsNoGroup (NLPServerConfig { server = CoreNLP, url }) EN txt = do
CoreNLP.coreNLPTokenTags url EN txt
tokenTagsNoGroup (NLPServerConfig { server = CoreNLP, url }) FR txt = do
CoreNLP.coreNLPTokenTags url FR txt
tokenTagsNoGroup (NLPServerConfig { server = Spacy, url }) _l txt = do
-- printDebug "NLP Debug" txt
SpacyNLP.nlpTokenTags url txt
-- tokenTagsWith l txt $ SpacyNLP.nlp url
-- tokenTags FR txt = do
-- -- printDebug "[Spacy Debug]" txt
-- if txt == ""
-- then pure [[]]
-- else tokenTagsWith FR txt SpacyNLP.nlp
tokenTagsNoGroup _ l _ = panicTrace $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l)
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.
......
......@@ -147,6 +147,7 @@ corenlp uri lang txt = do
coreNLPTokenTags :: URI -> Lang -> Text -> IO [[TokenTag]]
coreNLPTokenTags uri lang txt = do
document <- corenlp uri lang txt
putText $ "[coreNLPTokenTags] document: " <> show document
pure $ map tokens2tokensTags $ allTokens document
-- | parseWith
......
......@@ -9,21 +9,119 @@ Portability : POSIX
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Terms.Tokenize
where
import Control.Lens (view)
import Control.Monad.Trans.Control (MonadBaseControl)
import Gargantext.Core (Lang)
import Gargantext.Core.NLP (nlpServerGet, HasNLPServer)
import Gargantext.Core.Text.Terms.Multi (tokenTags)
import Gargantext.Core.Types (TokenTag(..))
import Control.Lens (view) -- over
import Data.ExtendedReal (Extended(..))
import Data.Interval ((<=..<=))
import Data.Interval qualified as I
import Data.IntervalSet qualified as IS
import Data.Set qualified as Set
import Data.Text qualified as T
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang, NLPServerConfig(..)) --, PosTagAlgo(CoreNLP))
import Gargantext.Core.NLP (nlpServerGet)
import Gargantext.Core.Text.Terms.Multi (tokenTagsNoGroup)
import Gargantext.Core.Text.Terms.Tokenize.Types
import Gargantext.Core.Types (TokenTag(..), POS(..)) --, my_token_offset_end)
import Gargantext.Prelude
import Gargantext.Utils.Array (window)
tokenize :: ( HasNLPServer env
, MonadReader env m
, MonadBaseControl IO m) => Lang -> Text -> m [TokenTag]
-- | Just pick an NLP server and tokenize the given string using given
-- language.
tokenize :: HasTokenizer env m
=> Lang -> Text -> m [TokenTag]
tokenize lang txt = do
nlp <- view (nlpServerGet lang)
liftBase $ concat <$> tokenTags nlp lang txt
ret <- liftBase $ concat <$> tokenTagsNoGroup nlp lang txt
let f = case server nlp of
-- CoreNLP -> over my_token_offset_end (\o -> o - 1)
_ -> identity
pure $ f <$> ret
-------
-- | This function, given a list of 'NgramsTerm' and a text,
-- highlights these terms using the 'tokenize' function above.
highlightTerms :: HasTokenizer env m
=> [NgramsTerm] -> Lang -> Text -> m [HighlightedTerm]
highlightTerms ngramsTerms lang txt = do
txtTokens' <- tokenize lang txt
let txtTokens = relevantTokens txtTokens'
liftBase $ putText $ "[highlightTerms] txtTokens: " <> show txtTokens
tokenizedTerms <- mapM tokenizeTerms ngramsTerms
liftBase $ putText $ "[highlightTerms] tokenizedTerms: " <> show tokenizedTerms
-- TODO This isn't the most optimal, of O(n*m) complexity. One can
-- try to compute hashes, incrementally, for the windowed tokens
let ht = highlight txt txtTokens <$> tokenizedTerms
pure $ catMaybes $ concat ht
where
tokenizeTerms :: HasTokenizer env m => NgramsTerm -> m (NgramsTerm, [TokenTag])
tokenizeTerms t = do
tt' <- tokenize lang $ unNgramsTerm t
let tt = relevantTokens tt'
pure (t, tt)
-- | Fills in all "gaps" created by 'highlightTerms', i.e. inserts
-- text parts where there are no highlights.
fillHighlightGaps :: [HighlightedTerm] -> Text -> [HighlightResult]
fillHighlightGaps hts txt = sortBy compareHR ((HRHighlighted <$> hts) <> gapHt)
where
txtInt = IS.singleton (Finite 0 <=..<= (Finite $ T.length txt))
compareHR hr1 hr2 = compare (I.lowerBound $ hrToInterval hr1) (I.lowerBound $ hrToInterval hr2)
htIntervals = IS.fromList (htToInterval <$> hts)
intDiff = IS.toList (IS.difference txtInt htIntervals)
gapHt = HRNormal <$> intervalToNt txt <$> intDiff
------- UTILITY FUNCTIONS
-- | Keep only relevant tokens for token highlight. This is because
-- things like hyphens etc prevent us from highlighting terms
-- separated e.g. with dashes.
relevantTokens :: [TokenTag] -> [TokenTag]
relevantTokens = filter f
where
f (TokenTag { .. }) =
case _my_token_pos of
Just (NotFound { }) -> False
_ -> True
highlight :: Text -> [TokenTag] -> (NgramsTerm, [TokenTag]) -> [Maybe HighlightedTerm]
highlight txt txtTokens (ngramsTerm, tokenizedTerms) =
highlightInWindow (ngramsTerm, tokenizedTerms) <$> (window (length tokenizedTerms) txtTokens)
where
highlightInWindow :: (NgramsTerm, [TokenTag]) -> [TokenTag] -> Maybe HighlightedTerm
highlightInWindow (nt, tt) windowTxtTokens =
case ( compareSets (_my_token_lemma <$> tt) (_my_token_lemma <$> windowTxtTokens)
, head windowTxtTokens
, lastMay windowTxtTokens ) of
( True, Just h, Just l ) ->
let ( lb, ub ) = ( _my_token_offset_begin h, _my_token_offset_end l )
in
Just (HighlightedTerm { _ht_term = unNgramsTerm nt
, _ht_original_text = T.take (ub - lb) $ T.drop lb txt
, _ht_start = lb
, _ht_end = ub })
_ -> Nothing
intersects :: Ord a => Set a -> Set a -> Bool
intersects s1 s2 = not $ Set.disjoint s1 s2
-- | We treat lemmas as equal, if sets intersect. This is a comparison
-- function for a list of such sets.
compareSets :: Ord a => [Set a] -> [Set a] -> Bool
compareSets ss1 ss2 = (length ss1 == length ss2) &&
all (\(s1, s2) -> intersects s1 s2) (zip ss1 ss2)
{-|
Module : Gargantext.Core.Text.Terms.Tokenize.Types
Description : String tokenization
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Terms.Tokenize.Types
where
import Control.Monad.Trans.Control (MonadBaseControl)
import Data.Aeson.TH (defaultOptions, deriveJSON)
import Data.ExtendedReal (Extended(..))
import Data.Interval ((<=..<=))
import Data.Interval qualified as I
import Data.Swagger (ToSchema, declareNamedSchema, defaultSchemaOptions, genericDeclareNamedSchema, genericDeclareNamedSchemaUnrestricted)
import Data.Text qualified as T
import Gargantext.Core.NLP (HasNLPServer)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Prelude
type HasTokenizer env m = ( HasNLPServer env
, MonadReader env m
, MonadBaseControl IO m )
------------------------------
-- NOTE: To highlight terms, we actually need to know what these terms
-- are. Terms consist of compounds of, possibly, multiple tokens and
-- it's not the same as NLP tokenization.
-- https://en.wikipedia.org/wiki/Terminology_extraction
data HighlightedTerm =
HighlightedTerm { _ht_term :: Text
, _ht_original_text :: Text
, _ht_start :: Int -- start position of the term
, _ht_end :: Int -- end position of the term
}
deriving (Show, Eq, Generic)
instance ToSchema HighlightedTerm where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_ht_")
htToInterval :: HighlightedTerm -> I.Interval Int
htToInterval (HighlightedTerm { .. }) = (Finite _ht_start) <=..<= (Finite _ht_end)
------------------------------
data NormalText =
NormalText { _nt_text :: Text
, _nt_start :: Int
, _nt_end :: Int }
deriving (Show, Eq, Generic)
instance ToSchema NormalText where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nt_")
ntToInterval :: NormalText -> I.Interval Int
ntToInterval (NormalText { .. }) = (Finite _nt_start) <=..<= (Finite _nt_end)
intervalToNt :: Text -> I.Interval Int -> NormalText
intervalToNt txt int = NormalText { _nt_text = T.take (ub - lb) $ T.drop lb txt
, _nt_start = lb
, _nt_end = ub }
where
lb' = I.lowerBound int
lb = case lb' of
Finite l -> l
_ -> 0
ub' = I.upperBound int
ub = case ub' of
Finite u -> u
_ -> 0
------------------------------
data HighlightResult =
HRHighlighted HighlightedTerm
| HRNormal NormalText
deriving (Show, Eq, Generic)
instance ToSchema HighlightResult where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
hrToInterval :: HighlightResult -> I.Interval Int
hrToInterval (HRHighlighted ht) = htToInterval ht
hrToInterval (HRNormal nt) = ntToInterval nt
$(deriveJSON (unPrefix "_ht_") ''HighlightedTerm)
$(deriveJSON (unPrefix "_nt_") ''NormalText)
$(deriveJSON defaultOptions ''HighlightResult)
......@@ -22,6 +22,12 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, DebugMode(..), withDebugMode
, Term(..), Terms(..), TermsCount, TermsWithCount
, TokenTag(..), POS(..), NER(..)
, my_token_offset_begin
, my_token_offset_end
, my_token_lemma
, my_token_ner
, my_token_pos
, my_token_word
, combineTokenTags, emptyTokenTag
, Label, Stems
, HasValidationError(..), assertValid
......@@ -39,13 +45,13 @@ import Data.Maybe
import Data.Monoid
import Data.Set (empty)
import Data.String
import Data.Swagger (ToParamSchema)
import Data.Swagger (ToParamSchema, defaultSchemaOptions, genericDeclareNamedSchema, genericDeclareNamedSchemaUnrestricted)
import Data.Swagger (ToSchema(..))
import Data.Text (unpack)
import Data.Validity
import GHC.Generics
import Gargantext.Core.Types.Main
import Gargantext.Core.Utils.Prefix (unPrefix, wellNamedSchema)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger, wellNamedSchema)
import Gargantext.Database.Admin.Types.Node
import Gargantext.Prelude hiding (Ordering, empty)
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
......@@ -133,9 +139,11 @@ instance FromJSON POS where
instance ToJSON POS
instance Hashable POS
instance ToSchema POS where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER { noNer :: !Text }
deriving (Show, Generic)
deriving (Show, Eq, Generic)
------------------------------------------------------------------------
instance FromJSON NER where
parseJSON = withText "String" (\x -> pure (ner $ unpack x))
......@@ -149,6 +157,8 @@ instance FromJSON NER where
ner x = NoNER (cs x)
instance ToJSON NER
instance ToSchema NER where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_lemma :: Set Text
......@@ -156,8 +166,11 @@ data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_ner :: Maybe NER
, _my_token_offset_begin :: Int
, _my_token_offset_end :: Int
} deriving (Show)
} deriving (Show, Eq, Generic)
instance ToSchema TokenTag where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_my_token_")
$(deriveJSON (unPrefix "_my_token_") ''TokenTag)
makeLenses ''TokenTag
-- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And
-- lemma combining is just wrong. You can't just "cat" <> "woman" to
......
{-|
Module : Gargantext.Utils.Array
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Utilities for handling arrays.
-}
module Gargantext.Utils.Array where
import Protolude
-- | A sliding window of given size for an array.
-- https://stackoverflow.com/questions/27726739/implementing-an-efficient-sliding-window-algorithm-in-haskell
window :: Int -> [a] -> [[a]]
window size = foldr (zipWith (:)) (repeat []) . take size . tails
......@@ -14,6 +14,7 @@ import Gargantext.Prelude
import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Utils as Utils
import qualified Test.Core.Text.Tokenize as Tokenize
import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Query as NgramsQuery
......@@ -35,6 +36,7 @@ main = do
cryptoSpec <- testSpec "Crypto" Crypto.test
nlpSpec <- testSpec "NLP" NLP.test
jobsSpec <- testSpec "Jobs" Jobs.test
tokenizeSpec <- testSpec "Tokenize" Tokenize.test
defaultMain $ testGroup "Gargantext"
[ utilSpec
......@@ -44,6 +46,7 @@ main = do
, cryptoSpec
, nlpSpec
, jobsSpec
, tokenizeSpec
, NgramsQuery.tests
, CorpusQuery.tests
, JSON.tests
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment