[tokenize] working backend version of tokenization/highlighting

parent e66f7257
...@@ -106,6 +106,8 @@ library ...@@ -106,6 +106,8 @@ library
Gargantext.Core.Text.Terms.Multi.Lang.En Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr Gargantext.Core.Text.Terms.Multi.Lang.Fr
Gargantext.Core.Text.Terms.Multi.RAKE Gargantext.Core.Text.Terms.Multi.RAKE
Gargantext.Core.Text.Terms.Tokenize
Gargantext.Core.Text.Terms.Tokenize.Types
Gargantext.Core.Text.Terms.WithList Gargantext.Core.Text.Terms.WithList
Gargantext.Core.Types Gargantext.Core.Types
Gargantext.Core.Types.Individu Gargantext.Core.Types.Individu
...@@ -152,6 +154,7 @@ library ...@@ -152,6 +154,7 @@ library
Gargantext.Database.Schema.User Gargantext.Database.Schema.User
Gargantext.Defaults Gargantext.Defaults
Gargantext.System.Logging Gargantext.System.Logging
Gargantext.Utils.Array
Gargantext.Utils.Dict Gargantext.Utils.Dict
Gargantext.Utils.Jobs Gargantext.Utils.Jobs
Gargantext.Utils.Jobs.Internal Gargantext.Utils.Jobs.Internal
...@@ -275,7 +278,6 @@ library ...@@ -275,7 +278,6 @@ library
Gargantext.Core.Text.Terms.Multi.Group Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.CoreNLP Gargantext.Core.Text.Terms.Multi.CoreNLP
Gargantext.Core.Text.Terms.Multi.PosTagging.Types Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Text.Terms.Tokenize
Gargantext.Core.Text.Upload Gargantext.Core.Text.Upload
Gargantext.Core.Types.Search Gargantext.Core.Types.Search
Gargantext.Core.Utils.DateUtils Gargantext.Core.Utils.DateUtils
...@@ -436,6 +438,7 @@ library ...@@ -436,6 +438,7 @@ library
, crawlerPubMed , crawlerPubMed
, cron ^>= 0.7.0 , cron ^>= 0.7.0
, cryptohash ^>= 0.11.9 , cryptohash ^>= 0.11.9
, data-interval ^>= 2.1.1
, data-time-segment ^>= 0.1.0.0 , data-time-segment ^>= 0.1.0.0
, deepseq ^>= 1.4.4.0 , deepseq ^>= 1.4.4.0
, directory ^>= 1.3.6.0 , directory ^>= 1.3.6.0
...@@ -444,6 +447,7 @@ library ...@@ -444,6 +447,7 @@ library
, ekg-json ^>= 0.1.0.7 , ekg-json ^>= 0.1.0.7
, epo-api-client , epo-api-client
, exceptions ^>= 0.10.4 , exceptions ^>= 0.10.4
, extended-reals ^>= 0.2.4.0
, extra ^>= 1.7.9 , extra ^>= 1.7.9
, fast-logger ^>= 3.0.5 , fast-logger ^>= 3.0.5
, fclabels ^>= 2.0.5 , fclabels ^>= 2.0.5
...@@ -934,6 +938,7 @@ test-suite garg-test-tasty ...@@ -934,6 +938,7 @@ test-suite garg-test-tasty
Test.Core.Text.Corpus.Query Test.Core.Text.Corpus.Query
Test.Core.Text.Examples Test.Core.Text.Examples
Test.Core.Text.Flow Test.Core.Text.Flow
Test.Core.Text.Tokenize
Test.Core.Utils Test.Core.Utils
Test.Database.Operations Test.Database.Operations
Test.Database.Operations.DocumentSearch Test.Database.Operations.DocumentSearch
......
...@@ -11,18 +11,25 @@ Portability : POSIX ...@@ -11,18 +11,25 @@ Portability : POSIX
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Public module Gargantext.API.Public
where where
import Control.Lens ((^?), (^.), _Just) import Control.Lens ((^?), (^.), _Just)
import Data.Aeson import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.List qualified as List import Data.List qualified as List
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Data.Set qualified as Set import Data.Set qualified as Set
import Data.Swagger hiding (title, url) import Data.Swagger hiding (title, url)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.API.Node.File import Gargantext.API.Node.File
import Gargantext.API.Prelude import Gargantext.API.Prelude
import Gargantext.Core (Lang)
import Gargantext.Core.Text.Terms.Tokenize qualified as Tokenize
import Gargantext.Core.Text.Terms.Tokenize.Types qualified as Tokenize
import Gargantext.Core.Types (TokenTag(..))
import Gargantext.Core.Utils.DateUtils (utc2year) import Gargantext.Core.Utils.DateUtils (utc2year)
import Gargantext.Database.Admin.Types.Hyperdata import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Hyperdata.CorpusField import Gargantext.Database.Admin.Types.Hyperdata.CorpusField
...@@ -32,6 +39,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..)) ...@@ -32,6 +39,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeNode (selectPublicNodes) import Gargantext.Database.Query.Table.NodeNode (selectPublicNodes)
import Gargantext.Database.Schema.Node -- (NodePoly(..)) import Gargantext.Database.Schema.Node -- (NodePoly(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Utils.Aeson qualified as GUA import Gargantext.Utils.Aeson qualified as GUA
import Servant import Servant
import Test.QuickCheck (elements) import Test.QuickCheck (elements)
...@@ -40,10 +48,12 @@ import Test.QuickCheck.Arbitrary ...@@ -40,10 +48,12 @@ import Test.QuickCheck.Arbitrary
------------------------------------------------------------------------ ------------------------------------------------------------------------
type API = API_Home type API = API_Home
:<|> API_Node :<|> API_Node
:<|> API_NLP
api :: Text -> GargServer API api :: Text -> GargServer API
api baseUrl = (api_home baseUrl) api baseUrl = (api_home baseUrl)
:<|> api_node :<|> api_node
:<|> api_nlp
------------------------------------------------------------------------- -------------------------------------------------------------------------
type API_Home = Summary " Public Home API" type API_Home = Summary " Public Home API"
...@@ -70,6 +80,60 @@ api_node nId = do ...@@ -70,6 +80,60 @@ api_node nId = do
------------------------------------------------------------------------- -------------------------------------------------------------------------
type API_NLP = Summary " NLP"
:> "nlp"
:> ("tokenize"
:> ReqBody '[JSON] TokenizeData
:> Post '[JSON] [TokenTag]
:<|> "highlight"
:> ReqBody '[JSON] HighlightData
:> Post '[JSON] [Tokenize.HighlightedTerm]
:<|> "highlight-total"
:> ReqBody '[JSON] HighlightData
:> Post '[JSON] [Tokenize.HighlightResult] )
api_nlp :: GargServer API_NLP
api_nlp = api_tokenize
:<|> api_highlight
:<|> api_highlight_total
data TokenizeData = TokenizeData
{ _td_lang :: Lang
, _td_text :: Text }
deriving (Show, Eq, Generic)
instance ToSchema TokenizeData where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_td_")
api_tokenize :: (GargServerC env err m) => TokenizeData -> m [TokenTag]
api_tokenize (TokenizeData { .. }) = do
ret <- Tokenize.tokenize _td_lang _td_text
liftBase $ putText $ "[tokenize] ret: " <> show ret
pure ret
data HighlightData = HighlightData
{ _hd_lang :: Lang
, _hd_text :: Text
, _hd_terms :: [Text] }
deriving (Show, Eq, Generic)
instance ToSchema HighlightData where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_hd_")
api_highlight :: (GargServerC env err m) => HighlightData -> m [Tokenize.HighlightedTerm]
api_highlight (HighlightData { .. }) = do
Tokenize.highlightTerms (NgramsTerm <$> _hd_terms) _hd_lang _hd_text
-- | Similar to 'api_highlight' but returns an array containing the
-- whole text, i.e. if it's not highlighted, it is still part of the
-- returned array.
api_highlight_total :: (GargServerC env err m) => HighlightData -> m [Tokenize.HighlightResult]
api_highlight_total hd@(HighlightData { .. }) = do
hts <- api_highlight hd
liftBase $ putText $ "[api_highlight_total] _hd_text: " <> _hd_text
pure $ Tokenize.fillHighlightGaps hts _hd_text
-------------------------------------------------------------------------
selectPublic :: HasNodeError err selectPublic :: HasNodeError err
=> DBCmd err [( Node HyperdataFolder, Maybe Int)] => DBCmd err [( Node HyperdataFolder, Maybe Int)]
...@@ -153,3 +217,8 @@ defaultPublicData = ...@@ -153,3 +217,8 @@ defaultPublicData =
, date = "YY/MM/DD" , date = "YY/MM/DD"
, database = "database" , database = "database"
, author = "Author" } , author = "Author" }
$(deriveJSON (unPrefix "_td_") ''TokenizeData)
$(deriveJSON (unPrefix "_hd_") ''HighlightData)
...@@ -17,6 +17,7 @@ module Gargantext.Core.Text.Terms.Multi ...@@ -17,6 +17,7 @@ module Gargantext.Core.Text.Terms.Multi
( multiterms ( multiterms
, multiterms_rake , multiterms_rake
, tokenTags , tokenTags
, tokenTagsNoGroup
, cleanTextForNLP ) , cleanTextForNLP )
where where
...@@ -76,6 +77,23 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do ...@@ -76,6 +77,23 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
tokenTags _ l _ = panicTrace $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l) tokenTags _ l _ = panicTrace $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l)
tokenTagsNoGroup :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
tokenTagsNoGroup (NLPServerConfig { server = CoreNLP, url }) EN txt = do
CoreNLP.coreNLPTokenTags url EN txt
tokenTagsNoGroup (NLPServerConfig { server = CoreNLP, url }) FR txt = do
CoreNLP.coreNLPTokenTags url FR txt
tokenTagsNoGroup (NLPServerConfig { server = Spacy, url }) _l txt = do
-- printDebug "NLP Debug" txt
SpacyNLP.nlpTokenTags url txt
-- tokenTagsWith l txt $ SpacyNLP.nlp url
-- tokenTags FR txt = do
-- -- printDebug "[Spacy Debug]" txt
-- if txt == ""
-- then pure [[]]
-- else tokenTagsWith FR txt SpacyNLP.nlp
tokenTagsNoGroup _ l _ = panicTrace $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l)
---- | This function analyses and groups (or not) ngrams according to ---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language. ---- specific grammars of each language.
......
...@@ -147,6 +147,7 @@ corenlp uri lang txt = do ...@@ -147,6 +147,7 @@ corenlp uri lang txt = do
coreNLPTokenTags :: URI -> Lang -> Text -> IO [[TokenTag]] coreNLPTokenTags :: URI -> Lang -> Text -> IO [[TokenTag]]
coreNLPTokenTags uri lang txt = do coreNLPTokenTags uri lang txt = do
document <- corenlp uri lang txt document <- corenlp uri lang txt
putText $ "[coreNLPTokenTags] document: " <> show document
pure $ map tokens2tokensTags $ allTokens document pure $ map tokens2tokensTags $ allTokens document
-- | parseWith -- | parseWith
......
...@@ -9,21 +9,119 @@ Portability : POSIX ...@@ -9,21 +9,119 @@ Portability : POSIX
-} -}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Terms.Tokenize module Gargantext.Core.Text.Terms.Tokenize
where where
import Control.Lens (view) import Control.Lens (view) -- over
import Control.Monad.Trans.Control (MonadBaseControl) import Data.ExtendedReal (Extended(..))
import Gargantext.Core (Lang) import Data.Interval ((<=..<=))
import Gargantext.Core.NLP (nlpServerGet, HasNLPServer) import Data.Interval qualified as I
import Gargantext.Core.Text.Terms.Multi (tokenTags) import Data.IntervalSet qualified as IS
import Gargantext.Core.Types (TokenTag(..)) import Data.Set qualified as Set
import Data.Text qualified as T
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang, NLPServerConfig(..)) --, PosTagAlgo(CoreNLP))
import Gargantext.Core.NLP (nlpServerGet)
import Gargantext.Core.Text.Terms.Multi (tokenTagsNoGroup)
import Gargantext.Core.Text.Terms.Tokenize.Types
import Gargantext.Core.Types (TokenTag(..), POS(..)) --, my_token_offset_end)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Utils.Array (window)
tokenize :: ( HasNLPServer env -- | Just pick an NLP server and tokenize the given string using given
, MonadReader env m -- language.
, MonadBaseControl IO m) => Lang -> Text -> m [TokenTag] tokenize :: HasTokenizer env m
=> Lang -> Text -> m [TokenTag]
tokenize lang txt = do tokenize lang txt = do
nlp <- view (nlpServerGet lang) nlp <- view (nlpServerGet lang)
liftBase $ concat <$> tokenTags nlp lang txt ret <- liftBase $ concat <$> tokenTagsNoGroup nlp lang txt
let f = case server nlp of
-- CoreNLP -> over my_token_offset_end (\o -> o - 1)
_ -> identity
pure $ f <$> ret
-------
-- | This function, given a list of 'NgramsTerm' and a text,
-- highlights these terms using the 'tokenize' function above.
highlightTerms :: HasTokenizer env m
=> [NgramsTerm] -> Lang -> Text -> m [HighlightedTerm]
highlightTerms ngramsTerms lang txt = do
txtTokens' <- tokenize lang txt
let txtTokens = relevantTokens txtTokens'
liftBase $ putText $ "[highlightTerms] txtTokens: " <> show txtTokens
tokenizedTerms <- mapM tokenizeTerms ngramsTerms
liftBase $ putText $ "[highlightTerms] tokenizedTerms: " <> show tokenizedTerms
-- TODO This isn't the most optimal, of O(n*m) complexity. One can
-- try to compute hashes, incrementally, for the windowed tokens
let ht = highlight txt txtTokens <$> tokenizedTerms
pure $ catMaybes $ concat ht
where
tokenizeTerms :: HasTokenizer env m => NgramsTerm -> m (NgramsTerm, [TokenTag])
tokenizeTerms t = do
tt' <- tokenize lang $ unNgramsTerm t
let tt = relevantTokens tt'
pure (t, tt)
-- | Fills in all "gaps" created by 'highlightTerms', i.e. inserts
-- text parts where there are no highlights.
fillHighlightGaps :: [HighlightedTerm] -> Text -> [HighlightResult]
fillHighlightGaps hts txt = sortBy compareHR ((HRHighlighted <$> hts) <> gapHt)
where
txtInt = IS.singleton (Finite 0 <=..<= (Finite $ T.length txt))
compareHR hr1 hr2 = compare (I.lowerBound $ hrToInterval hr1) (I.lowerBound $ hrToInterval hr2)
htIntervals = IS.fromList (htToInterval <$> hts)
intDiff = IS.toList (IS.difference txtInt htIntervals)
gapHt = HRNormal <$> intervalToNt txt <$> intDiff
------- UTILITY FUNCTIONS
-- | Keep only relevant tokens for token highlight. This is because
-- things like hyphens etc prevent us from highlighting terms
-- separated e.g. with dashes.
relevantTokens :: [TokenTag] -> [TokenTag]
relevantTokens = filter f
where
f (TokenTag { .. }) =
case _my_token_pos of
Just (NotFound { }) -> False
_ -> True
highlight :: Text -> [TokenTag] -> (NgramsTerm, [TokenTag]) -> [Maybe HighlightedTerm]
highlight txt txtTokens (ngramsTerm, tokenizedTerms) =
highlightInWindow (ngramsTerm, tokenizedTerms) <$> (window (length tokenizedTerms) txtTokens)
where
highlightInWindow :: (NgramsTerm, [TokenTag]) -> [TokenTag] -> Maybe HighlightedTerm
highlightInWindow (nt, tt) windowTxtTokens =
case ( compareSets (_my_token_lemma <$> tt) (_my_token_lemma <$> windowTxtTokens)
, head windowTxtTokens
, lastMay windowTxtTokens ) of
( True, Just h, Just l ) ->
let ( lb, ub ) = ( _my_token_offset_begin h, _my_token_offset_end l )
in
Just (HighlightedTerm { _ht_term = unNgramsTerm nt
, _ht_original_text = T.take (ub - lb) $ T.drop lb txt
, _ht_start = lb
, _ht_end = ub })
_ -> Nothing
intersects :: Ord a => Set a -> Set a -> Bool
intersects s1 s2 = not $ Set.disjoint s1 s2
-- | We treat lemmas as equal, if sets intersect. This is a comparison
-- function for a list of such sets.
compareSets :: Ord a => [Set a] -> [Set a] -> Bool
compareSets ss1 ss2 = (length ss1 == length ss2) &&
all (\(s1, s2) -> intersects s1 s2) (zip ss1 ss2)
{-|
Module : Gargantext.Core.Text.Terms.Tokenize.Types
Description : String tokenization
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Terms.Tokenize.Types
where
import Control.Monad.Trans.Control (MonadBaseControl)
import Data.Aeson.TH (defaultOptions, deriveJSON)
import Data.ExtendedReal (Extended(..))
import Data.Interval ((<=..<=))
import Data.Interval qualified as I
import Data.Swagger (ToSchema, declareNamedSchema, defaultSchemaOptions, genericDeclareNamedSchema, genericDeclareNamedSchemaUnrestricted)
import Data.Text qualified as T
import Gargantext.Core.NLP (HasNLPServer)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Prelude
type HasTokenizer env m = ( HasNLPServer env
, MonadReader env m
, MonadBaseControl IO m )
------------------------------
-- NOTE: To highlight terms, we actually need to know what these terms
-- are. Terms consist of compounds of, possibly, multiple tokens and
-- it's not the same as NLP tokenization.
-- https://en.wikipedia.org/wiki/Terminology_extraction
data HighlightedTerm =
HighlightedTerm { _ht_term :: Text
, _ht_original_text :: Text
, _ht_start :: Int -- start position of the term
, _ht_end :: Int -- end position of the term
}
deriving (Show, Eq, Generic)
instance ToSchema HighlightedTerm where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_ht_")
htToInterval :: HighlightedTerm -> I.Interval Int
htToInterval (HighlightedTerm { .. }) = (Finite _ht_start) <=..<= (Finite _ht_end)
------------------------------
data NormalText =
NormalText { _nt_text :: Text
, _nt_start :: Int
, _nt_end :: Int }
deriving (Show, Eq, Generic)
instance ToSchema NormalText where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nt_")
ntToInterval :: NormalText -> I.Interval Int
ntToInterval (NormalText { .. }) = (Finite _nt_start) <=..<= (Finite _nt_end)
intervalToNt :: Text -> I.Interval Int -> NormalText
intervalToNt txt int = NormalText { _nt_text = T.take (ub - lb) $ T.drop lb txt
, _nt_start = lb
, _nt_end = ub }
where
lb' = I.lowerBound int
lb = case lb' of
Finite l -> l
_ -> 0
ub' = I.upperBound int
ub = case ub' of
Finite u -> u
_ -> 0
------------------------------
data HighlightResult =
HRHighlighted HighlightedTerm
| HRNormal NormalText
deriving (Show, Eq, Generic)
instance ToSchema HighlightResult where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
hrToInterval :: HighlightResult -> I.Interval Int
hrToInterval (HRHighlighted ht) = htToInterval ht
hrToInterval (HRNormal nt) = ntToInterval nt
$(deriveJSON (unPrefix "_ht_") ''HighlightedTerm)
$(deriveJSON (unPrefix "_nt_") ''NormalText)
$(deriveJSON defaultOptions ''HighlightResult)
...@@ -22,6 +22,12 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main ...@@ -22,6 +22,12 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, DebugMode(..), withDebugMode , DebugMode(..), withDebugMode
, Term(..), Terms(..), TermsCount, TermsWithCount , Term(..), Terms(..), TermsCount, TermsWithCount
, TokenTag(..), POS(..), NER(..) , TokenTag(..), POS(..), NER(..)
, my_token_offset_begin
, my_token_offset_end
, my_token_lemma
, my_token_ner
, my_token_pos
, my_token_word
, combineTokenTags, emptyTokenTag , combineTokenTags, emptyTokenTag
, Label, Stems , Label, Stems
, HasValidationError(..), assertValid , HasValidationError(..), assertValid
...@@ -39,13 +45,13 @@ import Data.Maybe ...@@ -39,13 +45,13 @@ import Data.Maybe
import Data.Monoid import Data.Monoid
import Data.Set (empty) import Data.Set (empty)
import Data.String import Data.String
import Data.Swagger (ToParamSchema) import Data.Swagger (ToParamSchema, defaultSchemaOptions, genericDeclareNamedSchema, genericDeclareNamedSchemaUnrestricted)
import Data.Swagger (ToSchema(..)) import Data.Swagger (ToSchema(..))
import Data.Text (unpack) import Data.Text (unpack)
import Data.Validity import Data.Validity
import GHC.Generics import GHC.Generics
import Gargantext.Core.Types.Main import Gargantext.Core.Types.Main
import Gargantext.Core.Utils.Prefix (unPrefix, wellNamedSchema) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger, wellNamedSchema)
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Prelude hiding (Ordering, empty) import Gargantext.Prelude hiding (Ordering, empty)
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary) import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
...@@ -133,9 +139,11 @@ instance FromJSON POS where ...@@ -133,9 +139,11 @@ instance FromJSON POS where
instance ToJSON POS instance ToJSON POS
instance Hashable POS instance Hashable POS
instance ToSchema POS where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
------------------------------------------------------------------------ ------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER { noNer :: !Text } data NER = PERSON | ORGANIZATION | LOCATION | NoNER { noNer :: !Text }
deriving (Show, Generic) deriving (Show, Eq, Generic)
------------------------------------------------------------------------ ------------------------------------------------------------------------
instance FromJSON NER where instance FromJSON NER where
parseJSON = withText "String" (\x -> pure (ner $ unpack x)) parseJSON = withText "String" (\x -> pure (ner $ unpack x))
...@@ -149,6 +157,8 @@ instance FromJSON NER where ...@@ -149,6 +157,8 @@ instance FromJSON NER where
ner x = NoNER (cs x) ner x = NoNER (cs x)
instance ToJSON NER instance ToJSON NER
instance ToSchema NER where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
data TokenTag = TokenTag { _my_token_word :: [Text] data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_lemma :: Set Text , _my_token_lemma :: Set Text
...@@ -156,8 +166,11 @@ data TokenTag = TokenTag { _my_token_word :: [Text] ...@@ -156,8 +166,11 @@ data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_ner :: Maybe NER , _my_token_ner :: Maybe NER
, _my_token_offset_begin :: Int , _my_token_offset_begin :: Int
, _my_token_offset_end :: Int , _my_token_offset_end :: Int
} deriving (Show) } deriving (Show, Eq, Generic)
instance ToSchema TokenTag where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_my_token_")
$(deriveJSON (unPrefix "_my_token_") ''TokenTag) $(deriveJSON (unPrefix "_my_token_") ''TokenTag)
makeLenses ''TokenTag
-- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And -- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And
-- lemma combining is just wrong. You can't just "cat" <> "woman" to -- lemma combining is just wrong. You can't just "cat" <> "woman" to
......
{-|
Module : Gargantext.Utils.Array
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Utilities for handling arrays.
-}
module Gargantext.Utils.Array where
import Protolude
-- | A sliding window of given size for an array.
-- https://stackoverflow.com/questions/27726739/implementing-an-efficient-sliding-window-algorithm-in-haskell
window :: Int -> [a] -> [[a]]
window size = foldr (zipWith (:)) (repeat []) . take size . tails
...@@ -14,6 +14,7 @@ import Gargantext.Prelude ...@@ -14,6 +14,7 @@ import Gargantext.Prelude
import qualified Test.Core.Text.Corpus.Query as CorpusQuery import qualified Test.Core.Text.Corpus.Query as CorpusQuery
import qualified Test.Core.Utils as Utils import qualified Test.Core.Utils as Utils
import qualified Test.Core.Text.Tokenize as Tokenize
import qualified Test.Graph.Clustering as Graph import qualified Test.Graph.Clustering as Graph
import qualified Test.Ngrams.NLP as NLP import qualified Test.Ngrams.NLP as NLP
import qualified Test.Ngrams.Query as NgramsQuery import qualified Test.Ngrams.Query as NgramsQuery
...@@ -35,6 +36,7 @@ main = do ...@@ -35,6 +36,7 @@ main = do
cryptoSpec <- testSpec "Crypto" Crypto.test cryptoSpec <- testSpec "Crypto" Crypto.test
nlpSpec <- testSpec "NLP" NLP.test nlpSpec <- testSpec "NLP" NLP.test
jobsSpec <- testSpec "Jobs" Jobs.test jobsSpec <- testSpec "Jobs" Jobs.test
tokenizeSpec <- testSpec "Tokenize" Tokenize.test
defaultMain $ testGroup "Gargantext" defaultMain $ testGroup "Gargantext"
[ utilSpec [ utilSpec
...@@ -44,6 +46,7 @@ main = do ...@@ -44,6 +46,7 @@ main = do
, cryptoSpec , cryptoSpec
, nlpSpec , nlpSpec
, jobsSpec , jobsSpec
, tokenizeSpec
, NgramsQuery.tests , NgramsQuery.tests
, CorpusQuery.tests , CorpusQuery.tests
, JSON.tests , JSON.tests
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment