{-|
Module      : Gargantext.Core.Text.Terms.Tokenize.Types
Description : String tokenization
Copyright   : (c) CNRS, 2017
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX
-}


{-# LANGUAGE ConstraintKinds   #-}
{-# LANGUAGE TemplateHaskell   #-}

module Gargantext.Core.Text.Terms.Tokenize.Types
where

import Control.Monad.Trans.Control (MonadBaseControl)
import Data.Aeson.TH (defaultOptions, deriveJSON)
import Data.ExtendedReal (Extended(..))
import Data.Interval ((<=..<=))
import Data.Interval qualified as I
import Data.Swagger (ToSchema, declareNamedSchema, defaultSchemaOptions, genericDeclareNamedSchema, genericDeclareNamedSchemaUnrestricted)
import Data.Text qualified as T
import Gargantext.Core.NLP (HasNLPServer)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Prelude



type HasTokenizer env m = ( HasNLPServer env
                          , MonadReader env m
                          , MonadBaseControl IO m )

------------------------------

-- NOTE: To highlight terms, we actually need to know what these terms
-- are. Terms consist of compounds of, possibly, multiple tokens and
-- it's not the same as NLP tokenization.
-- https://en.wikipedia.org/wiki/Terminology_extraction

data HighlightedTerm =
  HighlightedTerm { _ht_term          :: Text
                  , _ht_original_text :: Text
                  , _ht_start         :: Int  -- start position of the term
                  , _ht_end           :: Int  -- end position of the term
                  }
  deriving (Show, Eq, Generic)
instance ToSchema HighlightedTerm where
  declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_ht_")

htToInterval :: HighlightedTerm -> I.Interval Int
htToInterval (HighlightedTerm { .. }) = (Finite _ht_start) <=..<= (Finite _ht_end)

------------------------------

data NormalText =
  NormalText { _nt_text  :: Text
             , _nt_start :: Int
             , _nt_end   :: Int }
  deriving (Show, Eq, Generic)
instance ToSchema NormalText where
  declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nt_")

ntToInterval :: NormalText -> I.Interval Int
ntToInterval (NormalText { .. }) = (Finite _nt_start) <=..<= (Finite _nt_end)

intervalToNt :: Text -> I.Interval Int -> NormalText
intervalToNt txt int = NormalText { _nt_text  = T.take (ub - lb) $ T.drop lb txt
                                  , _nt_start = lb
                                  , _nt_end   = ub }
  where
    lb' = I.lowerBound int
    lb = case lb' of
           Finite l -> l
           _        -> 0
    ub' = I.upperBound int
    ub = case ub' of
           Finite u -> u
           _        -> 0


------------------------------

data HighlightResult =
    HRHighlighted HighlightedTerm
  | HRNormal NormalText
  deriving (Show, Eq, Generic)
instance ToSchema HighlightResult where
  declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions

hrToInterval :: HighlightResult -> I.Interval Int
hrToInterval (HRHighlighted ht) = htToInterval ht
hrToInterval (HRNormal nt)  = ntToInterval nt


$(deriveJSON (unPrefix "_ht_") ''HighlightedTerm)
$(deriveJSON (unPrefix "_nt_") ''NormalText)
$(deriveJSON defaultOptions ''HighlightResult)
