{-|
Module      : Gargantext.Core.Text.Terms.Tokenize
Description : String tokenization
Copyright   : (c) CNRS, 2017
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX
-}


{-# LANGUAGE ConstraintKinds   #-}
{-# LANGUAGE TemplateHaskell   #-}

module Gargantext.Core.Text.Terms.Tokenize
where

import Control.Lens (view)  -- over
import Data.ExtendedReal (Extended(..))
import Data.Interval ((<=..<=))
import Data.Interval qualified as I
import Data.IntervalSet qualified as IS
import Data.Set qualified as Set
import Data.Text qualified as T
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang, NLPServerConfig(..)) --, PosTagAlgo(CoreNLP))
import Gargantext.Core.NLP (nlpServerGet)
import Gargantext.Core.Text.Terms.Multi (tokenTagsNoGroup)
import Gargantext.Core.Text.Terms.Tokenize.Types
import Gargantext.Core.Types (TokenTag(..), POS(..))  --, my_token_offset_end)
import Gargantext.Prelude
import Gargantext.Utils.Array (window)


-- | Just pick an NLP server and tokenize the given string using given
-- language.
tokenize :: HasTokenizer env m
         => Lang -> Text -> m [TokenTag]
tokenize lang txt = do
  nlp <- view (nlpServerGet lang)
  ret <- liftBase $ concat <$> tokenTagsNoGroup nlp lang txt

  let f = case server nlp of
            -- CoreNLP -> over my_token_offset_end (\o -> o - 1)
            _ -> identity

  pure $ f <$> ret


-------

-- | This function, given a list of 'NgramsTerm' and a text,
-- highlights these terms using the 'tokenize' function above.
highlightTerms :: HasTokenizer env m
               => [NgramsTerm] -> Lang -> Text -> m [HighlightedTerm]
highlightTerms ngramsTerms lang txt = do
  txtTokens' <- tokenize lang txt
  let txtTokens = relevantTokens txtTokens'
  liftBase $ putText $ "[highlightTerms] txtTokens: " <> show txtTokens
  tokenizedTerms <- mapM tokenizeTerms ngramsTerms
  liftBase $ putText $ "[highlightTerms] tokenizedTerms: " <> show tokenizedTerms
  -- TODO This isn't the most optimal, of O(n*m) complexity. One can
  -- try to compute hashes, incrementally, for the windowed tokens
  let ht = highlight txt txtTokens <$> tokenizedTerms
  pure $ catMaybes $ concat ht
  where
    tokenizeTerms :: HasTokenizer env m => NgramsTerm -> m (NgramsTerm, [TokenTag])
    tokenizeTerms t = do
      tt' <- tokenize lang $ unNgramsTerm t
      let tt = relevantTokens tt'
      pure (t, tt)



-- | Fills in all "gaps" created by 'highlightTerms', i.e. inserts
-- text parts where there are no highlights.
fillHighlightGaps :: [HighlightedTerm] -> Text -> [HighlightResult]
fillHighlightGaps hts txt = sortBy compareHR ((HRHighlighted <$> hts) <> gapHt)
  where
    txtInt = IS.singleton (Finite 0 <=..<= (Finite $ T.length txt))
    compareHR hr1 hr2 = compare (I.lowerBound $ hrToInterval hr1) (I.lowerBound $ hrToInterval hr2)
    htIntervals = IS.fromList (htToInterval <$> hts)
    intDiff = IS.toList (IS.difference txtInt htIntervals)
    gapHt = HRNormal <$> intervalToNt txt <$> intDiff


------- UTILITY FUNCTIONS

-- | Keep only relevant tokens for token highlight. This is because
-- things like hyphens etc prevent us from highlighting terms
-- separated e.g. with dashes.
relevantTokens :: [TokenTag] -> [TokenTag]
relevantTokens = filter f
  where
    f (TokenTag { .. }) =
      case _my_token_pos of
        Just (NotFound { }) -> False
        _            -> True

highlight :: Text -> [TokenTag] -> (NgramsTerm, [TokenTag]) -> [Maybe HighlightedTerm]
highlight txt txtTokens (ngramsTerm, tokenizedTerms) =
  highlightInWindow (ngramsTerm, tokenizedTerms) <$> (window (length tokenizedTerms) txtTokens)
  where
    highlightInWindow :: (NgramsTerm, [TokenTag]) -> [TokenTag] -> Maybe HighlightedTerm
    highlightInWindow (nt, tt) windowTxtTokens =
      case ( compareSets (_my_token_lemma <$> tt) (_my_token_lemma <$> windowTxtTokens)
           , head windowTxtTokens
           , lastMay windowTxtTokens ) of
        ( True, Just h, Just l ) ->
          let ( lb, ub ) = ( _my_token_offset_begin h, _my_token_offset_end l )
          in
          Just (HighlightedTerm { _ht_term = unNgramsTerm nt
                                , _ht_original_text = T.take (ub - lb) $ T.drop lb txt
                                , _ht_start = lb
                                , _ht_end = ub })
        _ -> Nothing


intersects :: Ord a => Set a -> Set a -> Bool
intersects s1 s2 = not $ Set.disjoint s1 s2


-- | We treat lemmas as equal, if sets intersect. This is a comparison
-- function for a list of such sets.
compareSets :: Ord a => [Set a] -> [Set a] -> Bool
compareSets ss1 ss2 = (length ss1 == length ss2) &&
  all (\(s1, s2) -> intersects s1 s2) (zip ss1 ss2)
