{-|
Module      : Gargantext.Core.Text.Terms.WithList
Description :
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Here is a longer description of this module, containing some
commentary with @some markup@.

-}
{-# LANGUAGE ViewPatterns      #-}

module Gargantext.Core.Text.Terms.WithList (
    termsInText
  , buildPatterns
  , Patterns
  , Pattern(_pat_length, _pat_terms)
  , MatchedText
  , buildPatternsWith
  , extractTermsWithList
  , extractTermsWithList'

  -- * Properties
  , prop_patterns_internal_consistency

  -- * For debugging
  , ReplaceTerms(..)
  , replaceTerms
  ) where

import Prelude (show)

import Data.Algorithms.KMP qualified as KMP
import Data.IntMap.Strict qualified as IntMap
import Data.List qualified as List
import Data.Ord
import Data.Text (concat)
import Data.Text qualified as T
import GHC.Exts (sortWith)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context ( Corpus, TermList, Label, MultiTerm )
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude hiding (concat)
import Test.QuickCheck (Property, counterexample, conjoin, (===))

------------------------------------------------------------------------

-- | A 'Pattern' encapsulate a string search (i.e. a pattern) in a text.
data Pattern = Pattern
  { -- | The internal state of the underlying library implementing the Knuth-Morris-Pratt.
    -- To not be leaked outside.
    _pat_table  :: !(KMP.Table Text)
  , _pat_length :: !Int
  , _pat_terms  :: ![Text]
  }

instance Show Pattern where
  show Pattern{..} = "Pattern { _pat_length = " <> Prelude.show _pat_length <> ", _pat_terms = " <> Prelude.show _pat_terms <> "}"

type Patterns = [Pattern]

------------------------------------------------------------------------

data ReplaceTerms = KeepAll | LongestOnly

-- | Given a 'ReplaceTerms' strategy, patterns and a split text,
-- return matching terms according to strategy.  This function is
-- usually applied to words in the whole sentence (i.e. 'terms'
-- variable contains a tokenized sentence, coming from
-- 'monoTextsBySentence').
replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
replaceTerms rTerms pats terms =
  List.concat (
    mapMaybe (\(ix, _t) ->
                case IntMap.lookup ix m of
                  Nothing -> Nothing
                  -- lst :: [(Int, [Text])]
                  -- snd <$> lst :: [[Text]]
                  Just lst -> Just (snd <$> lst)) $ zip [0..] terms
    )
--replaceTerms rTerms pats terms = go 0
  where
    -- termsLen :: Int
    -- termsLen = length terms

    -- go :: Int -> [[Text]]
    -- go ix | ix >= termsLen = []
    --       | otherwise =
    --   case IntMap.lookup ix m of
    --     Nothing -> go (ix + 1)
    --     Just (len, term) ->
    --       term : go (ix + len)

    m :: IntMap [(Int, [Text])]
    m = toMap
        [ (ix, (_pat_length, _pat_terms))
        | Pattern { .. } <- pats
        , ix <- KMP.match _pat_table terms ]

    toMap :: [(IntMap.Key, (Int, [Text]))] -> IntMap [(Int, [Text])]
    toMap kv = case rTerms of
      KeepAll -> IntMap.fromListWith (<>) (second (:[]) <$> kv)
      LongestOnly -> IntMap.map (:[]) $ IntMap.fromListWith merge kv
        where
          merge (len1, lab1) (len2, lab2) =
            if len2 < len1 then (len1, lab1) else (len2, lab2)

buildPatternsWith :: Lang -> [NgramsTerm] -> Patterns
buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (T.chunksOf 1  k, [])) ts
buildPatternsWith _  ts = buildPatterns $ map (\(NgramsTerm k) -> (T.splitOn " " k, [])) ts

buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
  where
    buildPattern :: Label -> [MultiTerm] -> [Pattern]
    buildPattern label alts = mapMaybe (mkPattern label) $ map (filter (/= "")) (label : alts)

    mkPattern :: Label -> [Text] -> Maybe Pattern
    mkPattern label alt
      | "" `elem` alt = Nothing
      | null alt      = Nothing
      | otherwise     = Just $
        -- /IMPORTANT/ The call to 'toLower' is crucial, because it means
        -- we will be building a pattern that will match lowercase terms,
        -- which is exactly what we will be given as part of 'termsInText',
        -- that calls 'monoTextsBySentence'. If we don't lower here it
        -- means we won't be getting matches, whereas in theory we could.
        Pattern { _pat_table  = KMP.build $ map T.toLower alt
                , _pat_length = length alt
                , _pat_terms  = map T.toLower label }
                         --(Terms label $ Set.empty) -- TODO check stems


--------------------------------------------------------------------------
-- Utils
type MatchedText = Text

termsInText :: Lang -> Patterns -> Text -> [(MatchedText, TermsCount)]
termsInText lang pats (manipulateText lang -> txt) =
  groupWithCounts $ List.concatMap (map unwords) (extractTermsWithList pats txt)

-- | Manipulates the input 'Text' before passing it to 'termsInText'.
-- In particular, if the language is Chinese (ZH), we add spaces.
manipulateText :: Lang -> Text -> Text
manipulateText ZH txt = addSpaces txt
manipulateText _  txt = txt

--------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence

-- | Extract terms
extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats txt = map concat $ List.concat $ extractTermsWithList pats txt

--------------------------------------------------------------------------
addSpaces :: Text -> Text
addSpaces = T.intersperse ' '


--------------------------------------------------------------------------

{- | Not used
filterWith :: TermList
           -> (a -> Text)
           -> [a]
           -> [(a, [Text])]
filterWith termList f xs = filterWith' termList f zip xs


filterWith' :: TermList
           -> (a -> Text)
           -> ([a] -> [[Text]] -> [b])
           -> [a]
           -> [b]
filterWith' termList f f' xs = f' xs
                            $ map (extractTermsWithList' pats)
                            $ map f xs
    where
      pats = buildPatterns termList
-}

prop_patterns_internal_consistency :: Text -> Property
prop_patterns_internal_consistency termFragment =
  conjoin [
    counterexample "{\RS1"     $ KMP.match (KMP.build [ "{\RS1" :: Text ]) [ "{\RS1" ] === [ 0 ]
  , counterexample "\1099831?" $ KMP.match (KMP.build [ "\1099831?" :: Text ]) [ "\1099831?" ] === [ 0 ]
  , counterexample "k2(" $ KMP.match (KMP.build [ "k2(" :: Text ]) [ "k2(" ] === [ 0 ]
  , counterexample "no match"  $ length (KMP.match (KMP.build [termFragment]) [termFragment]) > 0
  ]
