{-|
Module      : Gargantext.Core.Text.Terms.WithList
Description :
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Here is a longer description of this module, containing some
commentary with @some markup@.

-}
{-# LANGUAGE BangPatterns      #-}
{-# LANGUAGE ViewPatterns      #-}

module Gargantext.Core.Text.Terms.WithList (
    termsInText
  , buildPatterns
  , Patterns
  , Pattern(_pat_length, _pat_terms)
  , MatchedText
  , buildPatternsWith
  , extractTermsWithList
  , extractTermsWithList'

  -- * Properties
  , prop_patterns_internal_consistency
  ) where

import Prelude (show)

import Data.Algorithms.KMP qualified as KMP
import Data.IntMap.Strict qualified as IntMap
import Data.List qualified as List
import Data.Ord
import Data.Text (concat)
import Data.Text qualified as T
import GHC.Exts (sortWith)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude hiding (concat)
import Test.QuickCheck (Property, counterexample, conjoin, (===))

------------------------------------------------------------------------

-- | A 'Pattern' encapsulate a string search (i.e. a pattern) in a text.
data Pattern = Pattern
  { -- | The internal state of the underlying library implementing the Knuth-Morris-Pratt.
    -- To not be leaked outside.
    _pat_table  :: !(KMP.Table Text)
  , _pat_length :: !Int
  , _pat_terms  :: ![Text]
  }

instance Show Pattern where
  show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")"

type Patterns = [Pattern]

------------------------------------------------------------------------

data ReplaceTerms = KeepAll | LongestOnly

replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
replaceTerms rplaceTerms pats terms = go 0
  where
    terms_len = length terms

    go ix | ix >= terms_len = []
          | otherwise =
      case IntMap.lookup ix m of
        Nothing -> go (ix + 1)
        Just (len, term) ->
          term : go (ix + len)

    m = toMap
        [ (ix, (len, term))
        | Pattern pat len term <- pats, ix <- KMP.match pat terms ]

    toMap = case rplaceTerms of
      KeepAll -> IntMap.fromList
      LongestOnly -> IntMap.fromListWith merge
        where
          merge (len1, lab1) (len2, lab2) =
            if len2 < len1 then (len1, lab1) else (len2, lab2)

buildPatternsWith :: Lang -> [NgramsTerm] -> Patterns
buildPatternsWith ZH ts = buildPatterns $ map (\(NgramsTerm k) -> (T.chunksOf 1  k, [])) ts
buildPatternsWith _  ts = buildPatterns $ map (\(NgramsTerm k) -> (T.splitOn " " k, [])) ts

buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
  where
    buildPattern :: Label -> [MultiTerm] -> [Pattern]
    buildPattern label alts = mapMaybe (mkPattern label) $ map (\alt -> filter (/= "") alt) (label : alts)

    mkPattern :: Label -> [Text] -> Maybe Pattern
    mkPattern label alt
      | "" `elem` alt = Nothing
      | null alt      = Nothing
      | otherwise     = Just $
        -- /IMPORTANT/ The call to 'toLower' is crucial, because it means
        -- we will be building a pattern that will match lowercase terms,
        -- which is exactly what we will be given as part of 'termsInText',
        -- that calls 'monoTextsBySentence'. If we don't lower here it
        -- means we won't be getting matches, whereas in theory we could.
        Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label)
                         --(Terms label $ Set.empty) -- TODO check stems


--------------------------------------------------------------------------
-- Utils
type MatchedText = Text

termsInText :: Lang -> Patterns -> Text -> [(MatchedText, TermsCount)]
termsInText lang pats (manipulateText lang -> txt) =
  groupWithCounts $ List.concat
                  $ map (map unwords)
                  $ extractTermsWithList pats txt

-- | Manipulates the input 'Text' before passing it to 'termsInText'.
-- In particular, if the language is Chinese (ZH), we add spaces.
manipulateText :: Lang -> Text -> Text
manipulateText ZH txt = addSpaces txt
manipulateText _  txt = txt

--------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence

-- | Extract terms
extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
                           . monoTextsBySentence

--------------------------------------------------------------------------
addSpaces :: Text -> Text
addSpaces = T.unwords . (T.chunksOf 1)


--------------------------------------------------------------------------

{- | Not used
filterWith :: TermList
           -> (a -> Text)
           -> [a]
           -> [(a, [Text])]
filterWith termList f xs = filterWith' termList f zip xs


filterWith' :: TermList
           -> (a -> Text)
           -> ([a] -> [[Text]] -> [b])
           -> [a]
           -> [b]
filterWith' termList f f' xs = f' xs
                            $ map (extractTermsWithList' pats)
                            $ map f xs
    where
      pats = buildPatterns termList
-}

prop_patterns_internal_consistency :: Text -> Property
prop_patterns_internal_consistency termFragment =
  conjoin [
    counterexample "{\RS1"     $ KMP.match (KMP.build [ "{\RS1" :: Text ]) [ "{\RS1" ] === [ 0 ]
  , counterexample "\1099831?" $ KMP.match (KMP.build [ "\1099831?" :: Text ]) [ "\1099831?" ] === [ 0 ]
  , counterexample "k2(" $ KMP.match (KMP.build [ "k2(" :: Text ]) [ "k2(" ] === [ 0 ]
  , counterexample "no match"  $ length (KMP.match (KMP.build [termFragment]) [termFragment]) > 0
  ]
