{-|
Module      : Gargantext.Text.Terms.WithList
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Here is a longer description of this module, containing some
commentary with @some markup@.

-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns      #-}

module Gargantext.Text.Terms.WithList where

import qualified Data.Algorithms.KMP as KMP
import Data.Text (Text)
import qualified Data.IntMap.Strict as IntMap

import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence)

import Prelude (error)
import Gargantext.Prelude
import Data.List (null, concatMap)
import Data.Ord


------------------------------------------------------------------------

data Pattern = Pattern
  { _pat_table  :: !(KMP.Table Text)
  , _pat_length :: !Int
  , _pat_terms  :: ![Text]
  }
type Patterns = [Pattern]

------------------------------------------------------------------------

replaceTerms :: Patterns -> [Text] -> [[Text]]
replaceTerms pats terms = go 0
  where
    terms_len = length terms

    go ix | ix >= terms_len = []
          | otherwise =
      case IntMap.lookup ix m of
        Nothing -> go (ix + 1)
        Just (len, term) ->
          term : go (ix + len)


    merge (len1, lab1) (len2, lab2) =
      if len2 < len1 then (len1, lab1) else (len2, lab2)

    m =
      IntMap.fromListWith merge
        [ (ix, (len, term))
        | Pattern pat len term <- pats, ix <- KMP.match pat terms ]

buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
  where
    buildPattern (label, alts) = map f (label : alts)
      where
        f alt | "" `elem` alt = error "buildPatterns: ERR1"
              | null alt      = error "buildPatterns: ERR2"
              | otherwise     =
                Pattern (KMP.build alt) (length alt) label
                        --(Terms label $ Set.empty) -- TODO check stems

extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence