{-|
Module      : Gargantext.Database.TextSearch
Description : Postgres text search experimentation
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX
-}

{-# OPTIONS_GHC -fno-warn-deprecations #-}

{-# LANGUAGE Arrows            #-}
{-# LANGUAGE LambdaCase         #-}

module Gargantext.Database.Action.Search (
    searchInCorpus
  , searchInCorpusWithContacts
  , searchCountInCorpus
  , searchInCorpusWithNgrams
  , searchDocInDatabase
  ) where

import Control.Arrow (returnA)
import Control.Lens (view)
import Data.BoolExpr ( BoolExpr(..), Signed(Negative, Positive) )
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import Data.Profunctor.Product (p4)
import Data.Set qualified as Set
import Data.Text (unpack)
import Data.Text qualified as T
import Data.Time (UTCTime)
import Gargantext.Core ( Lang(EN), HasDBid(toDBid) )
import Gargantext.Core.Text.Corpus.Query qualified as API
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Types
import Gargantext.Core.Types.Query (IsTrash, Limit, Offset)
import Gargantext.Database.Admin.Types.Hyperdata.Contact ( HyperdataContact(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Database.Prelude (runOpaQuery, runCountOpaQuery, DBCmd)
import Gargantext.Database.Query.Facet
import Gargantext.Database.Query.Filter ( limit', offset' )
import Gargantext.Database.Query.Table.Context ( queryContextSearchTable )
import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
import Gargantext.Database.Query.Table.Node ( queryNodeSearchTable, defaultList )
import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
import Gargantext.Database.Query.Table.NodeContext
import Gargantext.Database.Schema.NodeContext_NodeContext ( NodeContext_NodeContextRead, queryNodeContext_NodeContextTable, ncnc_nodecontext2, ncnc_nodecontext1 )
import Gargantext.Database.Schema.Context
import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
import Gargantext.Database.Schema.Node ( NodePolySearch(_ns_hyperdata, _ns_search, _ns_typename, _ns_id) )
import Gargantext.Prelude hiding (groupBy)
import Opaleye hiding (Order)
import Opaleye qualified as O hiding (Order)
import Opaleye.TextSearch

--
-- Interpreting a query into a Postgres' TSQuery
--

queryToTsSearch :: API.Query -> Field SqlTSQuery
queryToTsSearch q =
  let (dictionary, transformed) = API.interpretQuery q transformAST
  in sqlToTSQuery dictionary (T.unpack transformed)
  where

    -- It's important to understand how things work under the hood: When we perform
    -- a search, we do it on a /ts vector/ in Postgres, which is already stemmed in
    -- lexemes. For example, this:
    --
    -- SELECT to_tsvector('Effects on postpartum on vitamins and minerals in women');
    --
    -- yields:
    --
    -- 'effect':1 'miner':7 'postpartum':3 'vitamin':5 'women':9
    --
    -- As you can see, minimum processing has happened: plurals have been stripped and
    -- what it looks like the Porter stemming has been applied (we get 'miner' instead
    -- of the original /mineral/, for example.
    --
    -- Therefore, in case of exact match searches, we need to perform stemming /regardless/,
    -- and this stemming should ideally match the one performed by Postgres.
    --
    -- Now, if the user is doing a partial match search (like \"~postpartum\" for example)
    -- then we need to stem /AND/ use the \":*\" operator to perform a
    -- sort of fuzzy search. Compare the followings:
    --
    -- SELECT to_tsvector('Effects on postpartum on vitamins and minerals in women') @@ to_tsquery('postpartum');
    -- SELECT to_tsvector('Effects on postpartum on vitamins and minerals in women') @@ to_tsquery('postpart');
    -- SELECT to_tsvector('Effects on postpartum on vitamins and minerals in women') @@ to_tsquery('postpart:*');
    --
    -- The first will match, the second won't, the third will.
    renderQueryTerms :: [API.QueryTerm] -> T.Text
    renderQueryTerms trms = T.intercalate " & " $ trms <&> \case
      API.QT_exact_match (Term term)
        -> stem EN GargPorterAlgorithm term
      API.QT_partial_match (Term term)
        -> stem EN GargPorterAlgorithm term <> ":*"

    -- Transforms the input query terms and returns the full SQL query to feed Postgres AND
    -- the dictionary to use, see: https://www.postgresql.org/docs/current/textsearch-dictionaries.html
    -- In a nutshell, if we have a partial match operator in our query, we use the \"simple\" dictionary
    -- under the hood, which won't filter stop words, which are sometimes useful, see issue #265.
    transformAST :: BoolExpr [API.QueryTerm] -> (Maybe Dictionary, T.Text)
    transformAST ast = case ast of
      BAnd sub1 sub2
        -> let (d1, sub1Expr) = transformAST sub1
               (d2, sub2Expr) = transformAST sub2
           in (d1 <|> d2, " (" <> sub1Expr <> " & " <> sub2Expr <> ") ")
      BOr sub1 sub2
        -> let (d1, sub1Expr) = transformAST sub1
               (d2, sub2Expr) = transformAST sub2
           in (d1 <|> d2, " (" <> sub1Expr <> " | " <> sub2Expr <> ") ")
      BNot (BConst (Negative term))
        -> transformAST (BConst (Positive term)) -- double negation
      BNot sub
        -> second (\e -> "!" <> e) $ transformAST sub
      -- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
      BTrue
        -> (Nothing, T.empty)
      -- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
      BFalse
        -> (Nothing, T.empty)
      BConst (Positive queryTerms)
        -> (pickDictionary queryTerms,  renderQueryTerms queryTerms)
      -- We can handle negatives via `ANDNOT` with itself.
      BConst (Negative queryTerms)
        -> (pickDictionary queryTerms, "!" <> renderQueryTerms queryTerms)


pickDictionary :: [API.QueryTerm] -> Maybe Dictionary
pickDictionary qs = if any isPartialMatch qs then Just (Dictionary "simple") else Nothing
  where
    isPartialMatch :: API.QueryTerm -> Bool
    isPartialMatch = \case
      API.QT_partial_match{} -> True
      _                      -> False

------------------------------------------------------------------------
searchDocInDatabase :: HasDBid NodeType
                    => ParentId
                    -> Text
                    -> DBCmd err [(NodeId, HyperdataDocument)]
searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
  where
    -- | Global search query where ParentId is Master Node Corpus Id
    queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
    queryDocInDatabase _p q = proc () -> do
        row <- queryNodeSearchTable -< ()
        restrict -< (_ns_search row)    @@ (sqlPlainToTSQuery (unpack q))
        restrict -< (_ns_typename row) .== (sqlInt4 $ toDBid NodeDocument)
        returnA  -< (_ns_id row, _ns_hyperdata row)

------------------------------------------------------------------------
-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
-- search only to map/candidate terms.
searchInCorpusWithNgrams :: HasDBid NodeType
               => CorpusId
               -> ListId
               -> IsTrash
               -> NgramsType
               -> [[Text]]
               -> Maybe Offset
               -> Maybe Limit
               -> Maybe OrderBy
               -> DBCmd err [FacetDoc]
searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined

-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
-- case only the "TF" part makes sense and so we only compute the
-- ratio of "number of times our terms appear in given document" and
-- "number of all terms in document" and return a sorted list of
-- document ids
_tfidfAll :: (HasDBid NodeType, HasNodeError err) => CorpusId -> [Int] -> DBCmd err [Int]
_tfidfAll cId ngramIds = do
  let ngramIdsSet = Set.fromList ngramIds
  lId <- defaultList cId
  docsWithNgrams <- runOpaQuery (_queryListWithNgrams lId ngramIds) :: DBCmd err [(Int, Int, Int)]
  -- NOTE The query returned docs with ANY ngramIds. We need to further
  -- restrict to ALL ngramIds.
  let docsNgramsM =
        Map.fromListWith (Set.union)
            [ (ctxId, Set.singleton ngrams_id)
            | (ctxId, ngrams_id, _) <- docsWithNgrams]
  let docsWithAllNgramsS = Set.fromList $ List.map fst $
        List.filter (\(_, docNgrams) ->
                        ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
  let docsWithAllNgrams =
        List.filter (\(ctxId, _, _) ->
                       Set.member ctxId docsWithAllNgramsS) docsWithNgrams
  -- printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
  let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
                                            | (ctxId, _, doc_count) <- docsWithAllNgrams]
  -- printDebug "[tfidfAll] docsWithCounts" docsWithCounts
  let totals = [ ( ctxId
                 , ngrams_id
                 , fromIntegral doc_count :: Double
                 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
               | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
  let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
                                     | (ctxId, _, doc_count, s) <- totals]
  pure $ List.map fst $ List.reverse tfidf_sorted

-- | Query for searching the 'context_node_ngrams' table so that we
-- find docs with ANY given 'ngramIds'.
_queryListWithNgrams :: ListId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
_queryListWithNgrams lId ngramIds = proc () -> do
  row <- queryContextNodeNgramsTable -< ()
  restrict -< (_cnng_node_id row) .== (pgNodeId lId)
  restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
  returnA -< ( _cnng_context_id row
             , _cnng_ngrams_id row
             , _cnng_doc_count row )
  --returnA -< row
  -- returnA -< ( _cnng_context_id row
  --            , _cnng_node_id row
  --            , _cnng_ngrams_id row
  --            , _cnng_ngramsType row
  --            , _cnng_weight row
  --            , _cnng_doc_count row)


------------------------------------------------------------------------
-- | todo add limit and offset and order
searchInCorpus :: HasDBid NodeType
               => CorpusId
               -> IsTrash
               -> API.Query
               -> Maybe Offset
               -> Maybe Limit
               -> Maybe OrderBy
               -> DBCmd err [FacetDoc]
searchInCorpus cId t q o l order = runOpaQuery
                                 $ filterWith o l order
                                 $ queryInCorpus cId t
                                 $ q

searchCountInCorpus :: HasDBid NodeType
                    => CorpusId
                    -> IsTrash
                    -> API.Query
                    -> DBCmd err Int
searchCountInCorpus cId t q = runCountOpaQuery
                            $ queryInCorpus cId t
                            $ q

queryInCorpus :: HasDBid NodeType
              => CorpusId
              -> IsTrash
              -> API.Query
              -> O.Select FacetDocRead
queryInCorpus cId t q = proc () -> do
  c <- queryContextSearchTable -< ()
  nc <- optionalRestrict queryNodeContextTable -<
    \nc' -> (nc' ^. nc_context_id) .== _cs_id c
  restrict -< (view nc_node_id <$> nc) .=== justFields (pgNodeId cId)
  restrict -< if t
                 then (view nc_category <$> nc) .=== justFields (sqlInt4 0)
                 else matchMaybe (view nc_category <$> nc) $ \case
                        Nothing -> toFields False
                        Just c' -> c' .>= sqlInt4 1
  restrict -< (c ^. cs_search)           @@ queryToTsSearch q
  restrict -< (c ^. cs_typename )       .== sqlInt4 (toDBid NodeDocument)
  returnA  -< FacetDoc { facetDoc_id         = c^.cs_id
                       , facetDoc_created    = c^.cs_date
                       , facetDoc_title      = c^.cs_name
                       , facetDoc_hyperdata  = c^.cs_hyperdata
                       , facetDoc_category   = maybeFieldsToNullable (view nc_category <$> nc)
                       , facetDoc_ngramCount = maybeFieldsToNullable (view nc_score <$> nc)
                       , facetDoc_score      = maybeFieldsToNullable (view nc_score <$> nc)
                       }

------------------------------------------------------------------------
searchInCorpusWithContacts
  :: HasDBid NodeType
  => CorpusId
  -> AnnuaireId
  -> API.Query
  -> Maybe Offset
  -> Maybe Limit
  -> Maybe OrderBy
  -> DBCmd err [FacetPaired Int UTCTime HyperdataContact Int]
searchInCorpusWithContacts cId aId q o l _order =
  runOpaQuery $ limit'   l
              $ offset'  o
              $ orderBy (desc _fp_score)
              $ selectGroup cId aId
              $ q

selectGroup :: HasDBid NodeType
            => CorpusId
            -> AnnuaireId
            -> API.Query
            -> Select FacetPairedRead
selectGroup cId aId q = proc () -> do
  (a, b, c, d) <- aggregate (p4 (groupBy, groupBy, groupBy, O.sum))
                            (selectContactViaDoc cId aId q) -< ()
  returnA -< FacetPaired a b c d


selectContactViaDoc
  :: HasDBid NodeType
  => CorpusId
  -> AnnuaireId
  -> API.Query
  -> SelectArr ()
               ( Field SqlInt4
               , Field SqlTimestamptz
               , Field SqlJsonb
               , Field SqlInt4
               )
selectContactViaDoc cId aId query = proc () -> do
  --(doc, (corpus, (_nodeContext_nodeContext, (annuaire, contact)))) <- queryContactViaDoc -< ()
  (contact, annuaire, _, corpus, doc) <- queryContactViaDoc -< ()
  restrict -< matchMaybe (view cs_search <$> doc) $ \case
    Nothing -> toFields False
    Just s  -> s @@ queryToTsSearch query
  restrict -< (view cs_typename <$> doc)          .=== justFields (sqlInt4 (toDBid NodeDocument))
  restrict -< (view nc_node_id <$> corpus)        .=== justFields (pgNodeId cId)
  restrict -< (view nc_node_id <$> annuaire)      .=== justFields (pgNodeId aId)
  restrict -< (contact ^. context_typename) .== sqlInt4 (toDBid NodeContact)
  returnA  -< ( contact ^. context_id
              , contact ^. context_date
              , contact ^. context_hyperdata
              , sqlInt4 1
              )

queryContactViaDoc :: O.Select ( ContextRead
                               , MaybeFields NodeContextRead
                               , MaybeFields NodeContext_NodeContextRead
                               , MaybeFields NodeContextRead
                               , MaybeFields ContextSearchRead )
queryContactViaDoc = proc () -> do
  contact <- queryContextTable -< ()
  annuaire <- optionalRestrict queryNodeContextTable -<
    \annuaire' -> (annuaire' ^. nc_context_id) .== (contact ^. context_id)
  nodeContext_nodeContext <- optionalRestrict queryNodeContext_NodeContextTable -<
    \ncnc' -> justFields (ncnc' ^. ncnc_nodecontext2) .=== (view nc_id <$> annuaire)
  corpus <- optionalRestrict queryNodeContextTable -<
    \corpus' -> justFields (corpus' ^. nc_id) .=== (view ncnc_nodecontext1 <$> nodeContext_nodeContext)
  doc <- optionalRestrict queryContextSearchTable -<
    \doc' -> justFields (doc' ^. cs_id) .=== (view nc_context_id <$> corpus)

  returnA -< (contact, annuaire, nodeContext_nodeContext, corpus, doc)