{-|
Module      : Gargantext.Database.Metrics.NgramsByContext
Description : Ngrams by Node user and master
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Ngrams by node enable contextual metrics.

-}

{-# LANGUAGE QuasiQuotes       #-}

module Gargantext.Database.Action.Metrics.NgramsByContext
  where

import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HM
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Database.PostgreSQL.Simple qualified as DPS
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.ToField qualified as DPS
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Database.PostgreSQL.Simple.Types qualified as DPST
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core ( HasDBid(toDBid) )
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Data.HashMap.Strict.Utils as HM ( unionsWith )
import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId (..), MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
import Gargantext.Database.Prelude
import Gargantext.Database.Schema.Ngrams ()  -- toDBid instance
import Gargantext.Prelude

-- | fst is size of Supra Corpus
--   snd is Texts and size of Occurrences (different docs)

countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
                       -> HashMap NgramsTerm (Set ContextId)
                       -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
countContextsByNgramsWith f m = (total, m')
  where
    total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
    m'    = HM.map ( swap . second (fromIntegral . Set.size))
          $ groupContextsByNgramsWith f m


    groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
                              -> HashMap NgramsTerm (Set ContextId)
                              -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
    groupContextsByNgramsWith f' m'' =
      HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
                           $ HM.toList m''

------------------------------------------------------------------------
getContextsByNgramsUser :: HasDBid NodeType
                        => CorpusId
                        -> NgramsType
                        -> DBQuery err x (HashMap NgramsTerm (Set ContextId))
getContextsByNgramsUser cId nt =
  HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
                    <$> selectNgramsByContextUser cId nt
    where

      selectNgramsByContextUser :: HasDBid NodeType
                                => CorpusId
                                -> NgramsType
                                -> DBQuery err x [(ContextId, Text)]
      selectNgramsByContextUser cId' nt' =
        mkPGQuery queryNgramsByContextUser
                    ( cId'
                    , toDBid NodeDocument
                    , toDBid nt'
           --         , 100 :: Int -- limit
           --         , 0   :: Int -- offset
                    )

      queryNgramsByContextUser :: DPS.Query
      queryNgramsByContextUser = [sql|
        SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
          JOIN ngrams         ng ON cng.ngrams_id = ng.id
          JOIN nodes_contexts nc ON nc.context_id   = cng.context_id
          JOIN contexts        c ON nc.context_id   = c.id
          WHERE nc.node_id      = ? -- CorpusId
            AND c.typename      = ? -- toDBid
            AND cng.ngrams_type = ? -- NgramsTypeId
            AND nc.category     > 0 -- is not in Trash
            GROUP BY cng.context_id, ng.terms
        |]

getTreeInstitutesUser :: HasDBid NodeType
                        => CorpusId
                        -> NgramsType
                        -> DBQuery err x (HashMap Text [Text])
getTreeInstitutesUser cId nt =
    HM.unionsWith (++) . map (\(_, hd) -> HM.fromList $ Map.toList $ fromMaybe Map.empty (_hd_institutes_tree hd)) <$> selectHyperDataByContextUser cId nt

selectHyperDataByContextUser :: HasDBid NodeType
                              => CorpusId
                              -> NgramsType
                              -> DBQuery err x [(ContextId, HyperdataDocument)]
selectHyperDataByContextUser cId' nt' =
  mkPGQuery queryHyperDataByContextUser
                                        ( cId'
                                        , toDBid nt'
                                        )

queryHyperDataByContextUser :: DPS.Query
queryHyperDataByContextUser = [sql|
  SELECT cng.context_id, c.hyperdata FROM context_node_ngrams cng
    JOIN ngrams         ng ON cng.ngrams_id = ng.id
        JOIN nodes_contexts nc ON nc.context_id   = cng.context_id
        JOIN contexts        c ON nc.context_id   = c.id
        WHERE nc.node_id      = ? -- CorpusId
          AND cng.ngrams_type = ? -- NgramsTypeId
          AND nc.category     > 0 -- is not in Trash
      GROUP BY cng.context_id, c.hyperdata
|]

------------------------------------------------------------------------
getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
                                  => CorpusId
                                  -> Int
                                  -> NgramsType
                                  -> [NgramsTerm]
                                  -> DBQuery err x (HashMap NgramsTerm Int)
getOccByNgramsOnlyFast_withSample cId int nt ngs =
  HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs


-- | Returns occurrences of ngrams in given corpus/list (for each
-- ngram, a list of contexts is returned)
getOccByNgramsOnlyFast :: CorpusId
                       -> ListId
                       -> NgramsType
                       -> DBQuery err x (HashMap NgramsTerm [ContextId])
getOccByNgramsOnlyFast cId lId nt = do
    --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
    HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, UnsafeMkContextId <$> DPST.fromPGArray ns)) <$> run cId lId nt
    where

      run :: CorpusId
           -> ListId
           -> NgramsType
           -> DBQuery err x [(Text, DPST.PGArray Int)]
      run cId' lId' nt' = mkPGQuery query
                ( lId'  -- node_stories.node_id
                , toDBid nt'
                , cId'  -- nodes_contexts.node_id (corpus filter)
                )

      query :: DPS.Query
      query = [sql|
                WITH ns AS (
                  SELECT ngrams_id, terms 
                  FROM node_stories 
                  JOIN ngrams ON ngrams_id = ngrams.id
                  WHERE node_id = ? AND ngrams_type_id = ?
                ),
                matching_contexts AS (
                  SELECT DISTINCT cng.ngrams_id, cng.context_id
                  FROM ns
                  JOIN context_node_ngrams cng ON cng.ngrams_id = ns.ngrams_id
                  JOIN nodes_contexts nc ON nc.context_id = cng.context_id
                  WHERE nc.node_id = ? 
                    AND nc.category > 0
                )
                SELECT 
                  ns.terms,
                  array_remove(array_agg(mc.context_id), NULL) as agg
                FROM ns
                LEFT JOIN matching_contexts mc ON ns.ngrams_id = mc.ngrams_id
                GROUP BY ns.ngrams_id, ns.terms
              |]


selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
                                                    => CorpusId
                                                    -> Int
                                                    -> NgramsType
                                                    -> [NgramsTerm]
                                                    -> DBQuery err x [(NgramsTerm, Int)]
selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
  fmap (first NgramsTerm) <$>
  mkPGQuery queryNgramsOccurrencesOnlyByContextUser_withSample
                ( int
                , toDBid NodeDocument
                , cId
                -- , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
                , DPS.In (unNgramsTerm <$> (List.take 10000 tms))
                , cId
                , toDBid nt
                )
    -- where
    --   fields = [QualifiedIdentifier Nothing "text"]

queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
  WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
                          JOIN nodes_contexts nn ON n.id = nn.context_id
                            WHERE n.typename  = ?
                            AND nn.node_id = ?),
       input_rows AS (
           SELECT id, terms
             FROM ngrams
             WHERE terms IN ?
        )
  SELECT ir.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
    JOIN input_rows  ir ON cng.ngrams_id = ir.id
    JOIN nodes_contexts nn ON nn.context_id   = cng.context_id
    JOIN nodes_sample n ON nn.context_id   = n.id
    WHERE nn.node_id      = ? -- CorpusId
      AND cng.ngrams_type = ? -- NgramsTypeId
      AND nn.category     > 0
      GROUP BY cng.node_id, ir.terms
  |]


-- queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
-- queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
--   WITH nodes_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
--                           JOIN nodes_contexts nc ON c.id = nc.context_id
--                             WHERE c.typename  = ?
--                             AND nc.node_id = ?),
--        input_rows(terms) AS (?)
--   SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
--     JOIN ngrams ng      ON cng.ngrams_id = ng.id
--     JOIN input_rows  ir ON ir.terms      = ng.terms
--     JOIN nodes_contexts nc ON nc.context_id   = cng.context_id
--     JOIN nodes_sample ns ON nc.context_id   = ns.id
--     WHERE nc.node_id      = ? -- CorpusId
--       AND cng.ngrams_type = ? -- NgramsTypeId
--       AND nc.category     > 0
--       -- AND nc.context_id IN (SELECT id FROM nodes_sample)
--       GROUP BY cng.node_id, ng.terms
--   |]


selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
                                                     => CorpusId
                                                     -> Int
                                                     -> NgramsType
                                                     -> DBQuery err x [(NgramsTerm, Int)]
selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
  fmap (first NgramsTerm) <$>
  mkPGQuery queryNgramsOccurrencesOnlyByContextUser_withSample
                ( int
                , toDBid NodeDocument
                , cId
                , cId
                , toDBid nt
                )

queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
  WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
                          JOIN nodes_contexts nc ON c.id = nc.context_id
                            WHERE c.typename  = ?
                            AND nc.node_id = ?)
  SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
    JOIN ngrams ng      ON cng.ngrams_id = ng.id
    JOIN node_stories ns ON ns.ngrams_id = ng.id
    JOIN nodes_contexts nc ON nc.context_id   = cng.context_id
    JOIN contexts_sample c ON nc.context_id   = c.id
    WHERE nc.node_id      = ? -- CorpusId
      AND cng.ngrams_type = ? -- NgramsTypeId
      AND nc.category     > 0
      GROUP BY ng.id
  |]

------------------------------------------------------------------------
getContextsByNgramsOnlyUser :: HasDBid NodeType
                            => CorpusId
                            -> [ListId]
                            -> NgramsType
                            -> [NgramsTerm]
                            -> DBQuery err x (HashMap NgramsTerm (Set ContextId))
getContextsByNgramsOnlyUser cId ls nt ngs =
     HM.unionsWith        (<>)
   . map (HM.fromListWith (<>)
   . map (second Set.singleton))
  <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
           (splitEvery 1000 ngs)

getNgramsByContextOnlyUser :: HasDBid NodeType
                           => NodeId
                           -> [ListId]
                           -> NgramsType
                           -> [NgramsTerm]
                           -> DBQuery err x (Map ContextId (Set NgramsTerm))
getNgramsByContextOnlyUser cId ls nt ngs =
     Map.unionsWith         (<>)
   . map ( Map.fromListWith (<>)
         . map (second Set.singleton)
         )
   . map (map swap)
  <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
           (splitEvery 1000 ngs)

------------------------------------------------------------------------
selectNgramsOnlyByContextUser :: HasDBid NodeType
                           => CorpusId
                           -> [ListId]
                           -> NgramsType
                           -> [NgramsTerm]
                           -> DBQuery err x [(NgramsTerm, ContextId)]
selectNgramsOnlyByContextUser cId ls nt tms =
  fmap (first NgramsTerm) <$>
  mkPGQuery queryNgramsOnlyByContextUser
                ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
                , Values [QualifiedIdentifier Nothing "int4"]
                         (DPS.Only <$> map DPS.toField ls)
                , cId
                , toDBid NodeDocument
                , toDBid nt
                )
    where
      fields = [QualifiedIdentifier Nothing "text"]

queryNgramsOnlyByContextUser :: DPS.Query
queryNgramsOnlyByContextUser = [sql|
  WITH input_rows(terms) AS (?),
       input_list(id)    AS (?)
  SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
    JOIN ngrams         ng ON cng.ngrams_id = ng.id
    JOIN input_rows     ir ON ir.terms      = ng.terms
    JOIN input_list     il ON il.id         = cng.node_id
    JOIN nodes_contexts nc ON nc.context_id   = cng.context_id
    JOIN contexts        c ON nc.context_id   = c.id
    WHERE nc.node_id      = ? -- CorpusId
      AND c.typename      = ? -- toDBid (maybe not useful with context table)
      AND cng.ngrams_type = ? -- NgramsTypeId
      AND nc.category     > 0
      GROUP BY ng.terms, cng.context_id
  |]

getNgramsByDocOnlyUser :: DocId
                       -> [ListId]
                       -> NgramsType
                       -> [NgramsTerm]
                       -> DBQuery err x (HashMap NgramsTerm (Set NodeId))
getNgramsByDocOnlyUser cId ls nt ngs =
  HM.unionsWith (<>)
  . map (HM.fromListWith (<>) . map (second Set.singleton))
  <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)


selectNgramsOnlyByDocUser :: DocId
                          -> [ListId]
                          -> NgramsType
                          -> [NgramsTerm]
                          -> DBQuery err x [(NgramsTerm, NodeId)]
selectNgramsOnlyByDocUser dId ls nt tms =
  fmap (first NgramsTerm) <$>
  mkPGQuery queryNgramsOnlyByDocUser
                ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
                , Values [QualifiedIdentifier Nothing "int4"]
                         (DPS.Only <$> (map DPS.toField ls))
                , dId
                , toDBid nt
                )
    where
      fields = [QualifiedIdentifier Nothing "text"]


queryNgramsOnlyByDocUser :: DPS.Query
queryNgramsOnlyByDocUser = [sql|
  WITH input_rows(terms) AS (?),
       input_list(id)    AS (?)
  SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
    JOIN ngrams ng      ON cng.ngrams_id = ng.id
    JOIN input_rows  ir ON ir.terms      = ng.terms
    JOIN input_list  il ON il.id         = cng.context_id
    WHERE cng.node_id     = ? -- DocId
      AND cng.ngrams_type = ? -- NgramsTypeId
      GROUP BY ng.terms, cng.node_id
  |]

------------------------------------------------------------------------
-- | TODO filter by language, database, any social field
getContextsByNgramsMaster :: HasDBid NodeType
                          =>  UserCorpusId
                          -> MasterCorpusId
                          -> DBQuery err x (HashMap Text (Set NodeId))
getContextsByNgramsMaster ucId mcId = unionsWith (<>)
                                 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
                                 -- . takeWhile (not . List.null)
                                 -- . takeWhile (\l -> List.length l > 3)
                                <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]

selectNgramsByContextMaster :: HasDBid NodeType
                            => Int
                            -> UserCorpusId
                            -> MasterCorpusId
                            -> Int
                            -> DBQuery err x [(NodeId, Text)]
selectNgramsByContextMaster n ucId mcId p = mkPGQuery
                               queryNgramsByContextMaster'
                                 ( ucId
                                 , toDBid NgramsTerms
                                 , toDBid   NodeDocument
                                 , p
                                 , toDBid   NodeDocument
                                 , p
                                 , n
                                 , mcId
                                 , toDBid   NodeDocument
                                 , toDBid NgramsTerms
                                 )

-- | TODO fix context_node_ngrams relation
queryNgramsByContextMaster' :: DPST.Query
queryNgramsByContextMaster' = [sql|
  WITH contextsByNgramsUser AS (

  SELECT n.id, ng.terms FROM contexts n
    JOIN nodes_contexts  nn  ON n.id = nn.context_id
    JOIN context_node_ngrams cng ON cng.context_id   = n.id
    JOIN ngrams       ng  ON cng.ngrams_id = ng.id
    WHERE nn.node_id      = ?   -- UserCorpusId
      -- AND n.typename   = ?  -- toDBid
      AND cng.ngrams_type = ? -- NgramsTypeId
      AND nn.category > 0
      AND node_pos(n.id,?) >= ?
      AND node_pos(n.id,?) <  ?
    GROUP BY n.id, ng.terms

    ),

  contextsByNgramsMaster AS (

  SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
    JOIN context_node_ngrams cng  ON n.id  = cng.context_id
    JOIN ngrams       ng   ON ng.id = cng.ngrams_id

    WHERE n.parent_id  = ?     -- Master Corpus toDBid
      AND n.typename   = ?     -- toDBid
      AND cng.ngrams_type = ? -- NgramsTypeId
    GROUP BY n.id, ng.terms
    )

  SELECT m.id, m.terms FROM nodesByNgramsMaster m
    RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
  |]

-- | Refreshes the \"context_node_ngrams_view\" materialized view.
-- This function will be run :
--  - periodically
--  - at reindex stage
--  - at the end of each text flow

-- refreshNgramsMaterialized :: Cmd err ()
-- refreshNgramsMaterialized = void $ execPGSQuery refreshNgramsMaterializedQuery ()
--   where
--     refreshNgramsMaterializedQuery :: DPS.Query
--     refreshNgramsMaterializedQuery =
--       [sql| REFRESH MATERIALIZED VIEW CONCURRENTLY context_node_ngrams_view; |]
