{-|
Module      : Test.Database.Operations.NgramsByContext
Description : Tests for ngrams occurrence queries
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX
-}

{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE ScopedTypeVariables #-}

module Test.Database.Operations.NgramsByContext where

import Data.Aeson.QQ.Simple ( aesonQQ )
import Data.Aeson.Types ( parseEither )
import Data.HashMap.Strict qualified as HM
import Data.Text qualified as T
import Database.PostgreSQL.Simple qualified as PSQL
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.Types qualified as PSQL
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core ( Lang(EN) )
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Types
import Gargantext.Core.Worker.Env ()
import Gargantext.Database.Action.Flow
import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument )
import Gargantext.Database.Prelude
import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Node.Error (HasNodeError, nodeErrorWith)
import Gargantext.Database.Query.Tree.Root
import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude
import Test.Database.Operations.Types ( testUsername )
import Test.Database.Types ( TestEnv, runTestMonad )
import Test.HUnit ( Assertion, assertFailure )
import Test.Hspec.Expectations ( shouldBe, shouldSatisfy )

-- Test documents with known ngrams
testDoc_01 :: HyperdataDocument
testDoc_01 = either errorTrace identity $ parseEither parseJSON $ [aesonQQ|
  { "doi":"test01"
  , "publication_year":2023
  , "language_iso2":"EN"
  , "authors":"Alice Smith"
  , "abstract":"This paper discusses functional programming and type systems."
  , "title":"Introduction to Functional Programming"
  }
|]

testDoc_02 :: HyperdataDocument
testDoc_02 = either errorTrace identity $ parseEither parseJSON $ [aesonQQ|
  { "doi":"test02"
  , "publication_year":2023
  , "language_iso2":"EN"
  , "authors":"Bob Jones"
  , "abstract":"We explore functional programming paradigms in modern languages."
  , "title":"Functional Programming Paradigms"
  }
|]

testDoc_03 :: HyperdataDocument
testDoc_03 = either errorTrace identity $ parseEither parseJSON $ [aesonQQ|
  { "doi":"test03"
  , "publication_year":2023
  , "language_iso2":"EN"
  , "authors":"Carol White"
  , "abstract":"Type systems ensure program correctness and safety."
  , "title":"Type Systems in Programming Languages"
  }
|]

getCorporaWithParentIdOrFail :: HasNodeError err => NodeId -> DBQuery err x (Node HyperdataCorpus)
getCorporaWithParentIdOrFail parentId = do
  xs <- getCorporaWithParentId parentId
  case xs of
    [corpus] -> pure corpus
    _        -> nodeErrorWith $ "getCorporaWithParentIdOrFail failed: " <> T.pack (show xs)

setupNgramsCorpus :: TestEnv -> IO TestEnv
setupNgramsCorpus env = runTestMonad env $ do
  let user = testUsername
  parentId <- runDBQuery $ getRootId user
  [corpus] <- runDBQuery $ getCorporaWithParentId parentId
  let corpusId = _node_id corpus

  let docs = [testDoc_01, testDoc_02, testDoc_03]
  _docIds <- addDocumentsToHyperCorpus (Nothing :: Maybe HyperdataCorpus)
                                       (Multi EN)
                                       corpusId
                                       docs
     
  userId <- runDBQuery $ getUserId user
  listId <- runDBTx $ getOrMkList corpusId userId

  -- Get Terms that were actually extracted
  (extractedTerms :: [(Int, Text)]) <- runDBQuery $ mkPGQuery [sql|
    SELECT DISTINCT ng.id, ng.terms
    FROM context_node_ngrams cng
    JOIN ngrams ng ON cng.ngrams_id = ng.id
    WHERE cng.ngrams_type = 4  -- NgramsTerms
      AND cng.context_id IN (
        SELECT context_id FROM nodes_contexts WHERE node_id = ?
      )
  |] (PSQL.Only corpusId)
  
  -- Directly insert into node_stories
  runDBTx $ void $ mkPGUpdate [sql|
    INSERT INTO node_stories (node_id, ngrams_id, ngrams_type_id, version)
    SELECT ?, unnest(?::int[]), 4, 1
    ON CONFLICT DO NOTHING
  |] (listId, PSQL.PGArray $ map fst extractedTerms)
  
  pure env     

-- Test that getOccByNgramsOnlyFast returns correct occurrences
testGetOccByNgramsOnlyFast :: TestEnv -> Assertion
testGetOccByNgramsOnlyFast env = runTestMonad env $ do
  result <- runDBQuery $ do
    parentId <- getRootId testUsername
    corpus <- getCorporaWithParentIdOrFail parentId
    let corpusId = _node_id corpus
    
    -- Get the list node (should be created during corpus setup)
    lists <- getListsWithParentId corpusId
    case lists of
      [] -> nodeErrorWith "No list found for corpus"
      (listNode:_) -> do
        let listId = _node_id listNode
        
        -- Query occurrences for NgramsTerms type
        getOccByNgramsOnlyFast corpusId listId NgramsTerms

  liftIO $ do
    -- Verify we got a non-empty result
    result `shouldSatisfy` (not . HM.null)
    
    -- Check that known terms appear in results
    -- "functional programming" should appear in doc_01 (on doc_02 it's with "paradigm")
    let functionalProg = NgramsTerm "functional programming"
    case HM.lookup functionalProg result of
      Nothing -> assertFailure "Expected 'functional programming' in results"
      Just contexts -> do
        length contexts `shouldBe` 1
        contexts `shouldSatisfy` (not . null)

    -- "functional programming paradigms" appears in exactly 1 doc (doc_02)
    let functionalProgParadigms = NgramsTerm "functional programming paradigms"
    case HM.lookup functionalProgParadigms result of
      Nothing -> assertFailure "Expected 'functional programming paradigms' in results"
      Just contexts -> do
        length contexts `shouldBe` 1
        contexts `shouldSatisfy` (not . null)
        
    -- "type systems" should appear in doc_01 and doc_03
    let typeSystems = NgramsTerm "type systems"
    case HM.lookup typeSystems result of
      Nothing -> pure () -- might not be extracted depending on ngrams config
      Just contexts -> do
        contexts `shouldSatisfy` (not . null)

-- Test that results contain valid context IDs
testGetOccByNgramsOnlyFastValidContextIds :: TestEnv -> Assertion  
testGetOccByNgramsOnlyFastValidContextIds env = runTestMonad env $ do
  result <- runDBQuery $ do
    parentId <- getRootId testUsername
    corpus <- getCorporaWithParentIdOrFail parentId
    let corpusId = _node_id corpus
    
    lists <- getListsWithParentId corpusId
    case lists of
      [] -> nodeErrorWith "No list found"
      (listNode:_) -> do
        let listId = _node_id listNode
        getOccByNgramsOnlyFast corpusId listId NgramsTerms

  liftIO $ do
    -- All context lists should be non-empty for terms that exist
    HM.toList result `shouldSatisfy` all (\(_, contexts) -> not (null contexts))
    
    -- Context IDs should be positive integers
    let allContexts = concatMap snd $ HM.toList result
    allContexts `shouldSatisfy` all (\(UnsafeMkContextId cid) -> cid > 0)
