{-|
Module      : Test.Database.Operations.DocumentSearch
Description : GarganText database tests
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX
-}

{-# LANGUAGE QuasiQuotes       #-}
{-# LANGUAGE ScopedTypeVariables #-}

module Test.Database.Operations.DocumentSearch where

import Control.Lens
import Control.Monad.Reader
import Data.Aeson.QQ.Simple
import Data.Aeson.Types
import Data.Maybe (fromJust)
import Data.Text qualified as T
import Gargantext.Core
import Gargantext.Core.Config
import Gargantext.Core.NLP (HasNLPServer(..))
import Gargantext.Core.Text.Corpus.Query qualified as API
import Gargantext.Core.Text.Terms.Mono.Stem
import Gargantext.Core.Types
import Gargantext.Core.Types.Individu
import Gargantext.Core.Worker.Env ()  -- instance HasNodeError
import Gargantext.Database.Action.Flow
import Gargantext.Database.Action.Search
import Gargantext.Database.Admin.Config (userMaster)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Prelude
import Gargantext.Database.Query.Facet
import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Node.Error
import Gargantext.Database.Query.Table.NodeContext (selectCountDocs)
import Gargantext.Database.Query.Tree.Root
import Gargantext.Database.Schema.Node (NodePoly(..))
import Network.URI (parseURI)
import Prelude
import Test.Database.Types
import Test.HUnit hiding (Node)
import Test.Hspec.Expectations


exampleDocument_01 :: HyperdataDocument
exampleDocument_01 = either error id $ parseEither parseJSON $ [aesonQQ|
  { "doi":"01"
  , "publication_day":6
  , "language_iso2":"EN"
  , "publication_minute":0
  , "publication_month":7
  , "language_iso3":"eng"
  , "publication_second":0
  , "authors":"Nils Hovdenak, Kjell Haram"
  , "publication_year":2012
  , "publication_date":"2012-07-06 00:00:00+00:00"
  , "language_name":"English"
  , "realdate_full_":"2012 01 12"
  , "source":"European journal of obstetrics, gynecology, and reproductive biology Institute"
  , "abstract":"The literature was searched for publications on minerals and vitamins during pregnancy and the possible influence of supplements on pregnancy outcome."
  , "title":"Influence of mineral and vitamin supplements on pregnancy outcome."
  , "publication_hour":0
  }
|]

exampleDocument_02 :: HyperdataDocument
exampleDocument_02 = either error id $ parseEither parseJSON $ [aesonQQ|
  { "doi":""
  , "bdd": "Arxiv"
  , "publication_day":6
  , "language_iso2":"EN"
  , "publication_second":0
  , "authors":"Ajeje Brazorf, Manuel Agnelli"
  , "publication_year":2012
  , "publication_date":"2012-07-06 00:00:00+00:00"
  , "language_name":"English"
  , "realdate_full_":"2012 01 12"
  , "source":"Malagrotta Institute of Technology"
  , "abstract":"We present PyPlasm, an innovative approach to computational graphics"
  , "title":"PyPlasm: computational geometry made easy"
  , "publication_hour":0
  }
|]

exampleDocument_03 :: HyperdataDocument
exampleDocument_03 = either error id $ parseEither parseJSON $ [aesonQQ|
{
    "bdd": "Arxiv"
  , "doi": ""
  , "url": "http://arxiv.org/pdf/1405.3072v2"
  , "title": "Haskell for OCaml programmers"
  , "source": ""
  , "authors": "Raphael Poss, Herbert Ballerina"
  , "abstract": "  This introduction to Haskell is written to optimize learning by programmers who already know OCaml. "
  , "institutes": ""
  , "language_iso2": "EN"
  , "publication_date": "2014-05-13T09:10:32Z"
  , "publication_year": 2014
}
|]

exampleDocument_04 :: HyperdataDocument
exampleDocument_04 = either error id $ parseEither parseJSON $ [aesonQQ|
{
    "bdd": "Arxiv"
  , "doi": ""
  , "url": "http://arxiv.org/pdf/1407.5670v1"
  , "title": "Rust for functional programmers"
  , "source": ""
  , "authors": "Raphael Poss"
  , "abstract": "  This article provides an introduction to Rust , a systems language by Mozilla , to programmers already familiar with Haskell , OCaml or other functional languages. " , "institutes": ""
  , "language_iso2": "EN"
  , "publication_date": "2014-07-21T21:20:31Z"
  , "publication_year": 2014
}
|]

exampleDocument_05 :: HyperdataDocument
exampleDocument_05 = either error id $ parseEither parseJSON $ [aesonQQ|
{
    "bdd": "Arxiv"
  , "doi": ""
  , "url": "http://arxiv.org/pdf/1407.5670v1"
  , "title": "GQL for Dummies"
  , "source": ""
  , "authors": "Ennio Annio"
  , "abstract": "  This article provides an introduction to GraphQL" , "institutes": ""
  , "language_iso2": "EN"
  , "publication_date": "2014-07-21T21:20:31Z"
  , "publication_year": 2014
}
|]

exampleDocument_06 :: HyperdataDocument
exampleDocument_06 = exampleDocument_05 { _hd_title = Just "GQL Part 1" }

exampleDocument_07 :: HyperdataDocument
exampleDocument_07 = exampleDocument_05 { _hd_title = Just "GQL Part 2" }

exampleDocument_08 :: HyperdataDocument
exampleDocument_08 = exampleDocument_05 { _hd_title = Just "GQL Part 3" }

getCorporaWithParentIdOrFail :: HasNodeError err => NodeId -> DBQuery err x (Node HyperdataCorpus)
getCorporaWithParentIdOrFail parentId = do
  xs <- getCorporaWithParentId parentId
  case xs of
    [corpus] -> pure corpus
    _        -> nodeErrorWith $ "getCorporaWithParentIdOrFail, impossible: " <> T.pack (show xs)

addCorpusDocuments :: TestEnv -> IO TestEnv
addCorpusDocuments env = runTestMonad env $ do
  -- NOTE(adn) We need to create user 'gargantua'(!!) in order
  -- for 'addDocumentsToHyperCorpus' to work.
  parentId   <- runDBQuery $ getRootId (UserName userMaster)
  [corpus] <- runDBQuery $ getCorporaWithParentId parentId
  let corpusId = _node_id corpus

  let lang = EN
  let docs = [exampleDocument_01, exampleDocument_02, exampleDocument_03, exampleDocument_04]
  ids <- addDocumentsToHyperCorpus (test_job_handle env)
                                 (Just $ _node_hyperdata $ corpus)
                                 (Multi lang)
                                 corpusId
                                 docs
  liftIO $ ids `shouldBe` [ UnsafeMkNodeId 1, UnsafeMkNodeId 2, UnsafeMkNodeId 3, UnsafeMkNodeId 4 ]
  pure env

corpusAddDocNgramsEmpty :: TestEnv -> Assertion
corpusAddDocNgramsEmpty env = runTestMonad env $ do
  cfg <- view hasConfig
  parentId <- runDBQuery $ getRootId (UserName userMaster)
  [corpus] <- runDBQuery $ getCorporaWithParentId parentId

  let doc = exampleDocument_05
  res <- runDBTx $ insertMasterDoc cfg noUncommittedNgrams (Just $ _node_hyperdata corpus) doc
  let wrn = NgramsNotFound (Just (DocumentHashId "\\x0b41caa65782b9570530910d1942a21bb1859386ee4c976e7b4cd0624ee966f3")) (UnsafeMkNodeId 5)
  liftIO $ res `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) (Just wrn)

corpusAddDocAlreadyExisting :: TestEnv -> Assertion
corpusAddDocAlreadyExisting env = runTestMonad env $ do
  cfg <- view hasConfig
  parentId <- runDBQuery $ getRootId (UserName userMaster)
  [corpus] <- runDBQuery $ getCorporaWithParentId parentId
  let doc = exampleDocument_05
  let la  = Mono EN

  -- Let's use a bogus NLP server port
  let nlp = NLPServerConfig CoreNLP (fromJust $ parseURI "http://localhost:9999")
  uncommittedNgrams <- extractNgramsFromDocument nlp la doc
  oldCount <- docsInCorpusCount
  res <- runDBTx $ insertMasterDoc cfg uncommittedNgrams (Just $ _node_hyperdata corpus) doc
  interimCount <- docsInCorpusCount
  liftIO $ do
    interimCount `shouldBe` oldCount + 1
    res `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) Nothing
  -- The second time the node gets cached and returned.
  res1 <- runDBTx $ insertMasterDoc cfg uncommittedNgrams (Just $ _node_hyperdata corpus) doc
  newCount <- docsInCorpusCount
  liftIO $ do
    newCount `shouldBe` interimCount
    res1 `shouldBe` InsertDocSucceded (UnsafeMkNodeId 5) Nothing

-- | We test that if we add 3 documents, out of which the middle one doesn't have any keyed ngrams
-- the other two are still persisted.
corpusAddDocBatchSurvival :: TestEnv -> Assertion
corpusAddDocBatchSurvival env = runTestMonad env $ do
  cfg <- view hasConfig
  parentId <- runDBQuery $ getRootId (UserName userMaster)
  [corpus] <- runDBQuery $ getCorporaWithParentId parentId
  let la  = Mono EN
  nlp <- view (nlpServerGet (_tt_lang la))

  oldCount <- docsInCorpusCount

  let docs = [exampleDocument_06, exampleDocument_07, exampleDocument_08 ]

  uNgrams1 <- extractNgramsFromDocument nlp la exampleDocument_06
  uNgrams2 <- extractNgramsFromDocument nlp la exampleDocument_08

  savedDocs <- insertMasterDocs (test_job_handle env) cfg (uNgrams1 <> uNgrams2) (Just $ _node_hyperdata corpus) docs

  c' <- docsInCorpusCount
  liftIO $ do
    -- failures  `shouldBe` [DocumentInsertionError "InternalNodeError Cannot make node due to: Couldn't find the associated ngrams for document with hash \\x666b8e7bfd7c0af37d630e1097791f7ba438a669ecb6d1cb38014edd0b7a2977, therefore the added document won't have any ngrams."]
    c' `shouldBe` oldCount + 3
    savedDocs `shouldBe` [ UnsafeMkNodeId 5, UnsafeMkNodeId 6, UnsafeMkNodeId 7 ]

corpusAddDocuments :: TestEnv -> Assertion
corpusAddDocuments env = runTestMonad env $ do

  cnt <- runDBQuery $ do
    parentId <- getRootId (UserName userMaster)
    corpus <- getCorporaWithParentIdOrFail parentId
    let corpusId = _node_id corpus
    searchCountInCorpus corpusId False Nothing

  liftIO $ cnt `shouldBe` 4

stemmingTest :: TestEnv -> Assertion
stemmingTest _env = do
  stem EN GargPorterAlgorithm "Ajeje"        `shouldBe` "Ajeje"
  stem EN GargPorterAlgorithm "PyPlasm:"     `shouldBe` "PyPlasm:"
  stem EN GargPorterAlgorithm "soy"          `shouldBe` "soy"
  stem EN GargPorterAlgorithm "cry"          `shouldBe` "cri"
  -- This test outlines the main differences between Porter and Lancaster.
  stem EN GargPorterAlgorithm "dancer"       `shouldBe` "dancer"
  stem EN LancasterAlgorithm  "dancer"       `shouldBe` "dant"
  stem EN GargPorterAlgorithm "postpartum"   `shouldBe` "postpartum"
  stem EN LancasterAlgorithm  "postpartum"   `shouldBe` "postpart"
  stem IT PorterAlgorithm     "catechizzare" `shouldBe` "catechizz"
  stem IT LancasterAlgorithm  "catechizzare" `shouldBe` "catechizzare" -- lancaster doesn't support Italian

mkQ :: T.Text -> API.Query
mkQ txt = either (\e -> error $ "(query) = " <> T.unpack txt <> ": " <> e) id . API.parseQuery . API.RawQuery $ txt

corpusSearch01 :: TestEnv -> Assertion
corpusSearch01 env = do
  runTestMonad env $ do

    (results1, results2) <- runDBQuery $ do
      parentId <- getRootId (UserName userMaster)
      corpus <- getCorporaWithParentIdOrFail parentId

      (,) <$> searchInCorpus (_node_id corpus) False (mkQ "mineral") Nothing Nothing Nothing
          <*> searchInCorpus (_node_id corpus) False (mkQ "computational") Nothing Nothing Nothing

    liftIO $ length results1 `shouldBe` 1
    liftIO $ length results2 `shouldBe` 1

-- | Check that we support more complex queries
corpusSearch02 :: TestEnv -> Assertion
corpusSearch02 env = do
  runTestMonad env $ do

    (results1, results2) <- runDBQuery $ do
      parentId <- getRootId (UserName userMaster)
      corpus <- getCorporaWithParentIdOrFail parentId

      (,) <$> searchInCorpus (_node_id corpus) False (mkQ "Raphael") Nothing Nothing Nothing
          <*> searchInCorpus (_node_id corpus) False (mkQ "Raphael Poss") Nothing Nothing Nothing

    liftIO $ do
      length results1 `shouldBe` 2 -- Haskell & Rust
      map facetDoc_title results2 `shouldBe` ["Haskell for OCaml programmers", "Rust for functional programmers"]

-- | Check that we support more complex queries via the bool API
corpusSearch03 :: TestEnv -> Assertion
corpusSearch03 env = do
  runTestMonad env $ do

    (results1, results2, results3) <- runDBQuery $ do
      parentId <- getRootId (UserName userMaster)
      corpus <- getCorporaWithParentIdOrFail parentId

      (,,) <$> searchInCorpus (_node_id corpus) False (mkQ "\"Manuel Agnelli\"") Nothing Nothing Nothing
           <*> searchInCorpus (_node_id corpus) False (mkQ "Raphael AND -Rust") Nothing Nothing Nothing
           <*> searchInCorpus (_node_id corpus) False (mkQ "(Raphael AND (NOT Rust)) OR PyPlasm") Nothing Nothing Nothing

    liftIO $ do
      length results1 `shouldBe` 1
      map facetDoc_title results2 `shouldBe` ["Haskell for OCaml programmers"]
      map facetDoc_title results3 `shouldBe` ["PyPlasm: computational geometry made easy", "Haskell for OCaml programmers"]

-- | Check that the score doc count is correct
--   TODO This test is unfinished because `updateDocs` needs more work
corpusScore01 :: TestEnv -> Assertion
corpusScore01 env = do
  runTestMonad env $ do

    results <- runDBQuery $ do
      parentId <- getRootId (UserName userMaster)
      corpus <- getCorporaWithParentIdOrFail parentId
      searchInCorpus (_node_id corpus) False (mkQ "Haskell") Nothing Nothing Nothing

    liftIO $ do
      map facetDoc_title results `shouldBe` ["Haskell for OCaml programmers", "Rust for functional programmers"]

      map facetDoc_score results `shouldBe` [Just 0.0, Just 0.0]

    -- _ <- updateDocs (_node_id corpus)

    liftIO $ do
      map facetDoc_score results `shouldBe` [Just 0.0, Just 0.0]

-- | Check that we support search with tsquery
corpusSearchDB01 :: TestEnv -> Assertion
corpusSearchDB01 env = do
  runTestMonad env $ do
    results <- runDBQuery $ do
      parentId <- getRootId (UserName userMaster)
      corpus <- getCorporaWithParentIdOrFail parentId
      searchDocInDatabase (_node_id corpus) ("first second")

    liftIO $ do
      length results `shouldBe` 0 -- doesn't exist, we just check that proper to_tsquery is called

docsInCorpusCount :: TestMonad Int
docsInCorpusCount = runDBQuery $ do
  parentId <- getRootId (UserName userMaster)
  corpus   <- getCorporaWithParentIdOrFail parentId
  selectCountDocs (_node_id corpus)

corpusReturnCount :: TestEnv -> Assertion
corpusReturnCount env = do
  count <- runTestMonad env $ docsInCorpusCount
  liftIO $ count @?= 4
