{-|
Module      : Gargantext.API.Node.Corpus.New
Description : New corpus API
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

New corpus means either:
- new corpus
- new data in existing corpus
-}

{-# LANGUAGE TemplateHaskell    #-}
{-# LANGUAGE TypeOperators      #-}
{-# LANGUAGE ViewPatterns       #-}

module Gargantext.API.Node.Corpus.New
      where


import Conduit
import Control.Lens ( view, non )
import Data.Aeson ( genericParseJSON, genericToJSON )
import Data.ByteString.Base64 qualified as BSB64
import Data.Conduit.Internal (zipSources)
import Data.Swagger ( ToSchema(..), genericDeclareNamedSchema )
import Data.Text qualified as T
import Data.Text.Encoding qualified as TE
import EPO.API.Client.Types qualified as EPO
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Ngrams (commitStatePatch, Versioned(..))
import Gargantext.API.Node.Corpus.New.Types ( FileFormat(..), FileType(..) )
import Gargantext.API.Node.Corpus.Searx ( triggerSearxSearch )
import Gargantext.API.Node.Corpus.Types ( Database, Datafield(Web), database2origin )
import Gargantext.API.Node.Corpus.Update (addLanguageToCorpus)
import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..), withDefaultLanguage, defaultLanguage)
import Gargantext.Core.NodeStory (HasNodeStoryImmediateSaver, HasNodeArchiveStoryImmediateSaver, currentVersion, NgramsStatePatch')
import Gargantext.API.Admin.Orchestrator.Types qualified as API
import Gargantext.Core.Text.Corpus.Query qualified as API
import Gargantext.Core.Text.Corpus.Parsers qualified as Parser (FileType(..), parseFormatC, _ParseFormatError)
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Action.Mail (sendMail)
import Gargantext.Database.Action.Node (mkNodeWithParent)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( ToHyperdataDocument(..) )
import Gargantext.Database.Admin.Types.Hyperdata.File ( HyperdataFile(..) )
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), ParentId)
import Gargantext.Database.GargDB qualified as GargDB
import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (getNodeWith, getOrMkList)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(MkCorpusUserNormalCorpusIds))
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude
import Gargantext.Prelude.Config (gc_max_docs_parsers)
import Gargantext.System.Logging ( logLocM, LogLevel(..) )
import Gargantext.Utils.Jobs.Monad (JobHandle, MonadJobStatus(..))
import Servant ( JSON, type (:>), FormUrlEncoded, Capture, Summary )
import Servant.Job.Utils (jsonOptions)
import Test.QuickCheck.Arbitrary (Arbitrary(..))

------------------------------------------------------------------------
{-
data Query = Query { query_query      :: Text
                   , query_node_id    :: Int
                   , query_lang       :: Lang
                   , query_databases  :: [DataOrigin]
                   }
                   deriving (Eq, Generic)

deriveJSON (unPrefix "query_") 'Query

instance Arbitrary Query where
    arbitrary = elements [ Query q n la fs
                         | q <- ["honeybee* AND collapse"
                                ,"covid 19"
                                ]
                         , n <- [0..10]
                         , la <- allLangs
                         , fs <- take 3 $ repeat allDataOrigins
                         ]

instance ToSchema Query where
  declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
-}

------------------------------------------------------------------------

{-
type Api = PostApi
        :<|> GetApi

type PostApi = Summary "New Corpus endpoint"
             :> ReqBody '[JSON] Query
             :> Post    '[JSON] CorpusId
type GetApi = Get '[JSON] ApiInfo
-}

-- | TODO manage several apis
-- TODO-ACCESS
-- TODO this is only the POST
{-
api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
api uid (Query q _ as) = do
  cId <- case head as of
    Nothing      -> flowCorpusSearchInDatabase (UserDBId uid) EN q
    Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
    Just a   -> do
      docs <- liftBase $ API.get a q (Just 1000)
      cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
      pure cId'

  pure cId
-}

------------------------------------------------
-- TODO use this route for Client implementation
data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
  deriving (Generic)
instance Arbitrary ApiInfo where
  arbitrary = ApiInfo <$> arbitrary

deriveJSON (unPrefix "") 'ApiInfo

instance ToSchema ApiInfo

info :: ApiInfo
info = ApiInfo API.externalAPIs

------------------------------------------------------------------------
------------------------------------------------------------------------
data WithQuery = WithQuery
  { _wq_query        :: !API.RawQuery
  , _wq_databases    :: !Database
  , _wq_datafield    :: !(Maybe Datafield)
  , _wq_lang         :: !Lang
  , _wq_node_id      :: !Int
  , _wq_flowListWith :: !FlowSocialListWith
  , _wq_pubmedAPIKey :: !(Maybe Text)
  , _wq_epoAPIUser   :: !(Maybe Text)
  , _wq_epoAPIToken  :: !(Maybe Text)
  }
  deriving (Show, Eq, Generic)

makeLenses ''WithQuery
instance FromJSON WithQuery where
  parseJSON = genericParseJSON $ jsonOptions "_wq_"
instance ToJSON WithQuery where
  toJSON = genericToJSON $ jsonOptions "_wq_"
instance ToSchema WithQuery where
  declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")

instance Arbitrary WithQuery where
  arbitrary = WithQuery <$> arbitrary
                        <*> arbitrary
                        <*> arbitrary
                        <*> arbitrary
                        <*> arbitrary
                        <*> arbitrary
                        <*> arbitrary
                        <*> arbitrary
                        <*> arbitrary

------------------------------------------------------------------------

type AddWithQuery = Summary "Add with Query to corpus endpoint"
   :> "corpus"
     :> Capture "corpus_id" CorpusId
   :> "query"
     :> AsyncJobs JobLog '[JSON] WithQuery JobLog

{-
type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
   :> "corpus"
     :> Capture "corpus_id" CorpusId
   :> "add"
   :> "file"
     :> MultipartForm Mem (MultipartData Mem)
     :> QueryParam "fileType"  FileType
   :> "async"
     :> AsyncJobs JobLog '[JSON] () JobLog
-}


------------------------------------------------------------------------
-- TODO WithQuery also has a corpus id


addToCorpusWithQuery :: ( FlowCmdM env err m
                        , MonadJobStatus m
                        , HasNodeStoryImmediateSaver env
                        , HasNodeArchiveStoryImmediateSaver env )
                       => User
                       -> CorpusId
                       -> WithQuery
                       -> Maybe API.Limit
                       -> JobHandle m
                       -> m ()
addToCorpusWithQuery user cid (WithQuery { _wq_query = q
                                         , _wq_databases = dbs
                                         , _wq_datafield = datafield
                                         , _wq_lang = l
                                         , _wq_flowListWith = flw
                                         , _wq_pubmedAPIKey = mPubmedAPIKey
                                         , .. }) maybeLimit jobHandle = do
  -- TODO ...
  $(logLocM) DEBUG $ T.pack $ "(cid, dbs) " <> show (cid, dbs)
  $(logLocM) DEBUG $ T.pack $ "datafield  " <> show datafield
  $(logLocM) DEBUG $ T.pack $ "flowListWith " <> show flw

  let mEPOAuthKey = EPO.AuthKey <$> (EPO.User <$> _wq_epoAPIUser)
                                <*> (EPO.Token <$> _wq_epoAPIToken)

  addLanguageToCorpus cid l

  case datafield of
    Just Web -> do
      $(logLocM) DEBUG $ T.pack $ "processing web request " <> show datafield

      markStarted 1 jobHandle

      _ <- triggerSearxSearch user cid q l jobHandle

      markComplete jobHandle

    _ -> do
      markStarted 3 jobHandle

      -- TODO add cid
      -- TODO if cid is folder -> create Corpus
      --      if cid is corpus -> add to corpus
      --      if cid is root   -> create corpus in Private
      $(logLocM) DEBUG $ T.pack $ "getDataText with query: " <> show q
      let db = database2origin dbs
      -- mPubmedAPIKey <- getUserPubmedAPIKey user
      -- printDebug "[addToCorpusWithQuery] mPubmedAPIKey" mPubmedAPIKey
      eTxt <- getDataText db (Multi l) q mPubmedAPIKey mEPOAuthKey maybeLimit

      -- printDebug "[G.A.N.C.New] lTxts" lTxts
      case eTxt of
        Right txt -> do
          -- TODO Sum lenghts of each txt elements
          $(logLocM) DEBUG "Processing dataText results"
          markProgress 1 jobHandle

          corpusId <- flowDataText user txt (Multi l) cid (Just flw) jobHandle
          $(logLocM) DEBUG $ T.pack $ "corpus id " <> show corpusId

          _ <- commitCorpus cid user

          -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
          sendMail user
          -- TODO ...
          markComplete jobHandle

        Left err -> do
          -- printDebug "Error: " err
          $(logLocM) ERROR (T.pack $ show err) -- log the full error
          markFailed (Just err) jobHandle

type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
   :> "corpus"
     :> Capture "corpus_id" CorpusId
   :> "add"
   :> "form"
   :> "async"
     :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog

addToCorpusWithForm :: ( FlowCmdM env err m
                       , MonadJobStatus m
                       , HasNodeStoryImmediateSaver env
                       , HasNodeArchiveStoryImmediateSaver env )
                    => User
                    -> CorpusId
                    -> NewWithForm
                    -> JobHandle m
                    -> m ()
addToCorpusWithForm user cid nwf jobHandle = do
  -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
  -- printDebug "[addToCorpusWithForm] fileType" ft
  -- printDebug "[addToCorpusWithForm] fileFormat" ff

  let l = nwf ^. wf_lang . non defaultLanguage
  addLanguageToCorpus cid l

  limit' <- view $ hasConfig . gc_max_docs_parsers
  let limit = fromIntegral limit' :: Integer
  let
    parseC = case (nwf ^. wf_filetype) of
      CSV       -> Parser.parseFormatC Parser.CsvGargV3
      CSV_HAL   -> Parser.parseFormatC Parser.CsvHal
      Iramuteq  -> Parser.parseFormatC Parser.Iramuteq
      Istex     -> Parser.parseFormatC Parser.Istex
      JSON      -> Parser.parseFormatC Parser.JSON
      PresseRIS -> Parser.parseFormatC Parser.RisPresse
      WOS       -> Parser.parseFormatC Parser.WOS

  -- TODO granularity of the logStatus
  let data' = case (nwf ^. wf_fileformat) of
        Plain -> cs (nwf ^. wf_data)
        ZIP   -> case BSB64.decode $ TE.encodeUtf8 (nwf ^. wf_data) of
          Left err -> panicTrace $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
          Right decoded -> decoded
  eDocsC <- liftBase $ parseC (nwf ^. wf_fileformat) data'
  case eDocsC of
    Right (count, docsC) -> do
      -- TODO Add progress (jobStatus) update for docs - this is a
      -- long action

      let docsC' = zipSources (yieldMany [1..]) docsC
                  .| mapMC (\(idx, doc) ->
                        if idx > limit then do
                          --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
                          let panicMsg' = [ "[addToCorpusWithForm] number of docs "
                                          , "exceeds the MAX_DOCS_PARSERS limit ("
                                          , show limit
                                          , ")" ]
                          let panicMsg = T.concat $ T.pack <$> panicMsg'
                          --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
                          panicTrace panicMsg
                        else
                          pure doc)
                  .| mapC toHyperdataDocument

      --printDebug "Parsing corpus finished : " cid
      --logStatus jobLog2

      --printDebug "Starting extraction     : " cid
      -- TODO granularity of the logStatus
      -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)

      _cid' <- flowCorpus (MkCorpusUserNormalCorpusIds user [cid])
                          (Multi l)
                          (Just (nwf ^. wf_selection))
                          --(Just $ fromIntegral $ length docs, docsC')
                          (count, transPipe liftBase docsC') -- TODO fix number of docs
                          --(map (map toHyperdataDocument) docs)
                          jobHandle

      _ <- commitCorpus cid user

      -- printDebug "Extraction finished   : " cid
      -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
      -- TODO uncomment this
      --sendMail user

      markComplete jobHandle
    Left parseErr -> do
      $(logLocM) ERROR $ "parse error: " <> (Parser._ParseFormatError parseErr)
      markFailed (Just parseErr) jobHandle

{-
addToCorpusWithFile :: FlowCmdM env err m
                    => CorpusId
                    -> MultipartData Mem
                    -> Maybe FileType
                    -> (JobLog -> m ())
                    -> m JobLog
addToCorpusWithFile cid input filetype logStatus = do
  logStatus JobLog { _scst_succeeded = Just 10
                          , _scst_failed    = Just 2
                          , _scst_remaining = Just 138
                          , _scst_events    = Just []
                          }
  printDebug "addToCorpusWithFile" cid
  _h <- postUpload cid filetype input

  pure      JobLog { _scst_succeeded = Just 137
                          , _scst_failed    = Just 13
                          , _scst_remaining = Just 0
                          , _scst_events    = Just []
                          }
-}



type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
   :> "corpus"
     :> Capture "corpus_id" CorpusId
   :> "add"
   :> "file"
   :> "async"
     :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog

addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
                    => User
                    -> CorpusId
                    -> NewWithFile
                    -> JobHandle m
                    -> m ()
addToCorpusWithFile user cid nwf@(NewWithFile _d (withDefaultLanguage -> l) fName) jobHandle = do

  addLanguageToCorpus cid l

  printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
  markStarted 1 jobHandle

  fPath <- GargDB.writeFile nwf
  printDebug "[addToCorpusWithFile] File saved as: " fPath

  uId <- getUserId user
  nIds <- mkNodeWithParent NodeFile (Just cid) uId fName

  _ <- case nIds of
    [nId] -> do
        node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
        let hl = node ^. node_hyperdata
        _ <- updateHyperdata nId $ hl { _hff_name = fName
                                      , _hff_path = T.pack fPath }

        printDebug "[addToCorpusWithFile] Created node with id: " nId
    _     -> pure ()

  printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid

  printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
  sendMail user

  markComplete jobHandle



--- UTILITIES

commitCorpus :: ( FlowCmdM env err m
                , HasNodeArchiveStoryImmediateSaver env
                , HasNodeStoryImmediateSaver env )
             => ParentId -> User -> m (Versioned NgramsStatePatch')
commitCorpus cid user = do  
  userId <- getUserId user
  listId <- getOrMkList cid userId
  v <- currentVersion listId
  commitStatePatch listId (Versioned v mempty)