List.hs 9.02 KB
{-|
Module      : Gargantext.API.Ngrams.List
Description : Get Ngrams (lists)
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

-}

{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell     #-}
{-# LANGUAGE TypeApplications    #-}
{-# LANGUAGE ViewPatterns        #-}

module Gargantext.API.Ngrams.List
  where

import Data.ByteString.Lazy qualified as BSL
import Data.Csv qualified as Tsv
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as Map
import Data.Map.Strict (toList)
import Data.Set qualified as Set
import Data.Text (concat, pack, splitOn)
import Data.Vector qualified as Vec
import Data.Vector (Vector)
import Gargantext.API.Admin.EnvTypes (Env)
import Gargantext.API.Errors.Types (BackendInternalError(InternalServerError))
import Gargantext.API.Ngrams.List.Types (_wjf_data, _wtf_data)
import Gargantext.API.Ngrams.Prelude (getNgramsList)
import Gargantext.API.Ngrams (setListNgrams)
import Gargantext.API.Ngrams.Types
import Gargantext.API.Prelude (GargM, serverError, HasServerError)
import Gargantext.API.Routes.Named.List qualified as Named
import Gargantext.API.Worker (serveWorkerAPI, serveWorkerAPIEJob)
import Gargantext.Core.NodeStory.Types ( HasNodeStory )
import Gargantext.Core.Text.Ngrams (Ngrams, NgramsType(NgramsTerms))
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Database.Action.Flow (reIndexWith)
import Gargantext.Database.Admin.Types.Node ( NodeId(_NodeId), ListId )
import Gargantext.Database.Query.Table.Node (getNode)
import Gargantext.Database.Schema.Ngrams ( text2ngrams, NgramsId )
import Gargantext.Database.Schema.Node (_node_parent_id)
import Gargantext.Database.Types (Indexed(..))
import Gargantext.Prelude hiding (concat, toList)
import Gargantext.System.Logging (logLocM, MonadLogger)
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..))
import Prelude qualified
import Protolude qualified as P
import Servant
import Servant.Server.Generic (AsServerT)
import Gargantext.System.Logging (LogLevel(..))


getAPI :: Named.GETAPI (AsServerT (GargM Env BackendInternalError))
getAPI = Named.GETAPI $ \listId -> Named.ListEndpoints
  { listJSONEp    = getJson listId
  , listJSONZipEp = getJsonZip listId
  , listTSVEp     = getTsv listId
  }

--
-- JSON API
--

jsonAPI :: Named.JSONAPI (AsServerT (GargM Env BackendInternalError))
jsonAPI = jsonPostAsync

------------------------------------------------------------------------
getJson :: HasNodeStory env err m
        => ListId
        -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
getJson lId = do
  lst <- getNgramsList lId
  pure $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
                             , pack $ show (_NodeId lId)
                             , ".json"
                             ]
                     ) lst

getJsonZip :: HasNodeStory env err m
           => ListId
           -> m (Headers '[Header "Content-Disposition" Text] NgramsListZIP)
getJsonZip lId = do
  lst <- getNgramsList lId
  let nlz = NgramsListZIP { _nlz_nl = lst, _nlz_list_id = lId}
  pure $ addHeader (concat [ "attachment; filename="
                           , nlzFileName nlz
                           , ".zip"
                           ]
                     ) nlz

getTsv :: HasNodeStory env err m
       => ListId
       -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
getTsv lId = do
  lst <- getNgramsList lId
  pure $ case Map.lookup NgramsTerms lst of
    Nothing -> noHeader Map.empty
    Just (Versioned { _v_data }) ->
      addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
                        , pack $ show (_NodeId lId)
                        , ".tsv"
                        ]
                ) _v_data

------------------------------------------------------------------------
jsonPostAsync :: Named.JSONAPI (AsServerT (GargM Env BackendInternalError))
jsonPostAsync = Named.JSONAPI {
  updateListJSONEp = \lId -> serveWorkerAPI $ \p ->
      Jobs.JSONPost { _jp_list_id = lId
                    , _jp_ngrams_list = _wjf_data p }
  }

------------------------------------------------------------------------
postAsyncJSON :: (HasNodeStory env err m, MonadJobStatus m, MonadLogger m)
              => ListId
              -> NgramsList
              -> JobHandle m
              -> m ()
postAsyncJSON l ngramsList jobHandle = do

  markStarted 2 jobHandle

  $(logLocM) DEBUG "[postAsyncJSON] Setting the Ngrams list ..."
  setList
  $(logLocM) DEBUG "[postAsyncJSON] Done."

  markProgress 1 jobHandle

  corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
  let corpus_id = fromMaybe (panicTrace "no parent_id") (_node_parent_id corpus_node)
  $(logLocM) DEBUG "[postAsyncJSON] Executing re-indexing..."
  _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
  $(logLocM) DEBUG "[postAsyncJSON] Re-indexing done."

  markComplete jobHandle

  where
    setList :: HasNodeStory env err m => m ()
    setList = do
      -- TODO check with Version for optim
      mapM_ (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList ngramsList
      -- TODO reindex


--
-- TSV API
--

tsvAPI :: Named.TSVAPI (AsServerT (GargM Env BackendInternalError))
tsvAPI = tsvPostAsync

------------------------------------------------------------------------
tsvPostAsync :: Named.TSVAPI (AsServerT (GargM Env BackendInternalError))
tsvPostAsync =
  Named.TSVAPI {
    updateListTSVEp = \lId -> serveWorkerAPIEJob $ \p ->
      case ngramsListFromTSVData (_wtf_data p) of
        Left err         -> Left $ InternalServerError $ err500 { errReasonPhrase = err }
        Right ngramsList -> Right $ Jobs.JSONPost { _jp_list_id = lId
                                                  , _jp_ngrams_list = ngramsList }
    }

-- | Tries converting a text file into an 'NgramList', so that we can reuse the
-- existing JSON endpoint for the TSV upload.
ngramsListFromTSVData :: Text -> Either Prelude.String NgramsList
ngramsListFromTSVData tsvData = case decodeTsv of
  -- /NOTE/ The legacy TSV data only supports terms in imports and exports, so this is
  -- all we care about.
  Left err    -> Left $ "Invalid TSV found in ngramsListFromTSVData: " <> err
  Right terms -> pure $ Map.fromList [ (NgramsTerms, Versioned 0 $ mconcat . Vec.toList $ terms) ]
  where
    binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData

    decodeTsv :: Either Prelude.String (Vector NgramsTableMap)
    decodeTsv = Tsv.decodeWithP tsvToNgramsTableMap
                               (Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') })
                               Tsv.HasHeader
                               binaryData

-- | Converts a plain TSV 'Record' into an NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser NgramsTableMap
tsvToNgramsTableMap record = case Vec.toList record of
  (map P.decodeUtf8 -> [status, label, forms])
    -> pure $ conv status label forms
  _ -> Prelude.fail "tsvToNgramsTableMap failed"

  where
    conv :: Text -> Text -> Text -> NgramsTableMap
    conv status label forms = Map.singleton (NgramsTerm label)
        $ NgramsRepoElement { _nre_size = 1
                                             , _nre_list = case status == "map" of
                                                             True  -> MapTerm
                                                             False -> case status == "main" of
                                                                True  -> CandidateTerm
                                                                False -> StopTerm
                                             , _nre_root = Nothing
                                             , _nre_parent = Nothing
                                             , _nre_children = MSet
                                                             $ Map.fromList
                                                             $ map (\form -> (NgramsTerm form, ()))
                                                             $ filter (\w -> w /= "" && w /= label)
                                                             $ splitOn "|&|" forms
                                             }

------------------------------------------------------------------------


-- | This is for debugging the TSV parser in the REPL
importTsvFile :: forall env err m. (HasNodeStory env err m, HasServerError err, MonadJobStatus m, MonadLogger m)
              => ListId -> P.FilePath -> m ()
importTsvFile lId fp = do
  contents <- liftBase $ P.readFile fp
  case ngramsListFromTSVData contents of
    Left err         -> serverError $ err500 { errReasonPhrase = err }
    Right ngramsList -> postAsyncJSON lId ngramsList (noJobHandle @m Proxy)

--
-- Utils
--

------------------------------------------------------------------------
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n
  where
    i = HashMap.lookup t m
    n = Just (text2ngrams t)