Commit 062f0c68 authored by Fabien Maniere's avatar Fabien Maniere

Merge branch '505-dev-fix-hyperdata-institutes-tree' into 'dev'

[CLI] db fix command, to fix hyperdata #630

See merge request !442
parents 3a6b2629 050f8b99
Pipeline #7877 passed with stages
in 53 minutes and 36 seconds
{-|
Module : CLI.DBFixes
Description : Gargantext CLI DB fixes
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Fixes to the GarganText DB
-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
module CLI.DBFixes where
import CLI.Parsers (settings_p)
import CLI.Types
import Control.Lens qualified as Lens
import Data.Aeson qualified as Aeson
import Data.Map.Strict qualified as Map
import Data.Text qualified as T
import Database.PostgreSQL.Simple qualified as PSQL
import Database.PostgreSQL.Simple.FromField qualified as PSQL
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Gargantext.Core.Config (gc_database_config, gc_logging)
import Gargantext.Core.Config.Types (_SettingsFile)
import Gargantext.Core.Config.Utils (readConfig)
import Gargantext.Core.Utils.Prefix (unCapitalize, dropPrefix)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..), hd_institutes_tree)
import Gargantext.Database.Admin.Types.Hyperdata.Prelude (fieldLabelModifier, omitNothingFields, sumEncoding, SumEncoding(..), genericParseJSON, genericToJSON, defaultOptions, fromField')
import Gargantext.Prelude
import Gargantext.Prelude.Database (runPGSExecute, runPGSQuery)
import Gargantext.System.Logging (LogLevel(..), logLoc, withLogger)
import Options.Applicative
fixBrokenHyperdata630Cmd :: HasCallStack => Mod CommandFields CLI
fixBrokenHyperdata630Cmd =
command "fix-broken-hyperdata-630" (info (helper <*> (fmap CLISub $ fmap CCMD_db_fix_630 dbFixParser))
(progDesc "Fix broken hyperdata, issue #630"))
dbFixParser :: Parser DBFixArgs
dbFixParser = DBFixArgs
<$> settings_p
<*> flag True False ( long "no-dry-run"
<> help "Whether to dry run the DB fix (enabled by default)" )
-- | Fixes issue https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/630
-- Basically, because of Haskell HyperdataDocument type change in 2024, there remain
-- documents in `contexts` table, where the `hyperdata->institutes_tree` is a map
-- from `Text` to `Text`, instead being a map from `Text` to `[Text]` (see
-- G.D.A.T.Hyperdata.Document).
-- We make a SQL query to find hyperdata containing `Text` in value,
-- parse these rows with Haskell to make sure this is indeed broken,
-- then type-safely conver to good values and update the row.
fixBrokenHyperdata630CLI :: DBFixArgs -> IO ()
fixBrokenHyperdata630CLI (DBFixArgs { settings_toml, dry_run }) = do
cfg <- liftIO $ readConfig settings_toml
withLogger (cfg ^. gc_logging) $ \logger -> do
$(logLoc) logger INFO $ "settings file: " <> T.pack (_SettingsFile settings_toml)
let dbConfig = cfg ^. gc_database_config
$(logLoc) logger INFO $ "DB config: " <> show dbConfig
c <- PSQL.connect dbConfig
PSQL.begin c
affectedContexts <- runPGSQuery c affectedDocsQ () :: IO [(Int, Hyperdata630Incorrect)]
mapM_ (\(cId, cHyperdata) -> do
putText $ "[" <> show cId <> "] " <> show (cHyperdata ^. Lens.to _hd630i_institutes_tree)
let fixedHyperdata = fix630Hyperdata cHyperdata
putText $ " -> " <> show (fixedHyperdata ^. hd_institutes_tree)
_ <- runPGSExecute c updateAffectedDoc (Aeson.encode $ fixedHyperdata ^. hd_institutes_tree, cId)
-- Check that the hyperdata can be fetched correctly
newHd <- runPGSQuery c [sql|SELECT hyperdata FROM contexts WHERE id = ?|] (PSQL.Only cId) :: IO [PSQL.Only HyperdataDocument]
unless (length newHd == 1) $ do
PSQL.rollback c
panicTrace $ "[" <> show cId <> "] Error! Can't get new hyperdata: " <> show newHd
putText $ "[" <> show cId <> "] updated correctly"
) affectedContexts
if dry_run then do
$(logLoc) logger INFO $ "dry run mode, rolling back changes"
PSQL.rollback c
else do
$(logLoc) logger INFO $ "no dry run mode, comitting changes"
PSQL.commit c
where
-- | SQL query to find all affected documents:
affectedDocsQ = [sql|
SELECT
c.id, c.hyperdata
FROM
contexts AS c
WHERE
-- does the hyperdata contain values with strings, instead of arrays?
EXISTS (
SELECT 1
FROM jsonb_each(c.hyperdata->'institutes_tree') AS kv(key, value) -- expand the top‑level map
WHERE
c.hyperdata->'institutes_tree' IS NOT NULL
AND jsonb_typeof(kv.value) = 'string' -- a plain string found
);
|]
updateAffectedDoc = [sql|
UPDATE contexts
SET hyperdata = jsonb_set(hyperdata, '{institutes_tree}', ?::jsonb)
WHERE id = ?
|]
data Hyperdata630Incorrect = Hyperdata630Incorrect {
_hd630i_bdd :: !(Maybe Text)
, _hd630i_doi :: !(Maybe Text)
, _hd630i_url :: !(Maybe Text)
, _hd630i_page :: !(Maybe Int)
, _hd630i_title :: !(Maybe Text)
, _hd630i_authors :: !(Maybe Text)
, _hd630i_institutes :: !(Maybe Text)
, _hd630i_source :: !(Maybe Text)
, _hd630i_abstract :: !(Maybe Text)
, _hd630i_publication_date :: !(Maybe Text)
, _hd630i_publication_year :: !(Maybe Int)
, _hd630i_publication_month :: !(Maybe Int)
, _hd630i_publication_day :: !(Maybe Int)
, _hd630i_publication_hour :: !(Maybe Int)
, _hd630i_publication_minute :: !(Maybe Int)
, _hd630i_publication_second :: !(Maybe Int)
, _hd630i_language_iso2 :: !(Maybe Text)
, _hd630i_institutes_tree :: !(Maybe (Map Text Text))
} deriving (Show, Generic)
instance FromJSON Hyperdata630Incorrect
where
parseJSON = genericParseJSON
( defaultOptions { sumEncoding = ObjectWithSingleField
, fieldLabelModifier = unCapitalize . dropPrefix "_hd630i_"
, omitNothingFields = True
}
)
instance ToJSON Hyperdata630Incorrect
where
toJSON = genericToJSON
( defaultOptions { sumEncoding = ObjectWithSingleField
, fieldLabelModifier = unCapitalize . dropPrefix "_hd630i_"
, omitNothingFields = True
}
)
instance PSQL.FromField Hyperdata630Incorrect
where
fromField = fromField'
fix630Hyperdata :: Hyperdata630Incorrect -> HyperdataDocument
fix630Hyperdata Hyperdata630Incorrect { .. } =
HyperdataDocument { _hd_bdd = _hd630i_bdd
, _hd_doi = _hd630i_doi
, _hd_url = _hd630i_url
, _hd_page = _hd630i_page
, _hd_title = _hd630i_title
, _hd_authors = _hd630i_authors
, _hd_institutes = _hd630i_institutes
, _hd_source = _hd630i_source
, _hd_abstract = _hd630i_abstract
, _hd_publication_date = _hd630i_publication_date
, _hd_publication_year = _hd630i_publication_year
, _hd_publication_month = _hd630i_publication_month
, _hd_publication_day = _hd630i_publication_day
, _hd_publication_hour = _hd630i_publication_hour
, _hd_publication_minute = _hd630i_publication_minute
, _hd_publication_second = _hd630i_publication_second
, _hd_language_iso2 = _hd630i_language_iso2
, _hd_institutes_tree }
where
_hd_institutes_tree = Map.map (\v -> [v]) <$> _hd630i_institutes_tree
...@@ -129,6 +129,12 @@ data WorkerStatsArgs = WorkerStatsArgs ...@@ -129,6 +129,12 @@ data WorkerStatsArgs = WorkerStatsArgs
{ ws_toml :: !SettingsFile { ws_toml :: !SettingsFile
} deriving (Show, Eq) } deriving (Show, Eq)
-- arguments when DB fixing is needed
data DBFixArgs = DBFixArgs {
settings_toml :: !SettingsFile
, dry_run :: !Bool
} deriving (Show, Eq)
data CLICmd data CLICmd
= CCMD_admin !AdminArgs = CCMD_admin !AdminArgs
| CCMD_clean_csv_corpus | CCMD_clean_csv_corpus
...@@ -145,6 +151,7 @@ data CLICmd ...@@ -145,6 +151,7 @@ data CLICmd
| CCMD_server !CLIServer | CCMD_server !CLIServer
| CCMD_upgrade !UpgradeArgs | CCMD_upgrade !UpgradeArgs
| CCMD_worker !CLIWorker | CCMD_worker !CLIWorker
| CCMD_db_fix_630 DBFixArgs
deriving (Show, Eq) deriving (Show, Eq)
data CLI = data CLI =
......
...@@ -13,12 +13,12 @@ Main specifications to index a corpus with a term list ...@@ -13,12 +13,12 @@ Main specifications to index a corpus with a term list
{-# LANGUAGE LambdaCase #-} {-# LANGUAGE LambdaCase #-}
{-# LANGUAGE Strict #-} {-# LANGUAGE Strict #-}
{-# LANGUAGE TypeApplications #-}
module Main where module Main where
import CLI.Admin (adminCLI, adminCmd) import CLI.Admin (adminCLI, adminCmd)
import CLI.DBFixes (fixBrokenHyperdata630Cmd, fixBrokenHyperdata630CLI)
import CLI.FileDiff (fileDiffCLI, fileDiffCmd) import CLI.FileDiff (fileDiffCLI, fileDiffCmd)
import CLI.FilterTermsAndCooc import CLI.FilterTermsAndCooc
import CLI.Import (importCLI, importCmd) import CLI.Import (importCLI, importCmd)
...@@ -69,6 +69,8 @@ runCLI = \case ...@@ -69,6 +69,8 @@ runCLI = \case
-> serverCLI args -> serverCLI args
CLISub (CCMD_worker args) CLISub (CCMD_worker args)
-> workerCLI args -> workerCLI args
CLISub (CCMD_db_fix_630 args)
-> fixBrokenHyperdata630CLI args
main :: IO () main :: IO ()
...@@ -94,5 +96,6 @@ allOptions = subparser ( ...@@ -94,5 +96,6 @@ allOptions = subparser (
fileDiffCmd <> fileDiffCmd <>
routesCmd <> routesCmd <>
serverCmd <> serverCmd <>
workerCmd workerCmd <>
fixBrokenHyperdata630Cmd
) )
...@@ -292,6 +292,7 @@ library ...@@ -292,6 +292,7 @@ library
Gargantext.Database.Admin.Types.Hyperdata.Corpus Gargantext.Database.Admin.Types.Hyperdata.Corpus
Gargantext.Database.Admin.Types.Hyperdata.Document Gargantext.Database.Admin.Types.Hyperdata.Document
Gargantext.Database.Admin.Types.Hyperdata.Folder Gargantext.Database.Admin.Types.Hyperdata.Folder
Gargantext.Database.Admin.Types.Hyperdata.Prelude
Gargantext.Database.Admin.Types.Node Gargantext.Database.Admin.Types.Node
Gargantext.Database.Class Gargantext.Database.Class
Gargantext.Database.Prelude Gargantext.Database.Prelude
...@@ -461,7 +462,6 @@ library ...@@ -461,7 +462,6 @@ library
Gargantext.Database.Admin.Types.Hyperdata.List Gargantext.Database.Admin.Types.Hyperdata.List
Gargantext.Database.Admin.Types.Hyperdata.Model Gargantext.Database.Admin.Types.Hyperdata.Model
Gargantext.Database.Admin.Types.Hyperdata.Phylo Gargantext.Database.Admin.Types.Hyperdata.Phylo
Gargantext.Database.Admin.Types.Hyperdata.Prelude
Gargantext.Database.Admin.Types.Hyperdata.Texts Gargantext.Database.Admin.Types.Hyperdata.Texts
Gargantext.Database.Admin.Types.Hyperdata.User Gargantext.Database.Admin.Types.Hyperdata.User
Gargantext.Database.Admin.Types.Metrics Gargantext.Database.Admin.Types.Metrics
...@@ -673,6 +673,7 @@ executable gargantext ...@@ -673,6 +673,7 @@ executable gargantext
main-is: Main.hs main-is: Main.hs
other-modules: other-modules:
CLI.Admin CLI.Admin
CLI.DBFixes
CLI.FileDiff CLI.FileDiff
CLI.FilterTermsAndCooc CLI.FilterTermsAndCooc
CLI.Import CLI.Import
...@@ -705,6 +706,7 @@ executable gargantext ...@@ -705,6 +706,7 @@ executable gargantext
, gargantext , gargantext
, gargantext-prelude , gargantext-prelude
, haskell-bee , haskell-bee
, lens >= 5.2.2 && < 5.3
, MonadRandom ^>= 0.6 , MonadRandom ^>= 0.6
, optparse-applicative , optparse-applicative
, postgresql-simple >= 0.6.4 && <= 0.7.0.0 , postgresql-simple >= 0.6.4 && <= 0.7.0.0
......
...@@ -13,11 +13,11 @@ Portability : POSIX ...@@ -13,11 +13,11 @@ Portability : POSIX
module Gargantext.Database.Admin.Types.Hyperdata.Document where module Gargantext.Database.Admin.Types.Hyperdata.Document where
import Gargantext.Prelude hiding (ByteString) import Codec.Serialise.Class hiding (decode)
import Gargantext.Core.Text (HasText(..)) import Gargantext.Core.Text (HasText(..))
import Gargantext.Core.Utils.Prefix (unCapitalize, dropPrefix) import Gargantext.Core.Utils.Prefix (unCapitalize, dropPrefix)
import Gargantext.Database.Admin.Types.Hyperdata.Prelude import Gargantext.Database.Admin.Types.Hyperdata.Prelude
import Codec.Serialise.Class hiding (decode) import Gargantext.Prelude hiding (ByteString)
------------------------------------------------------------------------ ------------------------------------------------------------------------
data HyperdataDocument = HyperdataDocument { _hd_bdd :: !(Maybe Text) data HyperdataDocument = HyperdataDocument { _hd_bdd :: !(Maybe Text)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment