Commit 062f0c68 authored by Fabien Maniere's avatar Fabien Maniere

Merge branch '505-dev-fix-hyperdata-institutes-tree' into 'dev'

[CLI] db fix command, to fix hyperdata #630

See merge request !442
parents 3a6b2629 050f8b99
Pipeline #7877 passed with stages
in 53 minutes and 36 seconds
{-|
Module : CLI.DBFixes
Description : Gargantext CLI DB fixes
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Fixes to the GarganText DB
-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
module CLI.DBFixes where
import CLI.Parsers (settings_p)
import CLI.Types
import Control.Lens qualified as Lens
import Data.Aeson qualified as Aeson
import Data.Map.Strict qualified as Map
import Data.Text qualified as T
import Database.PostgreSQL.Simple qualified as PSQL
import Database.PostgreSQL.Simple.FromField qualified as PSQL
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Gargantext.Core.Config (gc_database_config, gc_logging)
import Gargantext.Core.Config.Types (_SettingsFile)
import Gargantext.Core.Config.Utils (readConfig)
import Gargantext.Core.Utils.Prefix (unCapitalize, dropPrefix)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..), hd_institutes_tree)
import Gargantext.Database.Admin.Types.Hyperdata.Prelude (fieldLabelModifier, omitNothingFields, sumEncoding, SumEncoding(..), genericParseJSON, genericToJSON, defaultOptions, fromField')
import Gargantext.Prelude
import Gargantext.Prelude.Database (runPGSExecute, runPGSQuery)
import Gargantext.System.Logging (LogLevel(..), logLoc, withLogger)
import Options.Applicative
fixBrokenHyperdata630Cmd :: HasCallStack => Mod CommandFields CLI
fixBrokenHyperdata630Cmd =
command "fix-broken-hyperdata-630" (info (helper <*> (fmap CLISub $ fmap CCMD_db_fix_630 dbFixParser))
(progDesc "Fix broken hyperdata, issue #630"))
dbFixParser :: Parser DBFixArgs
dbFixParser = DBFixArgs
<$> settings_p
<*> flag True False ( long "no-dry-run"
<> help "Whether to dry run the DB fix (enabled by default)" )
-- | Fixes issue https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/630
-- Basically, because of Haskell HyperdataDocument type change in 2024, there remain
-- documents in `contexts` table, where the `hyperdata->institutes_tree` is a map
-- from `Text` to `Text`, instead being a map from `Text` to `[Text]` (see
-- G.D.A.T.Hyperdata.Document).
-- We make a SQL query to find hyperdata containing `Text` in value,
-- parse these rows with Haskell to make sure this is indeed broken,
-- then type-safely conver to good values and update the row.
fixBrokenHyperdata630CLI :: DBFixArgs -> IO ()
fixBrokenHyperdata630CLI (DBFixArgs { settings_toml, dry_run }) = do
cfg <- liftIO $ readConfig settings_toml
withLogger (cfg ^. gc_logging) $ \logger -> do
$(logLoc) logger INFO $ "settings file: " <> T.pack (_SettingsFile settings_toml)
let dbConfig = cfg ^. gc_database_config
$(logLoc) logger INFO $ "DB config: " <> show dbConfig
c <- PSQL.connect dbConfig
PSQL.begin c
affectedContexts <- runPGSQuery c affectedDocsQ () :: IO [(Int, Hyperdata630Incorrect)]
mapM_ (\(cId, cHyperdata) -> do
putText $ "[" <> show cId <> "] " <> show (cHyperdata ^. Lens.to _hd630i_institutes_tree)
let fixedHyperdata = fix630Hyperdata cHyperdata
putText $ " -> " <> show (fixedHyperdata ^. hd_institutes_tree)
_ <- runPGSExecute c updateAffectedDoc (Aeson.encode $ fixedHyperdata ^. hd_institutes_tree, cId)
-- Check that the hyperdata can be fetched correctly
newHd <- runPGSQuery c [sql|SELECT hyperdata FROM contexts WHERE id = ?|] (PSQL.Only cId) :: IO [PSQL.Only HyperdataDocument]
unless (length newHd == 1) $ do
PSQL.rollback c
panicTrace $ "[" <> show cId <> "] Error! Can't get new hyperdata: " <> show newHd
putText $ "[" <> show cId <> "] updated correctly"
) affectedContexts
if dry_run then do
$(logLoc) logger INFO $ "dry run mode, rolling back changes"
PSQL.rollback c
else do
$(logLoc) logger INFO $ "no dry run mode, comitting changes"
PSQL.commit c
where
-- | SQL query to find all affected documents:
affectedDocsQ = [sql|
SELECT
c.id, c.hyperdata
FROM
contexts AS c
WHERE
-- does the hyperdata contain values with strings, instead of arrays?
EXISTS (
SELECT 1
FROM jsonb_each(c.hyperdata->'institutes_tree') AS kv(key, value) -- expand the top‑level map
WHERE
c.hyperdata->'institutes_tree' IS NOT NULL
AND jsonb_typeof(kv.value) = 'string' -- a plain string found
);
|]
updateAffectedDoc = [sql|
UPDATE contexts
SET hyperdata = jsonb_set(hyperdata, '{institutes_tree}', ?::jsonb)
WHERE id = ?
|]
data Hyperdata630Incorrect = Hyperdata630Incorrect {
_hd630i_bdd :: !(Maybe Text)
, _hd630i_doi :: !(Maybe Text)
, _hd630i_url :: !(Maybe Text)
, _hd630i_page :: !(Maybe Int)
, _hd630i_title :: !(Maybe Text)
, _hd630i_authors :: !(Maybe Text)
, _hd630i_institutes :: !(Maybe Text)
, _hd630i_source :: !(Maybe Text)
, _hd630i_abstract :: !(Maybe Text)
, _hd630i_publication_date :: !(Maybe Text)
, _hd630i_publication_year :: !(Maybe Int)
, _hd630i_publication_month :: !(Maybe Int)
, _hd630i_publication_day :: !(Maybe Int)
, _hd630i_publication_hour :: !(Maybe Int)
, _hd630i_publication_minute :: !(Maybe Int)
, _hd630i_publication_second :: !(Maybe Int)
, _hd630i_language_iso2 :: !(Maybe Text)
, _hd630i_institutes_tree :: !(Maybe (Map Text Text))
} deriving (Show, Generic)
instance FromJSON Hyperdata630Incorrect
where
parseJSON = genericParseJSON
( defaultOptions { sumEncoding = ObjectWithSingleField
, fieldLabelModifier = unCapitalize . dropPrefix "_hd630i_"
, omitNothingFields = True
}
)
instance ToJSON Hyperdata630Incorrect
where
toJSON = genericToJSON
( defaultOptions { sumEncoding = ObjectWithSingleField
, fieldLabelModifier = unCapitalize . dropPrefix "_hd630i_"
, omitNothingFields = True
}
)
instance PSQL.FromField Hyperdata630Incorrect
where
fromField = fromField'
fix630Hyperdata :: Hyperdata630Incorrect -> HyperdataDocument
fix630Hyperdata Hyperdata630Incorrect { .. } =
HyperdataDocument { _hd_bdd = _hd630i_bdd
, _hd_doi = _hd630i_doi
, _hd_url = _hd630i_url
, _hd_page = _hd630i_page
, _hd_title = _hd630i_title
, _hd_authors = _hd630i_authors
, _hd_institutes = _hd630i_institutes
, _hd_source = _hd630i_source
, _hd_abstract = _hd630i_abstract
, _hd_publication_date = _hd630i_publication_date
, _hd_publication_year = _hd630i_publication_year
, _hd_publication_month = _hd630i_publication_month
, _hd_publication_day = _hd630i_publication_day
, _hd_publication_hour = _hd630i_publication_hour
, _hd_publication_minute = _hd630i_publication_minute
, _hd_publication_second = _hd630i_publication_second
, _hd_language_iso2 = _hd630i_language_iso2
, _hd_institutes_tree }
where
_hd_institutes_tree = Map.map (\v -> [v]) <$> _hd630i_institutes_tree
......@@ -129,6 +129,12 @@ data WorkerStatsArgs = WorkerStatsArgs
{ ws_toml :: !SettingsFile
} deriving (Show, Eq)
-- arguments when DB fixing is needed
data DBFixArgs = DBFixArgs {
settings_toml :: !SettingsFile
, dry_run :: !Bool
} deriving (Show, Eq)
data CLICmd
= CCMD_admin !AdminArgs
| CCMD_clean_csv_corpus
......@@ -145,6 +151,7 @@ data CLICmd
| CCMD_server !CLIServer
| CCMD_upgrade !UpgradeArgs
| CCMD_worker !CLIWorker
| CCMD_db_fix_630 DBFixArgs
deriving (Show, Eq)
data CLI =
......
......@@ -13,12 +13,12 @@ Main specifications to index a corpus with a term list
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE Strict #-}
{-# LANGUAGE TypeApplications #-}
module Main where
import CLI.Admin (adminCLI, adminCmd)
import CLI.DBFixes (fixBrokenHyperdata630Cmd, fixBrokenHyperdata630CLI)
import CLI.FileDiff (fileDiffCLI, fileDiffCmd)
import CLI.FilterTermsAndCooc
import CLI.Import (importCLI, importCmd)
......@@ -69,6 +69,8 @@ runCLI = \case
-> serverCLI args
CLISub (CCMD_worker args)
-> workerCLI args
CLISub (CCMD_db_fix_630 args)
-> fixBrokenHyperdata630CLI args
main :: IO ()
......@@ -94,5 +96,6 @@ allOptions = subparser (
fileDiffCmd <>
routesCmd <>
serverCmd <>
workerCmd
workerCmd <>
fixBrokenHyperdata630Cmd
)
......@@ -292,6 +292,7 @@ library
Gargantext.Database.Admin.Types.Hyperdata.Corpus
Gargantext.Database.Admin.Types.Hyperdata.Document
Gargantext.Database.Admin.Types.Hyperdata.Folder
Gargantext.Database.Admin.Types.Hyperdata.Prelude
Gargantext.Database.Admin.Types.Node
Gargantext.Database.Class
Gargantext.Database.Prelude
......@@ -461,7 +462,6 @@ library
Gargantext.Database.Admin.Types.Hyperdata.List
Gargantext.Database.Admin.Types.Hyperdata.Model
Gargantext.Database.Admin.Types.Hyperdata.Phylo
Gargantext.Database.Admin.Types.Hyperdata.Prelude
Gargantext.Database.Admin.Types.Hyperdata.Texts
Gargantext.Database.Admin.Types.Hyperdata.User
Gargantext.Database.Admin.Types.Metrics
......@@ -673,6 +673,7 @@ executable gargantext
main-is: Main.hs
other-modules:
CLI.Admin
CLI.DBFixes
CLI.FileDiff
CLI.FilterTermsAndCooc
CLI.Import
......@@ -705,6 +706,7 @@ executable gargantext
, gargantext
, gargantext-prelude
, haskell-bee
, lens >= 5.2.2 && < 5.3
, MonadRandom ^>= 0.6
, optparse-applicative
, postgresql-simple >= 0.6.4 && <= 0.7.0.0
......
......@@ -13,11 +13,11 @@ Portability : POSIX
module Gargantext.Database.Admin.Types.Hyperdata.Document where
import Gargantext.Prelude hiding (ByteString)
import Codec.Serialise.Class hiding (decode)
import Gargantext.Core.Text (HasText(..))
import Gargantext.Core.Utils.Prefix (unCapitalize, dropPrefix)
import Gargantext.Database.Admin.Types.Hyperdata.Prelude
import Codec.Serialise.Class hiding (decode)
import Gargantext.Prelude hiding (ByteString)
------------------------------------------------------------------------
data HyperdataDocument = HyperdataDocument { _hd_bdd :: !(Maybe Text)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment