From c0db80b3d85630a026dc722ae0abe7dd51e6ffc8 Mon Sep 17 00:00:00 2001 From: Amul Sul Date: Fri, 19 Jun 2020 06:29:36 -0400 Subject: [PATCH v8 3/5] Implement ALTER SYSTEM READ ONLY using global barrier. Implementation: 1. When a user tried to change server state to WAL-Prohibited using ALTER SYSTEM READ ONLY command or by calling pg_alter_wal_prohibit_state(true) sql function, the current state generation to inprogress in shared memory marked and signaled checkpointer process. Checkpointer, noticing that the current state generation has WALPROHIBIT_TRANSITION_IN_PROGRESS flag set, does the barrier request, and then acknowledges back to the backend who requested the state change once the transition has been completed. Final state will be updated in control file to make it persistent across the system restarts. 2. When a backend receives the WAL-Prohibited barrier, at that moment if it is already in a transaction and the transaction already assigned XID, then the backend will be killed by throwing FATAL(XXX: need more discussion on this) 3. Otherwise, if that backend running transaction without valid XID then, we don't need to do anything special right now, simply call ResetLocalXLogInsertAllowed() so that any future WAL insert in will check XLogInsertAllowed() first which set ready only state appropriately. 4. A new transaction (in an existing or in a new backend) starts as a read-only transaction. 5. Autovacuum launcher as well as checkpointer will not do anything in WAL-Prohibited server state until someone wakes us up. E.g. a backend might later on request us to put the system back to read-write. 5. The Autovacuum launcher, as well as the checkpointer, will not do anything while in the WAL-Prohibited server state until someone wakes up. E.g. user might, later on, request us to put the system back to read-write by executing ALTER SYSTEM READ WRITE. 6. At shutdown in WAL-Prohibited mode, we'll skip shutdown checkpoint and xlog rotation. Starting up again will perform crash recovery(XXX: need some discussion on this as well) but the end of recovery checkpoint will be skipped and it will be performed when the system changed to WAL-Permitted mode. 7. ALTER SYSTEM READ ONLY/WRITE is restricted on standby server. 8. To execute ALTER SYSTEM READ ONLY/WRITE, the user should have execute permssion on pg_alter_wal_prohibit_state() function. 9. Add system_is_read_only GUC show the system state -- will true when system is wal prohibited or in recovery. --- src/backend/access/transam/Makefile | 1 + src/backend/access/transam/walprohibit.c | 390 +++++++++++++++++++++++ src/backend/access/transam/xact.c | 37 ++- src/backend/access/transam/xlog.c | 117 ++++++- src/backend/catalog/system_views.sql | 2 + src/backend/postmaster/autovacuum.c | 4 + src/backend/postmaster/bgwriter.c | 2 +- src/backend/postmaster/checkpointer.c | 39 +++ src/backend/postmaster/pgstat.c | 3 + src/backend/storage/ipc/ipci.c | 6 + src/backend/storage/ipc/procsignal.c | 26 +- src/backend/tcop/utility.c | 15 +- src/backend/utils/misc/guc.c | 26 ++ src/bin/pg_controldata/pg_controldata.c | 2 + src/include/access/walprohibit.h | 94 ++++++ src/include/access/xlog.h | 4 + src/include/catalog/pg_control.h | 3 + src/include/catalog/pg_proc.dat | 4 + src/include/pgstat.h | 1 + src/include/postmaster/bgwriter.h | 2 + src/include/storage/procsignal.h | 7 +- src/tools/pgindent/typedefs.list | 1 + 22 files changed, 717 insertions(+), 69 deletions(-) create mode 100644 src/backend/access/transam/walprohibit.c create mode 100644 src/include/access/walprohibit.h diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 595e02de722..b5322a69954 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -26,6 +26,7 @@ OBJS = \ twophase.o \ twophase_rmgr.o \ varsup.o \ + walprohibit.o \ xact.o \ xlog.o \ xlogarchive.o \ diff --git a/src/backend/access/transam/walprohibit.c b/src/backend/access/transam/walprohibit.c new file mode 100644 index 00000000000..75f3924cc97 --- /dev/null +++ b/src/backend/access/transam/walprohibit.c @@ -0,0 +1,390 @@ +/*------------------------------------------------------------------------- + * + * walprohibit.c + * PostgreSQL write-ahead log prohibit states + * + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/backend/access/transam/walprohibit.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/walprohibit.h" +#include "fmgr.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "postmaster/bgwriter.h" +#include "storage/condition_variable.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "utils/acl.h" +#include "utils/fmgroids.h" +#include "utils/fmgrprotos.h" + +/* + * Shared-memory WAL prohibit state + */ +typedef struct WALProhibitStateData +{ + /* + * Indicates current WAL prohibit state generation and the last two bits of + * this generation indicates current wal prohibit state. + */ + pg_atomic_uint32 shared_state_generation; + + /* Signaled when requested WAL prohibit state changes */ + ConditionVariable walprohibit_cv; +} WALProhibitStateData; + +static WALProhibitStateData *WALProhibitState = NULL; + +static void RequestWALProhibitChange(uint32 cur_state_gen); + +/* + * ProcessBarrierWALProhibit() + * + * Handle WAL prohibit state change request. + */ +bool +ProcessBarrierWALProhibit(void) +{ + /* + * Kill off any transactions that have an XID *before* allowing the system + * to go WAL prohibit state. + */ + if (FullTransactionIdIsValid(GetTopFullTransactionIdIfAny())) + { + /* + * Should be here only while transiting towards the WAL prohibit state. + */ + Assert(WALPROHIBIT_CURRENT_STATE(GetWALProhibitStateGen()) == + WALPROHIBIT_STATE_GOING_READ_ONLY); + + /* + * XXX: Kill off the whole session by throwing FATAL instead of + * killing transaction by throwing ERROR due to following reasons that + * need be thought: + * + * 1. Due to some presents challenges with the wire protocol, we could + * not simply kill of idle transaction. + * + * 2. If we are here in subtransaction then the ERROR will kill the + * current subtransaction only. In the case of invalidations, that + * might be good enough, but for XID assignment it's not, because + * assigning an XID to a subtransaction also causes higher + * sub-transaction levels and the parent transaction to get XIDs. + */ + ereport(FATAL, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("system is now read only"), + errhint("Sessions with open write transactions must be terminated."))); + } + + /* Return to "check" state */ + ResetLocalXLogInsertAllowed(); + + return true; +} + +/* + * AlterSystemSetWALProhibitState() + * + * Execute ALTER SYSTEM READ { ONLY | WRITE } statement. + */ +void +AlterSystemSetWALProhibitState(AlterSystemWALProhibitState *stmt) +{ + /* Check permission for pg_alter_wal_prohibit_state() */ + if (pg_proc_aclcheck(F_PG_ALTER_WAL_PROHIBIT_STATE, + GetUserId(), ACL_EXECUTE) != ACLCHECK_OK) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied for command ALTER SYSTEM"), + errhint("Get execute permission for pg_alter_wal_prohibit_state() to this user."))); + + /* Alter WAL prohibit state not allowed during recovery */ + PreventCommandDuringRecovery("ALTER SYSTEM"); + + /* Execute function to alter wal prohibit state */ + (void) OidFunctionCall1(F_PG_ALTER_WAL_PROHIBIT_STATE, + BoolGetDatum(stmt->walprohibited)); +} + +/* + * pg_alter_wal_prohibit_state() + * + * SQL callable function to alter system read write state. + */ +Datum +pg_alter_wal_prohibit_state(PG_FUNCTION_ARGS) +{ + bool walprohibited = PG_GETARG_BOOL(0); + uint32 cur_state_gen; + + /* Alter WAL prohibit state not allowed during recovery */ + PreventCommandDuringRecovery("pg_alter_wal_prohibit_state()"); + + /* + * It is not a final state since we yet to convey this WAL prohibit state to + * all backend. + */ + cur_state_gen = SetWALProhibitState(walprohibited, false); + + /* Server is already in requested state */ + if (!cur_state_gen) + PG_RETURN_VOID(); + + /* + * Signal the checkpointer to do the actual state transition, and wait for + * the state change to occur. + */ + RequestWALProhibitChange(cur_state_gen); + + PG_RETURN_VOID(); +} + +/* + * RequestWALProhibitChange() + * + * Request checkpointer to make the WALProhibitState to read-only. + */ +static void +RequestWALProhibitChange(uint32 cur_state_gen) +{ + /* Must not be called from checkpointer */ + Assert(!AmCheckpointerProcess()); + Assert(GetWALProhibitStateGen() & WALPROHIBIT_TRANSITION_IN_PROGRESS); + + /* + * If in a standalone backend, just do it ourselves. + */ + if (!IsPostmasterEnvironment) + { + CompleteWALProhibitChange(cur_state_gen); + return; + } + + /* Signal checkpointer process */ + SendsSignalToCheckpointer(SIGINT); + + /* Wait for the state to change to read-only */ + ConditionVariablePrepareToSleep(&WALProhibitState->walprohibit_cv); + for (;;) + { + /* We'll be done once wal prohibit state generation changes */ + if (GetWALProhibitStateGen() != cur_state_gen) + break; + + ConditionVariableSleep(&WALProhibitState->walprohibit_cv, + WAIT_EVENT_WALPROHIBIT_STATE_CHANGE); + } + ConditionVariableCancelSleep(); +} + +/* + * CompleteWALProhibitChange() + * + * Checkpointer will call this to complete the requested WAL prohibit state + * transition. + */ +void +CompleteWALProhibitChange(uint32 cur_state_gen) +{ + uint64 barrier_gen; + bool wal_prohibited; + + /* + * Must be called from checkpointer. Otherwise, it must be single-user + * backend. + */ + Assert(AmCheckpointerProcess() || !IsPostmasterEnvironment); + Assert(cur_state_gen & WALPROHIBIT_TRANSITION_IN_PROGRESS); + + /* + * WAL prohibit state change is initiated. We need to complete the state + * transition by setting requested WAL prohibit state in all backends. + */ + elog(DEBUG1, "waiting for backends to adopt requested WAL prohibit state change"); + + /* Emit global barrier */ + barrier_gen = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_WALPROHIBIT); + WaitForProcSignalBarrier(barrier_gen); + + /* And flush all inserts. */ + XLogFlush(GetXLogInsertRecPtr()); + + wal_prohibited = + (WALPROHIBIT_NEXT_STATE(cur_state_gen) == WALPROHIBIT_STATE_READ_ONLY); + + /* Set the final state */ + (void) SetWALProhibitState(wal_prohibited, true); + + /* Update the control file to make state persistent */ + SetControlFileWALProhibitFlag(wal_prohibited); + + if (wal_prohibited) + ereport(LOG, (errmsg("system is now read only"))); + else + { + /* + * Request checkpoint if the end-of-recovery checkpoint has been skipped + * previously. + */ + if (LastCheckPointIsSkipped()) + { + RequestCheckpoint(CHECKPOINT_IMMEDIATE); + SetLastCheckPointSkipped(false); + } + ereport(LOG, (errmsg("system is now read write"))); + } + + /* Wake up all backends waiting on this. */ + ConditionVariableBroadcast(&WALProhibitState->walprohibit_cv); +} + +/* + * GetWALProhibitStateGen() + * + * Atomically return the current server WAL prohibited state generation. + */ +uint32 +GetWALProhibitStateGen(void) +{ + return pg_atomic_read_u32(&WALProhibitState->shared_state_generation); +} + +/* + * SetWALProhibitState() + * + * Increments current shared WAL prohibit state generation concerning to + * requested state and returns the same. + * + * For the transition state request where is_final_state is false if the server + * desired transition state is the same as the current state which might have + * been requested by some other backend and has been proceeded then the current + * wal prohibit generation will be returned so that this backend can wait until + * the shared wal prohibited generation change for the final state. And, if the + * server is already completely moved to the requested state then the requester + * backend doesn't need to wait, in that case, 0 will be returned. + * + * For the final state request which can be only requested by the checkpointer + * or by the single-user so that there is no chance that the server already is + * in the desired final state. + */ +uint32 +SetWALProhibitState(bool wal_prohibited, bool is_final_state) +{ + uint32 new_state; + uint32 cur_state; + uint32 cur_state_gen; + uint32 next_state_gen; + + /* Get the current state */ + cur_state_gen = GetWALProhibitStateGen(); + cur_state = WALPROHIBIT_CURRENT_STATE(cur_state_gen); + + /* Compute new state */ + if (is_final_state) + { + /* + * Only checkpointer or single-user can set the final wal prohibit + * state. + */ + Assert(AmCheckpointerProcess() || !IsPostmasterEnvironment); + + new_state = wal_prohibited ? + WALPROHIBIT_STATE_READ_ONLY : WALPROHIBIT_STATE_READ_WRITE; + + /* + * There won't be any other process for the final state setting so that + * the next final state will be the desired state. + */ + Assert(WALPROHIBIT_NEXT_STATE(cur_state) == new_state); + } + else + { + new_state = wal_prohibited ? + WALPROHIBIT_STATE_GOING_READ_ONLY : + WALPROHIBIT_STATE_GOING_READ_WRITE; + + /* Server is already in the requested transition state */ + if (cur_state == new_state) + return cur_state; /* Wait for state transition completion */ + + /* Server is already in requested state */ + if (WALPROHIBIT_NEXT_STATE(new_state) == cur_state) + return 0; /* No wait is needed */ + + /* Prevent concurrent contrary in progress transition state setting */ + if (cur_state & WALPROHIBIT_TRANSITION_IN_PROGRESS) + { + if (cur_state & WALPROHIBIT_STATE_READ_ONLY) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("system state transition to read only is already in progress"), + errhint("Try after sometime again."))); + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("system state transition to read write is already in progress"), + errhint("Try after sometime again."))); + } + } + + /* + * Update new state generation in share memory only if the state generation + * hasn't changed until now we have checked. + */ + next_state_gen = cur_state_gen + 1; + (void) pg_atomic_compare_exchange_u32(&WALProhibitState->shared_state_generation, + &cur_state_gen, next_state_gen); + + /* To be sure that any later reads of memory happen strictly after this. */ + pg_memory_barrier(); + + return next_state_gen; +} + +/* + * WALProhibitStateGenerationInit() + * + * Initialization of shared wal prohibit state generation. + */ +void +WALProhibitStateGenerationInit(bool wal_prohibited) +{ + uint32 new_state; + + Assert(AmStartupProcess() || !IsPostmasterEnvironment); + + new_state = wal_prohibited ? + WALPROHIBIT_STATE_READ_ONLY : WALPROHIBIT_STATE_READ_WRITE; + + pg_atomic_init_u32(&WALProhibitState->shared_state_generation, new_state); +} + +/* + * WALProhibitStateShmemInit() + * + * Initialization of shared memory for WAL prohibit state. + */ +void +WALProhibitStateShmemInit(void) +{ + bool found; + + WALProhibitState = (WALProhibitStateData *) + ShmemInitStruct("WAL Prohibit State", + sizeof(WALProhibitStateData), + &found); + + if (found) + return; + + /* First time through ... */ + memset(WALProhibitState, 0, sizeof(WALProhibitStateData)); + ConditionVariableInit(&WALProhibitState->walprohibit_cv); +} diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index af6afcebb13..188c299bed9 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1962,23 +1962,28 @@ StartTransaction(void) Assert(s->prevSecContext == 0); /* - * Make sure we've reset xact state variables + * Reset xact state variables. * - * If recovery is still in progress, mark this transaction as read-only. - * We have lower level defences in XLogInsert and elsewhere to stop us - * from modifying data during recovery, but this gives the normal - * indication to the user that the transaction is read-only. - */ - if (RecoveryInProgress()) - { - s->startedInRecovery = true; - XactReadOnly = true; - } - else - { - s->startedInRecovery = false; - XactReadOnly = DefaultXactReadOnly; - } + * If it is not currently possible to insert write-ahead log records, + * either because we are still in recovery or because ALTER SYSTEM READ + * ONLY has been executed, force this to be a read-only transaction. + * We have lower level defences in XLogBeginInsert() and elsewhere to stop + * us from modifying data during recovery when !XLogInsertAllowed(), but + * this gives the normal indication to the user that the transaction is + * read-only. + * + * On the other hand, we only need to set the startedInRecovery flag when + * the transaction started during recovery, and not when WAL is otherwise + * prohibited. This information is used by RelationGetIndexScan() to + * decide whether to permit (1) relying on existing killed-tuple markings + * and (2) further killing of index tuples. Even when WAL is prohibited + * on the master, it's still the master, so the former is OK; and since + * killing index tuples doesn't generate WAL, the latter is also OK. + * See comments in RelationGetIndexScan() and MarkBufferDirtyHint(). + */ + XactReadOnly = DefaultXactReadOnly || !XLogInsertAllowed(); + s->startedInRecovery = RecoveryInProgress(); + XactDeferrable = DefaultXactDeferrable; XactIsoLevel = DefaultXactIsoLevel; forceSyncCommit = false; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 61754312e26..bfba75bbe80 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -31,6 +31,7 @@ #include "access/timeline.h" #include "access/transam.h" #include "access/twophase.h" +#include "access/walprohibit.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "access/xlogarchive.h" @@ -246,9 +247,10 @@ static bool LocalPromoteIsTriggered = false; * 0: unconditionally not allowed to insert XLOG * -1: must check RecoveryInProgress(); disallow until it is false * Most processes start with -1 and transition to 1 after seeing that recovery - * is not in progress. But we can also force the value for special cases. - * The coding in XLogInsertAllowed() depends on the first two of these states - * being numerically the same as bool true and false. + * is not in progress or the server state is not a WAL prohibited state. But + * we can also force the value for special cases. The coding in + * XLogInsertAllowed() depends on the first two of these states being + * numerically the same as bool true and false. */ static int LocalXLogInsertAllowed = -1; @@ -723,6 +725,11 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* + * lastCheckPointSkipped indicates if the last checkpoint is skipped. + */ + bool lastCheckPointSkipped; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -968,6 +975,7 @@ static void WALInsertLockAcquire(void); static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); +static inline bool IsWALProhibited(void); /* * Insert an XLOG record represented by an already-constructed chain of data @@ -6196,6 +6204,32 @@ SetCurrentChunkStartTime(TimestampTz xtime) SpinLockRelease(&XLogCtl->info_lck); } +/* + * Set or unset flag to indicating that the last checkpoint has been skipped. + */ +void +SetLastCheckPointSkipped(bool ChkptSkip) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->lastCheckPointSkipped = ChkptSkip; + SpinLockRelease(&XLogCtl->info_lck); +} + +/* + * Return value of lastCheckPointSkipped flag. + */ +bool +LastCheckPointIsSkipped(void) +{ + bool ChkptSkipped; + + SpinLockAcquire(&XLogCtl->info_lck); + ChkptSkipped = XLogCtl->lastCheckPointSkipped; + SpinLockRelease(&XLogCtl->info_lck); + + return ChkptSkipped; +} + /* * Fetch timestamp of latest processed commit/abort record. * Startup process maintains an accurate local copy in XLogReceiptTime @@ -7708,6 +7742,12 @@ StartupXLOG(void) XLogCtl->LogwrtRqst.Write = EndOfLog; XLogCtl->LogwrtRqst.Flush = EndOfLog; + /* + * Before enabling WAL insertion, initialize WAL prohibit state in shared + * memory that will decide the further WAL insert should be allowed or not. + */ + WALProhibitStateGenerationInit(ControlFile->wal_prohibited); + /* * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE * record before resource manager writes cleanup WAL records or checkpoint @@ -7718,7 +7758,17 @@ StartupXLOG(void) UpdateFullPageWrites(); LocalXLogInsertAllowed = -1; - if (InRecovery) + /* + * Skip end-of-recovery checkpoint if the system is in WAL prohibited state. + */ + if (ControlFile->wal_prohibited && InRecovery) + { + SetLastCheckPointSkipped(true); + + ereport(LOG, + (errmsg("skipping startup checkpoint because the system is read only"))); + } + else if (InRecovery) { /* * Perform a checkpoint to update all our recovery activity to disk. @@ -7964,6 +8014,28 @@ StartupXLOG(void) RequestCheckpoint(CHECKPOINT_FORCE); } +/* Set ControlFile's WAL prohibit flag */ +void +SetControlFileWALProhibitFlag(bool wal_prohibited) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->wal_prohibited = wal_prohibited; + UpdateControlFile(); + LWLockRelease(ControlFileLock); +} + +/* + * Is the system still in WAL prohibited state? + */ +static inline bool +IsWALProhibited(void) +{ + uint32 cur_state = WALPROHIBIT_CURRENT_STATE(GetWALProhibitStateGen()); + + return (cur_state != WALPROHIBIT_STATE_READ_WRITE && + cur_state != WALPROHIBIT_STATE_GOING_READ_WRITE); +} + /* * Checks if recovery has reached a consistent state. When consistency is * reached and we have a valid starting standby snapshot, tell postmaster @@ -8179,9 +8251,9 @@ HotStandbyActiveInReplay(void) /* * Is this process allowed to insert new WAL records? * - * Ordinarily this is essentially equivalent to !RecoveryInProgress(). - * But we also have provisions for forcing the result "true" or "false" - * within specific processes regardless of the global state. + * Ordinarily this is essentially equivalent to !RecoveryInProgress() and + * !IsWALProhibited(). But we also have provisions for forcing the result + * "true" or "false" within specific processes regardless of the global state. */ bool XLogInsertAllowed(void) @@ -8200,9 +8272,20 @@ XLogInsertAllowed(void) if (RecoveryInProgress()) return false; + /* Or, in WAL prohibited state */ + if (IsWALProhibited()) + { + /* + * Set it to "unconditionally false" to avoid checking until it gets + * reset. + */ + LocalXLogInsertAllowed = 0; + return false; + } + /* - * On exit from recovery, reset to "unconditionally true", since there is - * no need to keep checking. + * On exit from recovery or WAL prohibited state, reset to "unconditionally + * true", since there is no need to keep checking. */ LocalXLogInsertAllowed = 1; return true; @@ -8224,6 +8307,12 @@ LocalSetXLogInsertAllowed(void) InitXLOGAccess(); } +void +ResetLocalXLogInsertAllowed(void) +{ + LocalXLogInsertAllowed = -1; +} + /* * Subroutine to try to fetch and validate a prior checkpoint record. * @@ -8513,9 +8602,13 @@ ShutdownXLOG(int code, Datum arg) */ WalSndWaitStopping(); + /* + * The restartpoint, checkpoint, or xlog rotation will be performed if the + * WAL writing is permitted. + */ if (RecoveryInProgress()) CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); - else + else if (XLogInsertAllowed()) { /* * If archiving is enabled, rotate the last XLOG file so that all the @@ -8528,6 +8621,10 @@ ShutdownXLOG(int code, Datum arg) CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } + else + ereport(LOG, + (errmsg("skipping shutdown checkpoint because the system is read only"))); + ShutdownCLOG(); ShutdownCommitTs(); ShutdownSUBTRANS(); diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index ed4f3f142d8..79da249dd5c 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1485,6 +1485,8 @@ REVOKE EXECUTE ON FUNCTION pg_stat_file(text,boolean) FROM public; REVOKE EXECUTE ON FUNCTION pg_ls_dir(text) FROM public; REVOKE EXECUTE ON FUNCTION pg_ls_dir(text,boolean,boolean) FROM public; +REVOKE EXECUTE ON FUNCTION pg_alter_wal_prohibit_state(bool) FROM public; + -- -- We also set up some things as accessible to standard roles. -- diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 2cef56f115f..efee35cbc94 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -659,6 +659,10 @@ AutoVacLauncherMain(int argc, char *argv[]) HandleAutoVacLauncherInterrupts(); + /* If the server is read only just go back to sleep. */ + if (!XLogInsertAllowed()) + continue; + /* * a worker finished, or postmaster signaled failure to start a worker */ diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index a7afa758b61..1d9c46de20a 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -278,7 +278,7 @@ BackgroundWriterMain(void) * Checkpointer, when active, is barely ever in its mainloop and thus * makes it hard to log regularly. */ - if (XLogStandbyInfoActive() && !RecoveryInProgress()) + if (XLogStandbyInfoActive() && XLogInsertAllowed()) { TimestampTz timeout = 0; TimestampTz now = GetCurrentTimestamp(); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 3e7dcd4f764..e2ff484d367 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -39,6 +39,7 @@ #include #include +#include "access/walprohibit.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "libpq/pqsignal.h" @@ -342,6 +343,7 @@ CheckpointerMain(void) pg_time_t now; int elapsed_secs; int cur_timeout; + uint32 wal_state_gen; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); @@ -352,6 +354,30 @@ CheckpointerMain(void) AbsorbSyncRequests(); HandleCheckpointerInterrupts(); + wal_state_gen = GetWALProhibitStateGen(); + + if (wal_state_gen & WALPROHIBIT_TRANSITION_IN_PROGRESS) + { + /* Complete WAL prohibit state change request */ + CompleteWALProhibitChange(wal_state_gen); + continue; + } + else if (WALPROHIBIT_CURRENT_STATE(wal_state_gen) == + WALPROHIBIT_STATE_READ_ONLY) + { + /* + * Don't do anything until someone wakes us up. For example a + * backend might later on request us to put the system back to + * read-write wal prohibit sate. + */ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, -1, + WAIT_EVENT_CHECKPOINTER_MAIN); + continue; + } + + Assert(WALPROHIBIT_CURRENT_STATE(wal_state_gen) == + WALPROHIBIT_STATE_READ_WRITE); + /* * Detect a pending checkpoint request by checking whether the flags * word in shared memory is nonzero. We shouldn't need to acquire the @@ -1333,3 +1359,16 @@ FirstCallSinceLastCheckpoint(void) return FirstCall; } + +/* + * SendsSignalToCheckpointer allows a process to send a signal to the checkpoint process. + */ +void +SendsSignalToCheckpointer(int signum) +{ + if (CheckpointerShmem->checkpointer_pid == 0) + elog(ERROR, "checkpointer is not running"); + + if (kill(CheckpointerShmem->checkpointer_pid, signum) != 0) + elog(ERROR, "could not signal checkpointer: %m"); +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index e6be2b7836a..95a738d7f25 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4060,6 +4060,9 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_REPLICATION_SLOT_WRITE: event_name = "ReplicationSlotWrite"; break; + case WAIT_EVENT_WALPROHIBIT_STATE_CHANGE: + event_name = "SystemWALProhibitStateChange"; + break; case WAIT_EVENT_SLRU_FLUSH_SYNC: event_name = "SLRUFlushSync"; break; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 96c2aaabbd6..2d000ec2ff7 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -22,6 +22,7 @@ #include "access/subtrans.h" #include "access/syncscan.h" #include "access/twophase.h" +#include "access/walprohibit.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -222,6 +223,11 @@ CreateSharedMemoryAndSemaphores(void) MultiXactShmemInit(); InitBufferPool(); + /* + * Set up wal probibit shared state + */ + WALProhibitStateShmemInit(); + /* * Set up lock manager */ diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 63cb70bcaa4..4f1b67f9d04 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -18,6 +18,7 @@ #include #include "access/parallel.h" +#include "access/walprohibit.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -96,7 +97,6 @@ static volatile ProcSignalSlot *MyProcSignalSlot = NULL; static bool CheckProcSignal(ProcSignalReason reason); static void CleanupProcSignalState(int status, Datum arg); -static bool ProcessBarrierPlaceholder(void); /* * ProcSignalShmemSize @@ -510,9 +510,9 @@ ProcessProcSignalBarrier(void) * unconditionally, but it's more efficient to call only the ones * that might need us to do something based on the flags. */ - if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_PLACEHOLDER) - && ProcessBarrierPlaceholder()) - BARRIER_CLEAR_BIT(flags, PROCSIGNAL_BARRIER_PLACEHOLDER); + if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_WALPROHIBIT) + && ProcessBarrierWALProhibit()) + BARRIER_CLEAR_BIT(flags, PROCSIGNAL_BARRIER_WALPROHIBIT); } PG_CATCH(); { @@ -554,24 +554,6 @@ ProcessProcSignalBarrier(void) pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, shared_gen); } -static bool -ProcessBarrierPlaceholder(void) -{ - /* - * XXX. This is just a placeholder until the first real user of this - * machinery gets committed. Rename PROCSIGNAL_BARRIER_PLACEHOLDER to - * PROCSIGNAL_BARRIER_SOMETHING_ELSE where SOMETHING_ELSE is something - * appropriately descriptive. Get rid of this function and instead have - * ProcessBarrierSomethingElse. Most likely, that function should live in - * the file pertaining to that subsystem, rather than here. - * - * The return value should be 'true' if the barrier was successfully - * absorbed and 'false' if not. Note that returning 'false' can lead to - * very frequent retries, so try hard to make that an uncommon case. - */ - return true; -} - /* * CheckProcSignal - check to see if a particular reason has been * signaled, and clear the signal flag. Should be called after receiving diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 74c2162cd59..05eac206182 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -19,6 +19,7 @@ #include "access/htup_details.h" #include "access/reloptions.h" #include "access/twophase.h" +#include "access/walprohibit.h" #include "access/xact.h" #include "access/xlog.h" #include "catalog/catalog.h" @@ -85,7 +86,6 @@ static void ProcessUtilitySlow(ParseState *pstate, DestReceiver *dest, QueryCompletion *qc); static void ExecDropStmt(DropStmt *stmt, bool isTopLevel); -static void AlterSystemSetWALProhibitState(AlterSystemWALProhibitState *stmt); /* * CommandIsReadOnly: is an executable query read-only? @@ -3691,16 +3691,3 @@ GetCommandLogLevel(Node *parsetree) return lev; } - -/* - * AlterSystemSetWALProhibitState - * - * Execute ALTER SYSTEM READ { ONLY | WRITE } statement. - */ -static void -AlterSystemSetWALProhibitState(AlterSystemWALProhibitState *stmt) -{ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ALTER SYSTEM READ { ONLY | WRITE } not implemented"))); -} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 596bcb7b842..24113249f67 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -225,6 +225,7 @@ static bool check_recovery_target_lsn(char **newval, void **extra, GucSource sou static void assign_recovery_target_lsn(const char *newval, void *extra); static bool check_primary_slot_name(char **newval, void **extra, GucSource source); static bool check_default_with_oids(bool *newval, void **extra, GucSource source); +static const char *show_system_is_read_only(void); /* Private functions in guc-file.l that need to be called from guc.c */ static ConfigVariable *ProcessConfigFileInternal(GucContext context, @@ -615,6 +616,7 @@ static char *recovery_target_string; static char *recovery_target_xid_string; static char *recovery_target_name_string; static char *recovery_target_lsn_string; +static bool system_is_read_only; /* should be static, but commands/variable.c needs to get at this */ @@ -2036,6 +2038,18 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + /* Not for general use */ + {"system_is_read_only", PGC_INTERNAL, WAL, + gettext_noop("Shows whether the system is read only."), + NULL, + GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE + }, + &system_is_read_only, + false, + NULL, NULL, show_system_is_read_only + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL @@ -12041,4 +12055,16 @@ check_default_with_oids(bool *newval, void **extra, GucSource source) return true; } +/* + * NB: The return string should be the same as the _ShowOption() for boolean + * type. + */ +static const char * +show_system_is_read_only(void) +{ + if (!XLogInsertAllowed()) + return "on"; + return "off"; +} + #include "guc-file.c" diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 3e00ac0f701..922cd9641d8 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -290,6 +290,8 @@ main(int argc, char *argv[]) (uint32) ControlFile->backupEndPoint); printf(_("End-of-backup record required: %s\n"), ControlFile->backupEndRequired ? _("yes") : _("no")); + printf(_("WAL write prohibited: %s\n"), + ControlFile->wal_prohibited ? _("yes") : _("no")); printf(_("wal_level setting: %s\n"), wal_level_str(ControlFile->wal_level)); printf(_("wal_log_hints setting: %s\n"), diff --git a/src/include/access/walprohibit.h b/src/include/access/walprohibit.h new file mode 100644 index 00000000000..61836d61844 --- /dev/null +++ b/src/include/access/walprohibit.h @@ -0,0 +1,94 @@ +/* + * walprohibit.h + * + * PostgreSQL write-ahead log prohibit states + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/include/access/walprohibit.h + */ +#ifndef WALPROHIBIT_H +#define WALPROHIBIT_H + +#include "access/xact.h" +#include "access/xlog.h" +#include "miscadmin.h" +#include "nodes/parsenodes.h" + +extern bool ProcessBarrierWALProhibit(void); +extern void AlterSystemSetWALProhibitState(AlterSystemWALProhibitState *stmt); +extern void CompleteWALProhibitChange(uint32 wal_state); +extern uint32 GetWALProhibitStateGen(void); +extern uint32 SetWALProhibitState(bool wal_prohibited, bool is_final_state); +extern void MarkCheckPointSkippedInWalProhibitState(void); +extern void WALProhibitStateGenerationInit(bool wal_prohibited); +extern void WALProhibitStateShmemInit(void); + +/* + * The WAL Prohibit States. + * + * The odd number represents the transition state and whereas the even number + * represents the final state. These states can be distinguished by checking + * the 0th bits aka transition bit. + */ +#define WALPROHIBIT_STATE_READ_WRITE (uint32) 0 /* WAL permitted */ +#define WALPROHIBIT_STATE_GOING_READ_ONLY (uint32) 1 +#define WALPROHIBIT_STATE_READ_ONLY (uint32) 2 /* WAL prohibited */ +#define WALPROHIBIT_STATE_GOING_READ_WRITE (uint32) 3 + +/* The transition bit to distinguish states. */ +#define WALPROHIBIT_TRANSITION_IN_PROGRESS ((uint32) 1 << 0) + +/* Extract last two bits */ +#define WALPROHIBIT_CURRENT_STATE(stateGeneration) \ + ((uint32)(stateGeneration) & ((uint32) ((1 << 2) - 1))) +#define WALPROHIBIT_NEXT_STATE(stateGeneration) \ + WALPROHIBIT_CURRENT_STATE((stateGeneration + 1)) + +/* Never reaches when WAL is prohibited. */ +static inline void +AssertWALPermitted(void) +{ + /* + * Recovery in the startup process never is in wal prohibited state. + */ + Assert(InRecovery || XLogInsertAllowed()); + +#ifdef USE_ASSERT_CHECKING + walpermit_checked_state = WALPERMIT_CHECKED; +#endif +} + +/* + * XID-bearing transactions are killed off by "ALTER SYSTEM READ ONLY", so any + * part of the code that can only be reached with an XID assigned is never + * reached when WAL is prohibited. + */ +static inline void +AssertWALPermittedHaveXID(void) +{ + /* Must be performing an INSERT, UPDATE or DELETE, so we'll have an XID */ + Assert(FullTransactionIdIsValid(GetTopFullTransactionIdIfAny())); + AssertWALPermitted(); +} + +/* + * In opposite to the above assertion if a transaction doesn't have valid XID + * (e.g. VACUUM) then it won't be killed while changing the system state to WAL + * prohibited. Therefore, we need to explicitly error out before entering into + * the critical section. + */ +static inline void +CheckWALPermitted(void) +{ + if (!XLogInsertAllowed()) + ereport(ERROR, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("system is now read only"))); + +#ifdef USE_ASSERT_CHECKING + walpermit_checked_state = WALPERMIT_CHECKED; +#endif +} + +#endif /* WALPROHIBIT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 221af87e715..2bcd37894f9 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -306,6 +306,7 @@ extern RecoveryState GetRecoveryState(void); extern bool HotStandbyActive(void); extern bool HotStandbyActiveInReplay(void); extern bool XLogInsertAllowed(void); +extern void ResetLocalXLogInsertAllowed(void); extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); extern XLogRecPtr GetXLogInsertRecPtr(void); @@ -314,6 +315,8 @@ extern bool RecoveryIsPaused(void); extern void SetRecoveryPause(bool recoveryPause); extern TimestampTz GetLatestXTime(void); extern TimestampTz GetCurrentChunkReplayStartTime(void); +extern void SetLastCheckPointSkipped(bool ChkptSkip); +extern bool LastCheckPointIsSkipped(void); extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); @@ -325,6 +328,7 @@ extern void XLOGShmemInit(void); extern void BootStrapXLOG(void); extern void LocalProcessControlFile(bool reset); extern void StartupXLOG(void); +extern void SetControlFileWALProhibitFlag(bool wal_prohibited); extern void ShutdownXLOG(int code, Datum arg); extern void InitXLOGAccess(void); extern void CreateCheckPoint(int flags); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 06bed90c5e9..f4dc5412ee6 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -182,6 +182,9 @@ typedef struct ControlFileData int max_locks_per_xact; bool track_commit_timestamp; + /* WAL prohibited determines if the WAL insert is allowed or not. */ + bool wal_prohibited; + /* * This data is used to check for hardware-architecture compatibility of * the database and the backend executable. We need not check endianness diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index f48f5fb4d99..41b8fe02b3e 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10964,6 +10964,10 @@ proname => 'pg_partition_root', prorettype => 'regclass', proargtypes => 'regclass', prosrc => 'pg_partition_root' }, +{ oid => '4142', descr => 'alter system read only state', + proname => 'pg_alter_wal_prohibit_state', prorettype => 'void', + proargtypes => 'bool', prosrc => 'pg_alter_wal_prohibit_state' }, + { oid => '4350', descr => 'Unicode normalization', proname => 'normalize', prorettype => 'text', proargtypes => 'text text', prosrc => 'unicode_normalize_func' }, diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 0dfbac46b4b..f9ff2360b35 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -956,6 +956,7 @@ typedef enum WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC, WAIT_EVENT_REPLICATION_SLOT_SYNC, WAIT_EVENT_REPLICATION_SLOT_WRITE, + WAIT_EVENT_WALPROHIBIT_STATE_CHANGE, WAIT_EVENT_SLRU_FLUSH_SYNC, WAIT_EVENT_SLRU_READ, WAIT_EVENT_SLRU_SYNC, diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index 0a5708b32e6..ad5e3ba5724 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -42,4 +42,6 @@ extern void CheckpointerShmemInit(void); extern bool FirstCallSinceLastCheckpoint(void); +extern void SendsSignalToCheckpointer(int signum); + #endif /* _BGWRITER_H */ diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 5cb39697f38..bae06202b4a 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -48,12 +48,7 @@ typedef enum typedef enum { - /* - * XXX. PROCSIGNAL_BARRIER_PLACEHOLDER should be replaced when the first - * real user of the ProcSignalBarrier mechanism is added. It's just here - * for now because we can't have an empty enum. - */ - PROCSIGNAL_BARRIER_PLACEHOLDER = 0 + PROCSIGNAL_BARRIER_WALPROHIBIT = 0 } ProcSignalBarrierType; /* diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 74af9665d00..f16efeb5d6a 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2670,6 +2670,7 @@ WALAvailability WALInsertLock WALInsertLockPadded WALOpenSegment +WALProhibitStateData WALReadError WALSegmentCloseCB WALSegmentContext -- 2.22.0