From 187fd4476422e95f079bba69d1e1e047bc5edb79 Mon Sep 17 00:00:00 2001 From: Osumi Takamichi Date: Mon, 16 Nov 2020 05:03:19 +0000 Subject: [PATCH v02] new wal_level to disable WAL logging In order to speed up the performance, especially for bulk data loading or pg_dumpall, this feature turns off generation of WAL except for XLOG resources. This means we gain this speed-up even at the cost of crash recovery. During the operation of this new wal_level, an unexpected stoppage or shutdown of the server makes the whole cluster corrupted and unrecoverable. Therefore, taking a full backup before and after the operation is a must. Author: Takamichi Osumi Reviewed-by: Tsunakawa, Takayuki Reviewed-by: Fujii Masao Reviewed-by: Ashutosh Bapat Discussion: https://www.postgresql.org/message-id/TYAPR01MB29901EBE5A3ACCE55BA99186FE320%40TYAPR01MB2990.jpnprd01.prod.outlook.com --- doc/src/sgml/config.sgml | 20 ++++++++++++++++-- doc/src/sgml/perform.sgml | 13 +++++++++--- src/backend/access/rmgrdesc/xlogdesc.c | 1 + src/backend/access/transam/varsup.c | 2 +- src/backend/access/transam/xlog.c | 28 ++++++++++++++++++++++++-- src/backend/access/transam/xloginsert.c | 7 +++++++ src/backend/postmaster/postmaster.c | 6 +++--- src/backend/replication/logical/logicalfuncs.c | 4 ++++ src/backend/replication/logical/origin.c | 4 ++++ src/backend/tcop/utility.c | 3 +++ src/backend/utils/misc/postgresql.conf.sample | 6 ++++-- src/bin/pg_controldata/pg_controldata.c | 2 ++ src/include/access/xlog.h | 4 +++- src/include/access/xlogdefs.h | 2 +- src/include/utils/rel.h | 3 ++- 15 files changed, 89 insertions(+), 16 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index f043433..3111a4d 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2591,7 +2591,15 @@ include_dir 'conf.d' data to support WAL archiving and replication, including running read-only queries on a standby server. minimal removes all logging except the information required to recover from a crash or - immediate shutdown. Finally, + immediate shutdown. none generates no WAL + except for ones related to transaction resources such as indication of checkpoint + or the end of recovery. This means that the amount of WAL during none + operation could be much less than that of minimal for ordinal operation. + Intrinsically, the purpose of none + is to accelerate data bulk loading at the expense of recovery. + Accordingly, note that crash during none makes + the whole cluster corrupted and unrecoverable. Therefore, never use this mode + unless the operation during the mode is repeatable and the cluster is backed up. Finally, logical adds information necessary to support logical decoding. Each level includes the information logged at all lower levels. This parameter can only be set at server start. @@ -2615,6 +2623,13 @@ include_dir 'conf.d' data from a base backup and the WAL logs, so replica or higher must be used to enable WAL archiving () and streaming replication. + In the same way, none does not create WAL logs in principle. + Therefore, this wal_level can be used to maximize the speed of data loading. + For example, bulk data loading or version upgrade using pg_dumpall. + On the other hand, an unexpected crash of the server makes the database cluster + inconsistent and never able to restart. For that reason, before utilizing this level, + get a full backup of both the cluster itself and the entire operations + that are done under the condition that wal_level is none. In logical level, the same information is logged as @@ -3372,7 +3387,8 @@ include_dir 'conf.d' changed without leaving archiving mode. This parameter can only be set at server start. archive_mode cannot be enabled when - wal_level is set to minimal. + wal_level is set to none or + minimal. diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 117a1f7..07d47d4 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1741,10 +1741,17 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; new base backup after the load has completed than to process a large amount of incremental WAL data. To prevent incremental WAL logging while loading, disable archiving and streaming replication, by setting - to minimal, + to either none + or minimal, to off, and to zero. - But note that changing these settings requires a server restart. + Changing wal_level to none + is extremely performance-oriented feature. Therefore, paying + a careful attention that a crash during the data loading causes + corruption of the whole cluster is needed. When it happens, + the server will not restart again any more and the administrator + needs to set up the cluster from the full backup. + Also, note that changing these settings requires a server restart. @@ -1810,7 +1817,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; If using WAL archiving or streaming replication, consider disabling them during the restore. To do that, set archive_mode to off, - wal_level to minimal, and + wal_level to minimal or none, and max_wal_senders to zero before loading the dump. Afterwards, set them back to the right values and take a fresh base backup. diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 3200f77..8293b3b 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -25,6 +25,7 @@ * GUC support */ const struct config_enum_entry wal_level_options[] = { + {"none", WAL_LEVEL_NONE, false}, {"minimal", WAL_LEVEL_MINIMAL, false}, {"replica", WAL_LEVEL_REPLICA, false}, {"archive", WAL_LEVEL_REPLICA, true}, /* deprecated */ diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index a4944fa..712943a 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -368,7 +368,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) * within 3M transactions of data loss. This leaves lots of room for the * DBA to fool around fixing things in a standalone backend, while not * being significant compared to total XID space. (VACUUM requires an XID - * if it truncates at wal_level!=minimal. "VACUUM (ANALYZE)", which a DBA + * if it truncates at wal_level<=minimal. "VACUUM (ANALYZE)", which a DBA * might do by reflex, assigns an XID. Hence, we had better be sure * there's lots of XIDs left...) Also, at default BLCKSZ, this leaves two * completely-idle segments. In the event of edge-case bugs involving diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index aa63f37..559cfe1 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4937,6 +4937,21 @@ DataChecksumsEnabled(void) } /* + * Returns the latest LSN. + */ +XLogRecPtr +GetLatestCheckPointLSN(void) +{ + XLogRecPtr latestLSN; + + LWLockAcquire(ControlFileLock, LW_SHARED); + latestLSN = ControlFile->checkPoint; + LWLockRelease(ControlFileLock); + + return latestLSN; +} + +/* * Returns a fake LSN for unlogged relations. * * Each call generates an LSN that is greater than any previous value @@ -6254,10 +6269,10 @@ CheckRequiredParameterValues(void) * For archive recovery, the WAL must be generated with at least 'replica' * wal_level. */ - if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL) + if (ArchiveRecoveryRequested && ControlFile->wal_level <= WAL_LEVEL_MINIMAL) { ereport(WARNING, - (errmsg("WAL was generated with wal_level=minimal, data may be missing"), + (errmsg("WAL was generated with wal_level<=minimal, data may be missing"), errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup."))); } @@ -6328,6 +6343,15 @@ StartupXLOG(void) CurrentResourceOwner = AuxProcessResourceOwner; /* + * Detect if the server previously crashed under wal_level='none' or not. + */ + if (ControlFile->wal_level == WAL_LEVEL_NONE && + (ControlFile->state != DB_SHUTDOWNED && ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)) + ereport(ERROR, + (errmsg("detected an unexpected server shutdown when WAL logging was disabled"), + errhint("It looks like you need to deploy a new cluster from your full backup again."))); + + /* * Check that contents look valid. */ if (!XRecOffIsValid(ControlFile->checkPoint)) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 1f0e4e0..8d6b875 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -449,6 +449,13 @@ XLogInsert(RmgrId rmid, uint8 info) return EndPos; } + /* Issues WAL only for XLOG resources */ + if (wal_level == WAL_LEVEL_NONE && rmid != RM_XLOG_ID) + { + XLogResetInsertion(); + return GetLatestCheckPointLSN(); + } + do { XLogRecPtr RedoRecPtr; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index b7799ed..27e4475 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -903,10 +903,10 @@ PostmasterMain(int argc, char *argv[]) ReservedBackends, MaxConnections); ExitPostmaster(1); } - if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level == WAL_LEVEL_MINIMAL) + if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level <= WAL_LEVEL_MINIMAL) ereport(ERROR, - (errmsg("WAL archival cannot be enabled when wal_level is \"minimal\""))); - if (max_wal_senders > 0 && wal_level == WAL_LEVEL_MINIMAL) + (errmsg("WAL archival cannot be enabled when wal_level is \"none\" or \"\"minimal"))); + if (max_wal_senders > 0 && wal_level <= WAL_LEVEL_MINIMAL) ereport(ERROR, (errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"replica\" or \"logical\""))); diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c index b99c94e..a3151fa 100644 --- a/src/backend/replication/logical/logicalfuncs.c +++ b/src/backend/replication/logical/logicalfuncs.c @@ -404,6 +404,10 @@ pg_logical_emit_message_bytea(PG_FUNCTION_ARGS) bytea *data = PG_GETARG_BYTEA_PP(2); XLogRecPtr lsn; + if (wal_level < WAL_LEVEL_REPLICA) + ereport(ERROR, + errmsg("propagating a message requires wal_level \"replica\" or \"logical\"")); + lsn = LogLogicalMessage(prefix, VARDATA_ANY(data), VARSIZE_ANY_EXHDR(data), transactional); PG_RETURN_LSN(lsn); diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index 1b22031..5e1f705 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -192,6 +192,10 @@ replorigin_check_prerequisites(bool check_slots, bool recoveryOK) (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), errmsg("cannot manipulate replication origins during recovery"))); + if (wal_level < WAL_LEVEL_REPLICA) + ereport(ERROR, + errmsg("creating replication origins requires wal_level \"replica\" or \"logical\"")); + } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index f398027..0f7597b 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -625,6 +625,9 @@ standard_ProcessUtility(PlannedStmt *pstmt, break; case TRANS_STMT_PREPARE: + if (wal_level == WAL_LEVEL_NONE) + ereport(ERROR, + errmsg("cannot execute PREPARE TRANSACTION when WAL logging is disabled")); if (!PrepareTransactionBlock(stmt->gid)) { /* report unsuccessful commit in qc */ diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 9cb571f..9bf89e5 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -196,8 +196,10 @@ # - Settings - -#wal_level = replica # minimal, replica, or logical - # (change requires restart) +#wal_level = replica # none, minimal, replica, or logical + # (change requires restart. + # choosing the none wal_level + # can cause unrecoverable data corruption) #fsync = on # flush data to disk for crash safety # (turning this off can cause # unrecoverable data corruption) diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 3e00ac0..90ec0dc 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -74,6 +74,8 @@ wal_level_str(WalLevel wal_level) { switch (wal_level) { + case WAL_LEVEL_NONE: + return "none"; case WAL_LEVEL_MINIMAL: return "minimal"; case WAL_LEVEL_REPLICA: diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 221af87..b8cffec 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -161,7 +161,8 @@ extern int XLogArchiveMode; /* WAL levels */ typedef enum WalLevel { - WAL_LEVEL_MINIMAL = 0, + WAL_LEVEL_NONE = 0, + WAL_LEVEL_MINIMAL, WAL_LEVEL_REPLICA, WAL_LEVEL_LOGICAL } WalLevel; @@ -319,6 +320,7 @@ extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); extern bool DataChecksumsEnabled(void); +extern XLogRecPtr GetLatestCheckPointLSN(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h index e1f5812..197ad06 100644 --- a/src/include/access/xlogdefs.h +++ b/src/include/access/xlogdefs.h @@ -59,7 +59,7 @@ typedef uint16 RepOriginId; /* * Because O_DIRECT bypasses the kernel buffers, and because we never - * read those buffers except during crash recovery or if wal_level != minimal, + * read those buffers except during crash recovery or if wal_level <= minimal, * it is a win to use it in all cases where we sync on each write(). We could * allow O_DIRECT with fsync(), but it is unclear if fsync() could process * writes not buffered in the kernel. Also, O_DIRECT is never enough to force diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index c5ffea4..a9f6af1 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -561,10 +561,11 @@ typedef struct ViewOptions * RelFileNode" in src/backend/access/transam/README. */ #define RelationNeedsWAL(relation) \ + (wal_level != WAL_LEVEL_NONE && \ ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT && \ (XLogIsNeeded() || \ (relation->rd_createSubid == InvalidSubTransactionId && \ - relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId))) + relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)))) /* * RelationUsesLocalBuffers -- 1.8.3.1