From d12621c1e03abf8876d944ea3e831213111f2909 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 28 Jun 2018 14:38:24 +0900 Subject: [PATCH 1/2] Fail hard when facing corrupted two-phase state files When a corrupted file is found by WAL replay, be it for crash recovery or archive recovery, then the file is simply skipped and a WARNING is logged to the user. Facing an on-disk WAL file which is corrupted is more likely to happen than its pair recorded in dedicated WAL records, but if that happens then the instance faces data loss as the transaction is not around anymore as it is not possible to commit it. Reported-by: Michael Paquier Author: Michael Paquier Discussion: https://postgr.es/m/20161216060832.GB17838@paquier.xyz --- src/backend/access/transam/twophase.c | 28 ++++++++++----------------- src/backend/access/transam/xlog.c | 10 +++++++--- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index a9ef1b3d73..2da3d93d87 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -1873,6 +1873,10 @@ restoreTwoPhaseData(void) * write a WAL entry, and so there might be no evidence in WAL of those * subxact XIDs. * + * On corrupted two-phase files, fail immediately. Keeping around broken + * entries and let replay continue causes harm on the system, and a new + * backup should be rolled in. + * * Our other responsibility is to determine and return the oldest valid XID * among the prepared xacts (if none, return ShmemVariableCache->nextXid). * This is needed to synchronize pg_subtrans startup properly. @@ -2165,13 +2169,9 @@ ProcessTwoPhaseBuffer(TransactionId xid, /* Read and validate file */ buf = ReadTwoPhaseFile(xid, true); if (buf == NULL) - { - ereport(WARNING, - (errmsg("removing corrupt two-phase state file for transaction %u", + ereport(FATAL, + (errmsg("corrupted two-phase state file for \"%u\"", xid))); - RemoveTwoPhaseFile(xid, true); - return NULL; - } } else { @@ -2184,21 +2184,13 @@ ProcessTwoPhaseBuffer(TransactionId xid, if (!TransactionIdEquals(hdr->xid, xid)) { if (fromdisk) - { - ereport(WARNING, - (errmsg("removing corrupt two-phase state file for transaction %u", + ereport(FATAL, + (errmsg("corrupted two-phase state file for \"%u\"", xid))); - RemoveTwoPhaseFile(xid, true); - } else - { - ereport(WARNING, - (errmsg("removing corrupt two-phase state from memory for transaction %u", + ereport(FATAL, + (errmsg("corrupted two-phase state in memory for \"%u\"", xid))); - PrepareRedoRemove(xid, true); - } - pfree(buf); - return NULL; } /* diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1a419aa49b..3695258e6f 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7462,6 +7462,13 @@ StartupXLOG(void) } } + /* + * Pre-scan prepared transactions to find out the range of XIDs present. + * This information is not quite needed yet, but it is positioned here so + * as potential problems are detected before any on-disk change is done. + */ + oldestActiveXID = PrescanPreparedTransactions(NULL, NULL); + /* * Consider whether we need to assign a new timeline ID. * @@ -7585,9 +7592,6 @@ StartupXLOG(void) XLogCtl->LogwrtRqst.Write = EndOfLog; XLogCtl->LogwrtRqst.Flush = EndOfLog; - /* Pre-scan prepared transactions to find out the range of XIDs present */ - oldestActiveXID = PrescanPreparedTransactions(NULL, NULL); - /* * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE * record before resource manager writes cleanup WAL records or checkpoint -- 2.18.0