From da18ad25c2b0308ff6ac87a1a63933acda1907cc Mon Sep 17 00:00:00 2001 From: Wu Hao Date: Fri, 17 Jan 2020 18:14:44 +0530 Subject: [PATCH v1 3/3] Start WAL receiver when it is found not running Postmaster now starts WAL receiver as soon as it is found not running from ServerLoop. This helps to resume streaming replication sooner when a temporary network disruption causes WAL receiver process to exit. As a consequence, the race condition addressed in e5d494d78cf is eliminated. Postmaster may start WAL receiver in states that allow a standby to operate, except PM_STARTUP. It is not possible to distinguish whether the postmaster is operating as master or standby in PM_STARTUP state. Postmaster attempts to start WAL receiver as long as a promote request is not received and the state permits. Co-authored-by: Asim R P --- src/backend/postmaster/postmaster.c | 43 +++++++++++++++++++++-------------- src/backend/replication/walreceiver.c | 40 ++++++++++++++++++++++++++++---- 2 files changed, 62 insertions(+), 21 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 7a92dac525..5d0f8b0ebb 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -360,8 +360,8 @@ static volatile sig_atomic_t start_autovac_launcher = false; /* the launcher needs to be signalled to communicate some condition */ static volatile bool avlauncher_needs_signal = false; -/* received START_WALRECEIVER signal */ -static volatile sig_atomic_t WalReceiverRequested = false; +/* attempt to start WAL receiver, if not undergoing promotion */ +static volatile sig_atomic_t ReceivedPromoteRequest = false; /* set when there's a worker that needs to be started up */ static volatile bool StartWorkerNeeded = true; @@ -1795,8 +1795,11 @@ ServerLoop(void) kill(AutoVacPID, SIGUSR2); } - /* If we need to start a WAL receiver, try to do that now */ - if (WalReceiverRequested) + /* + * Start WAL receiver if it is not already running and standby mode + * (or archive recovery) is enabled. + */ + if (!ReceivedPromoteRequest) MaybeStartWalReceiver(); /* Get other worker processes running, if needed */ @@ -5282,8 +5285,6 @@ sigusr1_handler(SIGNAL_ARGS) if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER)) { /* Startup Process wants us to start the walreceiver process. */ - /* Start immediately if possible, else remember request for later. */ - WalReceiverRequested = true; MaybeStartWalReceiver(); } @@ -5309,6 +5310,11 @@ sigusr1_handler(SIGNAL_ARGS) { /* Tell startup process to finish recovery */ signal_child(StartupPID, SIGUSR2); + /* + * Do not attempt to restart wal receiver from now on. Note that this + * flag remains unchanged once set. + */ + ReceivedPromoteRequest = true; } #ifdef WIN32 @@ -5604,26 +5610,29 @@ StartAutovacuumWorker(void) * MaybeStartWalReceiver * Start the WAL receiver process, if not running and our state allows. * - * Note: if WalReceiverPID is already nonzero, it might seem that we should - * clear WalReceiverRequested. However, there's a race condition if the - * walreceiver terminates and the startup process immediately requests a new - * one: it's quite possible to get the signal for the request before reaping - * the dead walreceiver process. Better to risk launching an extra - * walreceiver than to miss launching one we need. (The walreceiver code - * has logic to recognize that it should go away if not needed.) + * Note: there is a race condition if the walreceiver terminates and the + * startup process immediately requests a new one: it's quite possible to get + * the signal for the request before reaping the dead walreceiver process. It + * is alright to risk launching an extra walreceiver because the walreceiver + * code has logic to recognize that it should go away if not needed. */ static void MaybeStartWalReceiver(void) { if (WalReceiverPID == 0 && - (pmState == PM_STARTUP || pmState == PM_RECOVERY || + /* + * Cannot include PM_STARTUP here because it leads to starting WAL + * receiver even after a standby is promoted. The objective is to + * start WAL receiver only when standby mode is enabled. However, + * pmState is set to PM_RECOVERY when standby mode as well as archive + * recovery is enabled. That means, postmaster cannot distinguish + * between the two. TODO: if this is a problem, adderss it somehow! + */ + (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) && Shutdown == NoShutdown) { WalReceiverPID = StartWalReceiver(); - if (WalReceiverPID != 0) - WalReceiverRequested = false; - /* else leave the flag set, so we'll try again later */ } } diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index c862b65cae..9d91d9ca28 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -241,21 +241,33 @@ WalReceiverMain(void) * waiting for us to start up, until it times out. */ SpinLockAcquire(&walrcv->mutex); + /* + * TODO: postmaster may start walreceiver from ServerLoop. Startup + * process requests postmaster to start walreceiver on a couple of + * occasions. The requests from startup process are handled inside + * SIGUSR1 handler. It is possible that more than one walreceiver + * processes attempt to start nearly simultaneously. If this is indeed a + * possibility, one solution seems to not start walreceiver from the + * signal handler but only set a flag to do so. + */ Assert(walrcv->pid == 0); switch (walrcv->walRcvState) { case WALRCV_STOPPING: /* If we've already been requested to stop, don't start up. */ walrcv->walRcvState = WALRCV_STOPPED; - /* fall through */ - - case WALRCV_STOPPED: SpinLockRelease(&walrcv->mutex); proc_exit(1); break; + case WALRCV_STOPPED: + /* + * Postmaster, upon noticing that WAL receiver is not running, + * starts us from ServerLoop. + */ + /* fall through */ case WALRCV_STARTING: - /* The usual case */ + /* The usual case - startup process requests WAL streaming. */ break; case WALRCV_WAITING: @@ -350,6 +362,26 @@ WalReceiverMain(void) ControlFile = ShmemInitStruct("Control File", sizeof(ControlFileData), &found); Assert(found); + XLogSegNo startseg; + XLByteToSeg(startpoint, startseg, wal_segment_size); + + LWLockAcquire(ControlFileLock, LW_SHARED); + if (startpointTLI == ControlFile->lastFlushedSegTLI && + startseg < ControlFile->lastFlushedSeg) + { + /* + * Advance startpoint to the flush point in control file. The + * startpoint may be behind like this when WAL receiver is started by + * postmaster upon noticing that an existing WAL receiver child + * process exited. Postmaster does not update WalRcv->startpoint, + * similar to how it's done in RequestXLogStreaming, because it should + * refrain from touching shared memory. + */ + XLogSegNoOffsetToRecPtr( + ControlFile->lastFlushedSeg, 0, wal_segment_size, startpoint); + } + LWLockRelease(ControlFileLock); + first_stream = true; for (;;) { -- 2.14.3 (Apple Git-98)