From 2075441bebc47d3dd5b6e0a76e16f5ebb12858af Mon Sep 17 00:00:00 2001 From: Jehan-Guillaume de Rorthais Date: Fri, 10 Apr 2020 18:01:45 +0200 Subject: [PATCH] Demote PoC --- src/backend/access/transam/xlog.c | 3 +- src/backend/postmaster/postmaster.c | 206 ++++++++++++++++++------ src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_ctl/pg_ctl.c | 105 ++++++++++++ src/include/catalog/pg_control.h | 1 + src/include/libpq/libpq-be.h | 7 +- src/include/utils/pidfile.h | 1 + 7 files changed, 271 insertions(+), 54 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 55cac186dc..8a7f1a0855 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8493,6 +8493,7 @@ ShutdownXLOG(int code, Datum arg) CurrentResourceOwner = AuxProcessResourceOwner; /* Don't be chatty in standalone mode */ + // FIXME: what message when demoting? ereport(IsPostmasterEnvironment ? LOG : NOTICE, (errmsg("shutting down"))); @@ -8760,7 +8761,7 @@ CreateCheckPoint(int flags) if (shutdown) { LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->state = DB_SHUTDOWNING; + ControlFile->state = DB_SHUTDOWNING; // DEMOTING? ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); LWLockRelease(ControlFileLock); diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index b4d475bb0b..465d020f9d 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -150,6 +150,9 @@ #define BACKEND_TYPE_WORKER (BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER) +/* file to signal demotion from primary to standby */ +#define DEMOTE_SIGNAL_FILE "demote" + /* * List of active backends (or child processes anyway; we don't actually * know whether a given child has become a backend or is still in the @@ -269,18 +272,23 @@ typedef enum static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING; /* Startup/shutdown state */ -#define NoShutdown 0 -#define SmartShutdown 1 -#define FastShutdown 2 -#define ImmediateShutdown 3 - -static int Shutdown = NoShutdown; +typedef enum StepDownState { + NoShutdown = 0, /* find better label? */ + SmartShutdown, + SmartDemote, + FastShutdown, + FastDemote, + ImmediateShutdown +} StepDownState; + +static StepDownState StepDown = NoShutdown; +static bool DemoteSignal = false; /* true on demote request */ static bool FatalError = false; /* T if recovering from backend crash */ /* - * We use a simple state machine to control startup, shutdown, and - * crash recovery (which is rather like shutdown followed by startup). + * We use a simple state machine to control startup, shutdown, demote and + * crash recovery (both are rather like shutdown followed by startup). * * After doing all the postmaster initialization work, we enter PM_STARTUP * state and the startup process is launched. The startup process begins by @@ -314,7 +322,7 @@ static bool FatalError = false; /* T if recovering from backend crash */ * will not be very long). * * Notice that this state variable does not distinguish *why* we entered - * states later than PM_RUN --- Shutdown and FatalError must be consulted + * states later than PM_RUN --- StepDown and FatalError must be consulted * to find that out. FatalError is never true in PM_RECOVERY_* or PM_RUN * states, nor in PM_SHUTDOWN states (because we don't enter those states * when trying to recover from a crash). It can be true in PM_STARTUP state, @@ -414,6 +422,8 @@ static bool RandomCancelKey(int32 *cancel_key); static void signal_child(pid_t pid, int signal); static bool SignalSomeChildren(int signal, int targets); static void TerminateChildren(int signal); +static bool CheckDemoteSignal(void); + #define SignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL) @@ -1550,7 +1560,7 @@ DetermineSleepTime(struct timeval *timeout) * Normal case: either there are no background workers at all, or we're in * a shutdown sequence (during which we ignore bgworkers altogether). */ - if (Shutdown > NoShutdown || + if (StepDown > NoShutdown || (!StartWorkerNeeded && !HaveCrashedWorker)) { if (AbortStartTime != 0) @@ -1830,7 +1840,7 @@ ServerLoop(void) * * Note we also do this during recovery from a process crash. */ - if ((Shutdown >= ImmediateShutdown || (FatalError && !SendStop)) && + if ((StepDown >= ImmediateShutdown || (FatalError && !SendStop)) && AbortStartTime != 0 && (now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS) { @@ -2305,6 +2315,11 @@ retry1: (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("the database system is starting up"))); break; + case CAC_DEMOTE: + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is demoting"))); + break; case CAC_SHUTDOWN: ereport(FATAL, (errcode(ERRCODE_CANNOT_CONNECT_NOW), @@ -2436,7 +2451,7 @@ canAcceptConnections(int backend_type) CAC_state result = CAC_OK; /* - * Can't start backends when in startup/shutdown/inconsistent recovery + * Can't start backends when in startup/demote/shutdown/inconsistent recovery * state. We treat autovac workers the same as user backends for this * purpose. However, bgworkers are excluded from this test; we expect * bgworker_should_start_now() decided whether the DB state allows them. @@ -2452,7 +2467,9 @@ canAcceptConnections(int backend_type) { if (pmState == PM_WAIT_BACKUP) result = CAC_WAITBACKUP; /* allow superusers only */ - else if (Shutdown > NoShutdown) + else if (StepDown == SmartDemote || StepDown == FastDemote) + return CAC_DEMOTE; /* demote is pending */ + else if (StepDown > NoShutdown) return CAC_SHUTDOWN; /* shutdown is pending */ else if (!FatalError && (pmState == PM_STARTUP || @@ -2683,7 +2700,8 @@ SIGHUP_handler(SIGNAL_ARGS) PG_SETMASK(&BlockSig); #endif - if (Shutdown <= SmartShutdown) + if (StepDown == NoShutdown || StepDown == SmartShutdown || + StepDown == SmartDemote) { ereport(LOG, (errmsg("received SIGHUP, reloading configuration files"))); @@ -2769,26 +2787,72 @@ pmdie(SIGNAL_ARGS) (errmsg_internal("postmaster received signal %d", postgres_signal_arg))); + if (CheckDemoteSignal()) + { + if (pmState != PM_RUN) + { + DemoteSignal = false; + unlink(DEMOTE_SIGNAL_FILE); + ereport(LOG, + (errmsg("ignoring demote signal because already in standby mode"))); + } + else if (postgres_signal_arg == SIGQUIT) { + DemoteSignal = false; + ereport(WARNING, + (errmsg("can not demote in immediate stop mode"))); + // FIXME: should we abort the shutdown process? + } + else + { + FILE *standby_file; + + DemoteSignal = true; + + /* create the standby signal file */ + standby_file = AllocateFile(STANDBY_SIGNAL_FILE, "w"); + if (!standby_file) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + STANDBY_SIGNAL_FILE))); + + if (FreeFile(standby_file)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + STANDBY_SIGNAL_FILE))); + } + + unlink(DEMOTE_SIGNAL_FILE); + } + switch (postgres_signal_arg) { case SIGTERM: /* - * Smart Shutdown: + * Smart Stepdown: * - * Wait for children to end their work, then shut down. + * Wait for children to end their work, then shut down or demote. */ - if (Shutdown >= SmartShutdown) + if (StepDown >= SmartShutdown) break; - Shutdown = SmartShutdown; - ereport(LOG, - (errmsg("received smart shutdown request"))); - /* Report status */ - AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); + if (DemoteSignal) { + StepDown = SmartDemote; + ereport(LOG, (errmsg("received smart demote request"))); + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_DEMOTING); + } + else { + StepDown = SmartShutdown; + ereport(LOG, (errmsg("received smart shutdown request"))); + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); #ifdef USE_SYSTEMD - sd_notify(0, "STOPPING=1"); + sd_notify(0, "STOPPING=1"); #endif + } if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY || pmState == PM_STARTUP) @@ -2831,22 +2895,29 @@ pmdie(SIGNAL_ARGS) case SIGINT: /* - * Fast Shutdown: + * Fast StepDown: * * Abort all children with SIGTERM (rollback active transactions - * and exit) and shut down when they are gone. + * and exit) and shut down or demote when they are gone. */ - if (Shutdown >= FastShutdown) + if (StepDown >= FastShutdown) break; - Shutdown = FastShutdown; - ereport(LOG, - (errmsg("received fast shutdown request"))); - /* Report status */ - AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); + if (DemoteSignal) { + StepDown = FastDemote; + ereport(LOG, (errmsg("received fast demote request"))); + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_DEMOTING); + } + else { + StepDown = FastShutdown; + ereport(LOG, (errmsg("received fast shutdown request"))); + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); #ifdef USE_SYSTEMD - sd_notify(0, "STOPPING=1"); + sd_notify(0, "STOPPING=1"); #endif + } if (StartupPID != 0) signal_child(StartupPID, SIGTERM); @@ -2903,9 +2974,9 @@ pmdie(SIGNAL_ARGS) * terminate remaining ones with SIGKILL, then exit without * attempt to properly shut down the data base system. */ - if (Shutdown >= ImmediateShutdown) + if (StepDown >= ImmediateShutdown) break; - Shutdown = ImmediateShutdown; + StepDown = ImmediateShutdown; ereport(LOG, (errmsg("received immediate shutdown request"))); @@ -2967,10 +3038,11 @@ reaper(SIGNAL_ARGS) StartupPID = 0; /* - * Startup process exited in response to a shutdown request (or it - * completed normally regardless of the shutdown request). + * Startup process exited in response to a shutdown or demote + * request (or it completed normally regardless of the shutdown + * request). */ - if (Shutdown > NoShutdown && + if (StepDown > NoShutdown && (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) { StartupStatus = STARTUP_NOT_RUNNING; @@ -2984,7 +3056,7 @@ reaper(SIGNAL_ARGS) ereport(LOG, (errmsg("shutdown at recovery target"))); StartupStatus = STARTUP_NOT_RUNNING; - Shutdown = SmartShutdown; + StepDown = SmartShutdown; TerminateChildren(SIGTERM); pmState = PM_WAIT_BACKENDS; /* PostmasterStateMachine logic does the rest */ @@ -3124,7 +3196,7 @@ reaper(SIGNAL_ARGS) * archive cycle and quit. Likewise, if we have walsender * processes, tell them to send any remaining WAL and quit. */ - Assert(Shutdown > NoShutdown); + Assert(StepDown > NoShutdown); /* Waken archiver for the last time */ if (PgArchPID != 0) @@ -3484,7 +3556,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) * signaled children, nonzero exit status is to be expected, so don't * clutter log. */ - take_action = !FatalError && Shutdown != ImmediateShutdown; + take_action = !FatalError && StepDown != ImmediateShutdown; if (take_action) { @@ -3702,7 +3774,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) /* We do NOT restart the syslogger */ - if (Shutdown != ImmediateShutdown) + if (StepDown != ImmediateShutdown) FatalError = true; /* We now transit into a state of waiting for children to die */ @@ -3845,11 +3917,11 @@ PostmasterStateMachine(void) WalReceiverPID == 0 && BgWriterPID == 0 && (CheckpointerPID == 0 || - (!FatalError && Shutdown < ImmediateShutdown)) && + (!FatalError && StepDown < ImmediateShutdown)) && WalWriterPID == 0 && AutoVacPID == 0) { - if (Shutdown >= ImmediateShutdown || FatalError) + if (StepDown >= ImmediateShutdown || FatalError) { /* * Start waiting for dead_end children to die. This state @@ -3870,7 +3942,7 @@ PostmasterStateMachine(void) * the regular children are gone, and it's time to tell the * checkpointer to do a shutdown checkpoint. */ - Assert(Shutdown > NoShutdown); + Assert(StepDown > NoShutdown); /* Start the checkpointer if not running */ if (CheckpointerPID == 0) CheckpointerPID = StartCheckpointer(); @@ -3958,7 +4030,8 @@ PostmasterStateMachine(void) * EOF on its input pipe, which happens when there are no more upstream * processes. */ - if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN) + if (pmState == PM_NO_CHILDREN && (StepDown == SmartShutdown || + StepDown == FastShutdown || StepDown == ImmediateShutdown)) { if (FatalError) { @@ -3991,15 +4064,29 @@ PostmasterStateMachine(void) * startup process fails, because more than likely it will just fail again * and we will keep trying forever. */ - if (pmState == PM_NO_CHILDREN && + if (pmState == PM_NO_CHILDREN && !DemoteSignal && (StartupStatus == STARTUP_CRASHED || !restart_after_crash)) ExitPostmaster(1); + /* Handle demote signal */ + if (DemoteSignal && pmState == PM_NO_CHILDREN) + { + ereport(LOG, (errmsg("all server processes terminated; demoting"))); + + // Signal bgworkers? + + StartupPID = StartupDataBase(); + Assert(StartupPID != 0); + StartupStatus = STARTUP_RUNNING; + pmState = PM_STARTUP; + StepDown = NoShutdown; + } + /* * If we need to recover from a crash, wait for all non-syslogger children * to exit, then reset shmem and StartupDataBase. */ - if (FatalError && pmState == PM_NO_CHILDREN) + else if (FatalError && pmState == PM_NO_CHILDREN) { ereport(LOG, (errmsg("all server processes terminated; reinitializing"))); @@ -5195,7 +5282,7 @@ sigusr1_handler(SIGNAL_ARGS) * first. We don't want to go back to recovery in that case. */ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED) && - pmState == PM_STARTUP && Shutdown == NoShutdown) + pmState == PM_STARTUP && StepDown == NoShutdown) { /* WAL redo has started. We're out of reinitialization. */ FatalError = false; @@ -5234,7 +5321,7 @@ sigusr1_handler(SIGNAL_ARGS) pmState = PM_RECOVERY; } if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && - pmState == PM_RECOVERY && Shutdown == NoShutdown) + pmState == PM_RECOVERY && StepDown == NoShutdown) { /* * Likewise, start other special children as needed. @@ -5284,7 +5371,7 @@ sigusr1_handler(SIGNAL_ARGS) } if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER) && - Shutdown == NoShutdown) + StepDown == NoShutdown) { /* * Start one iteration of the autovacuum daemon, even if autovacuuming @@ -5299,7 +5386,7 @@ sigusr1_handler(SIGNAL_ARGS) } if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER) && - Shutdown == NoShutdown) + StepDown == NoShutdown) { /* The autovacuum launcher wants us to start a worker process. */ StartAutovacuumWorker(); @@ -5644,7 +5731,7 @@ MaybeStartWalReceiver(void) if (WalReceiverPID == 0 && (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) && - Shutdown == NoShutdown) + StepDown == NoShutdown) { WalReceiverPID = StartWalReceiver(); if (WalReceiverPID != 0) @@ -6647,3 +6734,18 @@ InitPostmasterDeathWatchHandle(void) GetLastError()))); #endif /* WIN32 */ } + +/* + * Check if a promote request appeared. Should be called by postmaster before + * shutting down. + */ +bool +CheckDemoteSignal(void) +{ + struct stat stat_buf; + + if (stat(DEMOTE_SIGNAL_FILE, &stat_buf) == 0) + return true; + + return false; +} diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index e73639df74..c144cc35d3 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -57,6 +57,8 @@ dbState(DBState state) return _("shut down"); case DB_SHUTDOWNED_IN_RECOVERY: return _("shut down in recovery"); + case DB_DEMOTING: + return _("demoting"); case DB_SHUTDOWNING: return _("shutting down"); case DB_IN_CRASH_RECOVERY: diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 3c03ace7ed..0bb7d69682 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -62,6 +62,7 @@ typedef enum RESTART_COMMAND, RELOAD_COMMAND, STATUS_COMMAND, + DEMOTE_COMMAND, PROMOTE_COMMAND, LOGROTATE_COMMAND, KILL_COMMAND, @@ -103,6 +104,7 @@ static char version_file[MAXPGPATH]; static char pid_file[MAXPGPATH]; static char backup_file[MAXPGPATH]; static char promote_file[MAXPGPATH]; +static char demote_file[MAXPGPATH]; static char logrotate_file[MAXPGPATH]; static volatile pgpid_t postmasterPID = -1; @@ -129,6 +131,7 @@ static void do_stop(void); static void do_restart(void); static void do_reload(void); static void do_status(void); +static void do_demote(void); static void do_promote(void); static void do_logrotate(void); static void do_kill(pgpid_t pid); @@ -1029,6 +1032,103 @@ do_stop(void) } +static void +do_demote(void) +{ + int cnt; + FILE *dmtfile; + pgpid_t pid; + struct stat statbuf; + + pid = get_pgpid(false); + + if (pid == 0) /* no pid file */ + { + write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file); + write_stderr(_("Is server running?\n")); + exit(1); + } + else if (pid < 0) /* standalone backend, not postmaster */ + { + pid = -pid; + write_stderr(_("%s: cannot demote server; " + "single-user server is running (PID: %ld)\n"), + progname, pid); + exit(1); + } + + snprintf(demote_file, MAXPGPATH, "%s/demote", pg_data); + + if ((dmtfile = fopen(demote_file, "w")) == NULL) + { + write_stderr(_("%s: could not create demote signal file \"%s\": %s\n"), + progname, demote_file, strerror(errno)); + exit(1); + } + if (fclose(dmtfile)) + { + write_stderr(_("%s: could not write demote signal file \"%s\": %s\n"), + progname, demote_file, strerror(errno)); + exit(1); + } + + if (kill((pid_t) pid, sig) != 0) + { + write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid, + strerror(errno)); + exit(1); + } + + if (!do_wait) + { + print_msg(_("server demoting\n")); + return; + } + else + { + /* + * If backup_label exists, an online backup is running. Warn the user + * that smart demote will wait for it to finish. However, if the + * server is in archive recovery, we're recovering from an online + * backup instead of performing one. + */ + if (shutdown_mode == SMART_MODE && + stat(backup_file, &statbuf) == 0 && + get_control_dbstate() != DB_IN_ARCHIVE_RECOVERY) + { + print_msg(_("WARNING: online backup mode is active\n" + "Demote will not complete until pg_stop_backup() is called.\n\n")); + } + + print_msg(_("waiting for server to demote...")); + + for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++) + { + if (get_control_dbstate() == DB_IN_ARCHIVE_RECOVERY) + break; + + if (cnt % WAITS_PER_SEC == 0) + print_msg("."); + pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); + } + + if (get_control_dbstate() != DB_IN_ARCHIVE_RECOVERY) + { + print_msg(_(" failed\n")); + + write_stderr(_("%s: server does not demote\n"), progname); + if (shutdown_mode == SMART_MODE) + write_stderr(_("HINT: The \"-m fast\" option immediately disconnects sessions rather than\n" + "waiting for session-initiated disconnection.\n")); + exit(1); + } + print_msg(_(" done\n")); + + print_msg(_("server demoted\n")); + } +} + + /* * restart/reload routines */ @@ -2452,6 +2552,8 @@ main(int argc, char **argv) ctl_command = RELOAD_COMMAND; else if (strcmp(argv[optind], "status") == 0) ctl_command = STATUS_COMMAND; + else if (strcmp(argv[optind], "demote") == 0) + ctl_command = DEMOTE_COMMAND; else if (strcmp(argv[optind], "promote") == 0) ctl_command = PROMOTE_COMMAND; else if (strcmp(argv[optind], "logrotate") == 0) @@ -2559,6 +2661,9 @@ main(int argc, char **argv) case RELOAD_COMMAND: do_reload(); break; + case DEMOTE_COMMAND: + do_demote(); + break; case PROMOTE_COMMAND: do_promote(); break; diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index de5670e538..f529f8c7bd 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -87,6 +87,7 @@ typedef enum DBState DB_STARTUP = 0, DB_SHUTDOWNED, DB_SHUTDOWNED_IN_RECOVERY, + DB_DEMOTING, DB_SHUTDOWNING, DB_IN_CRASH_RECOVERY, DB_IN_ARCHIVE_RECOVERY, diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h index 179ebaa104..a9e27f009e 100644 --- a/src/include/libpq/libpq-be.h +++ b/src/include/libpq/libpq-be.h @@ -70,7 +70,12 @@ typedef struct typedef enum CAC_state { - CAC_OK, CAC_STARTUP, CAC_SHUTDOWN, CAC_RECOVERY, CAC_TOOMANY, + CAC_OK, + CAC_STARTUP, + CAC_DEMOTE, + CAC_SHUTDOWN, + CAC_RECOVERY, + CAC_TOOMANY, CAC_WAITBACKUP } CAC_state; diff --git a/src/include/utils/pidfile.h b/src/include/utils/pidfile.h index 63fefe5c4c..f761d2c4ef 100644 --- a/src/include/utils/pidfile.h +++ b/src/include/utils/pidfile.h @@ -50,6 +50,7 @@ */ #define PM_STATUS_STARTING "starting" /* still starting up */ #define PM_STATUS_STOPPING "stopping" /* in shutdown sequence */ +#define PM_STATUS_DEMOTING "demoting" /* demote sequence */ #define PM_STATUS_READY "ready " /* ready for connections */ #define PM_STATUS_STANDBY "standby " /* up, won't accept connections */ -- 2.20.1