From 8c76950d665cabd095b5eed34ad25854eb2dd5a0 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi Date: Thu, 16 Nov 2017 17:05:21 +0900 Subject: [PATCH 3/4] Add vacuum execution status in pg_stat_all_tables The main objective of this patch is showing how vacuuming is failing. It is sometimes very hard to diagnose since autovacuum stops silently in most cases. This patch leaves the reason for vacuum failure in pg_stat_all_tables and how many times it is continuing to fail. --- doc/src/sgml/monitoring.sgml | 10 +++++ src/backend/catalog/system_views.sql | 2 + src/backend/commands/vacuum.c | 33 ++++++++++++++ src/backend/commands/vacuumlazy.c | 72 ++++++++++++++++++++++++++++++- src/backend/postmaster/pgstat.c | 62 +++++++++++++++++++++------ src/backend/utils/adt/pgstatfuncs.c | 83 ++++++++++++++++++++++++++++++++++++ src/include/catalog/pg_proc.h | 4 ++ src/include/commands/vacuum.h | 1 + src/include/pgstat.h | 25 ++++++++++- src/test/regress/expected/rules.out | 6 +++ 10 files changed, 283 insertions(+), 15 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index e8a8f77..e2bf2d2 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -2586,6 +2586,16 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i Number of splitted index scans performed during the last vacuum on this table + last_vacuum_status + text + The status of last autovacuum. + + + autovacuum_fail_count + integer + The number of continuously failed vacuum trials. Cleared to zero if completed. + + autovacuum_count bigint Number of times this table has been vacuumed by the autovacuum diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 97bafb8..cd0ea69 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -530,6 +530,8 @@ CREATE VIEW pg_stat_all_tables AS pg_stat_get_last_autoanalyze_time(C.oid) as last_autoanalyze, pg_stat_get_vacuum_count(C.oid) AS vacuum_count, pg_stat_get_last_vacuum_index_scans(C.oid) AS last_vacuum_index_scans, + pg_stat_get_last_vacuum_status(C.oid) AS last_vacuum_status, + pg_stat_get_autovacuum_fail_count(C.oid) AS autovacuum_fail_count, pg_stat_get_autovacuum_count(C.oid) AS autovacuum_count, pg_stat_get_analyze_count(C.oid) AS analyze_count, pg_stat_get_autoanalyze_count(C.oid) AS autoanalyze_count diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index f51dcdb..ac7c2ac 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -35,6 +35,7 @@ #include "catalog/pg_inherits_fn.h" #include "catalog/pg_namespace.h" #include "commands/cluster.h" +#include "commands/progress.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -367,6 +368,9 @@ vacuum(int options, List *relations, VacuumParams *params, } PG_CATCH(); { + /* report the final status of this vacuum */ + lazy_vacuum_cancel_handler(); + in_vacuum = false; VacuumCostActive = false; PG_RE_THROW(); @@ -1463,6 +1467,8 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) if (!onerel) { + pgstat_report_vacuum(relid, false, 0, 0, 0, + PGSTAT_VACUUM_SKIP_LOCK_FAILED, 0, 0); PopActiveSnapshot(); CommitTransactionCommand(); return false; @@ -1494,6 +1500,11 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) (errmsg("skipping \"%s\" --- only table or database owner can vacuum it", RelationGetRelationName(onerel)))); relation_close(onerel, lmode); + + pgstat_report_vacuum(RelationGetRelid(onerel), + onerel->rd_rel->relisshared, + 0, 0, 0, PGSTAT_VACUUM_SKIP_NONTARGET, 0, 0); + PopActiveSnapshot(); CommitTransactionCommand(); return false; @@ -1511,6 +1522,12 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) (errmsg("skipping \"%s\" --- cannot vacuum non-tables or special system tables", RelationGetRelationName(onerel)))); relation_close(onerel, lmode); + + pgstat_report_vacuum(RelationGetRelid(onerel), + onerel->rd_rel->relisshared, + 0, 0, 0, + PGSTAT_VACUUM_SKIP_NONTARGET, 0, 0); + PopActiveSnapshot(); CommitTransactionCommand(); return false; @@ -1526,6 +1543,12 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) if (RELATION_IS_OTHER_TEMP(onerel)) { relation_close(onerel, lmode); + + pgstat_report_vacuum(RelationGetRelid(onerel), + onerel->rd_rel->relisshared, + 0, 0, 0, + PGSTAT_VACUUM_SKIP_NONTARGET, 0, 0); + PopActiveSnapshot(); CommitTransactionCommand(); return false; @@ -1539,6 +1562,12 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { relation_close(onerel, lmode); + + pgstat_report_vacuum(RelationGetRelid(onerel), + onerel->rd_rel->relisshared, + 0, 0, 0, + PGSTAT_VACUUM_SKIP_NONTARGET, 0, 0); + PopActiveSnapshot(); CommitTransactionCommand(); /* It's OK to proceed with ANALYZE on this table */ @@ -1584,6 +1613,8 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) */ if (options & VACOPT_FULL) { + bool isshared = onerel->rd_rel->relisshared; + /* close relation before vacuuming, but hold lock until commit */ relation_close(onerel, NoLock); onerel = NULL; @@ -1591,6 +1622,8 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params) /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */ cluster_rel(relid, InvalidOid, false, (options & VACOPT_VERBOSE) != 0); + pgstat_report_vacuum(relid, isshared, 0, 0, 0, + PGSTAT_VACUUM_FULL_FINISHED, 0, 0); } else lazy_vacuum_rel(onerel, options, params, vac_strategy); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 4274043..af38962 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -105,6 +105,8 @@ typedef struct LVRelStats { + Oid reloid; /* oid of the target relation */ + bool shared; /* is shared relation? */ /* hasindex = true means two-pass strategy; false means one-pass */ bool hasindex; /* Overall statistics about rel */ @@ -138,6 +140,7 @@ static int elevel = -1; static TransactionId OldestXmin; static TransactionId FreezeLimit; static MultiXactId MultiXactCutoff; +static LVRelStats *current_lvstats; static BufferAccessStrategy vac_strategy; @@ -216,6 +219,7 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, else elevel = DEBUG2; + current_lvstats = NULL; pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM, RelationGetRelid(onerel)); @@ -236,12 +240,20 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); + vacrelstats->reloid = RelationGetRelid(onerel); + vacrelstats->shared = onerel->rd_rel->relisshared; vacrelstats->old_rel_pages = onerel->rd_rel->relpages; vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; vacrelstats->pages_removed = 0; vacrelstats->lock_waiter_detected = false; + /* + * Register current vacrelstats so that final status can be reported on + * interrupts + */ + current_lvstats = vacrelstats; + /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); @@ -331,11 +343,19 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, if (new_live_tuples < 0) new_live_tuples = 0; /* just in case */ - pgstat_report_vacuum(RelationGetRelid(onerel), + /* vacuum successfully finished. nothing to do on exit */ + current_lvstats = NULL; + + pgstat_report_vacuum(vacrelstats->reloid, onerel->rd_rel->relisshared, new_live_tuples, vacrelstats->new_dead_tuples, - vacrelstats->num_index_scans); + vacrelstats->num_index_scans, + aggressive ? + PGSTAT_VACUUM_AGGRESSIVE_FINISHED : + PGSTAT_VACUUM_FINISHED, + 0, 0); + pgstat_progress_end_command(); /* and log the action if appropriate */ @@ -2198,3 +2218,51 @@ heap_page_is_all_visible(Relation rel, Buffer buf, return all_visible; } + +/* + * lazy_vacuum_cancel_handler - report interrupted vacuum status + */ +void +lazy_vacuum_cancel_handler(void) +{ + LVRelStats *stats = current_lvstats; + LocalPgBackendStatus *local_beentry; + PgBackendStatus *beentry; + int phase; + int err; + + current_lvstats = NULL; + + /* we have nothing to report */ + if (!stats) + return; + + /* get vacuum progress stored in backend status */ + local_beentry = pgstat_fetch_stat_local_beentry(MyBackendId); + if (!local_beentry) + return; + + beentry = &local_beentry->backendStatus; + + Assert (beentry && beentry->st_progress_command == PROGRESS_COMMAND_VACUUM); + + phase = beentry->st_progress_param[PROGRESS_VACUUM_PHASE]; + + /* we can reach here both on interrupt and error */ + if (geterrcode() == ERRCODE_QUERY_CANCELED) + err = PGSTAT_VACUUM_CANCELED; + else + err = PGSTAT_VACUUM_ERROR; + + /* + * vacuum has been canceled, report stats numbers without normalization + * here. (But currently they are not used.) + */ + pgstat_report_vacuum(stats->reloid, + stats->shared, + stats->new_rel_tuples, + stats->new_dead_tuples, + stats->num_index_scans, + err, + phase, geterrcode()); +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 5f3fdf6..540c580 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -1404,7 +1404,9 @@ pgstat_report_autovac(Oid dboid) void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter livetuples, PgStat_Counter deadtuples, - PgStat_Counter num_index_scans) + PgStat_Counter num_index_scans, + PgStat_Counter status, PgStat_Counter last_phase, + PgStat_Counter errcode) { PgStat_MsgVacuum msg; @@ -1419,6 +1421,9 @@ pgstat_report_vacuum(Oid tableoid, bool shared, msg.m_live_tuples = livetuples; msg.m_dead_tuples = deadtuples; msg.m_num_index_scans = num_index_scans; + msg.m_vacuum_status = status; + msg.m_vacuum_last_phase = last_phase; + msg.m_vacuum_errcode = errcode; pgstat_send(&msg, sizeof(msg)); } @@ -4598,6 +4603,11 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) result->analyze_count = 0; result->autovac_analyze_timestamp = 0; result->autovac_analyze_count = 0; + + result->vacuum_status = 0; + result->vacuum_last_phase = 0; + result->vacuum_errcode = 0; + result->vacuum_failcount = 0; } return result; @@ -5982,19 +5992,47 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); - tabentry->n_live_tuples = msg->m_live_tuples; - tabentry->n_dead_tuples = msg->m_dead_tuples; - tabentry->n_index_scans = msg->m_num_index_scans; + tabentry->vacuum_status = msg->m_vacuum_status; + tabentry->vacuum_last_phase = msg->m_vacuum_last_phase; + tabentry->vacuum_errcode = msg->m_vacuum_errcode; - if (msg->m_autovacuum) - { - tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime; - tabentry->autovac_vacuum_count++; - } - else + /* + * We store the numbers only when the vacuum has been completed. They + * might be usable to find how much the stopped vacuum processed but we + * choose not to show them rather than show bogus numbers. + */ + switch ((StatVacuumStatus)msg->m_vacuum_status) { - tabentry->vacuum_timestamp = msg->m_vacuumtime; - tabentry->vacuum_count++; + case PGSTAT_VACUUM_FINISHED: + case PGSTAT_VACUUM_FULL_FINISHED: + case PGSTAT_VACUUM_AGGRESSIVE_FINISHED: + tabentry->n_live_tuples = msg->m_live_tuples; + tabentry->n_dead_tuples = msg->m_dead_tuples; + tabentry->n_index_scans = msg->m_num_index_scans; + tabentry->vacuum_failcount = 0; + + if (msg->m_autovacuum) + { + tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime; + tabentry->autovac_vacuum_count++; + } + else + { + tabentry->vacuum_timestamp = msg->m_vacuumtime; + tabentry->vacuum_count++; + } + break; + + case PGSTAT_VACUUM_ERROR: + case PGSTAT_VACUUM_CANCELED: + case PGSTAT_VACUUM_SKIP_LOCK_FAILED: + tabentry->vacuum_failcount++; + break; + + case PGSTAT_VACUUM_SKIP_NONTARGET: + default: + /* don't increment failure count for non-target tables */ + break; } } diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index ab80794..bc5d370 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -219,6 +219,89 @@ pg_stat_get_last_vacuum_index_scans(PG_FUNCTION_ARGS) } Datum +pg_stat_get_last_vacuum_status(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + char *result = "unknown"; + PgStat_StatTabEntry *tabentry; + + /* + * status string. this must be synced with the strings shown by the + * statistics view "pg_stat_progress_vacuum" + */ + static char *phasestr[] = + {"initialization", + "scanning heap", + "vacuuming indexes", + "vacuuming heap", + "cleaning up indexes", + "trucating heap", + "performing final cleanup"}; + + if ((tabentry = pgstat_fetch_stat_tabentry(relid)) != NULL) + { + int phase; + StatVacuumStatus status; + + status = tabentry->vacuum_status; + switch (status) + { + case PGSTAT_VACUUM_FINISHED: + result = "completed"; + break; + case PGSTAT_VACUUM_ERROR: + case PGSTAT_VACUUM_CANCELED: + phase = tabentry->vacuum_last_phase; + /* number of elements of phasestr above */ + if (phase >= 0 && phase <= 7) + result = psprintf("%s while %s", + status == PGSTAT_VACUUM_CANCELED ? + "canceled" : "error", + phasestr[phase]); + else + result = psprintf("unknown vacuum phase: %d", phase); + break; + case PGSTAT_VACUUM_SKIP_LOCK_FAILED: + result = "skipped - lock unavailable"; + break; + + case PGSTAT_VACUUM_AGGRESSIVE_FINISHED: + result = "aggressive vacuum completed"; + break; + + case PGSTAT_VACUUM_FULL_FINISHED: + result = "vacuum full completed"; + break; + + case PGSTAT_VACUUM_SKIP_NONTARGET: + result = "unvacuumable"; + break; + + default: + result = "unknown status"; + break; + } + } + + PG_RETURN_TEXT_P(cstring_to_text(result)); +} + +Datum +pg_stat_get_autovacuum_fail_count(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 result; + PgStat_StatTabEntry *tabentry; + + if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL) + result = 0; + else + result = (int32) (tabentry->vacuum_failcount); + + PG_RETURN_INT32(result); +} + +Datum pg_stat_get_blocks_fetched(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 6b84c9a..a51e321 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2891,6 +2891,10 @@ DATA(insert OID = 2579 ( pg_stat_get_vacuum_necessity PGNSP PGUID 12 1 0 0 0 f DESCR("statistics: true if needs vacuum"); DATA(insert OID = 3281 ( pg_stat_get_last_vacuum_index_scans PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 23 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_last_vacuum_index_scans _null_ _null_ _null_ )); DESCR("statistics: number of index scans in the last vacuum"); +DATA(insert OID = 3420 ( pg_stat_get_last_vacuum_status PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 25 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_last_vacuum_status _null_ _null_ _null_ )); +DESCR("statistics: ending status of the last vacuum"); +DATA(insert OID = 3421 ( pg_stat_get_autovacuum_fail_count PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 23 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_autovacuum_fail_count _null_ _null_ _null_ )); +DESCR("statistics: number of successively failed vacuum trials"); DATA(insert OID = 2026 ( pg_backend_pid PGNSP PGUID 12 1 0 0 0 f f f f t f s r 0 0 23 "" _null_ _null_ _null_ _null_ _null_ pg_backend_pid _null_ _null_ _null_ )); DESCR("statistics: current backend PID"); DATA(insert OID = 1937 ( pg_stat_get_backend_pid PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 23 "23" _null_ _null_ _null_ _null_ _null_ pg_stat_get_backend_pid _null_ _null_ _null_ )); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 84bec74..da3107a 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -190,6 +190,7 @@ extern void vacuum_delay_point(void); /* in commands/vacuumlazy.c */ extern void lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, BufferAccessStrategy bstrategy); +extern void lazy_vacuum_cancel_handler(void); /* in commands/analyze.c */ extern void analyze_rel(Oid relid, RangeVar *relation, int options, diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 3ab5f4a..62c2369 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -67,6 +67,20 @@ typedef enum StatMsgType PGSTAT_MTYPE_DEADLOCK } StatMsgType; +/* + * The exit status stored in vacuum report. + */ +typedef enum StatVacuumStatus +{ + PGSTAT_VACUUM_FINISHED, + PGSTAT_VACUUM_CANCELED, + PGSTAT_VACUUM_ERROR, + PGSTAT_VACUUM_SKIP_LOCK_FAILED, + PGSTAT_VACUUM_SKIP_NONTARGET, + PGSTAT_VACUUM_AGGRESSIVE_FINISHED, + PGSTAT_VACUUM_FULL_FINISHED +} StatVacuumStatus; + /* ---------- * The data type used for counters. * ---------- @@ -370,6 +384,9 @@ typedef struct PgStat_MsgVacuum PgStat_Counter m_live_tuples; PgStat_Counter m_dead_tuples; PgStat_Counter m_num_index_scans; + PgStat_Counter m_vacuum_status; + PgStat_Counter m_vacuum_last_phase; + PgStat_Counter m_vacuum_errcode; } PgStat_MsgVacuum; @@ -643,6 +660,10 @@ typedef struct PgStat_StatTabEntry PgStat_Counter analyze_count; TimestampTz autovac_analyze_timestamp; /* autovacuum initiated */ PgStat_Counter autovac_analyze_count; + PgStat_Counter vacuum_status; + PgStat_Counter vacuum_last_phase; + PgStat_Counter vacuum_errcode; + PgStat_Counter vacuum_failcount; } PgStat_StatTabEntry; @@ -1168,7 +1189,9 @@ extern void pgstat_reset_single_counter(Oid objectid, PgStat_Single_Reset_Type t extern void pgstat_report_autovac(Oid dboid); extern void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter livetuples, PgStat_Counter deadtuples, - PgStat_Counter num_index_scans); + PgStat_Counter num_index_scans, + PgStat_Counter status, PgStat_Counter last_phase, + PgStat_Counter errcode); extern void pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples, bool resetcounter); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 2144269..f0a8416 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1766,6 +1766,8 @@ pg_stat_all_tables| SELECT c.oid AS relid, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze, pg_stat_get_vacuum_count(c.oid) AS vacuum_count, pg_stat_get_last_vacuum_index_scans(c.oid) AS last_vacuum_index_scans, + pg_stat_get_last_vacuum_status(c.oid) AS last_vacuum_status, + pg_stat_get_autovacuum_fail_count(c.oid) AS autovacuum_fail_count, pg_stat_get_autovacuum_count(c.oid) AS autovacuum_count, pg_stat_get_analyze_count(c.oid) AS analyze_count, pg_stat_get_autoanalyze_count(c.oid) AS autoanalyze_count @@ -1915,6 +1917,8 @@ pg_stat_sys_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.last_autoanalyze, pg_stat_all_tables.vacuum_count, pg_stat_all_tables.last_vacuum_index_scans, + pg_stat_all_tables.last_vacuum_status, + pg_stat_all_tables.autovacuum_fail_count, pg_stat_all_tables.autovacuum_count, pg_stat_all_tables.analyze_count, pg_stat_all_tables.autoanalyze_count @@ -1960,6 +1964,8 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.last_autoanalyze, pg_stat_all_tables.vacuum_count, pg_stat_all_tables.last_vacuum_index_scans, + pg_stat_all_tables.last_vacuum_status, + pg_stat_all_tables.autovacuum_fail_count, pg_stat_all_tables.autovacuum_count, pg_stat_all_tables.analyze_count, pg_stat_all_tables.autoanalyze_count -- 2.9.2