From 315c5edbcde3160ee6d64ca74e5ae3c6c3ca070a Mon Sep 17 00:00:00 2001 From: Mark Dilger Date: Fri, 12 Jun 2020 13:21:52 -0700 Subject: [PATCH v7 1/2] Adding verify_heapam and pg_amcheck Adding new function verify_heapam for checking a heap relation and associated toast relation, if any, to contrib/amcheck. Adding new contrib module pg_amcheck, which is a command line interface for running amcheck's verifications against tables and indexes. Refactoring existing amcheck btree checking functions to optionally return corruption information rather than ereport'ing it. This is used by the new pg_amcheck command line tool for reporting back to the caller. --- contrib/Makefile | 1 + contrib/amcheck/Makefile | 7 +- contrib/amcheck/amcheck--1.2--1.3.sql | 54 + contrib/amcheck/amcheck.control | 2 +- contrib/amcheck/amcheck.h | 5 + contrib/amcheck/expected/check_btree.out | 31 + contrib/amcheck/expected/check_heap.out | 58 + .../amcheck/expected/disallowed_reltypes.out | 48 + contrib/amcheck/sql/check_btree.sql | 10 + contrib/amcheck/sql/check_heap.sql | 34 + contrib/amcheck/sql/disallowed_reltypes.sql | 48 + contrib/amcheck/t/skipping.pl | 101 ++ contrib/amcheck/verify_heapam.c | 1024 +++++++++++++++++ contrib/amcheck/verify_nbtree.c | 750 ++++++------ contrib/pg_amcheck/.gitignore | 3 + contrib/pg_amcheck/Makefile | 28 + contrib/pg_amcheck/pg_amcheck.c | 884 ++++++++++++++ contrib/pg_amcheck/t/001_basic.pl | 9 + contrib/pg_amcheck/t/002_nonesuch.pl | 55 + contrib/pg_amcheck/t/003_check.pl | 85 ++ contrib/pg_amcheck/t/004_verify_heapam.pl | 407 +++++++ doc/src/sgml/amcheck.sgml | 106 +- doc/src/sgml/contrib.sgml | 1 + doc/src/sgml/filelist.sgml | 1 + doc/src/sgml/pg_amcheck.sgml | 136 +++ 25 files changed, 3557 insertions(+), 331 deletions(-) create mode 100644 contrib/amcheck/amcheck--1.2--1.3.sql create mode 100644 contrib/amcheck/amcheck.h create mode 100644 contrib/amcheck/expected/check_heap.out create mode 100644 contrib/amcheck/expected/disallowed_reltypes.out create mode 100644 contrib/amcheck/sql/check_heap.sql create mode 100644 contrib/amcheck/sql/disallowed_reltypes.sql create mode 100644 contrib/amcheck/t/skipping.pl create mode 100644 contrib/amcheck/verify_heapam.c create mode 100644 contrib/pg_amcheck/.gitignore create mode 100644 contrib/pg_amcheck/Makefile create mode 100644 contrib/pg_amcheck/pg_amcheck.c create mode 100644 contrib/pg_amcheck/t/001_basic.pl create mode 100644 contrib/pg_amcheck/t/002_nonesuch.pl create mode 100644 contrib/pg_amcheck/t/003_check.pl create mode 100644 contrib/pg_amcheck/t/004_verify_heapam.pl create mode 100644 doc/src/sgml/pg_amcheck.sgml diff --git a/contrib/Makefile b/contrib/Makefile index 1846d415b6..c21c27cbeb 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -29,6 +29,7 @@ SUBDIRS = \ oid2name \ pageinspect \ passwordcheck \ + pg_amcheck \ pg_buffercache \ pg_freespacemap \ pg_prewarm \ diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index a2b1b1036b..27d38b2e86 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -3,13 +3,16 @@ MODULE_big = amcheck OBJS = \ $(WIN32RES) \ + verify_heapam.o \ verify_nbtree.o EXTENSION = amcheck -DATA = amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql +DATA = amcheck--1.2--1.3.sql amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql PGFILEDESC = "amcheck - function for verifying relation integrity" -REGRESS = check check_btree +REGRESS = check check_btree check_heap disallowed_reltypes + +TAP_TESTS = 1 ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/amcheck/amcheck--1.2--1.3.sql b/contrib/amcheck/amcheck--1.2--1.3.sql new file mode 100644 index 0000000000..2ab7d8b0d2 --- /dev/null +++ b/contrib/amcheck/amcheck--1.2--1.3.sql @@ -0,0 +1,54 @@ +/* contrib/amcheck/amcheck--1.2--1.3.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.3'" to load this file. \quit + +-- In order to avoid issues with dependencies when updating amcheck to 1.3, +-- create new, overloaded version of the 1.2 function signature + +-- +-- verify_heapam() +-- +CREATE FUNCTION verify_heapam(rel regclass, + on_error_stop boolean, + skip cstring, + startblock bigint, + endblock bigint, + blkno OUT bigint, + offnum OUT integer, + lp_off OUT smallint, + lp_flags OUT smallint, + lp_len OUT smallint, + attnum OUT integer, + chunk OUT integer, + msg OUT text + ) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'verify_heapam' +LANGUAGE C; + +-- Don't want this to be available to public +REVOKE ALL ON FUNCTION verify_heapam(regclass, boolean, cstring, bigint, bigint) +FROM PUBLIC; + +-- +-- verify_btreeam() +-- +CREATE FUNCTION verify_btreeam(rel regclass, + blkno OUT bigint, + msg OUT text) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'verify_btreeam' +LANGUAGE C; + +CREATE FUNCTION verify_btreeam(rel regclass, + on_error_stop boolean, + blkno OUT bigint, + msg OUT text) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'verify_btreeam' +LANGUAGE C; + +-- Don't want this to be available to public +REVOKE ALL ON FUNCTION verify_btreeam(regclass) FROM PUBLIC; +REVOKE ALL ON FUNCTION verify_btreeam(regclass, boolean) FROM PUBLIC; diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control index c6e310046d..ab50931f75 100644 --- a/contrib/amcheck/amcheck.control +++ b/contrib/amcheck/amcheck.control @@ -1,5 +1,5 @@ # amcheck extension comment = 'functions for verifying relation integrity' -default_version = '1.2' +default_version = '1.3' module_pathname = '$libdir/amcheck' relocatable = true diff --git a/contrib/amcheck/amcheck.h b/contrib/amcheck/amcheck.h new file mode 100644 index 0000000000..74edfc2f65 --- /dev/null +++ b/contrib/amcheck/amcheck.h @@ -0,0 +1,5 @@ +#include "postgres.h" + +Datum verify_heapam(PG_FUNCTION_ARGS); +Datum bt_index_check(PG_FUNCTION_ARGS); +Datum bt_index_parent_check(PG_FUNCTION_ARGS); diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out index f82f48d23b..c1acf238d7 100644 --- a/contrib/amcheck/expected/check_btree.out +++ b/contrib/amcheck/expected/check_btree.out @@ -21,6 +21,8 @@ SELECT bt_index_check('bttest_a_idx'::regclass); ERROR: permission denied for function bt_index_check SELECT bt_index_parent_check('bttest_a_idx'::regclass); ERROR: permission denied for function bt_index_parent_check +SELECT * FROM verify_btreeam('bttest_a_idx'::regclass); +ERROR: permission denied for function verify_btreeam RESET ROLE; -- we, intentionally, don't check relation permissions - it's useful -- to run this cluster-wide with a restricted account, and as tested @@ -29,6 +31,7 @@ GRANT EXECUTE ON FUNCTION bt_index_check(regclass) TO regress_bttest_role; GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass) TO regress_bttest_role; GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO regress_bttest_role; GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO regress_bttest_role; +GRANT EXECUTE ON FUNCTION verify_btreeam(regclass, boolean) TO regress_bttest_role; SET ROLE regress_bttest_role; SELECT bt_index_check('bttest_a_idx'); bt_index_check @@ -42,23 +45,31 @@ SELECT bt_index_parent_check('bttest_a_idx'); (1 row) +SELECT * FROM verify_btreeam('bttest_a_idx'); +ERROR: permission denied for function verify_btreeam RESET ROLE; -- verify plain tables are rejected (error) SELECT bt_index_check('bttest_a'); ERROR: "bttest_a" is not an index SELECT bt_index_parent_check('bttest_a'); ERROR: "bttest_a" is not an index +SELECT * FROM verify_btreeam('bttest_a'); +ERROR: "bttest_a" is not an index -- verify non-existing indexes are rejected (error) SELECT bt_index_check(17); ERROR: could not open relation with OID 17 SELECT bt_index_parent_check(17); ERROR: could not open relation with OID 17 +SELECT * FROM verify_btreeam(17); +ERROR: could not open relation with OID 17 -- verify wrong index types are rejected (error) BEGIN; CREATE INDEX bttest_a_brin_idx ON bttest_a USING brin(id); SELECT bt_index_parent_check('bttest_a_brin_idx'); ERROR: only B-Tree indexes are supported as targets for verification DETAIL: Relation "bttest_a_brin_idx" is not a B-Tree index. +SELECT * FROM verify_btreeam('bttest_a_brin_idx'); +ERROR: current transaction is aborted, commands ignored until end of transaction block ROLLBACK; -- normal check outside of xact SELECT bt_index_check('bttest_a_idx'); @@ -67,6 +78,11 @@ SELECT bt_index_check('bttest_a_idx'); (1 row) +SELECT * FROM verify_btreeam('bttest_a_idx'); + blkno | msg +-------+----- +(0 rows) + -- more expansive tests SELECT bt_index_check('bttest_a_idx', true); bt_index_check @@ -93,6 +109,11 @@ SELECT bt_index_parent_check('bttest_b_idx'); (1 row) +SELECT * FROM verify_btreeam('bttest_a_idx'); + blkno | msg +-------+----- +(0 rows) + -- make sure we don't have any leftover locks SELECT * FROM pg_locks WHERE relation = ANY(ARRAY['bttest_a', 'bttest_a_idx', 'bttest_b', 'bttest_b_idx']::regclass[]) @@ -118,6 +139,11 @@ SELECT bt_index_check('bttest_multi_idx'); (1 row) +SELECT * FROM verify_btreeam('bttest_multi_idx'); + blkno | msg +-------+----- +(0 rows) + -- more expansive tests for index with included columns SELECT bt_index_parent_check('bttest_multi_idx', true, true); bt_index_parent_check @@ -134,6 +160,11 @@ SELECT bt_index_parent_check('bttest_multi_idx', true, true); (1 row) +SELECT * FROM verify_btreeam('bttest_multi_idx'); + blkno | msg +-------+----- +(0 rows) + -- -- Test for multilevel page deletion/downlink present checks, and rootdescend -- checks diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out new file mode 100644 index 0000000000..6d30ca8023 --- /dev/null +++ b/contrib/amcheck/expected/check_heap.out @@ -0,0 +1,58 @@ +CREATE TABLE heaptest (a integer, b text); +INSERT INTO heaptest (a, b) + (SELECT gs, repeat('x', gs) + FROM generate_series(1,10000) gs); +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := false, + skip := 'all frozen', + startblock := NULL, + endblock := NULL); + blkno | offnum | lp_off | lp_flags | lp_len | attnum | chunk | msg +-------+--------+--------+----------+--------+--------+-------+----- +(0 rows) + +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := true, + skip := 'all visible', + startblock := NULL, + endblock := NULL); + blkno | offnum | lp_off | lp_flags | lp_len | attnum | chunk | msg +-------+--------+--------+----------+--------+--------+-------+----- +(0 rows) + +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); + blkno | offnum | lp_off | lp_flags | lp_len | attnum | chunk | msg +-------+--------+--------+----------+--------+--------+-------+----- +(0 rows) + +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := true, + skip := 'all frozen', + startblock := 5, + endblock := NULL); + blkno | offnum | lp_off | lp_flags | lp_len | attnum | chunk | msg +-------+--------+--------+----------+--------+--------+-------+----- +(0 rows) + +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := false, + skip := 'all visible', + startblock := NULL, + endblock := 10); + blkno | offnum | lp_off | lp_flags | lp_len | attnum | chunk | msg +-------+--------+--------+----------+--------+--------+-------+----- +(0 rows) + +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := true, + skip := NULL, + startblock := 5, + endblock := 10); + blkno | offnum | lp_off | lp_flags | lp_len | attnum | chunk | msg +-------+--------+--------+----------+--------+--------+-------+----- +(0 rows) + diff --git a/contrib/amcheck/expected/disallowed_reltypes.out b/contrib/amcheck/expected/disallowed_reltypes.out new file mode 100644 index 0000000000..892ae89652 --- /dev/null +++ b/contrib/amcheck/expected/disallowed_reltypes.out @@ -0,0 +1,48 @@ +-- +-- check that using the module's functions with unsupported relations will fail +-- +-- partitioned tables (the parent ones) don't have visibility maps +create table test_partitioned (a int, b text default repeat('x', 5000)) + partition by list (a); +-- these should all fail +select * from verify_heapam('test_partitioned', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); +ERROR: "test_partitioned" is not a table, materialized view, or TOAST table +create table test_partition partition of test_partitioned for values in (1); +create index test_index on test_partition (a); +-- indexes do not, so these all fail +select * from verify_heapam('test_index', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); +ERROR: "test_index" is not a table, materialized view, or TOAST table +create view test_view as select 1; +-- views do not have vms, so these all fail +select * from verify_heapam('test_view', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); +ERROR: "test_view" is not a table, materialized view, or TOAST table +create sequence test_sequence; +-- sequences do not have vms, so these all fail +select * from verify_heapam('test_sequence', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); +ERROR: "test_sequence" is not a table, materialized view, or TOAST table +create foreign data wrapper dummy; +create server dummy_server foreign data wrapper dummy; +create foreign table test_foreign_table () server dummy_server; +-- foreign tables do not have vms, so these all fail +select * from verify_heapam('test_foreign_table', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); +ERROR: "test_foreign_table" is not a table, materialized view, or TOAST table diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql index a1fef644cb..f5d0f8c1f6 100644 --- a/contrib/amcheck/sql/check_btree.sql +++ b/contrib/amcheck/sql/check_btree.sql @@ -24,6 +24,7 @@ CREATE ROLE regress_bttest_role; SET ROLE regress_bttest_role; SELECT bt_index_check('bttest_a_idx'::regclass); SELECT bt_index_parent_check('bttest_a_idx'::regclass); +SELECT * FROM verify_btreeam('bttest_a_idx'::regclass); RESET ROLE; -- we, intentionally, don't check relation permissions - it's useful @@ -33,27 +34,33 @@ GRANT EXECUTE ON FUNCTION bt_index_check(regclass) TO regress_bttest_role; GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass) TO regress_bttest_role; GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO regress_bttest_role; GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO regress_bttest_role; +GRANT EXECUTE ON FUNCTION verify_btreeam(regclass, boolean) TO regress_bttest_role; SET ROLE regress_bttest_role; SELECT bt_index_check('bttest_a_idx'); SELECT bt_index_parent_check('bttest_a_idx'); +SELECT * FROM verify_btreeam('bttest_a_idx'); RESET ROLE; -- verify plain tables are rejected (error) SELECT bt_index_check('bttest_a'); SELECT bt_index_parent_check('bttest_a'); +SELECT * FROM verify_btreeam('bttest_a'); -- verify non-existing indexes are rejected (error) SELECT bt_index_check(17); SELECT bt_index_parent_check(17); +SELECT * FROM verify_btreeam(17); -- verify wrong index types are rejected (error) BEGIN; CREATE INDEX bttest_a_brin_idx ON bttest_a USING brin(id); SELECT bt_index_parent_check('bttest_a_brin_idx'); +SELECT * FROM verify_btreeam('bttest_a_brin_idx'); ROLLBACK; -- normal check outside of xact SELECT bt_index_check('bttest_a_idx'); +SELECT * FROM verify_btreeam('bttest_a_idx'); -- more expansive tests SELECT bt_index_check('bttest_a_idx', true); SELECT bt_index_parent_check('bttest_b_idx', true); @@ -61,6 +68,7 @@ SELECT bt_index_parent_check('bttest_b_idx', true); BEGIN; SELECT bt_index_check('bttest_a_idx'); SELECT bt_index_parent_check('bttest_b_idx'); +SELECT * FROM verify_btreeam('bttest_a_idx'); -- make sure we don't have any leftover locks SELECT * FROM pg_locks WHERE relation = ANY(ARRAY['bttest_a', 'bttest_a_idx', 'bttest_b', 'bttest_b_idx']::regclass[]) @@ -74,6 +82,7 @@ SELECT bt_index_check('bttest_a_idx', true); -- normal check outside of xact for index with included columns SELECT bt_index_check('bttest_multi_idx'); +SELECT * FROM verify_btreeam('bttest_multi_idx'); -- more expansive tests for index with included columns SELECT bt_index_parent_check('bttest_multi_idx', true, true); @@ -81,6 +90,7 @@ SELECT bt_index_parent_check('bttest_multi_idx', true, true); TRUNCATE bttest_multi; INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i; SELECT bt_index_parent_check('bttest_multi_idx', true, true); +SELECT * FROM verify_btreeam('bttest_multi_idx'); -- -- Test for multilevel page deletion/downlink present checks, and rootdescend diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql new file mode 100644 index 0000000000..5759d5526e --- /dev/null +++ b/contrib/amcheck/sql/check_heap.sql @@ -0,0 +1,34 @@ +CREATE TABLE heaptest (a integer, b text); +INSERT INTO heaptest (a, b) + (SELECT gs, repeat('x', gs) + FROM generate_series(1,10000) gs); +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := false, + skip := 'all frozen', + startblock := NULL, + endblock := NULL); +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := true, + skip := 'all visible', + startblock := NULL, + endblock := NULL); +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := true, + skip := 'all frozen', + startblock := 5, + endblock := NULL); +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := false, + skip := 'all visible', + startblock := NULL, + endblock := 10); +SELECT * FROM verify_heapam(rel := 'heaptest', + on_error_stop := true, + skip := NULL, + startblock := 5, + endblock := 10); diff --git a/contrib/amcheck/sql/disallowed_reltypes.sql b/contrib/amcheck/sql/disallowed_reltypes.sql new file mode 100644 index 0000000000..fc90e6ca33 --- /dev/null +++ b/contrib/amcheck/sql/disallowed_reltypes.sql @@ -0,0 +1,48 @@ +-- +-- check that using the module's functions with unsupported relations will fail +-- + +-- partitioned tables (the parent ones) don't have visibility maps +create table test_partitioned (a int, b text default repeat('x', 5000)) + partition by list (a); +-- these should all fail +select * from verify_heapam('test_partitioned', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); + +create table test_partition partition of test_partitioned for values in (1); +create index test_index on test_partition (a); +-- indexes do not, so these all fail +select * from verify_heapam('test_index', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); + +create view test_view as select 1; +-- views do not have vms, so these all fail +select * from verify_heapam('test_view', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); + +create sequence test_sequence; +-- sequences do not have vms, so these all fail +select * from verify_heapam('test_sequence', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); + +create foreign data wrapper dummy; +create server dummy_server foreign data wrapper dummy; +create foreign table test_foreign_table () server dummy_server; +-- foreign tables do not have vms, so these all fail +select * from verify_heapam('test_foreign_table', + on_error_stop := false, + skip := NULL, + startblock := NULL, + endblock := NULL); diff --git a/contrib/amcheck/t/skipping.pl b/contrib/amcheck/t/skipping.pl new file mode 100644 index 0000000000..e716fc8c33 --- /dev/null +++ b/contrib/amcheck/t/skipping.pl @@ -0,0 +1,101 @@ +use strict; +use warnings; + +use PostgresNode; +use TestLib; + +use Test::More tests => 183; + +my ($node, $result); + +# Check various options are stable (don't abort) when running verify_heapam on +# the test table. For uncorrupted tables, there isn't anything to check except +# that it runs without crashing. +sub check_all_options +{ + for my $stop (qw(NULL true false)) + { + for my $skip ("NULL", "'all frozen'", "'all visible'") + { + for my $startblock (qw(NULL 5)) + { + for my $endblock (qw(NULL 10)) + { + my $check = "SELECT verify_heapam('test', $stop, $skip, " . + "$startblock, $endblock)"; + $result = $node->safe_psql('postgres', "$check; SELECT 1"); + is ($result, 1, "checked: $check"); + } + } + } + } +} + +# Stops the server and writes nulls in the first page of the table, +# assuming page size is large enough for offset 1000..1016 to be +# in the midst of the first page of data. +sub corrupt_first_page +{ + my $pgdata = $node->data_dir; + my $rel = $node->safe_psql('postgres', + qq(SELECT pg_relation_filepath('test'))); + my $relpath = "$pgdata/$rel"; + $node->stop; + + my $fh; + open($fh, '+<', $relpath); + binmode $fh; + seek($fh, 1000, 0); + syswrite($fh, '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', 16); + close($fh); + + $node->start; +} + +# Test set-up +$node = get_new_node('test'); +$node->init; +$node->append_conf('postgresql.conf', 'autovacuum=off'); +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); + +# Check empty table +$node->safe_psql('postgres', q( + CREATE TABLE test (a integer); + ALTER TABLE public.test SET (autovacuum_enabled=false); +)); +check_all_options(); + +# Check table with trivial data +$node->safe_psql('postgres', q(INSERT INTO test VALUES (0))); +check_all_options(); + +# Check table with non-trivial data (more than a page worth) but +# without any all frozen or all visible +$node->safe_psql('postgres', q( +INSERT INTO test SELECT generate_series(1,10000))); +check_all_options(); + +# Check table with all-visible data +$node->safe_psql('postgres', q(VACUUM test)); +check_all_options(); + +# Check table with all-frozen data +$node->safe_psql('postgres', q(VACUUM FREEZE test)); +check_all_options(); + +# Check table with corruption, no skipping +corrupt_first_page(); +$result = $node->safe_psql('postgres', q( +SELECT COUNT(*) > 0 FROM verify_heapam('test', on_error_stop := false, skip := NULL, startblock := NULL, endblock := NULL))); +is($result, 't', 'corruption detected on first page'); + +# Check table with corruption, skipping all visible blocks +$result = $node->safe_psql('postgres', q( +SELECT COUNT(*) > 0 FROM verify_heapam('test', on_error_stop := false, skip := 'all visible', startblock := NULL, endblock := NULL))); +is($result, 'f', 'skipping all visible first page'); + +# Check table with corruption, skipping all frozen blocks +$result = $node->safe_psql('postgres', q( +SELECT COUNT(*) > 0 FROM verify_heapam('test', on_error_stop := false, skip := 'all frozen', startblock := NULL, endblock := NULL))); +is($result, 'f', 'skipping all frozen first page'); diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c new file mode 100644 index 0000000000..1bddff7fc6 --- /dev/null +++ b/contrib/amcheck/verify_heapam.c @@ -0,0 +1,1024 @@ +/*------------------------------------------------------------------------- + * + * verify_heapam.c + * Functions to check postgresql heap relations for corruption + * + * Copyright (c) 2016-2020, PostgreSQL Global Development Group + * + * contrib/amcheck/verify_heapam.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/detoast.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heaptoast.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/toast_internals.h" +#include "access/visibilitymap.h" +#include "access/xact.h" +#include "catalog/pg_am.h" +#include "catalog/pg_type.h" +#include "catalog/storage_xlog.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "amcheck.h" + +PG_FUNCTION_INFO_V1(verify_heapam); + +/* + * Struct holding the running context information during + * a lifetime of a verify_heapam() execution. + */ +typedef struct HeapCheckContext +{ + TransactionId nextKnownValidXid; + TransactionId oldestValidXid; + + /* Values concerning the heap relation being checked */ + Relation rel; + TransactionId relfrozenxid; + TransactionId relminmxid; + Relation toastrel; + Relation *toast_indexes; + Relation valid_toast_index; + int num_toast_indexes; + + /* Values for iterating over pages in the relation */ + BlockNumber nblocks; + BlockNumber blkno; + BufferAccessStrategy bstrategy; + Buffer buffer; + Page page; + + /* Values for iterating over tuples within a page */ + OffsetNumber offnum; + ItemId itemid; + uint16 lp_len; + HeapTupleHeader tuphdr; + int natts; + + /* Values for iterating over attributes within the tuple */ + uint32 offset; /* offset in tuple data */ + AttrNumber attnum; + + /* Values for iterating over toast for the attribute */ + int32 chunkno; + int32 attrsize; + int32 endchunk; + int32 totalchunks; + + /* Values for returning tuples */ + bool is_corrupt; /* have we encountered any corruption? */ + TupleDesc tupdesc; + Tuplestorestate *tupstore; +} HeapCheckContext; + +/* Internal implementation */ +static void check_relation_relkind_and_relam(Relation rel); + +static void confess(HeapCheckContext * ctx, char *msg); +static TupleDesc verify_heapam_tupdesc(void); + +static bool TransactionIdValidInRel(TransactionId xid, HeapCheckContext * ctx); +static bool check_tuphdr_xids(HeapTupleHeader tuphdr, HeapCheckContext * ctx); +static void check_toast_tuple(HeapTuple toasttup, HeapCheckContext * ctx); +static bool check_tuple_attribute(HeapCheckContext * ctx); +static void check_tuple(HeapCheckContext * ctx); + +/* + * verify_heapam + * + * Scan and report corruption in heap pages or in associated toast relation. + */ +Datum +verify_heapam(PG_FUNCTION_ARGS) +{ +#define HEAPCHECK_RELATION_COLS 8 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + MemoryContext oldcontext; + bool randomAccess; + HeapCheckContext ctx; + FullTransactionId nextFullXid; + Buffer vmbuffer = InvalidBuffer; + Oid relid; + bool on_error_stop; + bool skip_all_frozen = false; + bool skip_all_visible = false; + int64 startblock = -1; + int64 endblock = -1; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot " + "accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("materialize mode required, but it is not allowed " + "in this context"))); + + /* check supplied arguments */ + if (PG_ARGISNULL(0)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing required parameter for 'rel'"))); + relid = PG_GETARG_OID(0); + on_error_stop = PG_ARGISNULL(1) ? false : PG_GETARG_BOOL(1); + if (!PG_ARGISNULL(2)) + { + const char *skip = PG_GETARG_CSTRING(2); + + if (pg_strcasecmp(skip, "all visible") == 0) + { + skip_all_visible = true; + } + else if (pg_strcasecmp(skip, "all frozen") == 0) + { + skip_all_visible = true; + skip_all_frozen = true; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized parameter for 'skip': %s", skip), + errhint("please choose from 'all visible', 'all frozen', " + "or NULL"))); + } + } + if (!PG_ARGISNULL(3)) + startblock = PG_GETARG_INT64(3); + if (!PG_ARGISNULL(4)) + endblock = PG_GETARG_INT64(4); + + memset(&ctx, 0, sizeof(HeapCheckContext)); + + /* The tupdesc and tuplestore must be created in ecxt_per_query_memory */ + oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); + randomAccess = (rsinfo->allowedModes & SFRM_Materialize_Random) != 0; + ctx.tupdesc = verify_heapam_tupdesc(); + ctx.tupstore = tuplestore_begin_heap(randomAccess, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = ctx.tupstore; + rsinfo->setDesc = ctx.tupdesc; + + MemoryContextSwitchTo(oldcontext); + + /* + * Open the relation. We use ShareUpdateExclusive to prevent concurrent + * vacuums from changing the relfrozenxid, relminmxid, or advancing the + * global oldestXid to be newer than those. This protection saves us from + * having to reacquire the locks and recheck those minimums for every + * tuple, which would be expensive. + */ + ctx.rel = relation_open(relid, ShareUpdateExclusiveLock); + check_relation_relkind_and_relam(ctx.rel); + + /* + * Open the toast relation, if any, also protected from concurrent + * vacuums. + */ + if (ctx.rel->rd_rel->reltoastrelid) + { + int offset; + + /* Main relation has associated toast relation */ + ctx.toastrel = table_open(ctx.rel->rd_rel->reltoastrelid, + ShareUpdateExclusiveLock); + offset = toast_open_indexes(ctx.toastrel, + ShareUpdateExclusiveLock, + &(ctx.toast_indexes), + &(ctx.num_toast_indexes)); + ctx.valid_toast_index = ctx.toast_indexes[offset]; + } + else + { + /* Main relation has no associated toast relation */ + ctx.toast_indexes = NULL; + ctx.num_toast_indexes = 0; + } + + /* + * Now that we have our relation(s) locked, oldestXid cannot advance + * beyond the oldest valid xid in our table, nor can our relfrozenxid + * advance. We keep a cached copy of the oldest valid xid that we may + * encounter in the table, which is relfrozenxid if valid, and oldestXid + * otherwise. + */ + ctx.relfrozenxid = ctx.rel->rd_rel->relfrozenxid; + ctx.relminmxid = ctx.rel->rd_rel->relminmxid; + + LWLockAcquire(XidGenLock, LW_SHARED); + nextFullXid = ShmemVariableCache->nextFullXid; + ctx.oldestValidXid = ShmemVariableCache->oldestXid; + LWLockRelease(XidGenLock); + ctx.nextKnownValidXid = XidFromFullTransactionId(nextFullXid); + + if (TransactionIdIsNormal(ctx.relfrozenxid) && + TransactionIdPrecedes(ctx.relfrozenxid, ctx.oldestValidXid)) + { + confess(&ctx, psprintf("relfrozenxid %u precedes global " + "oldest valid xid %u ", + ctx.relfrozenxid, ctx.oldestValidXid)); + PG_RETURN_NULL(); + } + + if (TransactionIdIsNormal(ctx.relminmxid) && + TransactionIdPrecedes(ctx.relminmxid, ctx.oldestValidXid)) + { + confess(&ctx, psprintf("relfrozenxid %u precedes global " + "oldest valid xid %u ", + ctx.relfrozenxid, ctx.oldestValidXid)); + PG_RETURN_NULL(); + } + + if (TransactionIdIsNormal(ctx.relfrozenxid)) + ctx.oldestValidXid = ctx.relfrozenxid; + + /* check all blocks of the relation */ + ctx.nblocks = RelationGetNumberOfBlocks(ctx.rel); + ctx.bstrategy = GetAccessStrategy(BAS_BULKREAD); + ctx.buffer = InvalidBuffer; + ctx.page = NULL; + + if (startblock < 0) + startblock = 0; + if (endblock < 0 || endblock > ctx.nblocks) + endblock = ctx.nblocks; + + for (ctx.blkno = startblock; ctx.blkno < endblock; ctx.blkno++) + { + int32 mapbits; + OffsetNumber maxoff; + + /* Optionally skip over all-frozen or all-visible blocks */ + if (skip_all_frozen || skip_all_visible) + { + mapbits = (int32) visibilitymap_get_status(ctx.rel, ctx.blkno, + &vmbuffer); + if (skip_all_visible && (mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0) + continue; + if (skip_all_frozen && (mapbits & VISIBILITYMAP_ALL_FROZEN) != 0) + continue; + } + + /* Read and lock the next page. */ + ctx.buffer = ReadBufferExtended(ctx.rel, MAIN_FORKNUM, ctx.blkno, + RBM_NORMAL, ctx.bstrategy); + LockBuffer(ctx.buffer, BUFFER_LOCK_SHARE); + ctx.page = BufferGetPage(ctx.buffer); + + /* We must unlock the page from the prior iteration, if any */ + Assert(ctx.blkno == InvalidBlockNumber || ctx.buffer != InvalidBuffer); + + /* We rely on this math property for the first iteration */ + StaticAssertStmt(InvalidOffsetNumber + 1 == FirstOffsetNumber, + "InvalidOffsetNumber increments to FirstOffsetNumber"); + + ctx.offnum = InvalidOffsetNumber; + ctx.itemid = NULL; + ctx.lp_len = 0; + ctx.tuphdr = NULL; + ctx.natts = 0; + + /* Perform tuple checks */ + maxoff = PageGetMaxOffsetNumber(ctx.page); + for (ctx.offnum = 0; ctx.offnum <= maxoff; + ctx.offnum = OffsetNumberNext(ctx.offnum)) + { + ctx.itemid = PageGetItemId(ctx.page, ctx.offnum); + + /* Skip over unused/dead/redirected line pointers */ + if (!ItemIdIsUsed(ctx.itemid) || + ItemIdIsDead(ctx.itemid) || + ItemIdIsRedirected(ctx.itemid)) + continue; + + /* Set up context information about this next tuple */ + ctx.lp_len = ItemIdGetLength(ctx.itemid); + ctx.tuphdr = (HeapTupleHeader) PageGetItem(ctx.page, ctx.itemid); + ctx.natts = HeapTupleHeaderGetNatts(ctx.tuphdr); + + /* + * Reset information about individual attributes and related toast + * values, so they show as NULL in the corruption report if we + * record a corruption before beginning to iterate over the + * attributes. + */ + ctx.attnum = -1; + ctx.chunkno = -1; + + /* Ok, ready to check this next tuple */ + check_tuple(&ctx); + } + + /* clean up */ + ctx.offnum = InvalidOffsetNumber; + ctx.itemid = NULL; + ctx.lp_len = 0; + UnlockReleaseBuffer(ctx.buffer); + + if (on_error_stop && ctx.is_corrupt) + break; + } + + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + /* Close the associated toast table and indexes, if any. */ + if (ctx.rel->rd_rel->reltoastrelid) + { + toast_close_indexes(ctx.toast_indexes, ctx.num_toast_indexes, + ShareUpdateExclusiveLock); + table_close(ctx.toastrel, ShareUpdateExclusiveLock); + } + + /* Close the main relation */ + relation_close(ctx.rel, ShareUpdateExclusiveLock); + + PG_RETURN_NULL(); +} + +/* + * check_relation_relkind_and_relam + * + * convenience routine to check that relation is of a supported relkind. + */ +static void +check_relation_relkind_and_relam(Relation rel) +{ + if (rel->rd_rel->relkind != RELKIND_RELATION && + rel->rd_rel->relkind != RELKIND_MATVIEW && + rel->rd_rel->relkind != RELKIND_TOASTVALUE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a table, materialized view, " + "or TOAST table", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relam != HEAP_TABLE_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a heap AM", + RelationGetRelationName(rel)))); +} + +/* + * confess + * + * Return a message about corruption, including information + * about where in the relation the corruption was found. + * + * The msg argument is pfree'd by this function. + */ +static void +confess(HeapCheckContext * ctx, char *msg) +{ + Datum values[HEAPCHECK_RELATION_COLS]; + bool nulls[HEAPCHECK_RELATION_COLS]; + HeapTuple tuple; + int16 lp_off = ItemIdGetOffset(ctx->itemid); + int16 lp_flags = ItemIdGetFlags(ctx->itemid); + int16 lp_len = ItemIdGetLength(ctx->itemid); + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + values[0] = Int64GetDatum(ctx->blkno); + values[1] = Int32GetDatum(ctx->offnum); + nulls[1] = (ctx->offnum < 0); + values[2] = Int16GetDatum(lp_off); + nulls[2] = (lp_off < 0); + values[3] = Int16GetDatum(lp_flags); + nulls[3] = (lp_flags < 0); + values[4] = Int16GetDatum(lp_len); + nulls[4] = (lp_len < 0); + values[5] = Int32GetDatum(ctx->attnum); + nulls[5] = (ctx->attnum < 0); + values[6] = Int32GetDatum(ctx->chunkno); + nulls[6] = (ctx->chunkno < 0); + values[7] = CStringGetTextDatum(msg); + + /* + * In principle, there is nothing to prevent a scan over a large, highly + * corrupted table from using workmem worth of memory building up the + * tuplestore. Don't leak the msg argument memory. + */ + pfree(msg); + + tuple = heap_form_tuple(ctx->tupdesc, values, nulls); + tuplestore_puttuple(ctx->tupstore, tuple); + ctx->is_corrupt = true; +} + +/* + * Helper function to construct the TupleDesc needed by verify_heapam. + */ +static TupleDesc +verify_heapam_tupdesc(void) +{ + TupleDesc tupdesc; + AttrNumber a = 0; + + tupdesc = CreateTemplateTupleDesc(HEAPCHECK_RELATION_COLS); + TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, ++a, "offnum", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, ++a, "lp_off", INT2OID, -1, 0); + TupleDescInitEntry(tupdesc, ++a, "lp_flags", INT2OID, -1, 0); + TupleDescInitEntry(tupdesc, ++a, "lp_len", INT2OID, -1, 0); + TupleDescInitEntry(tupdesc, ++a, "attnum", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, ++a, "chunk", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, ++a, "msg", TEXTOID, -1, 0); + Assert(a == HEAPCHECK_RELATION_COLS); + + return BlessTupleDesc(tupdesc); +} + +static inline bool +XidInValidRange(TransactionId xid, HeapCheckContext * ctx) +{ + return (TransactionIdPrecedesOrEquals(ctx->oldestValidXid, xid) && + TransactionIdPrecedes(xid, ctx->nextKnownValidXid)); +} + +/* + * Given a TransactionId, attempt to interpret it as a valid + * FullTransactionId, neither in the future nor overlong in + * the past. Stores the inferred FullTransactionId in *fxid. + * + * Returns whether the xid is newer than the oldest clog xid. + */ +static bool +TransactionIdValidInRel(TransactionId xid, HeapCheckContext * ctx) +{ + /* Quick return for special oids */ + switch (xid) + { + case InvalidTransactionId: + return false; + case BootstrapTransactionId: + case FrozenTransactionId: + return true; + } + + /* + * If this xid is within the last known valid range of xids, then it has + * to be ok. The oldest valid xid cannot advance, because we have too + * strong a lock on the relation for that, and although the newest valid + * xid may advance, that doesn't invalidate anything from the range we've + * already identified. + */ + if (XidInValidRange(xid, ctx)) + return true; + + /* The latest valid xid may have advanced. Recheck. */ + ctx->nextKnownValidXid = + XidFromFullTransactionId(ReadNextFullTransactionId()); + if (XidInValidRange(xid, ctx)) + return true; + + /* No good. This xid is invalid. */ + return false; +} + +/* + * check_tuphdr_xids + * + * Determine whether tuples are visible for verification. Similar to + * HeapTupleSatisfiesVacuum, but with critical differences. + * + * 1) Does not touch hint bits. It seems imprudent to write hint bits + * to a table during a corruption check. + * 2) Only makes a boolean determination of whether verification should + * see the tuple, rather than doing extra work for vacuum-related + * categorization. + * + * The caller should already have checked that xmin and xmax are not out of + * bounds for the relation. + */ +static bool +check_tuphdr_xids(HeapTupleHeader tuphdr, HeapCheckContext * ctx) +{ + uint16 infomask = tuphdr->t_infomask; + + if (!HeapTupleHeaderXminCommitted(tuphdr)) + { + TransactionId raw_xmin = HeapTupleHeaderGetRawXmin(tuphdr); + + if (HeapTupleHeaderXminInvalid(tuphdr)) + { + return false; /* HEAPTUPLE_DEAD */ + } + /* Used by pre-9.0 binary upgrades */ + else if (infomask & HEAP_MOVED_OFF || + infomask & HEAP_MOVED_IN) + { + TransactionId xvac = HeapTupleHeaderGetXvac(tuphdr); + + if (TransactionIdIsCurrentTransactionId(xvac)) + return false; /* HEAPTUPLE_DELETE_IN_PROGRESS */ + if (TransactionIdIsInProgress(xvac)) + return false; /* HEAPTUPLE_DELETE_IN_PROGRESS */ + + if (!TransactionIdValidInRel(xvac, ctx)) + { + confess(ctx, psprintf("tuple xvac = %u invalid", xvac)); + return false; + } + else if (TransactionIdDidCommit(xvac)) + return false; /* HEAPTUPLE_DEAD */ + } + else if (TransactionIdIsCurrentTransactionId(raw_xmin)) + return false; /* insert or delete in progress */ + else if (TransactionIdIsInProgress(raw_xmin)) + return false; /* HEAPTUPLE_INSERT_IN_PROGRESS */ + else if (!TransactionIdDidCommit(raw_xmin)) + { + return false; /* HEAPTUPLE_DEAD */ + } + } + + if (!(infomask & HEAP_XMAX_INVALID) && !HEAP_XMAX_IS_LOCKED_ONLY(infomask)) + { + if (infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax = HeapTupleGetUpdateXid(tuphdr); + + /* not LOCKED_ONLY, so it has to have an xmax */ + if (!TransactionIdIsValid(xmax)) + { + confess(ctx, + pstrdup("heap tuple with XMAX_IS_MULTI is " + "neither LOCKED_ONLY nor has a " + "valid xmax")); + return false; + } + if (TransactionIdIsInProgress(xmax)) + return false; /* HEAPTUPLE_DELETE_IN_PROGRESS */ + + else if (TransactionIdDidCommit(xmax)) + { + return false; /* HEAPTUPLE_RECENTLY_DEAD or HEAPTUPLE_DEAD */ + } + /* Ok, the tuple is live */ + } + else if (!(infomask & HEAP_XMAX_COMMITTED)) + { + if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuphdr))) + return false; /* HEAPTUPLE_DELETE_IN_PROGRESS */ + /* Ok, the tuple is live */ + } + else + return false; /* HEAPTUPLE_RECENTLY_DEAD or HEAPTUPLE_DEAD */ + } + return true; +} + +/* + * check_toast_tuple + * + * Checks the current toast tuple as tracked in ctx for corruption. Records + * any corruption found in ctx->corruption. + */ +static void +check_toast_tuple(HeapTuple toasttup, HeapCheckContext * ctx) +{ + int32 curchunk; + Pointer chunk; + bool isnull; + char *chunkdata; + int32 chunksize; + int32 expected_size; + + /* + * Have a chunk, extract the sequence number and the data + */ + curchunk = DatumGetInt32(fastgetattr(toasttup, 2, + ctx->toastrel->rd_att, &isnull)); + if (isnull) + { + confess(ctx, + pstrdup("toast chunk sequencenumber is null")); + return; + } + chunk = DatumGetPointer(fastgetattr(toasttup, 3, + ctx->toastrel->rd_att, &isnull)); + if (isnull) + { + confess(ctx, pstrdup("toast chunk data is null")); + return; + } + if (!VARATT_IS_EXTENDED(chunk)) + { + chunksize = VARSIZE(chunk) - VARHDRSZ; + chunkdata = VARDATA(chunk); + } + else if (VARATT_IS_SHORT(chunk)) + { + /* + * could happen due to heap_form_tuple doing its thing + */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + chunkdata = VARDATA_SHORT(chunk); + } + else + { + /* should never happen */ + confess(ctx, + pstrdup("toast chunk is neither short nor extended")); + return; + } + + /* + * Some checks on the data we've found + */ + if (curchunk != ctx->chunkno) + { + confess(ctx, psprintf("toast chunk sequence number %u " + "not the expected sequence number %u", + curchunk, ctx->chunkno)); + return; + } + if (curchunk > ctx->endchunk) + { + confess(ctx, psprintf("toast chunk sequence number %u " + "exceeds the end chunk sequence " + "number %u", + curchunk, ctx->endchunk)); + return; + } + + expected_size = curchunk < ctx->totalchunks - 1 ? TOAST_MAX_CHUNK_SIZE + : ctx->attrsize - ((ctx->totalchunks - 1) * TOAST_MAX_CHUNK_SIZE); + if (chunksize != expected_size) + { + confess(ctx, psprintf("chunk size %u differs from " + "expected size %u", + chunksize, expected_size)); + return; + } + + ctx->chunkno++; +} + +/* + * check_tuple_attribute + * + * Checks the current attribute as tracked in ctx for corruption. Records + * any corruption found in ctx->corruption. + * + * The caller should have iterated to a tuple via + * tupleAttributeIteration_next. + */ +static bool +check_tuple_attribute(HeapCheckContext * ctx) +{ + Datum attdatum; + struct varlena *attr; + char *tp; /* pointer to the tuple data */ + uint16 infomask = ctx->tuphdr->t_infomask; + Form_pg_attribute thisatt = TupleDescAttr(RelationGetDescr(ctx->rel), + ctx->attnum); + + tp = (char *) ctx->tuphdr + ctx->tuphdr->t_hoff; + + if (ctx->tuphdr->t_hoff + ctx->offset > ctx->lp_len) + { + confess(ctx, psprintf("t_hoff + offset > lp_len (%u + %u > %u)", + ctx->tuphdr->t_hoff, ctx->offset, + ctx->lp_len)); + return false; + } + + /* Skip null values */ + if (infomask & HEAP_HASNULL && att_isnull(ctx->attnum, ctx->tuphdr->t_bits)) + return true; + + /* Skip non-varlena values, but update offset first */ + if (thisatt->attlen != -1) + { + ctx->offset = att_align_nominal(ctx->offset, thisatt->attalign); + ctx->offset = att_addlength_pointer(ctx->offset, thisatt->attlen, + tp + ctx->offset); + return true; + } + + /* Ok, we're looking at a varlena attribute. */ + ctx->offset = att_align_pointer(ctx->offset, thisatt->attalign, -1, + tp + ctx->offset); + + /* Get the (possibly corrupt) varlena datum */ + attdatum = fetchatt(thisatt, tp + ctx->offset); + + /* + * We have the datum, but we cannot decode it carelessly, as it may still + * be corrupt. + */ + + /* + * Check that VARTAG_SIZE won't hit a TrapMacro on a corrupt va_tag before + * risking a call into att_addlength_pointer + */ + if (VARATT_IS_1B_E(tp + ctx->offset)) + { + uint8 va_tag = va_tag = VARTAG_EXTERNAL(tp + ctx->offset); + + if (va_tag != VARTAG_ONDISK) + { + confess(ctx, psprintf("unexpected TOAST vartag %u for " + "attribute #%u at t_hoff = %u, " + "offset = %u", + va_tag, ctx->attnum, + ctx->tuphdr->t_hoff, ctx->offset)); + return false; /* We can't know where the next attribute + * begins */ + } + } + + /* Ok, should be safe now */ + ctx->offset = att_addlength_pointer(ctx->offset, thisatt->attlen, + tp + ctx->offset); + + /* + * heap_deform_tuple would be done with this attribute at this point, + * having stored it in values[], and would continue to the next attribute. + * We go further, because we need to check if the toast datum is corrupt. + */ + + attr = (struct varlena *) DatumGetPointer(attdatum); + + /* + * Now we follow the logic of detoast_external_attr(), with the same + * caveats about being paranoid about corruption. + */ + + /* Skip values that are not external */ + if (!VARATT_IS_EXTERNAL(attr)) + return true; + + /* It is external, and we're looking at a page on disk */ + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + { + confess(ctx, + pstrdup("attribute is external but not marked as on disk")); + return true; + } + + /* The tuple header better claim to contain toasted values */ + if (!(infomask & HEAP_HASEXTERNAL)) + { + confess(ctx, pstrdup("attribute is external but tuple header " + "flag HEAP_HASEXTERNAL not set")); + return true; + } + + /* The relation better have a toast table */ + if (!ctx->rel->rd_rel->reltoastrelid) + { + confess(ctx, pstrdup("attribute is external but relation has " + "no toast relation")); + return true; + } + + /* + * Must dereference indirect toast pointers before we can check them + */ + if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + struct varatt_indirect redirect; + + VARATT_EXTERNAL_GET_POINTER(redirect, attr); + attr = (struct varlena *) redirect.pointer; + + /* nested indirect Datums aren't allowed */ + if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + confess(ctx, pstrdup("attribute has nested external " + "indirect toast pointer")); + return true; + } + } + + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + struct varatt_external toast_pointer; + ScanKeyData toastkey; + SysScanDesc toastscan; + SnapshotData SnapshotToast; + HeapTuple toasttup; + bool found_toasttup; + + /* + * Must copy attr into toast_pointer for alignment considerations + */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + ctx->attrsize = toast_pointer.va_extsize; + ctx->endchunk = (ctx->attrsize - 1) / TOAST_MAX_CHUNK_SIZE; + ctx->totalchunks = ctx->endchunk + 1; + + /* + * Setup a scan key to find chunks in toast table with matching + * va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(toast_pointer.va_valueid)); + + /* + * Check if any chunks for this toasted object exist in the toast + * table, accessible via the index. + */ + init_toast_snapshot(&SnapshotToast); + toastscan = systable_beginscan_ordered(ctx->toastrel, + ctx->valid_toast_index, + &SnapshotToast, 1, + &toastkey); + ctx->chunkno = 0; + + found_toasttup = false; + while ((toasttup = + systable_getnext_ordered(toastscan, + ForwardScanDirection)) != NULL) + { + found_toasttup = true; + check_toast_tuple(toasttup, ctx); + } + if (ctx->chunkno != (ctx->endchunk + 1)) + confess(ctx, psprintf("final chunk number differs from " + "expected (%u vs. %u)", + ctx->chunkno, (ctx->endchunk + 1))); + if (!found_toasttup) + confess(ctx, pstrdup("toasted value missing from " + "toast table")); + systable_endscan_ordered(toastscan); + } + return true; +} + +/* + * check_tuple + * + * Checks the current tuple as tracked in ctx for corruption. Records any + * corruption found in ctx->corruption. + */ +static void +check_tuple(HeapCheckContext * ctx) +{ + TransactionId xmin; + TransactionId xmax; + bool fatal = false; + uint16 infomask = ctx->tuphdr->t_infomask; + + /* Check relminmxid against mxid, if any */ + xmax = HeapTupleHeaderGetRawXmax(ctx->tuphdr); + if (infomask & HEAP_XMAX_IS_MULTI && + MultiXactIdPrecedes(xmax, ctx->relminmxid)) + { + confess(ctx, psprintf("tuple xmax = %u precedes relation " + "relminmxid = %u", + xmax, ctx->relminmxid)); + fatal = true; + } + + /* Check xmin against relfrozenxid */ + xmin = HeapTupleHeaderGetXmin(ctx->tuphdr); + if (TransactionIdIsNormal(ctx->relfrozenxid) && + TransactionIdIsNormal(xmin)) + { + if (TransactionIdPrecedes(xmin, ctx->relfrozenxid)) + { + confess(ctx, psprintf("tuple xmin = %u precedes relation " + "relfrozenxid = %u", + xmin, ctx->relfrozenxid)); + fatal = true; + } + else if (!TransactionIdValidInRel(xmin, ctx)) + { + confess(ctx, psprintf("tuple xmin = %u is in the future", + xmin)); + fatal = true; + } + } + + /* Check xmax against relfrozenxid */ + if (TransactionIdIsNormal(ctx->relfrozenxid) && + TransactionIdIsNormal(xmax)) + { + if (TransactionIdPrecedes(xmax, ctx->relfrozenxid)) + { + confess(ctx, psprintf("tuple xmax = %u precedes relation " + "relfrozenxid = %u", + xmax, ctx->relfrozenxid)); + fatal = true; + } + else if (!TransactionIdValidInRel(xmax, ctx)) + { + confess(ctx, psprintf("tuple xmax = %u is in the future", + xmax)); + fatal = true; + } + } + + /* Check for tuple header corruption */ + if (ctx->tuphdr->t_hoff < SizeofHeapTupleHeader) + { + confess(ctx, + psprintf("t_hoff < SizeofHeapTupleHeader (%u < %u)", + ctx->tuphdr->t_hoff, + (unsigned) SizeofHeapTupleHeader)); + fatal = true; + } + if (ctx->tuphdr->t_hoff > ctx->lp_len) + { + confess(ctx, psprintf("t_hoff > lp_len (%u > %u)", + ctx->tuphdr->t_hoff, ctx->lp_len)); + fatal = true; + } + if (ctx->tuphdr->t_hoff != MAXALIGN(ctx->tuphdr->t_hoff)) + { + confess(ctx, psprintf("t_hoff not max-aligned (%u)", + ctx->tuphdr->t_hoff)); + fatal = true; + } + + /* + * If the tuple has nulls, check that the implied length of the variable + * length nulls bitmap field t_bits does not overflow the allowed space. + * We don't know if the corruption is in the natts field or the infomask + * bit HEAP_HASNULL. + */ + if (infomask & HEAP_HASNULL && + SizeofHeapTupleHeader + BITMAPLEN(ctx->natts) > ctx->tuphdr->t_hoff) + { + confess(ctx, psprintf("SizeofHeapTupleHeader + " + "BITMAPLEN(natts) > t_hoff " + "(%u + %u > %u)", + (unsigned) SizeofHeapTupleHeader, + BITMAPLEN(ctx->natts), + ctx->tuphdr->t_hoff)); + fatal = true; + } + + /* + * Cannot process tuple data if tuple header was corrupt, as the offsets + * within the page cannot be trusted, leaving too much risk of reading + * garbage if we continue. + * + * We also cannot process the tuple if the xmin or xmax were invalid + * relative to relfrozenxid or relminmxid, as clog entries for the xids + * may already be gone. + */ + if (fatal) + return; + + /* + * Skip tuples that are invisible, as we cannot assume the TupleDesc we + * are using is appropriate. + */ + if (!check_tuphdr_xids(ctx->tuphdr, ctx)) + return; + + /* + * If we get this far, the tuple is visible to us, so it must not be + * incompatible with our relDesc. The natts field could be legitimately + * shorter than rel's natts, but it cannot be longer than rel's natts. + */ + if (RelationGetDescr(ctx->rel)->natts < ctx->natts) + { + confess(ctx, + psprintf("relation natts < tuple natts (%u < %u)", + RelationGetDescr(ctx->rel)->natts, + ctx->natts)); + return; + } + + /* + * Iterate over the attributes looking for broken toast values. This + * roughly follows the logic of heap_deform_tuple, except that it doesn't + * bother building up isnull[] and values[] arrays, since nobody wants + * them, and it unrolls anything that might trip over an Assert when + * processing corrupt data. + */ + ctx->offset = 0; + for (ctx->attnum = 0; ctx->attnum < ctx->natts; ctx->attnum++) + { + if (!check_tuple_attribute(ctx)) + break; + } + ctx->offset = -1; + ctx->attnum = -1; +} diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index e4d501a85d..bf68b554a8 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -32,16 +32,22 @@ #include "catalog/index.h" #include "catalog/pg_am.h" #include "commands/tablecmds.h" +#include "funcapi.h" #include "lib/bloomfilter.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/smgr.h" +#include "utils/builtins.h" #include "utils/memutils.h" #include "utils/snapmgr.h" - +#include "amcheck.h" PG_MODULE_MAGIC; +PG_FUNCTION_INFO_V1(bt_index_check); +PG_FUNCTION_INFO_V1(bt_index_parent_check); +PG_FUNCTION_INFO_V1(verify_btreeam); + /* * A B-Tree cannot possibly have this many levels, since there must be one * block per level, which is bound by the range of BlockNumber: @@ -50,6 +56,20 @@ PG_MODULE_MAGIC; #define BTreeTupleGetNKeyAtts(itup, rel) \ Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel)) +/* + * Context for use within verify_btreeam() + */ +typedef struct BtreeCheckContext +{ + TupleDesc tupdesc; + Tuplestorestate *tupstore; + bool is_corrupt; + bool on_error_stop; +} BtreeCheckContext; + +#define CONTINUE_CHECKING(ctx) \ + (ctx == NULL || !((ctx)->is_corrupt && (ctx)->on_error_stop)) + /* * State associated with verifying a B-Tree index * @@ -116,6 +136,9 @@ typedef struct BtreeCheckState bloom_filter *filter; /* Debug counter */ int64 heaptuplespresent; + + /* Error reporting context */ + BtreeCheckContext *ctx; } BtreeCheckState; /* @@ -133,16 +156,14 @@ typedef struct BtreeLevel bool istruerootlevel; } BtreeLevel; -PG_FUNCTION_INFO_V1(bt_index_check); -PG_FUNCTION_INFO_V1(bt_index_parent_check); - static void bt_index_check_internal(Oid indrelid, bool parentcheck, - bool heapallindexed, bool rootdescend); + bool heapallindexed, bool rootdescend, + BtreeCheckContext * ctx); static inline void btree_index_checkable(Relation rel); static inline bool btree_index_mainfork_expected(Relation rel); static void bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, bool readonly, bool heapallindexed, - bool rootdescend); + bool rootdescend, BtreeCheckContext * ctx); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); static void bt_target_page_check(BtreeCheckState *state); @@ -185,6 +206,26 @@ static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot); static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup); +static TupleDesc verify_btreeam_tupdesc(void); +static void confess(BtreeCheckContext * ctx, BlockNumber blkno, char *msg); + +/* + * Macro for either calling ereport(...) or confess(...) depending on whether + * a context for returning the error message exists. Prior to version 1.3, + * all functions reported any detected corruption via ereport, but starting in + * 1.3, the new function verify_btreeam reports detected corruption back to + * the caller as a set of rows, and pre-existing functions continue to report + * corruption via ereport. This macro allows the shared implementation to + * to do the right thing depending on context. + */ +#define econfess(ctx, blkno, code, ...) \ + do { \ + if (ctx) \ + confess(ctx, blkno, psprintf(__VA_ARGS__)); \ + else \ + ereport(ERROR, (errcode(code), errmsg(__VA_ARGS__))); \ + } while(0) + /* * bt_index_check(index regclass, heapallindexed boolean) * @@ -203,7 +244,7 @@ bt_index_check(PG_FUNCTION_ARGS) if (PG_NARGS() == 2) heapallindexed = PG_GETARG_BOOL(1); - bt_index_check_internal(indrelid, false, heapallindexed, false); + bt_index_check_internal(indrelid, false, heapallindexed, false, NULL); PG_RETURN_VOID(); } @@ -229,17 +270,66 @@ bt_index_parent_check(PG_FUNCTION_ARGS) if (PG_NARGS() == 3) rootdescend = PG_GETARG_BOOL(2); - bt_index_check_internal(indrelid, true, heapallindexed, rootdescend); + bt_index_check_internal(indrelid, true, heapallindexed, rootdescend, NULL); PG_RETURN_VOID(); } +Datum +verify_btreeam(PG_FUNCTION_ARGS) +{ +#define BTREECHECK_RELATION_COLS 2 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + MemoryContext oldcontext; + BtreeCheckContext ctx; + bool randomAccess; + Oid indrelid; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot " + "accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("materialize mode required, but it is not allowed " + "in this context"))); + + /* check supplied arguments */ + if (PG_ARGISNULL(0)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing required parameter for 'rel'"))); + indrelid = PG_GETARG_OID(0); + + memset(&ctx, 0, sizeof(BtreeCheckContext)); + + ctx.on_error_stop = PG_ARGISNULL(1) ? false : PG_GETARG_BOOL(1); + + /* The tupdesc and tuplestore must be created in ecxt_per_query_memory */ + oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); + randomAccess = (rsinfo->allowedModes & SFRM_Materialize_Random) != 0; + ctx.tupdesc = verify_btreeam_tupdesc(); + ctx.tupstore = tuplestore_begin_heap(randomAccess, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = ctx.tupstore; + rsinfo->setDesc = ctx.tupdesc; + + MemoryContextSwitchTo(oldcontext); + + bt_index_check_internal(indrelid, true, true, true, &ctx); + + PG_RETURN_NULL(); +} + /* * Helper for bt_index_[parent_]check, coordinating the bulk of the work. */ static void bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, - bool rootdescend) + bool rootdescend, BtreeCheckContext * ctx) { Oid heapid; Relation indrel; @@ -300,15 +390,16 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, RelationOpenSmgr(indrel); if (!smgrexists(indrel->rd_smgr, MAIN_FORKNUM)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index \"%s\" lacks a main relation fork", - RelationGetRelationName(indrel)))); + econfess(ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "index \"%s\" lacks a main relation fork", + RelationGetRelationName(indrel)); /* Check index, possibly against table it is an index on */ - _bt_metaversion(indrel, &heapkeyspace, &allequalimage); - bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, - heapallindexed, rootdescend); + if (CONTINUE_CHECKING(ctx)) + _bt_metaversion(indrel, &heapkeyspace, &allequalimage); + if (CONTINUE_CHECKING(ctx)) + bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, + heapallindexed, rootdescend, ctx); } /* @@ -402,7 +493,8 @@ btree_index_mainfork_expected(Relation rel) */ static void bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, - bool readonly, bool heapallindexed, bool rootdescend) + bool readonly, bool heapallindexed, bool rootdescend, + BtreeCheckContext * ctx) { BtreeCheckState *state; Page metapage; @@ -434,6 +526,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, state->readonly = readonly; state->heapallindexed = heapallindexed; state->rootdescend = rootdescend; + state->ctx = ctx; if (state->heapallindexed) { @@ -535,7 +628,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, current.level = metad->btm_level; current.leftmost = metad->btm_root; current.istruerootlevel = true; - while (current.leftmost != P_NONE) + while (CONTINUE_CHECKING(state->ctx) && current.leftmost != P_NONE) { /* * Verify this level, and get left most page for next level down, if @@ -544,10 +637,9 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, current = bt_check_level_from_leftmost(state, current); if (current.leftmost == InvalidBlockNumber) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index \"%s\" has no valid pages on level below %u or first level", - RelationGetRelationName(rel), previouslevel))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "index \"%s\" has no valid pages on level below %u or first level", + RelationGetRelationName(rel), previouslevel); previouslevel = current.level; } @@ -555,7 +647,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * * Check whether heap contains unindexed/malformed tuples * */ - if (state->heapallindexed) + if (CONTINUE_CHECKING(state->ctx) && state->heapallindexed) { IndexInfo *indexinfo = BuildIndexInfo(state->rel); TableScanDesc scan; @@ -691,18 +783,16 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) * checked. */ if (state->readonly && P_ISDELETED(opaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("downlink or sibling link points to deleted block in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u left block=%u left link from block=%u.", - current, leftcurrent, opaque->btpo_prev))); + econfess(state->ctx, current, ERRCODE_INDEX_CORRUPTED, + "downlink or sibling link points to deleted block in index \"%s\" " + "(Block=%u left block=%u left link from block=%u)", + RelationGetRelationName(state->rel), + current, leftcurrent, opaque->btpo_prev); if (P_RIGHTMOST(opaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("block %u fell off the end of index \"%s\"", - current, RelationGetRelationName(state->rel)))); + econfess(state->ctx, current, ERRCODE_INDEX_CORRUPTED, + "block %u fell off the end of index \"%s\"", + current, RelationGetRelationName(state->rel)); else ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), @@ -722,16 +812,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) if (state->readonly) { if (!P_LEFTMOST(opaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("block %u is not leftmost in index \"%s\"", - current, RelationGetRelationName(state->rel)))); + econfess(state->ctx, current, ERRCODE_INDEX_CORRUPTED, + "block %u is not leftmost in index \"%s\"", + current, RelationGetRelationName(state->rel)); if (level.istruerootlevel && !P_ISROOT(opaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("block %u is not true root in index \"%s\"", - current, RelationGetRelationName(state->rel)))); + econfess(state->ctx, current, ERRCODE_INDEX_CORRUPTED, + "block %u is not true root in index \"%s\"", + current, RelationGetRelationName(state->rel)); } /* @@ -780,21 +868,19 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) * so sibling pointers should always be in mutual agreement */ if (state->readonly && opaque->btpo_prev != leftcurrent) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("left link/right link pair in index \"%s\" not in agreement", - RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u left block=%u left link from block=%u.", - current, leftcurrent, opaque->btpo_prev))); + econfess(state->ctx, current, ERRCODE_INDEX_CORRUPTED, + "left link/right link pair in index \"%s\" not in agreement " + "(Block=%u left block=%u left link from block=%u)", + RelationGetRelationName(state->rel), + current, leftcurrent, opaque->btpo_prev); /* Check level, which must be valid for non-ignorable page */ if (level.level != opaque->btpo.level) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down", - RelationGetRelationName(state->rel)), - errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", - current, level.level, opaque->btpo.level))); + econfess(state->ctx, current, ERRCODE_INDEX_CORRUPTED, + "leftmost down link for level points to block in index \"%s\" whose level is not one level down " + "(Block pointed to=%u expected level=%u level in pointed to block=%u)", + RelationGetRelationName(state->rel), + current, level.level, opaque->btpo.level); /* Verify invariants for page */ bt_target_page_check(state); @@ -803,10 +889,9 @@ nextpage: /* Try to detect circular links */ if (current == leftcurrent || current == opaque->btpo_prev) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("circular link chain found in block %u of index \"%s\"", - current, RelationGetRelationName(state->rel)))); + econfess(state->ctx, current, ERRCODE_INDEX_CORRUPTED, + "circular link chain found in block %u of index \"%s\"", + current, RelationGetRelationName(state->rel)); leftcurrent = current; current = opaque->btpo_next; @@ -850,7 +935,7 @@ nextpage: /* Free page and associated memory for this iteration */ MemoryContextReset(state->targetcontext); } - while (current != P_NONE); + while (CONTINUE_CHECKING(state->ctx) && current != P_NONE); if (state->lowkey) { @@ -930,16 +1015,15 @@ bt_target_page_check(BtreeCheckState *state) P_HIKEY)) { itup = (IndexTuple) PageGetItem(state->target, itemid); - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("wrong number of high key index tuple attributes in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.", - state->targetblock, - BTreeTupleGetNAtts(itup, state->rel), - P_ISLEAF(topaque) ? "heap" : "index", - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, state->targetblock, ERRCODE_INDEX_CORRUPTED, + "wrong number of high key index tuple attributes in index \"%s\" " + "(Index block=%u natts=%u block type=%s page lsn=%X/%X)", + RelationGetRelationName(state->rel), + state->targetblock, + BTreeTupleGetNAtts(itup, state->rel), + P_ISLEAF(topaque) ? "heap" : "index", + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } } @@ -949,7 +1033,7 @@ bt_target_page_check(BtreeCheckState *state) * real item (if any). */ for (offset = P_FIRSTDATAKEY(topaque); - offset <= max; + offset <= max && CONTINUE_CHECKING(state->ctx); offset = OffsetNumberNext(offset)) { ItemId itemid; @@ -973,16 +1057,15 @@ bt_target_page_check(BtreeCheckState *state) * frequently, and is surprisingly tolerant of corrupt lp_len fields. */ if (tupsize != ItemIdGetLength(itemid)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index tuple size does not equal lp_len in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.", - state->targetblock, offset, - tupsize, ItemIdGetLength(itemid), - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn), - errhint("This could be a torn page problem."))); + econfess(state->ctx, state->targetblock, ERRCODE_INDEX_CORRUPTED, + "index tuple size does not equal lp_len in index \"%s\" " + "(Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X) " + "(This could be a torn page problem)", + RelationGetRelationName(state->rel), + state->targetblock, offset, + tupsize, ItemIdGetLength(itemid), + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); /* Check the number of index tuple attributes */ if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target, @@ -998,17 +1081,16 @@ bt_target_page_check(BtreeCheckState *state) ItemPointerGetBlockNumberNoCheck(tid), ItemPointerGetOffsetNumberNoCheck(tid)); - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("wrong number of index tuple attributes in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.", - itid, - BTreeTupleGetNAtts(itup, state->rel), - P_ISLEAF(topaque) ? "heap" : "index", - htid, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "wrong number of index tuple attributes in index \"%s\" " + "(Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X)", + RelationGetRelationName(state->rel), + itid, + BTreeTupleGetNAtts(itup, state->rel), + P_ISLEAF(topaque) ? "heap" : "index", + htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } /* @@ -1049,14 +1131,13 @@ bt_target_page_check(BtreeCheckState *state) htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid), ItemPointerGetOffsetNumber(tid)); - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("could not find tuple using search from root page in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", - itid, htid, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "could not find tuple using search from root page in index \"%s\" " + "(Index tid=%s points to heap tid=%s page lsn=%X/%X)", + RelationGetRelationName(state->rel), + itid, htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } /* @@ -1079,14 +1160,13 @@ bt_target_page_check(BtreeCheckState *state) { char *itid = psprintf("(%u,%u)", state->targetblock, offset); - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("posting list contains misplaced TID in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.", - itid, i, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "posting list contains misplaced TID in index \"%s\" " + "(Index tid=%s posting list offset=%d page lsn=%X/%X)", + RelationGetRelationName(state->rel), + itid, i, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } ItemPointerCopy(current, &last); @@ -1134,16 +1214,15 @@ bt_target_page_check(BtreeCheckState *state) ItemPointerGetBlockNumberNoCheck(tid), ItemPointerGetOffsetNumberNoCheck(tid)); - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index row size %zu exceeds maximum for index \"%s\"", - tupsize, RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", - itid, - P_ISLEAF(topaque) ? "heap" : "index", - htid, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "index row size %zu exceeds maximum for index \"%s\" " + "(Index tid=%s points to %s tid=%s page lsn=%X/%X)", + tupsize, RelationGetRelationName(state->rel), + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } /* Fingerprint leaf page tuples (those that point to the heap) */ @@ -1242,16 +1321,15 @@ bt_target_page_check(BtreeCheckState *state) ItemPointerGetBlockNumberNoCheck(tid), ItemPointerGetOffsetNumberNoCheck(tid)); - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("high key invariant violated for index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", - itid, - P_ISLEAF(topaque) ? "heap" : "index", - htid, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "high key invariant violated for index \"%s\" " + "(Index tid=%s points to %s tid=%s page lsn=%X/%X)", + RelationGetRelationName(state->rel), + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } /* Reset, in case scantid was set to (itup) posting tuple's max TID */ skey->scantid = scantid; @@ -1289,21 +1367,20 @@ bt_target_page_check(BtreeCheckState *state) ItemPointerGetBlockNumberNoCheck(tid), ItemPointerGetOffsetNumberNoCheck(tid)); - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("item order invariant violated for index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Lower index tid=%s (points to %s tid=%s) " - "higher index tid=%s (points to %s tid=%s) " - "page lsn=%X/%X.", - itid, - P_ISLEAF(topaque) ? "heap" : "index", - htid, - nitid, - P_ISLEAF(topaque) ? "heap" : "index", - nhtid, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "item order invariant violated for index \"%s\" " + "(Lower index tid=%s (points to %s tid=%s) " + "higher index tid=%s (points to %s tid=%s) " + "page lsn=%X/%X)", + RelationGetRelationName(state->rel), + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + nitid, + P_ISLEAF(topaque) ? "heap" : "index", + nhtid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } /* @@ -1354,14 +1431,13 @@ bt_target_page_check(BtreeCheckState *state) return; } - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("cross page item order invariant violated for index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.", - state->targetblock, offset, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, state->targetblock, ERRCODE_INDEX_CORRUPTED, + "cross page item order invariant violated for index \"%s\" " + "(Last item on page tid=(%u,%u) page lsn=%X/%X)", + RelationGetRelationName(state->rel), + state->targetblock, offset, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } } @@ -1386,7 +1462,8 @@ bt_target_page_check(BtreeCheckState *state) * right of the child page pointer to by our rightmost downlink. And they * might have missing downlinks. This final call checks for them. */ - if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) + if (CONTINUE_CHECKING(state->ctx) && + !P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) { bt_child_highkey_check(state, InvalidOffsetNumber, NULL, topaque->btpo.level); @@ -1708,7 +1785,7 @@ bt_child_highkey_check(BtreeCheckState *state, } /* Move to the right on the child level */ - while (true) + while (CONTINUE_CHECKING(state->ctx)) { /* * Did we traverse the whole tree level and this is check for pages to @@ -1723,11 +1800,10 @@ bt_child_highkey_check(BtreeCheckState *state, /* Did we traverse the whole tree level and don't find next downlink? */ if (blkno == P_NONE) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"", - state->prevrightlink, downlink, - RelationGetRelationName(state->rel)))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "can't traverse from downlink %u to downlink %u of index \"%s\"", + state->prevrightlink, downlink, + RelationGetRelationName(state->rel)); /* Load page contents */ if (blkno == downlink && loaded_child) @@ -1739,30 +1815,27 @@ bt_child_highkey_check(BtreeCheckState *state, /* The first page we visit at the level should be leftmost */ if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", - state->targetblock, blkno, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "the first child of leftmost target page is not leftmost of its level in index \"%s\" " + "(Target block=%u child block=%u target page lsn=%X/%X)", + RelationGetRelationName(state->rel), + state->targetblock, blkno, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); /* Check level for non-ignorable page */ if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("block found while following rightlinks from child of index \"%s\" has invalid level", - RelationGetRelationName(state->rel)), - errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", - blkno, target_level - 1, opaque->btpo.level))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "block found while following rightlinks from child of index \"%s\" has invalid level " + "(Block pointed to=%u expected level=%u level in pointed to block=%u)", + RelationGetRelationName(state->rel), + blkno, target_level - 1, opaque->btpo.level); /* Try to detect circular links */ if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("circular link chain found in block %u of index \"%s\"", - blkno, RelationGetRelationName(state->rel)))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "circular link chain found in block %u of index \"%s\"", + blkno, RelationGetRelationName(state->rel)); if (blkno != downlink && !P_IGNORE(opaque)) { @@ -1825,14 +1898,13 @@ bt_child_highkey_check(BtreeCheckState *state, if (pivotkey_offset > PageGetMaxOffsetNumber(state->target)) { if (P_RIGHTMOST(topaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", - state->targetblock, blkno, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "child high key is greater than rightmost pivot key on target level in index \"%s\" " + "(Target block=%u child block=%u target page lsn=%X/%X)", + RelationGetRelationName(state->rel), + state->targetblock, blkno, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); pivotkey_offset = P_HIKEY; } itemid = PageGetItemIdCareful(state, state->targetblock, @@ -1856,27 +1928,25 @@ bt_child_highkey_check(BtreeCheckState *state, * page. */ if (!state->lowkey) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("can't find left sibling high key in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", - state->targetblock, blkno, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "can't find left sibling high key in index \"%s\" " + "(Target block=%u child block=%u target page lsn=%X/%X)", + RelationGetRelationName(state->rel), + state->targetblock, blkno, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); itup = state->lowkey; } if (!bt_pivot_tuple_identical(highkey, itup)) { - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("mismatch between parent key and child high key in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", - state->targetblock, blkno, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "mismatch between parent key and child high key in index \"%s\" " + "(Target block=%u child block=%u target page lsn=%X/%X)", + RelationGetRelationName(state->rel), + state->targetblock, blkno, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } } @@ -2014,17 +2084,16 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, * to test. */ if (P_ISDELETED(copaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("downlink to deleted page found in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.", - state->targetblock, childblock, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, state->targetblock, ERRCODE_INDEX_CORRUPTED, + "downlink to deleted page found in index \"%s\" " + "(Parent block=%u child block=%u parent page lsn=%X/%X)", + RelationGetRelationName(state->rel), + state->targetblock, childblock, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); for (offset = P_FIRSTDATAKEY(copaque); - offset <= maxoffset; + offset <= maxoffset && CONTINUE_CHECKING(state->ctx); offset = OffsetNumberNext(offset)) { /* @@ -2056,14 +2125,13 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, if (!invariant_l_nontarget_offset(state, targetkey, childblock, child, offset)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("down-link lower bound invariant violated for index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.", - state->targetblock, childblock, offset, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + econfess(state->ctx, state->targetblock, ERRCODE_INDEX_CORRUPTED, + "down-link lower bound invariant violated for index \"%s\" " + "(Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X)", + RelationGetRelationName(state->rel), + state->targetblock, childblock, offset, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn); } pfree(child); @@ -2150,14 +2218,13 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, * inconsistencies anywhere else. */ if (P_ISLEAF(opaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("leaf index block lacks downlink in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u page lsn=%X/%X.", - blkno, - (uint32) (pagelsn >> 32), - (uint32) pagelsn))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "leaf index block lacks downlink in index \"%s\" " + "(Block=%u page lsn=%X/%X)", + RelationGetRelationName(state->rel), + blkno, + (uint32) (pagelsn >> 32), + (uint32) pagelsn); /* Descend from the given page, which is an internal page */ elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", @@ -2167,7 +2234,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(page, itemid); childblk = BTreeTupleGetDownLink(itup); - for (;;) + while (CONTINUE_CHECKING(state->ctx)) { CHECK_FOR_INTERRUPTS(); @@ -2179,13 +2246,12 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, /* Do an extra sanity check in passing on internal pages */ if (copaque->btpo.level != level - 1) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", - RelationGetRelationName(state->rel)), - errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", - blkno, childblk, - level - 1, copaque->btpo.level))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "downlink points to block in index \"%s\" whose level is not one level down " + "(Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u)", + RelationGetRelationName(state->rel), + blkno, childblk, + level - 1, copaque->btpo.level); level = copaque->btpo.level; itemid = PageGetItemIdCareful(state, childblk, child, @@ -2217,14 +2283,13 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, * parent/ancestor page) lacked a downlink is incidental. */ if (P_ISDELETED(copaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("downlink to deleted leaf page found in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.", - blkno, childblk, - (uint32) (pagelsn >> 32), - (uint32) pagelsn))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "downlink to deleted leaf page found in index \"%s\" " + "(Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X)", + RelationGetRelationName(state->rel), + blkno, childblk, + (uint32) (pagelsn >> 32), + (uint32) pagelsn); /* * Iff leaf page is half-dead, its high key top parent link should point @@ -2244,14 +2309,13 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, return; } - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("internal index block lacks downlink in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u level=%u page lsn=%X/%X.", - blkno, opaque->btpo.level, - (uint32) (pagelsn >> 32), - (uint32) pagelsn))); + econfess(state->ctx, blkno, ERRCODE_INDEX_CORRUPTED, + "internal index block lacks downlink in index \"%s\" " + "(Block=%u level=%u page lsn=%X/%X)", + RelationGetRelationName(state->rel), + blkno, opaque->btpo.level, + (uint32) (pagelsn >> 32), + (uint32) pagelsn); } /* @@ -2327,16 +2391,12 @@ bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, /* Probe Bloom filter -- tuple should be present */ if (bloom_lacks_element(state->filter, (unsigned char *) norm, IndexTupleSize(norm))) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"", - ItemPointerGetBlockNumber(&(itup->t_tid)), - ItemPointerGetOffsetNumber(&(itup->t_tid)), - RelationGetRelationName(state->heaprel), - RelationGetRelationName(state->rel)), - !state->readonly - ? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.") - : 0)); + econfess(state->ctx, ItemPointerGetBlockNumber(&(itup->t_tid)), ERRCODE_DATA_CORRUPTED, + "heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid)), + RelationGetRelationName(state->heaprel), + RelationGetRelationName(state->rel)); state->heaptuplespresent++; pfree(itup); @@ -2395,7 +2455,7 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) if (!IndexTupleHasVarwidths(itup)) return itup; - for (i = 0; i < tupleDescriptor->natts; i++) + for (i = 0; CONTINUE_CHECKING(state->ctx) && i < tupleDescriptor->natts; i++) { Form_pg_attribute att; @@ -2415,12 +2475,11 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) * should never be encountered here */ if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i]))) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"", - ItemPointerGetBlockNumber(&(itup->t_tid)), - ItemPointerGetOffsetNumber(&(itup->t_tid)), - RelationGetRelationName(state->rel)))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid)), + RelationGetRelationName(state->rel)); else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i]))) { formnewtup = true; @@ -2810,10 +2869,9 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISMETA(opaque) && blocknum != BTREE_METAPAGE) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid meta page found at block %u in index \"%s\"", - blocknum, RelationGetRelationName(state->rel)))); + econfess(state->ctx, blocknum, ERRCODE_INDEX_CORRUPTED, + "invalid meta page found at block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)); /* Check page from block that ought to be meta page */ if (blocknum == BTREE_METAPAGE) @@ -2822,20 +2880,18 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) if (!P_ISMETA(opaque) || metad->btm_magic != BTREE_MAGIC) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index \"%s\" meta page is corrupt", - RelationGetRelationName(state->rel)))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "index \"%s\" meta page is corrupt", + RelationGetRelationName(state->rel)); if (metad->btm_version < BTREE_MIN_VERSION || metad->btm_version > BTREE_VERSION) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, " - "current version %d, minimum supported version %d", - RelationGetRelationName(state->rel), - metad->btm_version, BTREE_VERSION, - BTREE_MIN_VERSION))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "version mismatch in index \"%s\": file version %d, " + "current version %d, minimum supported version %d", + RelationGetRelationName(state->rel), + metad->btm_version, BTREE_VERSION, + BTREE_MIN_VERSION); /* Finished with metapage checks */ return page; @@ -2846,17 +2902,15 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) * page level */ if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid leaf page level %u for block %u in index \"%s\"", - opaque->btpo.level, blocknum, RelationGetRelationName(state->rel)))); + econfess(state->ctx, InvalidBlockNumber, ERRCODE_INDEX_CORRUPTED, + "invalid leaf page level %u for block %u in index \"%s\"", + opaque->btpo.level, blocknum, RelationGetRelationName(state->rel)); if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level == 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid internal page level 0 for block %u in index \"%s\"", - blocknum, RelationGetRelationName(state->rel)))); + econfess(state->ctx, blocknum, ERRCODE_INDEX_CORRUPTED, + "invalid internal page level 0 for block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)); /* * Sanity checks for number of items on page. @@ -2910,17 +2964,15 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) * Internal pages should never have garbage items, either. */ if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("internal page block %u in index \"%s\" is half-dead", - blocknum, RelationGetRelationName(state->rel)), - errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); + econfess(state->ctx, blocknum, ERRCODE_INDEX_CORRUPTED, + "internal page block %u in index \"%s\" is half-dead " + "(This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it)", + blocknum, RelationGetRelationName(state->rel)); if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("internal page block %u in index \"%s\" has garbage items", - blocknum, RelationGetRelationName(state->rel)))); + econfess(state->ctx, blocknum, ERRCODE_INDEX_CORRUPTED, + "internal page block %u in index \"%s\" has garbage items", + blocknum, RelationGetRelationName(state->rel)); return page; } @@ -2971,14 +3023,13 @@ PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) > BLCKSZ - sizeof(BTPageOpaqueData)) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("line pointer points past end of tuple space in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", - block, offset, ItemIdGetOffset(itemid), - ItemIdGetLength(itemid), - ItemIdGetFlags(itemid)))); + econfess(state->ctx, block, ERRCODE_INDEX_CORRUPTED, + "line pointer points past end of tuple space in index \"%s\" " + "(Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u)", + RelationGetRelationName(state->rel), + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)); /* * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED, since nbtree @@ -2987,14 +3038,13 @@ PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, */ if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) || ItemIdGetLength(itemid) == 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid line pointer storage in index \"%s\"", - RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", - block, offset, ItemIdGetOffset(itemid), - ItemIdGetLength(itemid), - ItemIdGetFlags(itemid)))); + econfess(state->ctx, block, ERRCODE_INDEX_CORRUPTED, + "invalid line pointer storage in index \"%s\" " + "(Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u)", + RelationGetRelationName(state->rel), + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)); return itemid; } @@ -3016,26 +3066,23 @@ BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, */ Assert(state->heapkeyspace); if (BTreeTupleIsPivot(itup) && nonpivot) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple", - state->targetblock, - RelationGetRelationName(state->rel)))); + econfess(state->ctx, state->targetblock, ERRCODE_INDEX_CORRUPTED, + "block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)); if (!BTreeTupleIsPivot(itup) && !nonpivot) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple", - state->targetblock, - RelationGetRelationName(state->rel)))); + econfess(state->ctx, state->targetblock, ERRCODE_INDEX_CORRUPTED, + "block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)); htid = BTreeTupleGetHeapTID(itup); if (!ItemPointerIsValid(htid) && nonpivot) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", - state->targetblock, - RelationGetRelationName(state->rel)))); + econfess(state->ctx, state->targetblock, ERRCODE_INDEX_CORRUPTED, + "block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", + state->targetblock, + RelationGetRelationName(state->rel)); return htid; } @@ -3066,3 +3113,52 @@ BTreeTupleGetPointsToTID(IndexTuple itup) /* Pivot tuple returns TID with downlink block (heapkeyspace variant) */ return &itup->t_tid; } + +/* + * Helper function to construct the TupleDesc needed by verify_heapam. + */ +static TupleDesc +verify_btreeam_tupdesc(void) +{ + TupleDesc tupdesc; + AttrNumber a = 0; + + tupdesc = CreateTemplateTupleDesc(BTREECHECK_RELATION_COLS); + TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, ++a, "msg", TEXTOID, -1, 0); + Assert(a == BTREECHECK_RELATION_COLS); + + return BlessTupleDesc(tupdesc); +} + +/* + * confess + * + * Return a message about index corruption + * + * The msg argument is pfree'd by this function. + */ +static void +confess(BtreeCheckContext * ctx, BlockNumber blkno, char *msg) +{ + Datum values[BTREECHECK_RELATION_COLS]; + bool nulls[BTREECHECK_RELATION_COLS]; + HeapTuple tuple; + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + values[0] = Int64GetDatum(blkno); + nulls[0] = (blkno == InvalidBlockNumber); + values[1] = CStringGetTextDatum(msg); + + /* + * In principle, there is nothing to prevent a scan over a large, highly + * corrupted table from using workmem worth of memory building up the + * tuplestore. Don't leak the msg argument memory. + */ + pfree(msg); + + tuple = heap_form_tuple(ctx->tupdesc, values, nulls); + tuplestore_puttuple(ctx->tupstore, tuple); + ctx->is_corrupt = true; +} diff --git a/contrib/pg_amcheck/.gitignore b/contrib/pg_amcheck/.gitignore new file mode 100644 index 0000000000..07ad380105 --- /dev/null +++ b/contrib/pg_amcheck/.gitignore @@ -0,0 +1,3 @@ +/pg_amcheck + +/tmp_check/ diff --git a/contrib/pg_amcheck/Makefile b/contrib/pg_amcheck/Makefile new file mode 100644 index 0000000000..74554b9e8d --- /dev/null +++ b/contrib/pg_amcheck/Makefile @@ -0,0 +1,28 @@ +# contrib/pg_amcheck/Makefile + +PGFILEDESC = "pg_amcheck - detects corruption within database relations" +PGAPPICON = win32 + +PROGRAM = pg_amcheck +OBJS = \ + $(WIN32RES) \ + pg_amcheck.o + +REGRESS_OPTS += --load-extension=amcheck --load-extension=pageinspect +EXTRA_INSTALL += contrib/amcheck contrib/pageinspect + +TAP_TESTS = 1 + +PG_CPPFLAGS = -I$(libpq_srcdir) +PG_LIBS_INTERNAL = -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_amcheck +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_amcheck/pg_amcheck.c b/contrib/pg_amcheck/pg_amcheck.c new file mode 100644 index 0000000000..3e47b717f1 --- /dev/null +++ b/contrib/pg_amcheck/pg_amcheck.c @@ -0,0 +1,884 @@ +/*------------------------------------------------------------------------- + * + * pg_amcheck.c + * Detects corruption within database relations. + * + * Copyright (c) 2017-2020, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/pg_amcheck/pg_amcheck.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include "catalog/pg_am.h" +#include "catalog/pg_class.h" +#include "common/logging.h" +#include "common/username.h" +#include "fe_utils/connect.h" +#include "fe_utils/print.h" +#include "fe_utils/simple_list.h" +#include "fe_utils/string_utils.h" +#include "pg_getopt.h" + +const char *usage_text[] = { + "pg_amcheck is the PostgreSQL command line database corruption checker.", + "", + "Usage:", + " pg_amcheck [OPTION]... [DBNAME [USERNAME]]", + "", + "General options:", + " -V, --version output version information, then exit", + " -?, --help show this help, then exit", + " -s, --schema=PATTERN check all relations in the specified schema(s)", + " -N, --exclude-schema=PATTERN do NOT check relations in the specified " + "schema(s)", + " -t, --table=PATTERN check the specified table(s) only", + " -T, --exclude-table=PATTERN do NOT check the specified table(s)", + " -i, --check-indexes check associated btree indexes, if any", + " -I, --exclude-indexes do NOT check associated btree indexes", + " --strict-names require table and/or schema include patterns " + "to match at least one entity each", + " -b, --startblock check relations beginning at the given " + "starting block number", + " -e, --endblock check relations only up to the given ending " + "block number", + " -f, --skip-all-frozen do not check blocks marked as all frozen", + " -v, --skip-all-visible do not check blocks marked as all visible", + "", + "Connection options:", + " -d, --dbname=DBNAME database name to connect to", + " -h, --host=HOSTNAME database server host or socket directory", + " -p, --port=PORT database server port", + " -U, --username=USERNAME database user name", + " -w, --no-password never prompt for password", + " -W, --password force password prompt (should happen " + "automatically)", + "", + NULL /* sentinel */ +}; + +typedef struct +{ + char *dbname; + char *host; + char *port; + char *username; +} ConnectOptions; + +typedef enum trivalue +{ + TRI_DEFAULT, + TRI_NO, + TRI_YES +} trivalue; + +typedef struct +{ + PGconn *db; /* connection to backend */ + bool notty; /* stdin or stdout is not a tty (as determined + * on startup) */ + trivalue getPassword; /* prompt for a username and password */ + const char *progname; /* in case you renamed pg_amcheck */ + bool strict_names; /* The specified names/patterns should to + * match at least one entity */ + bool on_error_stop; /* The checking of each table should stop + * after the first corrupt page is found. */ + bool skip_frozen; /* Do not check pages marked all frozen */ + bool skip_visible; /* Do not check pages marked all visible */ + bool check_indexes; /* Check btree indexes for tables */ + char *startblock; /* Block number where checking begins */ + char *endblock; /* Block number where checking ends */ +} AmCheckSettings; + +static AmCheckSettings settings; + +/* + * Object inclusion/exclusion lists + * + * The string lists record the patterns given by command-line switches, + * which we then convert to lists of OIDs of matching objects. + */ +static SimpleStringList schema_include_patterns = {NULL, NULL}; +static SimpleOidList schema_include_oids = {NULL, NULL}; +static SimpleStringList schema_exclude_patterns = {NULL, NULL}; +static SimpleOidList schema_exclude_oids = {NULL, NULL}; + +static SimpleStringList table_include_patterns = {NULL, NULL}; +static SimpleOidList table_include_oids = {NULL, NULL}; +static SimpleStringList table_exclude_patterns = {NULL, NULL}; +static SimpleOidList table_exclude_oids = {NULL, NULL}; + +/* + * List of tables to be checked, compiled from above lists. + */ +static SimpleOidList checklist = {NULL, NULL}; + + +static void check_tables(SimpleOidList *checklist); +static void check_table(Oid tbloid); +static void check_indexes(Oid tbloid); +static void check_index(Oid tbloid, Oid idxoid); + +static void parse_cli_options(int argc, char *argv[], + ConnectOptions * connOpts); +static void usage(void); +static void showVersion(void); + +static void NoticeProcessor(void *arg, const char *message); + +static void expand_schema_name_patterns(SimpleStringList *patterns, + SimpleOidList *oids, + bool strict_names); +static void expand_table_name_patterns(SimpleStringList *patterns, + SimpleOidList *oids, + bool strict_names); +static void get_table_check_list(SimpleOidList *include_nsp, + SimpleOidList *exclude_nsp, + SimpleOidList *include_tbl, + SimpleOidList *exclude_tbl, + SimpleOidList *checklist); + +static void die_on_query_failure(const char *query); +static void ExecuteSqlStatement(const char *query); +static PGresult *ExecuteSqlQuery(const char *query, ExecStatusType status); +static PGresult *ExecuteSqlQueryForSingleRow(const char *query); + +#define fatal(...) do { pg_log_error(__VA_ARGS__); exit(1); } while(0) + +#define NOPAGER 0 +#define EXIT_BADCONN 2 + +int +main(int argc, char **argv) +{ + ConnectOptions connOpts; + bool have_password = false; + char password[100]; + bool new_pass; + + pg_logging_init(argv[0]); + set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_amcheck")); + + if (argc > 1) + { + if ((strcmp(argv[1], "-?") == 0) || + (argc == 2 && (strcmp(argv[1], "--help") == 0))) + { + usage(); + exit(EXIT_SUCCESS); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + showVersion(); + exit(EXIT_SUCCESS); + } + } + + memset(&settings, 0, sizeof(settings)); + settings.progname = get_progname(argv[0]); + + settings.db = NULL; + setDecimalLocale(); + + settings.notty = (!isatty(fileno(stdin)) || !isatty(fileno(stdout))); + + settings.getPassword = TRI_DEFAULT; + + parse_cli_options(argc, argv, &connOpts); + + if (settings.getPassword == TRI_YES) + { + /* + * We can't be sure yet of the username that will be used, so don't + * offer a potentially wrong one. Typical uses of this option are + * noninteractive anyway. + */ + simple_prompt("Password: ", password, sizeof(password), false); + have_password = true; + } + + /* loop until we have a password if requested by backend */ + do + { +#define ARRAY_SIZE 8 + const char **keywords = pg_malloc(ARRAY_SIZE * sizeof(*keywords)); + const char **values = pg_malloc(ARRAY_SIZE * sizeof(*values)); + + keywords[0] = "host"; + values[0] = connOpts.host; + keywords[1] = "port"; + values[1] = connOpts.port; + keywords[2] = "user"; + values[2] = connOpts.username; + keywords[3] = "password"; + values[3] = have_password ? password : NULL; + keywords[4] = "dbname"; /* see do_connect() */ + values[4] = (connOpts.dbname == NULL) ? "postgres" : connOpts.dbname; + keywords[5] = "fallback_application_name"; + values[5] = settings.progname; + keywords[6] = "client_encoding"; + values[6] = (settings.notty || + getenv("PGCLIENTENCODING")) ? NULL : "auto"; + keywords[7] = NULL; + values[7] = NULL; + + new_pass = false; + settings.db = PQconnectdbParams(keywords, values, true); + if (settings.db == NULL) + { + pg_log_error("no connection to server after initial attempt"); + exit(EXIT_BADCONN); + } + + free(keywords); + free(values); + + if (PQstatus(settings.db) == CONNECTION_BAD && + PQconnectionNeedsPassword(settings.db) && + !have_password && + settings.getPassword != TRI_NO) + { + /* + * Before closing the old PGconn, extract the user name that was + * actually connected with. + */ + const char *realusername = PQuser(settings.db); + char *password_prompt; + + if (realusername && realusername[0]) + password_prompt = psprintf(_("Password for user %s: "), + realusername); + else + password_prompt = pg_strdup(_("Password: ")); + PQfinish(settings.db); + + simple_prompt(password_prompt, password, sizeof(password), false); + free(password_prompt); + have_password = true; + new_pass = true; + } + } while (new_pass); + + if (!settings.db) + { + pg_log_error("no connection to server"); + exit(EXIT_BADCONN); + } + + if (PQstatus(settings.db) == CONNECTION_BAD) + { + pg_log_error("could not connect to server: %s", + PQerrorMessage(settings.db)); + PQfinish(settings.db); + exit(EXIT_BADCONN); + } + + /* Expand schema selection patterns into OID lists */ + if (schema_include_patterns.head != NULL) + { + expand_schema_name_patterns(&schema_include_patterns, + &schema_include_oids, + settings.strict_names); + if (schema_include_oids.head == NULL) + fatal("no matching schemas were found"); + } + expand_schema_name_patterns(&schema_exclude_patterns, + &schema_exclude_oids, + false); + /* non-matching exclusion patterns aren't an error */ + + /* Expand table selection patterns into OID lists */ + if (table_include_patterns.head != NULL) + { + expand_table_name_patterns(&table_include_patterns, + &table_include_oids, + settings.strict_names); + if (table_include_oids.head == NULL) + fatal("no matching tables were found"); + } + expand_table_name_patterns(&table_exclude_patterns, + &table_exclude_oids, + false); + + /* + * Compile list of all tables to be checked based on namespace and table + * includes and excludes. + */ + get_table_check_list(&schema_include_oids, &schema_exclude_oids, + &table_include_oids, &table_exclude_oids, &checklist); + + PQsetNoticeProcessor(settings.db, NoticeProcessor, NULL); + + check_tables(&checklist); + + return 0; +} + +static void +check_tables(SimpleOidList *checklist) +{ + const SimpleOidListCell *cell; + + for (cell = checklist->head; cell; cell = cell->next) + { + check_table(cell->val); + if (settings.check_indexes) + check_indexes(cell->val); + } +} + +static void +check_table(Oid tbloid) +{ + PQExpBuffer query; + PGresult *res; + int i; + char *skip; + const char *stop; + + if (settings.db == NULL) + fatal("no connection on entry to expand_table_name_patterns"); + + if (settings.startblock == NULL) + settings.startblock = pg_strdup("NULL"); + if (settings.endblock == NULL) + settings.endblock = pg_strdup("NULL"); + if (settings.skip_frozen) + skip = pg_strdup("'all frozen'"); + else if (settings.skip_visible) + skip = pg_strdup("'all visible'"); + else + skip = pg_strdup("NULL"); + stop = (settings.on_error_stop) ? "true" : "false"; + + query = createPQExpBuffer(); + + appendPQExpBuffer(query, + "SELECT c.relname, v.blkno, v.offnum, v.lp_off, " + "v.lp_flags, v.lp_len, v.attnum, v.chunk, v.msg" + "\nFROM verify_heapam(rel := %u, on_error_stop := %s, " + "skip := %s, startblock := %s, endblock := %s) v, " + "pg_class c" + "\nWHERE c.oid = %u", + tbloid, stop, skip, settings.startblock, + settings.endblock, tbloid); + + ExecuteSqlStatement("RESET search_path"); + res = ExecuteSqlQuery(query->data, PGRES_TUPLES_OK); + PQclear(ExecuteSqlQueryForSingleRow(ALWAYS_SECURE_SEARCH_PATH_SQL)); + + if (PQntuples(res) > 0) + { + int lines = PQntuples(res) * 2; + FILE *output = PageOutput(lines, NULL); + + for (i = 0; i < PQntuples(res); i++) + { + fprintf(output, + "(relname=%s,blkno=%s,offnum=%s,lp_off=%s,lp_flags=%s," + "lp_len=%s,attnum=%s,chunk=%s)\n%s\n", + PQgetvalue(res, i, 0), /* relname */ + PQgetvalue(res, i, 1), /* blkno */ + PQgetvalue(res, i, 2), /* offnum */ + PQgetvalue(res, i, 3), /* lp_off */ + PQgetvalue(res, i, 4), /* lp_flags */ + PQgetvalue(res, i, 5), /* lp_len */ + PQgetvalue(res, i, 6), /* attnum */ + PQgetvalue(res, i, 7), /* chunk */ + PQgetvalue(res, i, 8)); /* msg */ + } + } + + PQclear(res); + resetPQExpBuffer(query); + destroyPQExpBuffer(query); +} + +static void +check_indexes(Oid tbloid) +{ + PQExpBuffer query; + PGresult *res; + int i; + + query = createPQExpBuffer(); + appendPQExpBuffer(query, + "SELECT i.indexrelid" + "\nFROM pg_catalog.pg_index i, pg_catalog.pg_class c" + "\nWHERE i.indexrelid = c.oid" + "\n AND c.relam = %u" + "\n AND i.indrelid = %u", + BTREE_AM_OID, tbloid); + + ExecuteSqlStatement("RESET search_path"); + res = ExecuteSqlQuery(query->data, PGRES_TUPLES_OK); + PQclear(ExecuteSqlQueryForSingleRow(ALWAYS_SECURE_SEARCH_PATH_SQL)); + + for (i = 0; i < PQntuples(res); i++) + check_index(tbloid, atooid(PQgetvalue(res, i, 0))); + + PQclear(res); + resetPQExpBuffer(query); + destroyPQExpBuffer(query); +} + +static void +check_index(Oid tbloid, Oid idxoid) +{ + PQExpBuffer query; + PGresult *res; + int i; + + query = createPQExpBuffer(); + + appendPQExpBuffer(query, + "SELECT ct.relname, ci.relname, blkno, msg" + "\nFROM verify_btreeam(%u,%s)," + "\n pg_catalog.pg_class ci," + "\n pg_catalog.pg_class ct" + "\nWHERE ci.oid = %u" + "\n AND ct.oid = %u", + idxoid, + settings.on_error_stop ? "true" : "false", + idxoid, tbloid); + + ExecuteSqlStatement("RESET search_path"); + res = ExecuteSqlQuery(query->data, PGRES_TUPLES_OK); + PQclear(ExecuteSqlQueryForSingleRow(ALWAYS_SECURE_SEARCH_PATH_SQL)); + + if (PQntuples(res) > 0) + { + int lines = PQntuples(res) * 2; + FILE *output = PageOutput(lines, NULL); + + for (i = 0; i < PQntuples(res); i++) + { + fprintf(output, + "(table=%s,index=%s,blkno=%s)" + "\n%s\n", + PQgetvalue(res, i, 0), /* table relname */ + PQgetvalue(res, i, 1), /* index relname */ + PQgetvalue(res, i, 2), /* index blkno */ + PQgetvalue(res, i, 3)); /* msg */ + } + } + + PQclear(res); + resetPQExpBuffer(query); + destroyPQExpBuffer(query); +} + +static void +parse_cli_options(int argc, char *argv[], ConnectOptions * connOpts) +{ + static struct option long_options[] = + { + {"startblock", required_argument, NULL, 'b'}, + {"dbname", required_argument, NULL, 'd'}, + {"endblock", required_argument, NULL, 'e'}, + {"host", required_argument, NULL, 'h'}, + {"check-indexes", no_argument, NULL, 'i'}, + {"exclude-indexes", no_argument, NULL, 'I'}, + {"skip-all-visible", no_argument, NULL, 'v'}, + {"skip-all-frozen", no_argument, NULL, 'f'}, + {"schema", required_argument, NULL, 'n'}, + {"exclude-schema", required_argument, NULL, 'N'}, + {"on-error-stop", no_argument, NULL, 'o'}, + {"port", required_argument, NULL, 'p'}, + {"strict-names", no_argument, NULL, 's'}, + {"table", required_argument, NULL, 't'}, + {"exclude-table", required_argument, NULL, 'T'}, + {"username", required_argument, NULL, 'U'}, + {"version", no_argument, NULL, 'V'}, + {"no-password", no_argument, NULL, 'w'}, + {"password", no_argument, NULL, 'W'}, + {"help", optional_argument, NULL, '?'}, + {NULL, 0, NULL, 0} + }; + + int optindex; + int c; + + memset(connOpts, 0, sizeof *connOpts); + + while ((c = getopt_long(argc, argv, "b:d:e:fh:iIn:N:op:st:T:U:vVwW?1", + long_options, &optindex)) != -1) + { + switch (c) + { + case 'b': + settings.startblock = pg_strdup(optarg); + break; + case 'd': + connOpts->dbname = pg_strdup(optarg); + break; + case 'e': + settings.endblock = pg_strdup(optarg); + break; + case 'f': + settings.skip_frozen = true; + break; + case 'h': + connOpts->host = pg_strdup(optarg); + break; + case 'i': + settings.check_indexes = true; + break; + case 'I': + settings.check_indexes = false; + break; + case 'n': /* include schema(s) */ + simple_string_list_append(&schema_include_patterns, optarg); + break; + case 'N': /* exclude schema(s) */ + simple_string_list_append(&schema_exclude_patterns, optarg); + break; + case 'o': + settings.on_error_stop = true; + break; + case 'p': + connOpts->port = pg_strdup(optarg); + break; + case 's': + settings.strict_names = true; + break; + case 't': /* include table(s) */ + simple_string_list_append(&table_include_patterns, optarg); + break; + case 'T': /* exclude table(s) */ + simple_string_list_append(&table_exclude_patterns, optarg); + break; + case 'U': + connOpts->username = pg_strdup(optarg); + break; + case 'v': + settings.skip_visible = true; + break; + case 'V': + showVersion(); + exit(EXIT_SUCCESS); + case 'w': + settings.getPassword = TRI_NO; + break; + case 'W': + settings.getPassword = TRI_YES; + break; + case '?': + if (optind <= argc && + strcmp(argv[optind - 1], "-?") == 0) + { + /* actual help option given */ + usage(); + exit(EXIT_SUCCESS); + } + else + { + /* getopt error (unknown option or missing argument) */ + goto unknown_option; + } + break; + case 1: + { + if (!optarg || strcmp(optarg, "options") == 0) + usage(); + else + goto unknown_option; + + exit(EXIT_SUCCESS); + } + break; + default: + unknown_option: + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + settings.progname); + exit(EXIT_FAILURE); + break; + } + } + + /* + * if we still have arguments, use it as the database name and username + */ + while (argc - optind >= 1) + { + if (!connOpts->dbname) + connOpts->dbname = argv[optind]; + else if (!connOpts->username) + connOpts->username = argv[optind]; + else + pg_log_warning("extra command-line argument \"%s\" ignored", + argv[optind]); + + optind++; + } + +} + +/* + * usage + * + * print out command line arguments + */ +static void +usage(void) +{ + FILE *output; + int lines; + int lineno; + + for (lines = 0; usage_text[lines]; lines++) + ; + output = PageOutput(lines + 2, NULL); + for (lineno = 0; usage_text[lineno]; lineno++) + fprintf(output, "%s\n", usage_text[lineno]); + fprintf(output, "Report bugs to <%s>.\n", PACKAGE_BUGREPORT); + fprintf(output, "%s home page: <%s>\n", PACKAGE_NAME, PACKAGE_URL); + + ClosePager(output); +} + +static void +showVersion(void) +{ + puts("pg_amcheck (PostgreSQL) " PG_VERSION); +} + +/* + * for backend Notice messages (INFO, WARNING, etc) + */ +static void +NoticeProcessor(void *arg, const char *message) +{ + (void) arg; /* not used */ + pg_log_info("%s", message); +} + +/* + * Find the OIDs of all schemas matching the given list of patterns, + * and append them to the given OID list. + */ +static void +expand_schema_name_patterns(SimpleStringList *patterns, + SimpleOidList *oids, + bool strict_names) +{ + PQExpBuffer query; + PGresult *res; + SimpleStringListCell *cell; + int i; + + if (settings.db == NULL) + fatal("no connection on entry to expand_schema_name_patterns"); + + if (patterns->head == NULL) + return; /* nothing to do */ + + query = createPQExpBuffer(); + + /* + * The loop below runs multiple SELECTs might sometimes result in + * duplicate entries in the OID list, but we don't care. + */ + + for (cell = patterns->head; cell; cell = cell->next) + { + appendPQExpBufferStr(query, + "SELECT oid FROM pg_catalog.pg_namespace n\n"); + processSQLNamePattern(settings.db, query, cell->val, false, + false, NULL, "n.nspname", NULL, NULL); + + res = ExecuteSqlQuery(query->data, PGRES_TUPLES_OK); + if (strict_names && PQntuples(res) == 0) + fatal("no matching schemas were found for pattern \"%s\"", + cell->val); + + for (i = 0; i < PQntuples(res); i++) + { + simple_oid_list_append(oids, atooid(PQgetvalue(res, i, 0))); + } + + PQclear(res); + resetPQExpBuffer(query); + } + + destroyPQExpBuffer(query); +} + +/* + * Find the OIDs of all tables matching the given list of patterns, + * and append them to the given OID list. See also expand_dbname_patterns() + * in pg_dumpall.c + */ +static void +expand_table_name_patterns(SimpleStringList *patterns, SimpleOidList *oids, + bool strict_names) +{ + PQExpBuffer query; + PGresult *res; + SimpleStringListCell *cell; + int i; + + if (settings.db == NULL) + fatal("no connection on entry to expand_table_name_patterns"); + + if (patterns->head == NULL) + return; /* nothing to do */ + + query = createPQExpBuffer(); + + /* + * this might sometimes result in duplicate entries in the OID list, but + * we don't care. + */ + + for (cell = patterns->head; cell; cell = cell->next) + { + /* + * Query must remain ABSOLUTELY devoid of unqualified names. This + * would be unnecessary given a pg_table_is_visible() variant taking a + * search_path argument. + */ + appendPQExpBuffer(query, + "SELECT c.oid" + "\nFROM pg_catalog.pg_class c" + "\n LEFT JOIN pg_catalog.pg_namespace n" + "\n ON n.oid OPERATOR(pg_catalog.=) c.relnamespace" + "\nWHERE c.relkind OPERATOR(pg_catalog.=) ANY" + "\n (array['%c', '%c', '%c'])\n", + RELKIND_RELATION, RELKIND_MATVIEW, + RELKIND_PARTITIONED_TABLE); + processSQLNamePattern(settings.db, query, cell->val, true, + false, "n.nspname", "c.relname", NULL, NULL); + ExecuteSqlStatement("RESET search_path"); + res = ExecuteSqlQuery(query->data, PGRES_TUPLES_OK); + PQclear(ExecuteSqlQueryForSingleRow(ALWAYS_SECURE_SEARCH_PATH_SQL)); + if (strict_names && PQntuples(res) == 0) + fatal("no matching tables were found for pattern \"%s\"", + cell->val); + + for (i = 0; i < PQntuples(res); i++) + simple_oid_list_append(oids, atooid(PQgetvalue(res, i, 0))); + + PQclear(res); + resetPQExpBuffer(query); + } + + destroyPQExpBuffer(query); +} + +static void +append_csv_oids(PQExpBuffer query, const SimpleOidList *oids) +{ + const SimpleOidListCell *cell; + const char *comma; + + for (comma = "", cell = oids->head; cell; comma = ", ", cell = cell->next) + appendPQExpBuffer(query, "%s%u", comma, cell->val); +} + +static bool +append_filter(PQExpBuffer query, const char *lval, const char *operator, + const SimpleOidList *oids) +{ + if (!oids->head) + return false; + appendPQExpBuffer(query, "\nAND %s %s ANY(array[\n", lval, operator); + append_csv_oids(query, oids); + appendPQExpBuffer(query, "\n])"); + return true; +} + +static void +get_table_check_list(SimpleOidList *include_nsp, SimpleOidList *exclude_nsp, + SimpleOidList *include_tbl, SimpleOidList *exclude_tbl, + SimpleOidList *checklist) +{ + PQExpBuffer query; + PGresult *res; + int i; + + if (settings.db == NULL) + fatal("no connection on entry to expand_table_name_patterns"); + + query = createPQExpBuffer(); + + appendPQExpBuffer(query, + "SELECT c.oid" + "\nFROM pg_catalog.pg_class c" + "\n LEFT JOIN pg_catalog.pg_namespace n" + "\n ON n.oid OPERATOR(pg_catalog.=) c.relnamespace" + "\nWHERE c.relkind OPERATOR(pg_catalog.=) ANY" + "\n (array['%c', '%c', '%c'])\n", + RELKIND_RELATION, RELKIND_MATVIEW, + RELKIND_PARTITIONED_TABLE); + append_filter(query, "n.oid", "OPERATOR(pg_catalog.=)", include_nsp); + append_filter(query, "n.oid", "OPERATOR(pg_catalog.!=)", exclude_nsp); + append_filter(query, "c.oid", "OPERATOR(pg_catalog.=)", include_tbl); + append_filter(query, "c.oid", "OPERATOR(pg_catalog.!=)", exclude_tbl); + + ExecuteSqlStatement("RESET search_path"); + res = ExecuteSqlQuery(query->data, PGRES_TUPLES_OK); + PQclear(ExecuteSqlQueryForSingleRow(ALWAYS_SECURE_SEARCH_PATH_SQL)); + + for (i = 0; i < PQntuples(res); i++) + simple_oid_list_append(checklist, atooid(PQgetvalue(res, i, 0))); + + PQclear(res); + resetPQExpBuffer(query); + destroyPQExpBuffer(query); +} + +/* Like fatal(), but with a complaint about a particular query. */ +static void +die_on_query_failure(const char *query) +{ + pg_log_error("query failed: %s", + PQerrorMessage(settings.db)); + fatal("query was: %s", query); +} + +static void +ExecuteSqlStatement(const char *query) +{ + PGresult *res; + + res = PQexec(settings.db, query); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + die_on_query_failure(query); + PQclear(res); +} + +static PGresult * +ExecuteSqlQuery(const char *query, ExecStatusType status) +{ + PGresult *res; + + res = PQexec(settings.db, query); + if (PQresultStatus(res) != status) + die_on_query_failure(query); + return res; +} + +/* + * Execute an SQL query and verify that we got exactly one row back. + */ +static PGresult * +ExecuteSqlQueryForSingleRow(const char *query) +{ + PGresult *res; + int ntups; + + res = ExecuteSqlQuery(query, PGRES_TUPLES_OK); + + /* Expecting a single result only */ + ntups = PQntuples(res); + if (ntups != 1) + fatal(ngettext("query returned %d row instead of one: %s", + "query returned %d rows instead of one: %s", + ntups), + ntups, query); + + return res; +} diff --git a/contrib/pg_amcheck/t/001_basic.pl b/contrib/pg_amcheck/t/001_basic.pl new file mode 100644 index 0000000000..dfa0ae9e06 --- /dev/null +++ b/contrib/pg_amcheck/t/001_basic.pl @@ -0,0 +1,9 @@ +use strict; +use warnings; + +use TestLib; +use Test::More tests => 8; + +program_help_ok('pg_amcheck'); +program_version_ok('pg_amcheck'); +program_options_handling_ok('pg_amcheck'); diff --git a/contrib/pg_amcheck/t/002_nonesuch.pl b/contrib/pg_amcheck/t/002_nonesuch.pl new file mode 100644 index 0000000000..c63ba4452e --- /dev/null +++ b/contrib/pg_amcheck/t/002_nonesuch.pl @@ -0,0 +1,55 @@ +use strict; +use warnings; + +use PostgresNode; +use TestLib; +use Test::More tests => 12; + +# Test set-up +my ($node, $port); +$node = get_new_node('test'); +$node->init; +$node->start; +$port = $node->port; + +# Load the amcheck extension, upon which pg_amcheck depends +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); + +######################################### +# Test connecting to a non-existent database + +command_fails_like( + [ 'pg_amcheck', '-p', "$port", 'qqq' ], + qr/\Qpg_amcheck: error: could not connect to server: FATAL: database "qqq" does not exist\E/, + 'connecting to a non-existent database'); + +######################################### +# Test connecting with a non-existent user + +command_fails_like( + [ 'pg_amcheck', '-p', "$port", '-U=no_such_user' ], + qr/\Qpg_amcheck: error: could not connect to server: FATAL: role "=no_such_user" does not exist\E/, + 'connecting with a non-existent user'); + +######################################### +# Test checking a non-existent schema, table, and patterns with --strict-names + +command_fails_like( + [ 'pg_amcheck', '-p', "$port", '-n', 'nonexistent' ], + qr/\Qpg_amcheck: error: no matching schemas were found\E/, + 'checking a non-existent schema'); + +command_fails_like( + [ 'pg_amcheck', '-p', "$port", '-t', 'nonexistent' ], + qr/\Qpg_amcheck: error: no matching tables were found\E/, + 'checking a non-existent table'); + +command_fails_like( + [ 'pg_amcheck', '-p', "$port", '--strict-names', '-n', 'nonexistent*' ], + qr/\Qpg_amcheck: error: no matching schemas were found for pattern\E/, + 'no matching schemas'); + +command_fails_like( + [ 'pg_amcheck', '-p', "$port", '--strict-names', '-t', 'nonexistent*' ], + qr/\Qpg_amcheck: error: no matching tables were found for pattern\E/, + 'no matching tables'); diff --git a/contrib/pg_amcheck/t/003_check.pl b/contrib/pg_amcheck/t/003_check.pl new file mode 100644 index 0000000000..de3ce54e8e --- /dev/null +++ b/contrib/pg_amcheck/t/003_check.pl @@ -0,0 +1,85 @@ +use strict; +use warnings; + +use PostgresNode; +use TestLib; +use Test::More tests => 7; + +# Test set-up +my ($node, $port); +$node = get_new_node('test'); +$node->init; +$node->start; +$port = $node->port; + +# Load the amcheck extension, upon which pg_amcheck depends +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); + +# Create schemas and tables for checking pg_amcheck's include +# and exclude schema and table command line options +$node->safe_psql('postgres', q( +CREATE SCHEMA s1; +CREATE SCHEMA s2; +CREATE SCHEMA s3; +CREATE TABLE s1.t1 (a TEXT); +CREATE TABLE s1.t2 (a TEXT); +CREATE TABLE s1.t3 (a TEXT); +CREATE TABLE s2.t1 (a TEXT); +CREATE TABLE s2.t2 (a TEXT); +CREATE TABLE s2.t3 (a TEXT); +CREATE TABLE s3.t1 (a TEXT); +CREATE TABLE s3.t2 (a TEXT); +CREATE TABLE s3.t3 (a TEXT); +CREATE INDEX i1 ON s1.t1(a); +CREATE INDEX i2 ON s1.t2(a); +CREATE INDEX i3 ON s1.t3(a); +CREATE INDEX i1 ON s2.t1(a); +CREATE INDEX i2 ON s2.t2(a); +CREATE INDEX i3 ON s2.t3(a); +CREATE INDEX i1 ON s3.t1(a); +CREATE INDEX i2 ON s3.t2(a); +CREATE INDEX i3 ON s3.t3(a); +INSERT INTO s1.t1 (a) (SELECT gs::TEXT FROM generate_series(1,10000) AS gs); +)); + +$node->command_ok( + [ + 'pg_amcheck', '-p', $port, 'postgres' + ], + 'pg_amcheck all schemas and tables implicitly'); + +$node->command_ok( + [ + 'pg_amcheck', '-i', '-p', $port, 'postgres' + ], + 'pg_amcheck all schemas, tables and indexes'); + +;$node->command_ok( + [ + 'pg_amcheck', '-p', $port, 'postgres', '-n', 's1' + ], + 'pg_amcheck all tables in schema s1'); + +$node->command_ok( + [ + 'pg_amcheck', '-p', $port, 'postgres', '-N', 's1' + ], + 'pg_amcheck all tables not in schema s1'); + +$node->command_ok( + [ + 'pg_amcheck', '-p', $port, 'postgres', '-i', '-n', 's*', '-t', 't1' + ], + 'pg_amcheck all tables named t1 and their indexes'); + +$node->command_ok( + [ + 'pg_amcheck', '-p', $port, 'postgres', '-T', 't1' + ], + 'pg_amcheck all tables not named t1'); + +$node->command_ok( + [ + 'pg_amcheck', '-p', $port, 'postgres', '-N', 's1', '-T', 't1' + ], + 'pg_amcheck all tables not named t1 nor in schema s1'); diff --git a/contrib/pg_amcheck/t/004_verify_heapam.pl b/contrib/pg_amcheck/t/004_verify_heapam.pl new file mode 100644 index 0000000000..a96b763886 --- /dev/null +++ b/contrib/pg_amcheck/t/004_verify_heapam.pl @@ -0,0 +1,407 @@ +use strict; +use warnings; + +use PostgresNode; +use TestLib; + +use Test::More tests => 36; + +# This regression test demonstrates that the verify_heapam() function supplied +# with the amcheck contrib module and depended upon by this pg_amcheck contrib +# module correctly identifies specific kinds of corruption within pages. To +# test this, we need a mechanism to create corrupt pages with predictable, +# repeatable corruption. The postgres backend cannot be expected to help us +# with this, as its design is not consistent with the goal of intentionally +# corrupting pages. +# +# Instead, we create a table to corrupt, and with careful consideration of how +# postgresql lays out heap pages, we seek to offsets within the page and +# overwrite deliberately chosen bytes with specific values calculated to +# corrupt the page in expected ways. We then verify that verify_heapam +# reports the corruption, and that it runs without crashing. Note that the +# backend cannot simply be started to run queries against the corrupt table, as +# the backend will crash, at least for some of the corruption types we +# generate. +# +# Autovacuum potentially touching the table in the background makes the exact +# behavior of this test harder to reason about. We turn it off to keep things +# simpler. We use a "belt and suspenders" approach, turning it off for the +# system generally in postgresql.conf, and turning it off specifically for the +# test table. +# +# This test depends on the table being written to the heap file exactly as we +# expect it to be, so we take care to arrange the columns of the table, and +# insert rows of the table, that give predictable sizes and locations within +# the table page. +# +# The HeapTupleHeaderData has 23 bytes of fixed size fields before the variable +# length t_bits[] array. We have exactly 3 columns in the table, so natts = 3, +# t_bits is 1 byte long, and t_hoff = MAXALIGN(23 + 1) = 24. +# +# We're not too fussy about which datatypes we use for the test, but we do care +# about some specific properties. We'd like to test both fixed size and +# varlena types. We'd like some varlena data inline and some toasted. And +# we'd like the layout of the table such that the datums land at predictable +# offsets within the tuple. We choose a structure without padding on all +# supported architectures: +# +# a BIGINT +# b TEXT +# c TEXT +# +# We always insert a 7-ascii character string into field 'b', which with a +# 1-byte varlena header gives an 8 byte inline value. We always insert a long +# text string in field 'c', long enough to force toast storage. +# +# +# We choose to read and write binary copies of our table's tuples, using perl's +# pack() and unpack() functions. Perl uses a packing code system in which: +# +# L = "Unsigned 32-bit Long", +# S = "Unsigned 16-bit Short", +# C = "Unsigned 8-bit Octet", +# c = "signed 8-bit octet", +# q = "signed 64-bit quadword" +# +# Each tuple in our table has a layout as follows: +# +# xx xx xx xx t_xmin: xxxx offset = 0 L +# xx xx xx xx t_xmax: xxxx offset = 4 L +# xx xx xx xx t_field3: xxxx offset = 8 L +# xx xx bi_hi: xx offset = 12 S +# xx xx bi_lo: xx offset = 14 S +# xx xx ip_posid: xx offset = 16 S +# xx xx t_infomask2: xx offset = 18 S +# xx xx t_infomask: xx offset = 20 S +# xx t_hoff: x offset = 22 C +# xx t_bits: x offset = 23 C +# xx xx xx xx xx xx xx xx 'a': xxxxxxxx offset = 24 q +# xx xx xx xx xx xx xx xx 'b': xxxxxxxx offset = 32 Cccccccc +# xx xx xx xx xx xx xx xx 'c': xxxxxxxx offset = 40 SSSS +# xx xx xx xx xx xx xx xx : xxxxxxxx ...continued SSSS +# xx xx : xx ...continued S +# +# We could choose to read and write columns 'b' and 'c' in other ways, but +# it is convenient enough to do it this way. We define packing code +# constants here, where they can be compared easily against the layout. + +use constant HEAPTUPLE_PACK_CODE => 'LLLSSSSSCCqCcccccccSSSSSSSSS'; +use constant HEAPTUPLE_PACK_LENGTH => 58; # Total size + +# Read a tuple of our table from a heap page. +# +# Takes an open filehandle to the heap file, and the offset of the tuple. +# +# Rather than returning the binary data from the file, unpacks the data into a +# perl hash with named fields. These fields exactly match the ones understood +# by write_tuple(), below. Returns a reference to this hash. +# +sub read_tuple ($$) +{ + my ($fh, $offset) = @_; + my ($buffer, %tup); + seek($fh, $offset, 0); + sysread($fh, $buffer, HEAPTUPLE_PACK_LENGTH); + + @_ = unpack(HEAPTUPLE_PACK_CODE, $buffer); + %tup = (t_xmin => shift, + t_xmax => shift, + t_field3 => shift, + bi_hi => shift, + bi_lo => shift, + ip_posid => shift, + t_infomask2 => shift, + t_infomask => shift, + t_hoff => shift, + t_bits => shift, + a => shift, + b_header => shift, + b_body1 => shift, + b_body2 => shift, + b_body3 => shift, + b_body4 => shift, + b_body5 => shift, + b_body6 => shift, + b_body7 => shift, + c1 => shift, + c2 => shift, + c3 => shift, + c4 => shift, + c5 => shift, + c6 => shift, + c7 => shift, + c8 => shift, + c9 => shift); + # Stitch together the text for column 'b' + $tup{b} = join('', map { chr($tup{"b_body$_"}) } (1..7)); + return \%tup; +} + +# Write a tuple of our table to a heap page. +# +# Takes an open filehandle to the heap file, the offset of the tuple, and a +# reference to a hash with the tuple values, as returned by read_tuple(). +# Writes the tuple fields from the hash into the heap file. +# +# The purpose of this function is to write a tuple back to disk with some +# subset of fields modified. The function does no error checking. Use +# cautiously. +# +sub write_tuple($$$) +{ + my ($fh, $offset, $tup) = @_; + my $buffer = pack(HEAPTUPLE_PACK_CODE, + $tup->{t_xmin}, + $tup->{t_xmax}, + $tup->{t_field3}, + $tup->{bi_hi}, + $tup->{bi_lo}, + $tup->{ip_posid}, + $tup->{t_infomask2}, + $tup->{t_infomask}, + $tup->{t_hoff}, + $tup->{t_bits}, + $tup->{a}, + $tup->{b_header}, + $tup->{b_body1}, + $tup->{b_body2}, + $tup->{b_body3}, + $tup->{b_body4}, + $tup->{b_body5}, + $tup->{b_body6}, + $tup->{b_body7}, + $tup->{c1}, + $tup->{c2}, + $tup->{c3}, + $tup->{c4}, + $tup->{c5}, + $tup->{c6}, + $tup->{c7}, + $tup->{c8}, + $tup->{c9}); + seek($fh, $offset, 0); + syswrite($fh, $buffer, HEAPTUPLE_PACK_LENGTH); + return; +} + +# Set umask so test directories and files are created with default permissions +umask(0077); + +# Set up the node. Once we create and corrupt the table, +# autovacuum workers visiting the table could crash the backend. +# Disable autovacuum so that won't happen. +my $node = get_new_node('test'); +$node->init; +$node->append_conf('postgresql.conf', 'autovacuum=off'); + +# Start the node and load the extensions. We depend on both +# amcheck and pageinspect for this test. +$node->start; +my $port = $node->port; +my $pgdata = $node->data_dir; +$node->safe_psql('postgres', "CREATE EXTENSION amcheck"); +$node->safe_psql('postgres', "CREATE EXTENSION pageinspect"); + +# Create the test table with precisely the schema that our +# corruption function expects. +$node->safe_psql( + 'postgres', qq( + CREATE TABLE public.test (a BIGINT, b TEXT, c TEXT); + ALTER TABLE public.test SET (autovacuum_enabled=false); + ALTER TABLE public.test ALTER COLUMN c SET STORAGE EXTERNAL; + CREATE INDEX test_idx ON public.test(a, b); + )); + +my $rel = $node->safe_psql('postgres', qq(SELECT pg_relation_filepath('public.test'))); +my $relpath = "$pgdata/$rel"; + +use constant ROWCOUNT => 12; +$node->safe_psql('postgres', qq( + INSERT INTO public.test (a, b, c) + VALUES ( + 12345678, + 'abcdefg', + repeat('w', 10000) + ); + VACUUM FREEZE public.test + )) for (1..ROWCOUNT); + +my $relfrozenxid = $node->safe_psql('postgres', + q(select relfrozenxid from pg_class where relname = 'test')); + +# Find where each of the tuples is located on the page. +my @lp_off; +for my $tup (0..ROWCOUNT-1) +{ + push (@lp_off, $node->safe_psql('postgres', qq( +select lp_off from heap_page_items(get_raw_page('test', 'main', 0)) + offset $tup limit 1))); +} + +# Check that pg_amcheck runs against the uncorrupted table without error. +$node->command_ok(['pg_amcheck', '-p', $port, 'postgres'], + 'pg_amcheck test table, prior to corruption'); + +# Check that pg_amcheck runs against the uncorrupted table and index without error. +$node->command_ok(['pg_amcheck', '--check-indexes', '-p', $port, 'postgres'], + 'pg_amcheck test table and index, prior to corruption'); + +$node->stop; + +# Some #define constants from access/htup_details.h for use while corrupting. +use constant HEAP_HASNULL => 0x0001; +use constant HEAP_XMIN_COMMITTED => 0x0100; +use constant HEAP_XMIN_INVALID => 0x0200; +use constant HEAP_XMAX_INVALID => 0x0800; +use constant HEAP_NATTS_MASK => 0x07FF; + +# Corrupt the tuples, one type of corruption per tuple. Some types of +# corruption cause verify_heapam to skip to the next tuple without +# performing any remaining checks, so we can't exercise the system properly if +# we focus all our corruption on a single tuple. +# +my $file; +open($file, '+<', $relpath); +binmode $file; + +for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) +{ + my $offset = $lp_off[$tupidx]; + my $tup = read_tuple($file, $offset); + + # Sanity-check that the data appears on the page where we expect. + if ($tup->{a} ne '12345678' || $tup->{b} ne 'abcdefg') + { + fail('Page layout differs from our expectations'); + $node->clean_node; + exit; + } + + if ($tupidx == 0) + { + # Corruptly set xmin < relfrozenxid + $tup->{t_xmin} = 3; + $tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED; + $tup->{t_infomask} &= ~HEAP_XMIN_INVALID; + } + elsif ($tupidx == 1) + { + # Corruptly set xmin < relfrozenxid, further back + $tup->{t_xmin} = 4026531839; # Note circularity of xid comparison + $tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED; + $tup->{t_infomask} &= ~HEAP_XMIN_INVALID; + } + elsif ($tupidx == 2) + { + # Corruptly set xmax < relminmxid; + $tup->{t_xmax} = 4026531839; # Note circularity of xid comparison + $tup->{t_infomask} &= ~HEAP_XMAX_INVALID; + } + elsif ($tupidx == 3) + { + # Corrupt the tuple t_hoff, but keep it aligned properly + $tup->{t_hoff} += 128; + } + elsif ($tupidx == 4) + { + # Corrupt the tuple t_hoff, wrong alignment + $tup->{t_hoff} += 3; + } + elsif ($tupidx == 5) + { + # Corrupt the tuple t_hoff, underflow but correct alignment + $tup->{t_hoff} -= 8; + } + elsif ($tupidx == 6) + { + # Corrupt the tuple t_hoff, underflow and wrong alignment + $tup->{t_hoff} -= 3; + } + elsif ($tupidx == 7) + { + # Corrupt the tuple to look like it has lots of attributes, not just 3 + $tup->{t_infomask2} |= HEAP_NATTS_MASK; + } + elsif ($tupidx == 8) + { + # Corrupt the tuple to look like it has lots of attributes, some of + # them null. This falsely creates the impression that the t_bits + # array is longer than just one byte, but t_hoff still says otherwise. + $tup->{t_infomask} |= HEAP_HASNULL; + $tup->{t_infomask2} |= HEAP_NATTS_MASK; + $tup->{t_bits} = 0xAA; + } + elsif ($tupidx == 9) + { + # Same as above, but this time t_hoff plays along + $tup->{t_infomask} |= HEAP_HASNULL; + $tup->{t_infomask2} |= (HEAP_NATTS_MASK & 0x40); + $tup->{t_bits} = 0xAA; + $tup->{t_hoff} = 32; + } + elsif ($tupidx == 10) + { + # Corrupt the bits in column 'b' 1-byte varlena header + $tup->{b_header} = 0x80; + } + elsif ($tupidx == 11) + { + # Corrupt the bits in column 'c' toast pointer + $tup->{c6} = 41; + $tup->{c7} = 41; + } + write_tuple($file, $offset, $tup); +} +close($file); + +# Run verify_heapam on the corrupted file +$node->start; + +my $result = $node->safe_psql( + 'postgres', + q(SELECT * FROM verify_heapam('test', on_error_stop := false, skip := NULL, startblock := NULL, endblock := NULL))); +is ($result, +"0|1|8128|1|58|||tuple xmin = 3 precedes relation relfrozenxid = $relfrozenxid +0|2|8064|1|58|||tuple xmin = 4026531839 precedes relation relfrozenxid = $relfrozenxid +0|3|8000|1|58|||tuple xmax = 4026531839 precedes relation relfrozenxid = $relfrozenxid +0|4|7936|1|58|||t_hoff > lp_len (152 > 58) +0|5|7872|1|58|||t_hoff not max-aligned (27) +0|6|7808|1|58|||t_hoff < SizeofHeapTupleHeader (16 < 23) +0|7|7744|1|58|||t_hoff < SizeofHeapTupleHeader (21 < 23) +0|7|7744|1|58|||t_hoff not max-aligned (21) +0|8|7680|1|58|||relation natts < tuple natts (3 < 2047) +0|9|7616|1|58|||SizeofHeapTupleHeader + BITMAPLEN(natts) > t_hoff (23 + 256 > 24) +0|10|7552|1|58|||relation natts < tuple natts (3 < 67) +0|11|7488|1|58|2||t_hoff + offset > lp_len (24 + 416847976 > 58) +0|12|7424|1|58|2|0|final chunk number differs from expected (0 vs. 6) +0|12|7424|1|58|2|0|toasted value missing from toast table", +"Expected verify_heapam output"); + +# Each table corruption message is returned with a standard header, and we can +# check for those headers to verify that corruption is being reported. We can +# also check for each individual corruption that we would expect to see. +my @corruption_re = ( + + # standard header + qr/relname=test,blkno=\d*,offnum=\d*,lp_off=\d*,lp_flags=\d*,lp_len=\d*,attnum=\d*,chunk=\d*/, + + # individual detected corruptions + qr/tuple xmin = \d+ precedes relation relfrozenxid = \d+/, + qr/tuple xmax = \d+ precedes relation relfrozenxid = \d+/, + qr/t_hoff > lp_len/, + qr/t_hoff not max-aligned/, + qr/t_hoff < SizeofHeapTupleHeader/, + qr/relation natts < tuple natts/, + qr/SizeofHeapTupleHeader \+ BITMAPLEN\(natts\) > t_hoff/, + qr/t_hoff \+ offset > lp_le/, + qr/final chunk number differs from expected/, + qr/toasted value missing from toast table/, +); + +$node->command_like( + ['pg_amcheck', '-p', $port, 'postgres'], $_, + "pg_amcheck reports: $_" + ) for(@corruption_re); + +$node->teardown_node; +$node->clean_node; diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 75518a7820..cc36d92f72 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -69,7 +69,7 @@ AND c.relpersistence != 't' -- Function may throw an error when this is omitted: AND c.relkind = 'i' AND i.indisready AND i.indisvalid ORDER BY c.relpages DESC LIMIT 10; - bt_index_check | relname | relpages + bt_index_check | relname | relpages ----------------+---------------------------------+---------- | pg_depend_reference_index | 43 | pg_depend_depender_index | 40 @@ -165,6 +165,110 @@ ORDER BY c.relpages DESC LIMIT 10; + + + + + verify_heapam(relation regclass, + on_error_stop boolean, + skip_all_frozen boolean, + skip_all_visible boolean, + blkno OUT bigint, + offnum OUT integer, + lp_off OUT smallint, + lp_flags OUT smallint, + lp_len OUT smallint, + attnum OUT integer, + chunk OUT integer, + msg OUT text) + returns record + + + + + Checks for "logical" corruption, where the page is valid but inconsistent + with the rest of the database cluster. This can happen due to faulty or + ill-conceived backup and restore tools, or bad storage, or user error, or + bugs in the server itself. It checks xmin and xmax values against + relfrozenxid and relminmxid, and also validates TOAST pointers. + + + + For each block in the relation where corruption is detected, or for just + the first block if on_error_stop is true, for each corruption detected, + returns one row containing the following fields: + + + + blkno + + + The number of the block containing the corrupt page. + + + + + offnum + + + The OffsetNumber of the corrupt tuple. + + + + + lp_off + + + The offset into the page of the line pointer for the corrupt tuple. + + + + + lp_flags + + + The flags in the line pointer for the corrupt tuple. + + + + + lp_len + + + The length of the corrupt tuple as recorded in the line pointer. + + + + + attnum + + + The attribute number of the corrupt column in the tuple, if the + corruption is specific to a column and not the tuple as a whole. + + + + + chunk + + + The chunk number of the corrupt toasted attribute, if the corruption + is specific to a toasted value. + + + + + msg + + + A human readable message describing the corruption in the page. + + + + + + + diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml index 261a559e81..f606e42fb9 100644 --- a/doc/src/sgml/contrib.sgml +++ b/doc/src/sgml/contrib.sgml @@ -118,6 +118,7 @@ CREATE EXTENSION module_name; <ree; &pageinspect; &passwordcheck; + &pg_amcheck; &pgbuffercache; &pgcrypto; &pgfreespacemap; diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index 64b5da0070..10e1ca9663 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -131,6 +131,7 @@ + diff --git a/doc/src/sgml/pg_amcheck.sgml b/doc/src/sgml/pg_amcheck.sgml new file mode 100644 index 0000000000..f379af2258 --- /dev/null +++ b/doc/src/sgml/pg_amcheck.sgml @@ -0,0 +1,136 @@ + + + + pg_amcheck + + + pg_amcheck + + + + The pg_amcheck module provides a command line interface + to the corruption checking functionality. + + + + pg_amcheck is a regular + PostgreSQL client application. You can perform + corruption checks from any remote host that has access to the database + connecting as a user with sufficient privileges to check tables and indexes. + Currently, this requires superuser privileges. + + + + Options + + + To specify which database server pg_amcheck should + contact, use the command line options or + and or + . The default host is the local host + or whatever your PGHOST environment variable specifies. + Similarly, the default port is indicated by the PGPORT + environment variable or, failing that, by the compiled-in default. + + + + Like any other PostgreSQL client application, + pg_amcheck will by default connect with the + database user name that is equal to the current operating system user name. + To override this, either specify the option or set the + environment variable PGUSER. Remember that + pg_amcheck connections are subject to the normal + client authentication mechanisms (which are described in ). + + + + To restrict checking of tables and indexes to specific schemas, specify the + or option with a pattern. + To exclude checking of tables and indexes within specific schemas, specify + the or option with + a pattern. + + + + To specify which tables are checked, specify the + or option with a pattern. + To exclude checking of tables, specify the + or option with a + pattern. + + + + To check indexes associated with checked tables, specify the + or option. Only + indexes on tables which are being checked will themselves be checked. To + check all indexes in a database, all tables on which the indexes exist must + also be checked. This restriction may be relaxed in the future. + + + + To restrict the range of blocks within a table that are checked, specify the + or and/or + or options with numeric + values for the starting and ending block numbers. Although these options + make the most sense when applied to a single table, if specified along with + options that select multiple tables, each table check will be restricted to + the specified blocks. If is omitted, checking + begins with the first block. If is omitted, + checking continues to the end of the relation. + + + + Some users may wish to periodically check tables without incurring the cost + of rechecking older table blocks, presumably because those blocks have + already been checked in the past. There is at present no perfect way to do + this. Although the and + options can be used to restrict blocks, the user is not expected to have + perfect knowledge of which blocks have already been checked, and in any + event, some blocks that were previously checked may have been subject to + modification since the last check. As an approximation to the desired + functionality, one can specify the + or option, or + alternatively the + or option to skip + blocks marked all frozen or all visible, respectively. + + + + + Example Usage + + + Checking an entire database which contains one corrupt table, "corrupted", + along with the output: + + + +% pg_amcheck -i test +(relname=corrupted,blkno=0,offnum=16,lp_off=7680,lp_flags=1,lp_len=31,attnum=,chunk=) +tuple xmin = 3289393 is in the future +(relname=corrupted,blkno=0,offnum=17,lp_off=7648,lp_flags=1,lp_len=31,attnum=,chunk=) +tuple xmax = 0 precedes relation relminmxid = 1 +(relname=corrupted,blkno=0,offnum=17,lp_off=7648,lp_flags=1,lp_len=31,attnum=,chunk=) +tuple xmin = 12593 is in the future + + + + .... many pages of output removed for brevity .... + + + +(relname=corrupted,blkno=107,offnum=22,lp_off=7312,lp_flags=1,lp_len=34,attnum=,chunk=) +tuple xmin = 305 precedes relation relfrozenxid = 487 +(relname=corrupted,blkno=107,offnum=22,lp_off=7312,lp_flags=1,lp_len=34,attnum=,chunk=) +t_hoff > lp_len (54 > 34) +(relname=corrupted,blkno=107,offnum=22,lp_off=7312,lp_flags=1,lp_len=34,attnum=,chunk=) +t_hoff not max-aligned (54) + + + + Each detected corruption is reported on two lines, the first line shows the + location and the second line shows a message describing the problem. + + + -- 2.21.1 (Apple Git-122.3)