From 7906c7391a9f52d334c2cbc7d3e245ff014629f2 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 2 May 2017 00:19:24 -0700
Subject: [PATCH 2/2] Add amcheck verification of indexes against heap.

Add a new, optional capability to bt_index_check() and
bt_index_parent_check():  callers can check that each heap tuple that
ought to have an index entry does in fact have one.  This happens at the
end of the existing verification checks.

This is implemented by using a Bloom filter data structure.  The
implementation performs set membership tests within a callback (the same
type of callback that each index AM registers for CREATE INDEX).  The
Bloom filter is populated during the initial index verification scan.
---
 contrib/amcheck/Makefile                 |   2 +-
 contrib/amcheck/amcheck--1.0--1.1.sql    |  28 +++
 contrib/amcheck/amcheck.control          |   2 +-
 contrib/amcheck/expected/check_btree.out |  14 +-
 contrib/amcheck/sql/check_btree.sql      |   9 +-
 contrib/amcheck/verify_nbtree.c          | 298 ++++++++++++++++++++++++++++---
 doc/src/sgml/amcheck.sgml                | 173 ++++++++++++++----
 src/include/utils/snapmgr.h              |   2 +-
 8 files changed, 454 insertions(+), 74 deletions(-)
 create mode 100644 contrib/amcheck/amcheck--1.0--1.1.sql

diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile
index 43bed91..c5764b5 100644
--- a/contrib/amcheck/Makefile
+++ b/contrib/amcheck/Makefile
@@ -4,7 +4,7 @@ MODULE_big	= amcheck
 OBJS		= verify_nbtree.o $(WIN32RES)

 EXTENSION = amcheck
-DATA = amcheck--1.0.sql
+DATA = amcheck--1.0--1.1.sql amcheck--1.0.sql
 PGFILEDESC = "amcheck - function for verifying relation integrity"

 REGRESS = check check_btree
diff --git a/contrib/amcheck/amcheck--1.0--1.1.sql b/contrib/amcheck/amcheck--1.0--1.1.sql
new file mode 100644
index 0000000..e6cca0a
--- /dev/null
+++ b/contrib/amcheck/amcheck--1.0--1.1.sql
@@ -0,0 +1,28 @@
+/* contrib/amcheck/amcheck--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.1'" to load this file. \quit
+
+--
+-- bt_index_check()
+--
+DROP FUNCTION bt_index_check(regclass);
+CREATE FUNCTION bt_index_check(index regclass,
+    heapallindexed boolean DEFAULT false)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'bt_index_check'
+LANGUAGE C STRICT PARALLEL RESTRICTED;
+
+--
+-- bt_index_parent_check()
+--
+DROP FUNCTION bt_index_parent_check(regclass);
+CREATE FUNCTION bt_index_parent_check(index regclass,
+    heapallindexed boolean DEFAULT false)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'bt_index_parent_check'
+LANGUAGE C STRICT PARALLEL RESTRICTED;
+
+-- Don't want these to be available to public
+REVOKE ALL ON FUNCTION bt_index_check(regclass, boolean) FROM PUBLIC;
+REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean) FROM PUBLIC;
diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control
index 05e2861..4690484 100644
--- a/contrib/amcheck/amcheck.control
+++ b/contrib/amcheck/amcheck.control
@@ -1,5 +1,5 @@
 # amcheck extension
 comment = 'functions for verifying relation integrity'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/amcheck'
 relocatable = true
diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out
index df3741e..42872b8 100644
--- a/contrib/amcheck/expected/check_btree.out
+++ b/contrib/amcheck/expected/check_btree.out
@@ -16,8 +16,8 @@ RESET ROLE;
 -- we, intentionally, don't check relation permissions - it's useful
 -- to run this cluster-wide with a restricted account, and as tested
 -- above explicit permission has to be granted for that.
-GRANT EXECUTE ON FUNCTION bt_index_check(regclass) TO bttest_role;
-GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass) TO bttest_role;
+GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO bttest_role;
+GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO bttest_role;
 SET ROLE bttest_role;
 SELECT bt_index_check('bttest_a_idx');
  bt_index_check 
@@ -56,8 +56,14 @@ SELECT bt_index_check('bttest_a_idx');

 (1 row)

--- more expansive test
-SELECT bt_index_parent_check('bttest_b_idx');
+-- more expansive tests
+SELECT bt_index_check('bttest_a_idx', true);
+ bt_index_check 
+----------------
+ 
+(1 row)
+
+SELECT bt_index_parent_check('bttest_b_idx', true);
  bt_index_parent_check 
 -----------------------

diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql
index fd90531..5d27969 100644
--- a/contrib/amcheck/sql/check_btree.sql
+++ b/contrib/amcheck/sql/check_btree.sql
@@ -19,8 +19,8 @@ RESET ROLE;
 -- we, intentionally, don't check relation permissions - it's useful
 -- to run this cluster-wide with a restricted account, and as tested
 -- above explicit permission has to be granted for that.
-GRANT EXECUTE ON FUNCTION bt_index_check(regclass) TO bttest_role;
-GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass) TO bttest_role;
+GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO bttest_role;
+GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO bttest_role;
 SET ROLE bttest_role;
 SELECT bt_index_check('bttest_a_idx');
 SELECT bt_index_parent_check('bttest_a_idx');
@@ -42,8 +42,9 @@ ROLLBACK;

 -- normal check outside of xact
 SELECT bt_index_check('bttest_a_idx');
--- more expansive test
-SELECT bt_index_parent_check('bttest_b_idx');
+-- more expansive tests
+SELECT bt_index_check('bttest_a_idx', true);
+SELECT bt_index_parent_check('bttest_b_idx', true);

 BEGIN;
 SELECT bt_index_check('bttest_a_idx');
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 868c14e..8e57d2e 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -8,6 +8,11 @@
  * (the insertion scankey sort-wise NULL semantics are needed for
  * verification).
  *
+ * When index-to-heap verification is requested, a Bloom filter is used to
+ * fingerprint all tuples in the target index, as the index is traversed to
+ * verify its structure.  A heap scan later verifies the presence in the heap
+ * of all index tuples fingerprinted within the Bloom filter.
+ *
  *
  * Copyright (c) 2017, PostgreSQL Global Development Group
  *
@@ -18,11 +23,13 @@
  */
 #include "postgres.h"

+#include "access/htup_details.h"
 #include "access/nbtree.h"
 #include "access/transam.h"
 #include "catalog/index.h"
 #include "catalog/pg_am.h"
 #include "commands/tablecmds.h"
+#include "lib/bloomfilter.h"
 #include "miscadmin.h"
 #include "storage/lmgr.h"
 #include "utils/memutils.h"
@@ -43,9 +50,10 @@ PG_MODULE_MAGIC;
  * target is the point of reference for a verification operation.
  *
  * Other B-Tree pages may be allocated, but those are always auxiliary (e.g.,
- * they are current target's child pages). Conceptually, problems are only
- * ever found in the current target page. Each page found by verification's
- * left/right, top/bottom scan becomes the target exactly once.
+ * they are current target's child pages).  Conceptually, problems are only
+ * ever found in the current target page (or for a particular heap tuple during
+ * heapallindexed verification).  Each page found by verification's left/right,
+ * top/bottom scan becomes the target exactly once.
  */
 typedef struct BtreeCheckState
 {
@@ -53,10 +61,13 @@ typedef struct BtreeCheckState
 	 * Unchanging state, established at start of verification:
 	 */

-	/* B-Tree Index Relation */
+	/* B-Tree Index Relation and associated heap relation */
 	Relation	rel;
+	Relation	heaprel;
 	/* ShareLock held on heap/index, rather than AccessShareLock? */
 	bool		readonly;
+	/* Also verifying heap has no unindexed tuples? */
+	bool		heapallindexed;
 	/* Per-page context */
 	MemoryContext targetcontext;
 	/* Buffer access strategy */
@@ -72,6 +83,15 @@ typedef struct BtreeCheckState
 	BlockNumber targetblock;
 	/* Target page's LSN */
 	XLogRecPtr	targetlsn;
+
+	/*
+	 * Mutable state, for optional heapallindexed verification:
+	 */
+
+	/* Bloom filter fingerprints B-Tree index */
+	bloom_filter *filter;
+	/* Debug counter */
+	int64		heaptuplespresent;
 } BtreeCheckState;

 /*
@@ -92,15 +112,20 @@ typedef struct BtreeLevel
 PG_FUNCTION_INFO_V1(bt_index_check);
 PG_FUNCTION_INFO_V1(bt_index_parent_check);

-static void bt_index_check_internal(Oid indrelid, bool parentcheck);
+static void bt_index_check_internal(Oid indrelid, bool parentcheck,
+									bool heapallindexed);
 static inline void btree_index_checkable(Relation rel);
-static void bt_check_every_level(Relation rel, bool readonly);
+static void bt_check_every_level(Relation rel, Relation heaprel,
+								 bool readonly, bool heapallindexed);
 static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
 							 BtreeLevel level);
 static void bt_target_page_check(BtreeCheckState *state);
 static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);
 static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
 				  ScanKey targetkey);
+static void bt_tuple_present_callback(Relation index, HeapTuple htup,
+									  Datum *values, bool *isnull,
+									  bool tupleIsAlive, void *checkstate);
 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
 							OffsetNumber offset);
 static inline bool invariant_leq_offset(BtreeCheckState *state,
@@ -116,37 +141,47 @@ static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state,
 static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum);

 /*
- * bt_index_check(index regclass)
+ * bt_index_check(index regclass, heapallindexed boolean)
  *
  * Verify integrity of B-Tree index.
  *
  * Acquires AccessShareLock on heap & index relations.  Does not consider
- * invariants that exist between parent/child pages.
+ * invariants that exist between parent/child pages.  Optionally verifies
+ * that heap does not contain any unindexed or incorrectly indexed tuples.
  */
 Datum
 bt_index_check(PG_FUNCTION_ARGS)
 {
 	Oid			indrelid = PG_GETARG_OID(0);
+	bool		heapallindexed = false;

-	bt_index_check_internal(indrelid, false);
+	if (PG_NARGS() == 2)
+		heapallindexed = PG_GETARG_BOOL(1);
+
+	bt_index_check_internal(indrelid, false, heapallindexed);

 	PG_RETURN_VOID();
 }

 /*
- * bt_index_parent_check(index regclass)
+ * bt_index_parent_check(index regclass, heapallindexed boolean)
  *
  * Verify integrity of B-Tree index.
  *
  * Acquires ShareLock on heap & index relations.  Verifies that downlinks in
- * parent pages are valid lower bounds on child pages.
+ * parent pages are valid lower bounds on child pages.  Optionally verifies
+ * that heap does not contain any unindexed or incorrectly indexed tuples.
  */
 Datum
 bt_index_parent_check(PG_FUNCTION_ARGS)
 {
 	Oid			indrelid = PG_GETARG_OID(0);
+	bool		heapallindexed = false;

-	bt_index_check_internal(indrelid, true);
+	if (PG_NARGS() == 2)
+		heapallindexed = PG_GETARG_BOOL(1);
+
+	bt_index_check_internal(indrelid, true, heapallindexed);

 	PG_RETURN_VOID();
 }
@@ -155,7 +190,7 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
  * Helper for bt_index_[parent_]check, coordinating the bulk of the work.
  */
 static void
-bt_index_check_internal(Oid indrelid, bool parentcheck)
+bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
 {
 	Oid			heapid;
 	Relation	indrel;
@@ -191,9 +226,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck)
 	/*
 	 * Since we did the IndexGetRelation call above without any lock, it's
 	 * barely possible that a race against an index drop/recreation could have
-	 * netted us the wrong table.  Although the table itself won't actually be
-	 * examined during verification currently, a recheck still seems like a
-	 * good idea.
+	 * netted us the wrong table.
 	 */
 	if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false))
 		ereport(ERROR,
@@ -204,8 +237,8 @@ bt_index_check_internal(Oid indrelid, bool parentcheck)
 	/* Relation suitable for checking as B-Tree? */
 	btree_index_checkable(indrel);

-	/* Check index */
-	bt_check_every_level(indrel, parentcheck);
+	/* Check index, possibly against table it is an index on */
+	bt_check_every_level(indrel, heaprel, parentcheck, heapallindexed);

 	/*
 	 * Release locks early. That's ok here because nothing in the called
@@ -253,11 +286,14 @@ btree_index_checkable(Relation rel)

 /*
  * Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in
- * logical order, verifying invariants as it goes.
+ * logical order, verifying invariants as it goes.  Optionally, verification
+ * checks if the heap relation contains any tuples that are not represented in
+ * the index but should be.
  *
  * It is the caller's responsibility to acquire appropriate heavyweight lock on
  * the index relation, and advise us if extra checks are safe when a ShareLock
- * is held.
+ * is held.  (A lock of the same type must also have been acquired on the heap
+ * relation.)
  *
  * A ShareLock is generally assumed to prevent any kind of physical
  * modification to the index structure, including modifications that VACUUM may
@@ -272,7 +308,8 @@ btree_index_checkable(Relation rel)
  * parent/child check cannot be affected.)
  */
 static void
-bt_check_every_level(Relation rel, bool readonly)
+bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
+					 bool heapallindexed)
 {
 	BtreeCheckState *state;
 	Page		metapage;
@@ -283,15 +320,35 @@ bt_check_every_level(Relation rel, bool readonly)
 	/*
 	 * RecentGlobalXmin assertion matches index_getnext_tid().  See note on
 	 * RecentGlobalXmin/B-Tree page deletion.
+	 *
+	 * We also rely on TransactionXmin having been initialized by now.
 	 */
 	Assert(TransactionIdIsValid(RecentGlobalXmin));
+	Assert(TransactionIdIsNormal(TransactionXmin));

 	/*
 	 * Initialize state for entire verification operation
 	 */
 	state = palloc(sizeof(BtreeCheckState));
 	state->rel = rel;
+	state->heaprel = heaprel;
 	state->readonly = readonly;
+	state->heapallindexed = heapallindexed;
+
+	if (state->heapallindexed)
+	{
+		int64	total_elems;
+		uint32	seed;
+
+		/* Size Bloom filter based on estimated number of tuples in index */
+		total_elems = (int64) state->rel->rd_rel->reltuples;
+		/* Random seed relies on backend srandom() call to avoid repetition */
+		seed = random();
+		/* Create Bloom filter to fingerprint index */
+		state->filter = bloom_create(total_elems, maintenance_work_mem, seed);
+		state->heaptuplespresent = 0;
+	}
+
 	/* Create context for page */
 	state->targetcontext = AllocSetContextCreate(CurrentMemoryContext,
 												 "amcheck context",
@@ -347,6 +404,61 @@ bt_check_every_level(Relation rel, bool readonly)
 		previouslevel = current.level;
 	}

+	/*
+	 * * Heap contains unindexed/malformed tuples check *
+	 */
+	if (state->heapallindexed)
+	{
+		IndexInfo  *indexinfo;
+
+		if (state->readonly)
+			elog(DEBUG1, "verifying presence of all required tuples in index \"%s\"",
+				 RelationGetRelationName(rel));
+		else
+			elog(DEBUG1, "verifying presence of required tuples in index \"%s\" with xmin before %u",
+				 RelationGetRelationName(rel), TransactionXmin);
+
+		indexinfo = BuildIndexInfo(state->rel);
+
+		/*
+		 * Force use of MVCC snapshot (reuse CONCURRENTLY infrastructure) when
+		 * only AccessShareLocks held.  It seems like a good idea to not
+		 * diverge from expected heap lock strength.
+		 */
+		indexinfo->ii_Concurrent = !state->readonly;
+
+		/*
+		 * Don't wait for uncommitted tuple xact commit/abort when index is a
+		 * unique index (or an index used by an exclusion constraint).  It is
+		 * supposed to be impossible to get duplicates with the already-defined
+		 * unique index in place.  Our relation-level locks prevent races
+		 * resulting in false positive corruption errors where an IndexTuple
+		 * insertion was just missed, but we still test its heap tuple.  (While
+		 * this would not be true for !readonly verification, it doesn't matter
+		 * because CREATE INDEX CONCURRENTLY index build heap scanning has no
+		 * special treatment for unique indexes to avoid.)
+		 *
+		 * Not waiting can only affect verification of indexes on system
+		 * catalogs, where heavyweights locks can be dropped before transaction
+		 * commit.  If anything, avoiding waiting slightly improves test
+		 * coverage.
+		 */
+		indexinfo->ii_Unique = false;
+		indexinfo->ii_ExclusionOps = NULL;
+		indexinfo->ii_ExclusionProcs = NULL;
+		indexinfo->ii_ExclusionStrats = NULL;
+
+		IndexBuildHeapScan(state->heaprel, state->rel, indexinfo, true,
+						   bt_tuple_present_callback, (void *) state);
+
+		ereport(DEBUG1,
+				(errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples (proportion of bits set: %f) from table \"%s\"",
+								 state->heaptuplespresent, bloom_prop_bits_set(state->filter),
+								 RelationGetRelationName(heaprel))));
+
+		bloom_free(state->filter);
+	}
+
 	/* Be tidy: */
 	MemoryContextDelete(state->targetcontext);
 }
@@ -499,7 +611,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
 					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
 										current, level.level, opaque->btpo.level)));

-		/* Verify invariants for page -- all important checks occur here */
+		/* Verify invariants for page */
 		bt_target_page_check(state);

 nextpage:
@@ -546,6 +658,9 @@ nextpage:
  *
  * - That all child pages respect downlinks lower bound.
  *
+ * This is also where heapallindexed callers use their Bloom filter to
+ * fingerprint IndexTuples.
+ *
  * Note:  Memory allocated in this routine is expected to be released by caller
  * resetting state->targetcontext.
  */
@@ -589,6 +704,11 @@ bt_target_page_check(BtreeCheckState *state)
 		itup = (IndexTuple) PageGetItem(state->target, itemid);
 		skey = _bt_mkscankey(state->rel, itup);

+		/* Fingerprint leaf page tuples (those that point to the heap) */
+		if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
+			bloom_add_element(state->filter, (unsigned char *) itup,
+							  IndexTupleSize(itup));
+
 		/*
 		 * * High key check *
 		 *
@@ -682,8 +802,10 @@ bt_target_page_check(BtreeCheckState *state)
 		 * * Last item check *
 		 *
 		 * Check last item against next/right page's first data item's when
-		 * last item on page is reached.  This additional check can detect
-		 * transposed pages.
+		 * last item on page is reached.  This additional check will detect
+		 * transposed pages iff the supposed right sibling page happens to
+		 * belong before target in the key space.  (Otherwise, a subsequent
+		 * heap verification will probably detect the problem.)
 		 *
 		 * This check is similar to the item order check that will have
 		 * already been performed for every other "real" item on target page
@@ -1062,6 +1184,134 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
 }

 /*
+ * Per-tuple callback from IndexBuildHeapScan, used to determine if index has
+ * all the entries that definitely should have been observed in leaf pages of
+ * the target index (that is, all IndexTuples that were fingerprinted by our
+ * Bloom filter).  All heapallindexed checks occur here.
+ *
+ * Theory of operation:
+ *
+ * The redundancy between an index and the table it indexes provides a good
+ * opportunity to detect corruption, especially corruption within the table.
+ * The high level principle behind the verification performed here is that any
+ * IndexTuple that should be in an index following a fresh CREATE INDEX (based
+ * on the same index definition) should also have been in the original,
+ * existing index, which should have used exactly the same representation
+ * (Index tuple formation is assumed to be deterministic, and IndexTuples are
+ * assumed immutable; while the LP_DEAD bit is mutable, that's ItemId metadata,
+ * which is not fingerprinted).  There will often be some dead-to-everyone
+ * IndexTuples fingerprinted by the Bloom filter, but we only try to detect the
+ * *absence of needed tuples*, so that's okay.
+ *
+ * Since the overall structure of the index has already been verified, the most
+ * likely explanation for error here is a corrupt heap page (could be logical
+ * or physical corruption).  Index corruption may still be detected here,
+ * though.  Only readonly callers will have verified that left links and right
+ * links are in agreement, and so it's possible that a leaf page transposition
+ * within index is actually the source of corruption detected here (for
+ * !readonly callers).  The checks performed only for readonly callers might
+ * more accurately frame the problem as a cross-page invariant issue (this
+ * could even be due to recovery not replaying all WAL records).  The !readonly
+ * ERROR message raised here includes a HINT about retrying with readonly
+ * verification, just in case it's a cross-page invariant issue, though that
+ * isn't particularly likely.
+ *
+ * IndexBuildHeapScan() expects to be able to find the root tuple when a
+ * heap-only tuple (the live tuple at the end of some HOT chain) needs to be
+ * indexed, in order to replace the actual tuple's TID with the root tuple's
+ * TID (which is what we're actually passed back here).  The index build heap
+ * scan code will raise an error when a tuple that claims to be the root of the
+ * heap-only tuple's HOT chain cannot be located.  This catches cases where the
+ * original root item offset/root tuple for a HOT chain indicates (for whatever
+ * reason) that the entire HOT chain is dead, despite the fact that the latest
+ * heap-only tuple should be indexed.  When this happens, sequential scans may
+ * always give correct answers, and all indexes may be considered structurally
+ * consistent (i.e. the nbtree structural checks would not detect corruption).
+ * It may be the case that only index scans give wrong answers, and yet heap or
+ * SLRU corruption is the real culprit.  (While it's true that LP_DEAD bit
+ * setting will probably also leave the index in a corrupt state before too
+ * long, the problem is nonetheless that there is heap corruption.)
+ *
+ * Note also that heap-only tuple handling within IndexBuildHeapScan() detects
+ * index tuples that contain the wrong values.  This can happen when there is
+ * no superseding index tuple due to a faulty assessment of HOT safety.
+ * Because the latest tuple's contents are used with the root TID, an error
+ * will be raised when a tuple with the same TID but different (correct)
+ * attribute values is passed back to us.  (Faulty assessment of HOT-safety was
+ * behind the CREATE INDEX CONCURRENTLY bug that was fixed in February of
+ * 2017.)
+ */
+static void
+bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
+						  bool *isnull, bool tupleIsAlive, void *checkstate)
+{
+	BtreeCheckState *state = (BtreeCheckState *) checkstate;
+	IndexTuple		 itup;
+
+	Assert(state->heapallindexed);
+
+	/* Must recheck visibility when only AccessShareLock held */
+	if (!state->readonly)
+	{
+		TransactionId	xmin;
+
+		/*
+		 * Don't test for presence in index where xmin not at least old enough
+		 * that we know for sure that absence of index tuple wasn't just due to
+		 * some transaction performing insertion after our verifying index
+		 * traversal began.  (Actually, the cut-off used is a point where
+		 * preceding write transactions must have committed/aborted.  We should
+		 * have already fingerprinted all index tuples for all such preceding
+		 * transactions, because the cut-off was established before our index
+		 * traversal even began.)
+		 *
+		 * You might think that the fact that an MVCC snapshot is used by the
+		 * heap scan (due to our indicating that this is the first scan of a
+		 * CREATE INDEX CONCURRENTLY index build) would make this test
+		 * redundant.  That's not quite true, because with current
+		 * IndexBuildHeapScan() interface caller cannot do the MVCC snapshot
+		 * acquisition itself.  Heap tuple coverage is thereby similar to the
+		 * coverage we could get by using earliest transaction snapshot
+		 * directly.  It's easier to do this than to adopt the
+		 * IndexBuildHeapScan() interface to our narrow requirements.
+		 */
+		Assert(tupleIsAlive);
+		xmin = HeapTupleHeaderGetXmin(htup->t_data);
+		if (!TransactionIdPrecedes(xmin, TransactionXmin))
+			return;
+	}
+
+	/*
+	 * Generate an index tuple.
+	 *
+	 * Note that we rely on deterministic index_form_tuple() TOAST compression.
+	 * If index_form_tuple() was ever enhanced to compress datums out-of-line,
+	 * or otherwise varied when or how compression was applied, our assumption
+	 * would break, leading to false positive reports of corruption.  For now,
+	 * we don't decompress/normalize toasted values as part of fingerprinting.
+	 */
+	itup = index_form_tuple(RelationGetDescr(index), values, isnull);
+	itup->t_tid = htup->t_self;
+
+	/* Probe Bloom filter -- tuple should be present */
+	if (bloom_lacks_element(state->filter, (unsigned char *) itup,
+							IndexTupleSize(itup)))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
+						ItemPointerGetBlockNumber(&(itup->t_tid)),
+						ItemPointerGetOffsetNumber(&(itup->t_tid)),
+						RelationGetRelationName(state->heaprel),
+						RelationGetRelationName(state->rel)),
+				 !state->readonly
+				 ? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.")
+				 : 0));
+
+	state->heaptuplespresent++;
+	pfree(itup);
+}
+
+/*
  * Is particular offset within page (whose special state is passed by caller)
  * the page negative-infinity item?
  *
diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml
index 0dd68f0..bff1116 100644
--- a/doc/src/sgml/amcheck.sgml
+++ b/doc/src/sgml/amcheck.sgml
@@ -44,7 +44,7 @@
   <variablelist>
    <varlistentry>
     <term>
-     <function>bt_index_check(index regclass) returns void</function>
+     <function>bt_index_check(index regclass, heapallindexed boolean DEFAULT false) returns void</function>
      <indexterm>
       <primary>bt_index_check</primary>
      </indexterm>
@@ -55,7 +55,9 @@
       <function>bt_index_check</function> tests that its target, a
       B-Tree index, respects a variety of invariants.  Example usage:
 <screen>
-test=# SELECT bt_index_check(c.oid), c.relname, c.relpages
+test=# SELECT bt_index_check(index =&gt; c.oid, heapallindexed =&gt; i.indisunique)
+               c.relname,
+               c.relpages
 FROM pg_index i
 JOIN pg_opclass op ON i.indclass[0] = op.oid
 JOIN pg_am am ON op.opcmethod = am.oid
@@ -83,9 +85,11 @@ ORDER BY c.relpages DESC LIMIT 10;
 </screen>
       This example shows a session that performs verification of every
       catalog index in the database <quote>test</quote>.  Details of just
-      the 10 largest indexes verified are displayed.  Since no error
-      is raised, all indexes tested appear to be logically consistent.
-      Naturally, this query could easily be changed to call
+      the 10 largest indexes verified are displayed.  Verification of
+      the presence of heap tuples as index tuples is requested for
+      unique indexes only.  Since no error is raised, all indexes
+      tested appear to be logically consistent.  Naturally, this query
+      could easily be changed to call
       <function>bt_index_check</function> for every index in the
       database where verification is supported.
      </para>
@@ -95,10 +99,11 @@ ORDER BY c.relpages DESC LIMIT 10;
       is the same lock mode acquired on relations by simple
       <literal>SELECT</literal> statements.
       <function>bt_index_check</function> does not verify invariants
-      that span child/parent relationships, nor does it verify that
-      the target index is consistent with its heap relation.  When a
-      routine, lightweight test for corruption is required in a live
-      production environment, using
+      that span child/parent relationships, but will verify the
+      presence of all heap tuples as index tuples within the index
+      when <parameter>heapallindexed</parameter> is
+      <literal>true</literal>.  When a routine, lightweight test for
+      corruption is required in a live production environment, using
       <function>bt_index_check</function> often provides the best
       trade-off between thoroughness of verification and limiting the
       impact on application performance and availability.
@@ -108,7 +113,7 @@ ORDER BY c.relpages DESC LIMIT 10;

    <varlistentry>
     <term>
-     <function>bt_index_parent_check(index regclass) returns void</function>
+     <function>bt_index_parent_check(index regclass, heapallindexed boolean DEFAULT false) returns void</function>
      <indexterm>
       <primary>bt_index_parent_check</primary>
      </indexterm>
@@ -117,30 +122,34 @@ ORDER BY c.relpages DESC LIMIT 10;
     <listitem>
      <para>
       <function>bt_index_parent_check</function> tests that its
-      target, a B-Tree index, respects a variety of invariants.  The
-      checks performed by <function>bt_index_parent_check</function>
-      are a superset of the checks performed by
-      <function>bt_index_check</function>.
+      target, a B-Tree index, respects a variety of invariants.
+      Optionally, when the <parameter>heapallindexed</parameter>
+      argument is <literal>true</literal>, the function verifies the
+      presence of all heap tuples that should be found within the
+      index.  The checks performed by
+      <function>bt_index_parent_check</function> are a superset of the
+      checks performed by <function>bt_index_check</function> when
+      called with the same options.
       <function>bt_index_parent_check</function> can be thought of as
       a more thorough variant of <function>bt_index_check</function>:
       unlike <function>bt_index_check</function>,
       <function>bt_index_parent_check</function> also checks
-      invariants that span parent/child relationships.  However, it
-      does not verify that the target index is consistent with its
-      heap relation.  <function>bt_index_parent_check</function>
-      follows the general convention of raising an error if it finds a
-      logical inconsistency or other problem.
+      invariants that span parent/child relationships.
+      <function>bt_index_parent_check</function> follows the general
+      convention of raising an error if it finds a logical
+      inconsistency or other problem.
      </para>
      <para>
-      A <literal>ShareLock</literal> is required on the target index by
-      <function>bt_index_parent_check</function> (a
-      <literal>ShareLock</literal> is also acquired on the heap relation).
-      These locks prevent concurrent data modification from
-      <command>INSERT</command>, <command>UPDATE</command>, and <command>DELETE</command>
-      commands.  The locks also prevent the underlying relation from
-      being concurrently processed by <command>VACUUM</command>, as well as
-      all other utility commands.  Note that the function holds locks
-      only while running, not for the entire transaction.
+      A <literal>ShareLock</literal> is required on the target index
+      by <function>bt_index_parent_check</function> (a
+      <literal>ShareLock</literal> is also acquired on the heap
+      relation).  These locks prevent concurrent data modification
+      from <command>INSERT</command>, <command>UPDATE</command>, and
+      <command>DELETE</command> commands.  The locks also prevent the
+      underlying relation from being concurrently processed by
+      <command>VACUUM</command>, as well as all other utility
+      commands.  Note that the function holds locks only while
+      running, not for the entire transaction.
      </para>
      <para>
       <function>bt_index_parent_check</function>'s additional
@@ -159,6 +168,72 @@ ORDER BY c.relpages DESC LIMIT 10;
  </sect2>

  <sect2>
+  <title>Optional <parameter>heapallindexed</parameter> verification</title>
+ <para>
+  When the <parameter>heapallindexed</parameter> argument to
+  verification functions is <literal>true</literal>, an additional
+  phase of verification is performed against the table associated with
+  the target index relation.  This consists of a <quote>dummy</quote>
+  <command>CREATE INDEX</command> operation, which checks for the
+  presence of all would-be new index tuples against a temporary,
+  in-memory summarizing structure (this is built when needed during
+  the first, standard phase).  The summarizing structure
+  <quote>fingerprints</quote> every tuple found within the target
+  index.  The high level principle behind
+  <parameter>heapallindexed</parameter> verification is that a new
+  index that is equivalent to the existing, target index must only
+  have entries that can be found in the existing structure.
+ </para>
+ <para>
+  The additional <parameter>heapallindexed</parameter> phase adds
+  significant overhead: verification will typically take several times
+  longer than it would with only the standard consistency checking of
+  the target index's structure.  However, verification will still take
+  significantly less time than an actual <command>CREATE
+  INDEX</command>.  There is no change to the relation-level locks
+  acquired when <parameter>heapallindexed</parameter> verification is
+  performed.  The summarizing structure is bound in size by
+  <varname>maintenance_work_mem</varname>.  In order to ensure that
+  there is no more than a 2% probability of failure to detect the
+  absence of any particular index tuple, approximately 2 bytes of
+  memory are needed per index tuple.  As less memory is made available
+  per index tuple, the probability of missing an inconsistency
+  increases.  This is considered an acceptable trade-off, since it
+  limits the overhead of verification very significantly, while only
+  slightly reducing the probability of detecting a problem, especially
+  for installations where verification is treated as a routine
+  maintenance task.
+ </para>
+ <para>
+  With many databases, even the default
+  <varname>maintenance_work_mem</varname> setting of
+  <literal>64MB</literal> is sufficient to have less than a 2%
+  probability of overlooking any single absent or corrupt tuple.  This
+  will be the case when there are no indexes with more than about 30
+  million distinct index tuples, regardless of the overall size of any
+  index, the total number of indexes, or anything else.  False
+  positive candidate tuple membership tests within the summarizing
+  structure occur at random, and are very unlikely to be the same for
+  repeat verification operations.  Furthermore, within a single
+  verification operation, each missing or malformed index tuple
+  independently has the same chance of being detected.  If there is
+  any inconsistency at all, it isn't particularly likely to be limited
+  to a single tuple.  All of these factors favor accepting a limited
+  per operation per tuple probability of missing corruption, in order
+  to enable performing more thorough index to heap verification more
+  frequently (practical concerns about the overhead of verification
+  are likely to limit the frequency of verification).  In aggregate,
+  the probability of detecting a hardware fault or software defect
+  actually <emphasis>increases</emphasis> significantly with this
+  strategy in most real world cases.  Moreover, frequent verification
+  allows problems to be caught earlier on average, which helps to
+  limit the overall impact of corruption, and often simplifies root
+  cause analysis.
+ </para>
+
+ </sect2>
+
+ <sect2>
   <title>Using <filename>amcheck</filename> effectively</title>

  <para>
@@ -199,18 +274,33 @@ ORDER BY c.relpages DESC LIMIT 10;
    </listitem>
    <listitem>
     <para>
+     Structural inconsistencies between indexes and the heap relations
+     that are indexed (when <parameter>heapallindexed</parameter>
+     verification is performed).
+    </para>
+    <para>
+     There is no cross-checking of indexes against their heap relation
+     during normal operation.  Symptoms of heap corruption can be very
+     subtle.
+    </para>
+   </listitem>
+   <listitem>
+    <para>
      Corruption caused by hypothetical undiscovered bugs in the
-     underlying <productname>PostgreSQL</productname> access method code or sort
-     code.
+     underlying <productname>PostgreSQL</productname> access method
+     code, sort code, or transaction management code.
     </para>
     <para>
      Automatic verification of the structural integrity of indexes
      plays a role in the general testing of new or proposed
      <productname>PostgreSQL</productname> features that could plausibly allow a
-     logical inconsistency to be introduced.  One obvious testing
-     strategy is to call <filename>amcheck</filename> functions continuously
+     logical inconsistency to be introduced.  Verification of table
+     structure and associated visibility and transaction status
+     information plays a similar role.  One obvious testing strategy
+     is to call <filename>amcheck</filename> functions continuously
      when running the standard regression tests.  See <xref
-     linkend="regress-run"> for details on running the tests.
+     linkend="regress-run"> for details on running
+     the tests.
     </para>
    </listitem>
    <listitem>
@@ -242,6 +332,12 @@ ORDER BY c.relpages DESC LIMIT 10;
      <emphasis>absolute</emphasis> protection against failures that
      result in memory corruption.
     </para>
+    <para>
+     When <parameter>heapallindexed</parameter> verification is
+     performed, there is generally a greatly increased chance of
+     detecting single-bit errors, since strict binary equality is
+     tested, and the indexed attributes within the heap are tested.
+    </para>
    </listitem>
   </itemizedlist>
   In general, <filename>amcheck</filename> can only prove the presence of
@@ -252,12 +348,11 @@ ORDER BY c.relpages DESC LIMIT 10;
  <sect2>
   <title>Repairing corruption</title>
  <para>
-  No error concerning corruption raised by <filename>amcheck</filename> should
-  ever be a false positive.  In practice, <filename>amcheck</filename> is more
-  likely to find software bugs than problems with hardware.
-  <filename>amcheck</filename> raises errors in the event of conditions that,
-  by definition, should never happen, and so careful analysis of
-  <filename>amcheck</filename> errors is often required.
+  No error concerning corruption raised by <filename>amcheck</> should
+  ever be a false positive.  <filename>amcheck</filename> raises
+  errors in the event of conditions that, by definition, should never
+  happen, and so careful analysis of <filename>amcheck</filename>
+  errors is often required.
  </para>
  <para>
   There is no general method of repairing problems that
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index fc64153..565260f 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -56,7 +56,7 @@ extern TimestampTz GetOldSnapshotThresholdTimestamp(void);

 extern bool FirstSnapshotSet;

-extern TransactionId TransactionXmin;
+extern PGDLLIMPORT TransactionId TransactionXmin;
 extern TransactionId RecentXmin;
 extern PGDLLIMPORT TransactionId RecentGlobalXmin;
 extern TransactionId RecentGlobalDataXmin;
-- 
2.7.4