From 8f141b150a442f54a5d271e430e173f1942fbde6 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 5 Aug 2020 16:07:46 -0700 Subject: [PATCH v4] Add amcheck sibling link checks. Teach contrib/amcheck's bt_index_check() function to check agreement between left links and right links between siblings. This was always possible with bt_index_parent_check(), but the check tends to catch a lot of real world issues, and can only be used on replicas with the former function (since it only requires an AccessShareLock). --- contrib/amcheck/verify_nbtree.c | 123 ++++++++++++++++++++++++++++---- 1 file changed, 110 insertions(+), 13 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index c9f9e755dc..cf1860110f 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -145,6 +145,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel, bool rootdescend); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); +static void bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent); static void bt_target_page_check(BtreeCheckState *state); static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, @@ -775,17 +778,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) */ } - /* - * readonly mode can only ever land on live pages and half-dead pages, - * so sibling pointers should always be in mutual agreement - */ - if (state->readonly && opaque->btpo_prev != leftcurrent) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("left link/right link pair in index \"%s\" not in agreement", - RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u left block=%u left link from block=%u.", - current, leftcurrent, opaque->btpo_prev))); + /* Sibling links should be in mutual agreement */ + if (opaque->btpo_prev != leftcurrent) + bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); /* Check level, which must be valid for non-ignorable page */ if (level.level != opaque->btpo.level) @@ -865,6 +860,106 @@ nextpage: return nextleveldown; } +/* + * Raise an error when target page's left link does not match the right link + * from the page that was previously the target page (called leftcurrent + * here). Make sure that this condition has definitely been violated in the + * !readonly case, where the apparent violation of this invariant is probably + * just due to a harmless concurrent page split. + * + * This is the only place where amcheck will ever "couple" buffer locks (in + * the style of_bt_check_unique). This is only needed in !readonly mode. We + * generally prefer to avoid more thorough cross-page checks in !readonly + * mode, but this case is relatively important, and isn't very disruptive in + * practice. Also, we probably won't end up here all that often when the + * index isn't corrupt (that only happens in the event of a concurrent page + * split), so the performance impact of lock coupling here should be + * acceptable in practice. + */ +static void +bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent) +{ + if (!state->readonly) + { + Buffer lbuffer; + Buffer rbuffer; + Page page; + BTPageOpaque opaque; + BlockNumber rightblock; + + /* + * Actual recheck is required when we only have an AccessShareLock on + * relation. Couple locks in the usual order for nbtree: Left to + * right. + */ + lbuffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, leftcurrent, + RBM_NORMAL, state->checkstrategy); + LockBuffer(lbuffer, BT_READ); + _bt_checkpage(state->rel, lbuffer); + page = BufferGetPage(lbuffer); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + rightblock = opaque->btpo_next; + + /* + * Avoid deadlock in rare cases where sibling link points to the page + * that currently contains it + */ + if (rightblock != leftcurrent) + { + rbuffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, rightblock, + RBM_NORMAL, state->checkstrategy); + LockBuffer(rbuffer, BT_READ); + _bt_checkpage(state->rel, rbuffer); + page = BufferGetPage(rbuffer); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* btpo_prev_from_target may have changed; update it */ + btpo_prev_from_target = opaque->btpo_prev; + } + else + { + /* + * The "cross page item order invariant" check will generally + * catch this first, but don't rely on that from this distance + */ + rbuffer = InvalidBuffer; + btpo_prev_from_target = P_NONE; + } + + if (BufferIsValid(rbuffer)) + UnlockReleaseBuffer(rbuffer); + UnlockReleaseBuffer(lbuffer); + + if (btpo_prev_from_target == leftcurrent) + { + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg("harmless concurrent page split detected in index %s", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u right sibling=%u.", + leftcurrent, rightblock))); + return; + } + + /* + * Index is corrupt. Make sure that we report correct target. This + * theoretically could have changed in rare cases where there is index + * corruption as well as a concurrent page split. (Note that + * btpo_prev_from_target was already updated above.) + */ + state->targetblock = rightblock; + } + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("left link/right link pair in index \"%s\" not in agreement", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u left block=%u left link from block=%u.", + state->targetblock, leftcurrent, + btpo_prev_from_target))); +} + /* * Function performs the following checks on target page, or pages ancillary to * target page: @@ -1964,8 +2059,8 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, * On the other hand, we'd need to lock down race conditions involving * deletion of child's left page, for long enough to read the child page * into memory (in other words, a scheme with concurrently held buffer - * locks on both child and left-of-child pages). That's unacceptable for - * amcheck functions on general principle, though. + * locks on both child and left-of-child pages). That hardly seems worth + * the trouble. */ Assert(state->readonly); @@ -2774,6 +2869,8 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, * There is never an attempt to get a consistent view of multiple pages using * multiple concurrent buffer locks; in general, we only acquire a single pin * and buffer lock at a time, which is often all that the nbtree code requires. + * (Actually, bt_recheck_sibling_links couples buffer locks, which is the only + * exception to this general rule.) * * Operating on a copy of the page is useful because it prevents control * getting stuck in an uninterruptible state when an underlying operator class -- 2.25.1