From b84934beb3fd08ab09fec2d2dd68168453e0b9b0 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 30 Apr 2020 12:31:42 -0700 Subject: [PATCH v3 2/2] Add nbtree Valgrind buffer lock checks. Teach nbtree to use Valgrind memcheck client requests to mark pages as NOACCESS at the point the buffer content lock is released. Callers usually also drop their pin at the same time, though there are some exceptions. This instrumentation makes Valgrind detect any page access that occurs without a buffer lock, which is fundamentally unsafe with B-Tree index pages. Buffer pins only prevent VACUUM from performing concurrent recycling of heap TIDs from the pinned leaf page. Our assumption is that they don't make it safe to access the page in any way whatsoever. We provide nbtree specific wrappers for LockBuffer() and related functions. All nbtree code is required to use the wrappers. These checks are a stricter version of the memcheck client requests in bufmgr.c that detect buffer accesses that take place without a buffer pin held. The checks never make the buffer pin checking more lax. --- src/include/access/nbtree.h | 4 + src/backend/access/nbtree/nbtinsert.c | 2 +- src/backend/access/nbtree/nbtpage.c | 128 ++++++++++++++++++++++---- src/backend/access/nbtree/nbtree.c | 6 +- src/backend/access/nbtree/nbtsearch.c | 16 ++-- src/backend/access/nbtree/nbtutils.c | 4 +- 6 files changed, 127 insertions(+), 33 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 79506c748b..f5274cc750 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1073,6 +1073,10 @@ extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access); extern void _bt_relbuf(Relation rel, Buffer buf); +extern void _bt_lockbuf(Relation rel, Buffer buf, int access); +extern void _bt_unlockbuf(Relation rel, Buffer buf); +extern bool _bt_conditionallockbuf(Relation rel, Buffer buf); +extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index b86c122763..e3a44bc09e 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -303,7 +303,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate) { /* Simulate a _bt_getbuf() call with conditional locking */ insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); - if (ConditionalLockBuffer(insertstate->buf)) + if (_bt_conditionallockbuf(rel, insertstate->buf)) { Page page; BTPageOpaque lpageop; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 75628e0eb9..a516600a85 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -32,6 +32,7 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "utils/memdebug.h" #include "utils/snapmgr.h" static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); @@ -198,8 +199,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, } /* trade in our read lock for a write lock */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - LockBuffer(metabuf, BT_WRITE); + _bt_unlockbuf(rel, metabuf); + _bt_lockbuf(rel, metabuf, BT_WRITE); START_CRIT_SECTION(); @@ -340,8 +341,8 @@ _bt_getroot(Relation rel, int access) } /* trade in our read lock for a write lock */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - LockBuffer(metabuf, BT_WRITE); + _bt_unlockbuf(rel, metabuf); + _bt_lockbuf(rel, metabuf, BT_WRITE); /* * Race condition: if someone else initialized the metadata between @@ -434,8 +435,8 @@ _bt_getroot(Relation rel, int access) * else accessing the new root page while it's unlocked, since no one * else knows where it is yet. */ - LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); - LockBuffer(rootbuf, BT_READ); + _bt_unlockbuf(rel, rootbuf); + _bt_lockbuf(rel, rootbuf, BT_READ); /* okay, metadata is correct, release lock on it without caching */ _bt_relbuf(rel, metabuf); @@ -786,7 +787,26 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX * When this routine returns, the appropriate lock is set on the * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). Also, we apply - * _bt_checkpage to sanity-check the page (except in P_NEW case). + * _bt_checkpage to sanity-check the page (except in P_NEW case), + * and perform Valgrind client requests that indicate whether or + * not it's currently safe to access the buffer (i.e. whether or + * not a lock is held on the buffer). + * + * The general rule in nbtree is that it's never okay to access a + * page without holding both a buffer pin and a buffer lock on + * the page's buffer. This is slightly stricter than necessary, + * but it makes the Valgrind checks simpler. + * + * The Valgrind checks are a strict superset of the generic + * buffer pin checks in bufmgr.c. Raw LockBuffer() calls are + * disallowed in nbtree; all buffer lock requests need to go + * through the wrapper functions. The client requests here rely + * on the generic bufmgr.c client requests to mark memory as + * defined when a shared buffer is reused for another page in the + * event of an error. That's why we don't mark pages as NOACCESS + * when they're in a local buffer: it can lead to false positives + * from Valgrind. (Besides, local buffers are protected by the + * generic aset.c client requests.) */ Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access) @@ -797,7 +817,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) { /* Read an existing block of the relation */ buf = ReadBuffer(rel, blkno); - LockBuffer(buf, access); + _bt_lockbuf(rel, buf, access); _bt_checkpage(rel, buf); } else @@ -837,7 +857,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) if (blkno == InvalidBlockNumber) break; buf = ReadBuffer(rel, blkno); - if (ConditionalLockBuffer(buf)) + if (_bt_conditionallockbuf(rel, buf)) { page = BufferGetPage(buf); if (_bt_page_recyclable(page)) @@ -890,7 +910,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) buf = ReadBuffer(rel, P_NEW); /* Acquire buffer lock on new page */ - LockBuffer(buf, BT_WRITE); + _bt_lockbuf(rel, buf, BT_WRITE); /* * Release the file-extension lock; it's now OK for someone else to @@ -931,9 +951,10 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access) Assert(blkno != P_NEW); if (BufferIsValid(obuf)) - LockBuffer(obuf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(rel, obuf); buf = ReleaseAndReadBuffer(obuf, rel, blkno); - LockBuffer(buf, access); + _bt_lockbuf(rel, buf, access); + _bt_checkpage(rel, buf); return buf; } @@ -946,7 +967,76 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access) void _bt_relbuf(Relation rel, Buffer buf) { - UnlockReleaseBuffer(buf); + _bt_unlockbuf(rel, buf); + ReleaseBuffer(buf); +} + +/* + * _bt_lockbuf() -- lock a pinned buffer. + * + * Lock is acquired without acquiring another pin. This is like a raw + * LockBuffer() call, but performs extra steps needed by Valgrind. + */ +void +_bt_lockbuf(Relation rel, Buffer buf, int access) +{ + LockBuffer(buf, access); + + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); +} + +/* + * _bt_unlockbuf() -- unlock a pinned buffer. + */ +void +_bt_unlockbuf(Relation rel, Buffer buf) +{ + /* + * Buffer is pinned and locked, which means that it is expected to be + * defined and addressable in general. Make sure of it. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ); +} + +/* + * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned + * buffer. + * + * Note: Caller is responsible for calling _bt_checkpage() on success. + */ +bool +_bt_conditionallockbuf(Relation rel, Buffer buf) +{ + if (!ConditionalLockBuffer(buf)) + return false; + + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + + return true; +} + +/* + * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup + * lock. + */ +void +_bt_upgradelockbufcleanup(Relation rel, Buffer buf) +{ + /* + * Buffer is pinned and locked, which means that it is expected to be + * defined and addressable in general. Make sure of it. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBufferForCleanup(buf); } /* @@ -1580,7 +1670,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) * To avoid deadlocks, we'd better drop the leaf page lock * before going further. */ - LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(rel, leafbuf); /* * Check that the left sibling of leafbuf (if any) is not @@ -1616,7 +1706,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) * (Page deletion can cope with the stack being to the left of * leafbuf, but not to the right of leafbuf.) */ - LockBuffer(leafbuf, BT_WRITE); + _bt_lockbuf(rel, leafbuf, BT_WRITE); continue; } @@ -1970,7 +2060,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, leafleftsib = opaque->btpo_prev; leafrightsib = opaque->btpo_next; - LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(rel, leafbuf); /* * Check here, as calling loops will have locks held, preventing @@ -2005,7 +2095,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * To avoid deadlocks, we'd better drop the target page lock before * going further. */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(rel, buf); } /* @@ -2022,7 +2112,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * table.) */ if (target != leafblkno) - LockBuffer(leafbuf, BT_WRITE); + _bt_lockbuf(rel, leafbuf, BT_WRITE); if (leftsib != P_NONE) { lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); @@ -2072,7 +2162,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * rather than a superexclusive lock, since no scan will stop on an empty * page. */ - LockBuffer(buf, BT_WRITE); + _bt_lockbuf(rel, buf, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index e947addef6..cf409c4c8a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1115,7 +1115,8 @@ backtrack: */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); - LockBuffer(buf, BT_READ); + _bt_lockbuf(rel, buf, BT_READ); + page = BufferGetPage(buf); opaque = NULL; if (!PageIsNew(page)) @@ -1222,8 +1223,7 @@ backtrack: * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBufferForCleanup(buf); + _bt_upgradelockbufcleanup(rel, buf); /* * Check whether we need to backtrack to earlier pages. What we are diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 4534224812..b1aaa446f8 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -64,7 +64,7 @@ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) { - LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(scan->indexRelation, sp->buf); if (IsMVCCSnapshot(scan->xs_snapshot) && RelationNeedsWAL(scan->indexRelation) && @@ -189,8 +189,8 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, if (access == BT_WRITE && page_access == BT_READ) { /* trade in our read lock for a write lock */ - LockBuffer(*bufP, BUFFER_LOCK_UNLOCK); - LockBuffer(*bufP, BT_WRITE); + _bt_unlockbuf(rel, *bufP); + _bt_lockbuf(rel, *bufP, BT_WRITE); /* * If the page was split between the time that we surrendered our read @@ -292,8 +292,8 @@ _bt_moveright(Relation rel, /* upgrade our lock if necessary */ if (access == BT_READ) { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBuffer(buf, BT_WRITE); + _bt_unlockbuf(rel, buf); + _bt_lockbuf(rel, buf, BT_WRITE); } if (P_INCOMPLETE_SPLIT(opaque)) @@ -1416,7 +1416,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */ - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); if (!_bt_steppage(scan, dir)) return false; } @@ -2064,7 +2064,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) * deleted. */ if (BTScanPosIsPinned(so->currPos)) - LockBuffer(so->currPos.buf, BT_READ); + _bt_lockbuf(rel, so->currPos.buf, BT_READ); else so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); @@ -2442,7 +2442,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */ - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); if (!_bt_steppage(scan, dir)) return false; } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 7c33711a9f..81589b9056 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -1744,7 +1744,7 @@ _bt_killitems(IndexScanDesc scan) * LSN. */ droppedpin = false; - LockBuffer(so->currPos.buf, BT_READ); + _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); page = BufferGetPage(so->currPos.buf); } @@ -1885,7 +1885,7 @@ _bt_killitems(IndexScanDesc scan) MarkBufferDirtyHint(so->currPos.buf, true); } - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); } -- 2.25.1