diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 3e47c37..55c7833 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -214,6 +214,7 @@ bool btgettuple(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanState state = &so->state; bool res; /* btree indexes are never lossy */ @@ -224,7 +225,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) * scan. We can't do this in btrescan because we don't know the scan * direction at that time. */ - if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)) + if (so->numArrayKeys && !BTScanPosIsValid(state->currPos)) { /* punt if we have any unsatisfiable array keys */ if (so->numArrayKeys < 0) @@ -241,7 +242,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) * the appropriate direction. If we haven't done so yet, we call * _bt_first() to get the first item in the scan. */ - if (!BTScanPosIsValid(so->currPos)) + if (!BTScanPosIsValid(state->currPos)) res = _bt_first(scan, dir); else { @@ -259,11 +260,11 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) * trying to optimize that, so we don't detect it, but instead * just forget any excess entries. */ - if (so->killedItems == NULL) - so->killedItems = (int *) + if (state->killedItems == NULL) + state->killedItems = (int *) palloc(MaxIndexTuplesPerPage * sizeof(int)); - if (so->numKilled < MaxIndexTuplesPerPage) - so->killedItems[so->numKilled++] = so->currPos.itemIndex; + if (state->numKilled < MaxIndexTuplesPerPage) + state->killedItems[so->state.numKilled++] = state->currPos.itemIndex; } /* @@ -288,6 +289,7 @@ int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos currPos = &so->state.currPos; int64 ntids = 0; ItemPointer heapTid; @@ -320,7 +322,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) * Advance to next tuple within page. This is the same as the * easy case in _bt_next(). */ - if (++so->currPos.itemIndex > so->currPos.lastItem) + if (++currPos->itemIndex > currPos->lastItem) { /* let _bt_next do the heavy lifting */ if (!_bt_next(scan, ForwardScanDirection)) @@ -328,7 +330,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) } /* Save tuple ID, and continue scanning */ - heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; + heapTid = &currPos->items[currPos->itemIndex].heapTid; tbm_add_tuples(tbm, heapTid, 1, false); ntids++; } @@ -356,8 +358,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys) /* allocate private workspace */ so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); - BTScanPosInvalidate(so->currPos); - BTScanPosInvalidate(so->markPos); + BTScanPosInvalidate(so->state.currPos); + BTScanPosInvalidate(so->state.markPos); if (scan->numberOfKeys > 0) so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); else @@ -368,15 +370,15 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->arrayKeys = NULL; so->arrayContext = NULL; - so->killedItems = NULL; /* until needed */ - so->numKilled = 0; + so->state.killedItems = NULL; /* until needed */ + so->state.numKilled = 0; /* * We don't know yet whether the scan will be index-only, so we do not * allocate the tuple workspace arrays until btrescan. However, we set up * scan->xs_itupdesc whether we'll need it or not, since that's so cheap. */ - so->currTuples = so->markTuples = NULL; + so->state.currTuples = so->state.markTuples = NULL; scan->xs_itupdesc = RelationGetDescr(rel); @@ -385,6 +387,45 @@ btbeginscan(Relation rel, int nkeys, int norderbys) return scan; } +static void +_bt_release_current_position(BTScanState state, Relation indexRelation, + bool invalidate) +{ + /* we aren't holding any read locks, but gotta drop the pins */ + if (BTScanPosIsValid(state->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (state->numKilled > 0) + _bt_killitems(state, indexRelation); + + BTScanPosUnpinIfPinned(state->currPos); + + if (invalidate) + BTScanPosInvalidate(state->currPos); + } +} + +static void +_bt_release_scan_state(IndexScanDesc scan, BTScanState state, bool free) +{ + /* No need to invalidate positions, if the RAM is about to be freed. */ + _bt_release_current_position(state, scan->indexRelation, !free); + + state->markItemIndex = -1; + BTScanPosUnpinIfPinned(state->markPos); + + if (free) + { + if (state->killedItems != NULL) + pfree(state->killedItems); + if (state->currTuples != NULL) + pfree(state->currTuples); + /* markTuples should not be pfree'd (_bt_allocate_tuple_workspaces) */ + } + else + BTScanPosInvalidate(state->markPos); +} + /* * btrescan() -- rescan an index relation */ @@ -393,21 +434,11 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanState state = &so->state; - /* we aren't holding any read locks, but gotta drop the pins */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - BTScanPosInvalidate(so->currPos); - } + _bt_release_scan_state(scan, state, false); - so->markItemIndex = -1; - so->arrayKeyCount = 0; - BTScanPosUnpinIfPinned(so->markPos); - BTScanPosInvalidate(so->markPos); + so->arrayKeyCount = 0; /* FIXME in _bt_release_scan_state */ /* * Allocate tuple workspace arrays, if needed for an index-only scan and @@ -425,11 +456,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats * adding special-case treatment for name_ops elsewhere. */ - if (scan->xs_want_itup && so->currTuples == NULL) - { - so->currTuples = (char *) palloc(BLCKSZ * 2); - so->markTuples = so->currTuples + BLCKSZ; - } + if (scan->xs_want_itup && state->currTuples == NULL) + _bt_allocate_tuple_workspaces(state); /* * Reset the scan keys. Note that keys ordering stuff moved to _bt_first. @@ -453,19 +481,7 @@ btendscan(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* we aren't holding any read locks, but gotta drop the pins */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - } - - so->markItemIndex = -1; - BTScanPosUnpinIfPinned(so->markPos); - - /* No need to invalidate positions, the RAM is about to be freed. */ + _bt_release_scan_state(scan, &so->state, true); /* Release storage */ if (so->keyData != NULL) @@ -473,24 +489,15 @@ btendscan(IndexScanDesc scan) /* so->arrayKeyData and so->arrayKeys are in arrayContext */ if (so->arrayContext != NULL) MemoryContextDelete(so->arrayContext); - if (so->killedItems != NULL) - pfree(so->killedItems); - if (so->currTuples != NULL) - pfree(so->currTuples); - /* so->markTuples should not be pfree'd, see btrescan */ + pfree(so); } -/* - * btmarkpos() -- save current scan position - */ -void -btmarkpos(IndexScanDesc scan) +static void +_bt_mark_current_position(BTScanState state) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* There may be an old mark with a pin (but no lock). */ - BTScanPosUnpinIfPinned(so->markPos); + BTScanPosUnpinIfPinned(state->markPos); /* * Just record the current itemIndex. If we later step to next page @@ -498,32 +505,34 @@ btmarkpos(IndexScanDesc scan) * the currPos struct in markPos. If (as often happens) the mark is moved * before we leave the page, we don't have to do that work. */ - if (BTScanPosIsValid(so->currPos)) - so->markItemIndex = so->currPos.itemIndex; + if (BTScanPosIsValid(state->currPos)) + state->markItemIndex = state->currPos.itemIndex; else { - BTScanPosInvalidate(so->markPos); - so->markItemIndex = -1; + BTScanPosInvalidate(state->markPos); + state->markItemIndex = -1; } - - /* Also record the current positions of any array keys */ - if (so->numArrayKeys) - _bt_mark_array_keys(scan); } /* - * btrestrpos() -- restore scan to last saved position + * btmarkpos() -- save current scan position */ void -btrestrpos(IndexScanDesc scan) +btmarkpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* Restore the marked positions of any array keys */ + _bt_mark_current_position(&so->state); + + /* Also record the current positions of any array keys */ if (so->numArrayKeys) - _bt_restore_array_keys(scan); + _bt_mark_array_keys(scan); +} - if (so->markItemIndex >= 0) +static void +_bt_restore_marked_position(IndexScanDesc scan, BTScanState state) +{ + if (state->markItemIndex >= 0) { /* * The scan has never moved to a new page since the last mark. Just @@ -532,7 +541,7 @@ btrestrpos(IndexScanDesc scan) * NB: In this case we can't count on anything in so->markPos to be * accurate. */ - so->currPos.itemIndex = so->markItemIndex; + state->currPos.itemIndex = state->markItemIndex; } else { @@ -542,28 +551,21 @@ btrestrpos(IndexScanDesc scan) * locks, but if we're still holding the pin for the current position, * we must drop it. */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - } + _bt_release_current_position(state, scan->indexRelation, + !BTScanPosIsValid(state->markPos)); - if (BTScanPosIsValid(so->markPos)) + if (BTScanPosIsValid(state->markPos)) { /* bump pin on mark buffer for assignment to current buffer */ - if (BTScanPosIsPinned(so->markPos)) - IncrBufferRefCount(so->markPos.buf); - memcpy(&so->currPos, &so->markPos, + if (BTScanPosIsPinned(state->markPos)) + IncrBufferRefCount(state->markPos.buf); + memcpy(&state->currPos, &state->markPos, offsetof(BTScanPosData, items[1]) + - so->markPos.lastItem * sizeof(BTScanPosItem)); - if (so->currTuples) - memcpy(so->currTuples, so->markTuples, - so->markPos.nextTupleOffset); + state->markPos.lastItem * sizeof(BTScanPosItem)); + if (state->currTuples) + memcpy(state->currTuples, state->markTuples, + state->markPos.nextTupleOffset); } - else - BTScanPosInvalidate(so->currPos); } } @@ -779,9 +781,10 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan) } /* - * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that - * btbulkdelete() wasn't called. - */ +- * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that +- * btbulkdelete() wasn't called. ++ * btrestrpos() -- restore scan to last saved position + */ static bool _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) { @@ -844,6 +847,21 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) } /* + * btrestrpos() -- restore scan to last saved position + */ +void +btrestrpos(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* Restore the marked positions of any array keys */ + if (so->numArrayKeys) + _bt_restore_array_keys(scan); + + _bt_restore_marked_position(scan, &so->state); +} + +/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index d3700bd..2b63e0c 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,18 +25,19 @@ #include "utils/tqual.h" -static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, +static bool _bt_readpage(IndexScanDesc scan, BTScanState state, ScanDirection dir, OffsetNumber offnum); -static void _bt_saveitem(BTScanOpaque so, int itemIndex, +static void _bt_saveitem(BTScanState state, int itemIndex, OffsetNumber offnum, IndexTuple itup); -static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); -static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); +static bool _bt_steppage(IndexScanDesc scan, BTScanState state, ScanDirection dir); +static bool _bt_readnextpage(IndexScanDesc scan, BTScanState state, + BlockNumber blkno, ScanDirection dir); static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); -static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +static inline void _bt_initialize_more_data(BTScanState state, ScanDirection dir); /* @@ -545,6 +546,58 @@ _bt_compare(Relation rel, } /* + * _bt_return_current_item() -- Prepare current scan state item for return. + * + * This function is used only in "return _bt_return_current_item();" statements + * and always returns true. + */ +static inline bool +_bt_return_current_item(IndexScanDesc scan, BTScanState state) +{ + BTScanPosItem *currItem = &state->currPos.items[state->currPos.itemIndex]; + + scan->xs_ctup.t_self = currItem->heapTid; + + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (state->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_load_first_page() -- Load data from the first page of the scan. + * + * Caller must have pinned and read-locked state->currPos.buf. + * + * On success exit, state->currPos is updated to contain data from the next + * interesting page. For success on a scan using a non-MVCC snapshot we hold + * a pin, but not a read lock, on that page. If we do not hold the pin, we + * set state->currPos.buf to InvalidBuffer. We return true to indicate success. + * + * If there are no more matching records in the given direction at all, + * we drop all locks and pins, set state->currPos.buf to InvalidBuffer, + * and return false. + */ +static bool +_bt_load_first_page(IndexScanDesc scan, BTScanState state, ScanDirection dir, + OffsetNumber offnum) +{ + if (!_bt_readpage(scan, state, dir, offnum)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + LockBuffer(state->currPos.buf, BUFFER_LOCK_UNLOCK); + return _bt_steppage(scan, state, dir); + } + + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &state->currPos); + return true; +} + +/* * _bt_first() -- Find the first item in a scan. * * We need to be clever about the direction of scan, the search @@ -569,6 +622,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos currPos = &so->state.currPos; Buffer buf; BTStack stack; OffsetNumber offnum; @@ -582,10 +636,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) int i; bool status = true; StrategyNumber strat_total; - BTScanPosItem *currItem; BlockNumber blkno; - Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(*currPos)); pgstat_count_index_scan(rel); @@ -1076,7 +1129,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * their scan */ _bt_parallel_done(scan); - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } @@ -1084,7 +1137,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); - _bt_initialize_more_data(so, dir); + _bt_initialize_more_data(&so->state, dir); /* position to the precise item on the page */ offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); @@ -1111,36 +1164,36 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) offnum = OffsetNumberPrev(offnum); /* remember which buffer we have pinned, if any */ - Assert(!BTScanPosIsValid(so->currPos)); - so->currPos.buf = buf; + Assert(!BTScanPosIsValid(*currPos)); + currPos->buf = buf; - /* - * Now load data from the first page of the scan. - */ - if (!_bt_readpage(scan, dir, offnum)) + if (!_bt_load_first_page(scan, &so->state, dir, offnum)) + return false; + +readcomplete: + /* OK, currPos->itemIndex says what to return */ + return _bt_return_current_item(scan, &so->state); +} + +/* + * Advance to next tuple on current page; or if there's no more, + * try to step to the next page with data. + */ +static bool +_bt_next_item(IndexScanDesc scan, BTScanState state, ScanDirection dir) +{ + if (ScanDirectionIsForward(dir)) { - /* - * There's no actually-matching data on this page. Try to advance to - * the next page. Return false if there's no matching data at all. - */ - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); - if (!_bt_steppage(scan, dir)) - return false; + if (++state->currPos.itemIndex <= state->currPos.lastItem) + return true; } else { - /* Drop the lock, and maybe the pin, on the current page */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + if (--state->currPos.itemIndex >= state->currPos.firstItem) + return true; } -readcomplete: - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_ctup.t_self = currItem->heapTid; - if (scan->xs_want_itup) - scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); - - return true; + return _bt_steppage(scan, state, dir); } /* @@ -1161,44 +1214,20 @@ bool _bt_next(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - BTScanPosItem *currItem; - /* - * Advance to next tuple on current page; or if there's no more, try to - * step to the next page with data. - */ - if (ScanDirectionIsForward(dir)) - { - if (++so->currPos.itemIndex > so->currPos.lastItem) - { - if (!_bt_steppage(scan, dir)) - return false; - } - } - else - { - if (--so->currPos.itemIndex < so->currPos.firstItem) - { - if (!_bt_steppage(scan, dir)) - return false; - } - } + if (!_bt_next_item(scan, &so->state, dir)) + return false; /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_ctup.t_self = currItem->heapTid; - if (scan->xs_want_itup) - scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); - - return true; + return _bt_return_current_item(scan, &so->state); } /* * _bt_readpage() -- Load data from current index page into so->currPos * - * Caller must have pinned and read-locked so->currPos.buf; the buffer's state - * is not changed here. Also, currPos.moreLeft and moreRight must be valid; - * they are updated as appropriate. All other fields of so->currPos are + * Caller must have pinned and read-locked pos->buf; the buffer's state + * is not changed here. Also, pos->moreLeft and moreRight must be valid; + * they are updated as appropriate. All other fields of pos are * initialized from scratch here. * * We scan the current page starting at offnum and moving in the indicated @@ -1213,9 +1242,10 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) * Returns true if any matching items found on the page, false if none. */ static bool -_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) +_bt_readpage(IndexScanDesc scan, BTScanState state, ScanDirection dir, + OffsetNumber offnum) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos pos = &state->currPos; Page page; BTPageOpaque opaque; OffsetNumber minoff; @@ -1228,9 +1258,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * We must have the buffer pinned and locked, but the usual macro can't be * used here; this function is what makes it good for currPos. */ - Assert(BufferIsValid(so->currPos.buf)); + Assert(BufferIsValid(pos->buf)); - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(pos->buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* allow next page be processed by parallel worker */ @@ -1239,7 +1269,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (ScanDirectionIsForward(dir)) _bt_parallel_release(scan, opaque->btpo_next); else - _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + _bt_parallel_release(scan, BufferGetBlockNumber(pos->buf)); } minoff = P_FIRSTDATAKEY(opaque); @@ -1249,30 +1279,30 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * We note the buffer's block number so that we can release the pin later. * This allows us to re-read the buffer if it is needed again for hinting. */ - so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + pos->currPage = BufferGetBlockNumber(pos->buf); /* * We save the LSN of the page as we read it, so that we know whether it * safe to apply LP_DEAD hints to the page later. This allows us to drop * the pin for MVCC scans, which allows vacuum to avoid blocking. */ - so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + pos->lsn = BufferGetLSNAtomic(pos->buf); /* * we must save the page's right-link while scanning it; this tells us * where to step right to after we're done with these items. There is no * corresponding need for the left-link, since splits always go right. */ - so->currPos.nextPage = opaque->btpo_next; + pos->nextPage = opaque->btpo_next; /* initialize tuple workspace to empty */ - so->currPos.nextTupleOffset = 0; + pos->nextTupleOffset = 0; /* * Now that the current page has been made consistent, the macro should be * good. */ - Assert(BTScanPosIsPinned(so->currPos)); + Assert(BTScanPosIsPinned(*pos)); if (ScanDirectionIsForward(dir)) { @@ -1287,13 +1317,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(state, itemIndex, offnum, itup); itemIndex++; } if (!continuescan) { /* there can't be any more matches, so stop */ - so->currPos.moreRight = false; + pos->moreRight = false; break; } @@ -1301,9 +1331,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) } Assert(itemIndex <= MaxIndexTuplesPerPage); - so->currPos.firstItem = 0; - so->currPos.lastItem = itemIndex - 1; - so->currPos.itemIndex = 0; + pos->firstItem = 0; + pos->lastItem = itemIndex - 1; + pos->itemIndex = 0; } else { @@ -1319,12 +1349,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) { /* tuple passes all scan key conditions, so remember it */ itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(state, itemIndex, offnum, itup); } if (!continuescan) { /* there can't be any more matches, so stop */ - so->currPos.moreLeft = false; + pos->moreLeft = false; break; } @@ -1332,30 +1362,31 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) } Assert(itemIndex >= 0); - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + pos->firstItem = itemIndex; + pos->lastItem = MaxIndexTuplesPerPage - 1; + pos->itemIndex = MaxIndexTuplesPerPage - 1; } - return (so->currPos.firstItem <= so->currPos.lastItem); + return (pos->firstItem <= pos->lastItem); } /* Save an index item into so->currPos.items[itemIndex] */ static void -_bt_saveitem(BTScanOpaque so, int itemIndex, +_bt_saveitem(BTScanState state, int itemIndex, OffsetNumber offnum, IndexTuple itup) { - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + BTScanPosItem *currItem = &state->currPos.items[itemIndex]; currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; - if (so->currTuples) + if (state->currTuples) { Size itupsz = IndexTupleSize(itup); - currItem->tupleOffset = so->currPos.nextTupleOffset; - memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); - so->currPos.nextTupleOffset += MAXALIGN(itupsz); + currItem->tupleOffset = state->currPos.nextTupleOffset; + memcpy(state->currTuples + state->currPos.nextTupleOffset, + itup, itupsz); + state->currPos.nextTupleOffset += MAXALIGN(itupsz); } } @@ -1371,35 +1402,36 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, * to InvalidBuffer. We return true to indicate success. */ static bool -_bt_steppage(IndexScanDesc scan, ScanDirection dir) +_bt_steppage(IndexScanDesc scan, BTScanState state, ScanDirection dir) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos currPos = &state->currPos; + Relation rel = scan->indexRelation; BlockNumber blkno = InvalidBlockNumber; bool status = true; - Assert(BTScanPosIsValid(so->currPos)); + Assert(BTScanPosIsValid(*currPos)); /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); + if (state->numKilled > 0) + _bt_killitems(state, rel); /* * Before we modify currPos, make a copy of the page data if there was a * mark position that needs it. */ - if (so->markItemIndex >= 0) + if (state->markItemIndex >= 0) { /* bump pin on current buffer for assignment to mark buffer */ - if (BTScanPosIsPinned(so->currPos)) - IncrBufferRefCount(so->currPos.buf); - memcpy(&so->markPos, &so->currPos, + if (BTScanPosIsPinned(*currPos)) + IncrBufferRefCount(currPos->buf); + memcpy(&state->markPos, currPos, offsetof(BTScanPosData, items[1]) + - so->currPos.lastItem * sizeof(BTScanPosItem)); - if (so->markTuples) - memcpy(so->markTuples, so->currTuples, - so->currPos.nextTupleOffset); - so->markPos.itemIndex = so->markItemIndex; - so->markItemIndex = -1; + currPos->lastItem * sizeof(BTScanPosItem)); + if (state->markTuples) + memcpy(state->markTuples, state->currTuples, + currPos->nextTupleOffset); + state->markPos.itemIndex = state->markItemIndex; + state->markItemIndex = -1; } if (ScanDirectionIsForward(dir)) @@ -1415,27 +1447,27 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) if (!status) { /* release the previous buffer, if pinned */ - BTScanPosUnpinIfPinned(so->currPos); - BTScanPosInvalidate(so->currPos); + BTScanPosUnpinIfPinned(*currPos); + BTScanPosInvalidate(*currPos); return false; } } else { /* Not parallel, so use the previously-saved nextPage link. */ - blkno = so->currPos.nextPage; + blkno = currPos->nextPage; } /* Remember we left a page with data */ - so->currPos.moreLeft = true; + currPos->moreLeft = true; /* release the previous buffer, if pinned */ - BTScanPosUnpinIfPinned(so->currPos); + BTScanPosUnpinIfPinned(*currPos); } else { /* Remember we left a page with data */ - so->currPos.moreRight = true; + currPos->moreRight = true; if (scan->parallel_scan != NULL) { @@ -1444,25 +1476,25 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * ended already, bail out. */ status = _bt_parallel_seize(scan, &blkno); - BTScanPosUnpinIfPinned(so->currPos); + BTScanPosUnpinIfPinned(*currPos); if (!status) { - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } } else { /* Not parallel, so just use our own notion of the current page */ - blkno = so->currPos.currPage; + blkno = currPos->currPage; } } - if (!_bt_readnextpage(scan, blkno, dir)) + if (!_bt_readnextpage(scan, state, blkno, dir)) return false; /* Drop the lock, and maybe the pin, on the current page */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(scan, currPos); return true; } @@ -1478,9 +1510,10 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * locks and pins, set so->currPos.buf to InvalidBuffer, and return false. */ static bool -_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +_bt_readnextpage(IndexScanDesc scan, BTScanState state, BlockNumber blkno, + ScanDirection dir) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos currPos = &state->currPos; Relation rel; Page page; BTPageOpaque opaque; @@ -1496,17 +1529,17 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) * if we're at end of scan, give up and mark parallel scan as * done, so that all the workers can finish their scan */ - if (blkno == P_NONE || !so->currPos.moreRight) + if (blkno == P_NONE || !currPos->moreRight) { _bt_parallel_done(scan); - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); /* step right one page */ - so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(so->currPos.buf); + currPos->buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(currPos->buf); TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* check for deleted page */ @@ -1515,7 +1548,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) PredicateLockPage(rel, blkno, scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreRight if we can stop */ - if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) + if (_bt_readpage(scan, state, dir, P_FIRSTDATAKEY(opaque))) break; } else if (scan->parallel_scan != NULL) @@ -1527,18 +1560,18 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) /* nope, keep going */ if (scan->parallel_scan != NULL) { - _bt_relbuf(rel, so->currPos.buf); + _bt_relbuf(rel, currPos->buf); status = _bt_parallel_seize(scan, &blkno); if (!status) { - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } } else { blkno = opaque->btpo_next; - _bt_relbuf(rel, so->currPos.buf); + _bt_relbuf(rel, currPos->buf); } } } @@ -1548,10 +1581,10 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) * Should only happen in parallel cases, when some other backend * advanced the scan. */ - if (so->currPos.currPage != blkno) + if (currPos->currPage != blkno) { - BTScanPosUnpinIfPinned(so->currPos); - so->currPos.currPage = blkno; + BTScanPosUnpinIfPinned(*currPos); + currPos->currPage = blkno; } /* @@ -1576,31 +1609,30 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) * is MVCC the page cannot move past the half-dead state to fully * deleted. */ - if (BTScanPosIsPinned(so->currPos)) - LockBuffer(so->currPos.buf, BT_READ); + if (BTScanPosIsPinned(*currPos)) + LockBuffer(currPos->buf, BT_READ); else - so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); + currPos->buf = _bt_getbuf(rel, currPos->currPage, BT_READ); for (;;) { /* Done if we know there are no matching keys to the left */ - if (!so->currPos.moreLeft) + if (!currPos->moreLeft) { - _bt_relbuf(rel, so->currPos.buf); + _bt_relbuf(rel, currPos->buf); _bt_parallel_done(scan); - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } /* Step to next physical page */ - so->currPos.buf = _bt_walk_left(rel, so->currPos.buf, - scan->xs_snapshot); + currPos->buf = _bt_walk_left(rel, currPos->buf, scan->xs_snapshot); /* if we're physically at end of index, return failure */ - if (so->currPos.buf == InvalidBuffer) + if (currPos->buf == InvalidBuffer) { _bt_parallel_done(scan); - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } @@ -1609,21 +1641,21 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) * it's not half-dead and contains matching tuples. Else loop back * and do it all again. */ - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(currPos->buf); TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { - PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); + PredicateLockPage(rel, BufferGetBlockNumber(currPos->buf), scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ - if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) + if (_bt_readpage(scan, state, dir, PageGetMaxOffsetNumber(page))) break; } else if (scan->parallel_scan != NULL) { /* allow next page be processed by parallel worker */ - _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + _bt_parallel_release(scan, BufferGetBlockNumber(currPos->buf)); } /* @@ -1634,14 +1666,14 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) */ if (scan->parallel_scan != NULL) { - _bt_relbuf(rel, so->currPos.buf); + _bt_relbuf(rel, currPos->buf); status = _bt_parallel_seize(scan, &blkno); if (!status) { - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } - so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + currPos->buf = _bt_getbuf(rel, blkno, BT_READ); } } } @@ -1660,13 +1692,13 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - _bt_initialize_more_data(so, dir); + _bt_initialize_more_data(&so->state, dir); - if (!_bt_readnextpage(scan, blkno, dir)) + if (!_bt_readnextpage(scan, &so->state, blkno, dir)) return false; /* Drop the lock, and maybe the pin, on the current page */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(scan, &so->state.currPos); return true; } @@ -1891,11 +1923,11 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos currPos = &so->state.currPos; Buffer buf; Page page; BTPageOpaque opaque; OffsetNumber start; - BTScanPosItem *currItem; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified @@ -1911,7 +1943,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * exists. */ PredicateLockRelation(rel, scan->xs_snapshot); - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } @@ -1940,36 +1972,15 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) } /* remember which buffer we have pinned */ - so->currPos.buf = buf; + currPos->buf = buf; - _bt_initialize_more_data(so, dir); + _bt_initialize_more_data(&so->state, dir); - /* - * Now load data from the first page of the scan. - */ - if (!_bt_readpage(scan, dir, start)) - { - /* - * There's no actually-matching data on this page. Try to advance to - * the next page. Return false if there's no matching data at all. - */ - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); - if (!_bt_steppage(scan, dir)) - return false; - } - else - { - /* Drop the lock, and maybe the pin, on the current page */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); - } - - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_ctup.t_self = currItem->heapTid; - if (scan->xs_want_itup) - scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + if (!_bt_load_first_page(scan, &so->state, dir, start)) + return false; - return true; + /* OK, currPos->itemIndex says what to return */ + return _bt_return_current_item(scan, &so->state); } /* @@ -1977,19 +1988,19 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * for scan direction */ static inline void -_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) +_bt_initialize_more_data(BTScanState state, ScanDirection dir) { /* initialize moreLeft/moreRight appropriately for scan direction */ if (ScanDirectionIsForward(dir)) { - so->currPos.moreLeft = false; - so->currPos.moreRight = true; + state->currPos.moreLeft = false; + state->currPos.moreRight = true; } else { - so->currPos.moreLeft = true; - so->currPos.moreRight = false; + state->currPos.moreLeft = true; + state->currPos.moreRight = false; } - so->numKilled = 0; /* just paranoia */ - so->markItemIndex = -1; /* ditto */ + state->numKilled = 0; /* just paranoia */ + state->markItemIndex = -1; /* ditto */ } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 4528e87..9bf453c 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -1741,26 +1741,26 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, * away and the TID was re-used by a completely different heap tuple. */ void -_bt_killitems(IndexScanDesc scan) +_bt_killitems(BTScanState state, Relation indexRelation) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos pos = &state->currPos; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; int i; - int numKilled = so->numKilled; + int numKilled = state->numKilled; bool killedsomething = false; - Assert(BTScanPosIsValid(so->currPos)); + Assert(BTScanPosIsValid(state->currPos)); /* * Always reset the scan state, so we don't look for same items on other * pages. */ - so->numKilled = 0; + state->numKilled = 0; - if (BTScanPosIsPinned(so->currPos)) + if (BTScanPosIsPinned(*pos)) { /* * We have held the pin on this page since we read the index tuples, @@ -1768,44 +1768,42 @@ _bt_killitems(IndexScanDesc scan) * re-use of any TID on the page, so there is no need to check the * LSN. */ - LockBuffer(so->currPos.buf, BT_READ); - - page = BufferGetPage(so->currPos.buf); + LockBuffer(pos->buf, BT_READ); } else { Buffer buf; /* Attempt to re-read the buffer, getting pin and lock. */ - buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); + buf = _bt_getbuf(indexRelation, pos->currPage, BT_READ); /* It might not exist anymore; in which case we can't hint it. */ if (!BufferIsValid(buf)) return; - page = BufferGetPage(buf); - if (BufferGetLSNAtomic(buf) == so->currPos.lsn) - so->currPos.buf = buf; + if (BufferGetLSNAtomic(buf) == pos->lsn) + pos->buf = buf; else { /* Modified while not pinned means hinting is not safe. */ - _bt_relbuf(scan->indexRelation, buf); + _bt_relbuf(indexRelation, buf); return; } } + page = BufferGetPage(pos->buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (i = 0; i < numKilled; i++) { - int itemIndex = so->killedItems[i]; - BTScanPosItem *kitem = &so->currPos.items[itemIndex]; + int itemIndex = state->killedItems[i]; + BTScanPosItem *kitem = &pos->items[itemIndex]; OffsetNumber offnum = kitem->indexOffset; - Assert(itemIndex >= so->currPos.firstItem && - itemIndex <= so->currPos.lastItem); + Assert(itemIndex >= pos->firstItem && + itemIndex <= pos->lastItem); if (offnum < minoff) continue; /* pure paranoia */ while (offnum <= maxoff) @@ -1833,10 +1831,10 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(so->currPos.buf, true); + MarkBufferDirtyHint(pos->buf, true); } - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + LockBuffer(pos->buf, BUFFER_LOCK_UNLOCK); } @@ -2214,3 +2212,14 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum) } } + +/* + * _bt_allocate_tuple_workspaces() -- Allocate buffers for saving index tuples + * in index-only scans. + */ +void +_bt_allocate_tuple_workspaces(BTScanState state) +{ + state->currTuples = (char *) palloc(BLCKSZ * 2); + state->markTuples = state->currTuples + BLCKSZ; +} diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 04ecb4c..388b311 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -434,22 +434,8 @@ typedef struct BTArrayKeyInfo Datum *elem_values; /* array of num_elems Datums */ } BTArrayKeyInfo; -typedef struct BTScanOpaqueData +typedef struct BTScanStateData { - /* these fields are set by _bt_preprocess_keys(): */ - bool qual_ok; /* false if qual can never be satisfied */ - int numberOfKeys; /* number of preprocessed scan keys */ - ScanKey keyData; /* array of preprocessed scan keys */ - - /* workspace for SK_SEARCHARRAY support */ - ScanKey arrayKeyData; /* modified copy of scan->keyData */ - int numArrayKeys; /* number of equality-type array keys (-1 if - * there are any unsatisfiable array keys) */ - int arrayKeyCount; /* count indicating number of array scan keys - * processed */ - BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ - MemoryContext arrayContext; /* scan-lifespan context for array data */ - /* info about killed items if any (killedItems is NULL if never used) */ int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ @@ -474,6 +460,25 @@ typedef struct BTScanOpaqueData /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ +} BTScanStateData, *BTScanState; + +typedef struct BTScanOpaqueData +{ + /* these fields are set by _bt_preprocess_keys(): */ + bool qual_ok; /* false if qual can never be satisfied */ + int numberOfKeys; /* number of preprocessed scan keys */ + ScanKey keyData; /* array of preprocessed scan keys */ + + /* workspace for SK_SEARCHARRAY support */ + ScanKey arrayKeyData; /* modified copy of scan->keyData */ + int numArrayKeys; /* number of equality-type array keys (-1 if + * there are any unsatisfiable array keys) */ + int arrayKeyCount; /* count indicating number of array scan keys + * processed */ + BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ + MemoryContext arrayContext; /* scan-lifespan context for array data */ + + BTScanStateData state; } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -590,7 +595,7 @@ extern void _bt_preprocess_keys(IndexScanDesc scan); extern IndexTuple _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan); -extern void _bt_killitems(IndexScanDesc scan); +extern void _bt_killitems(BTScanState state, Relation indexRelation); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); extern void _bt_end_vacuum(Relation rel); @@ -603,6 +608,7 @@ extern bool btproperty(Oid index_oid, int attno, bool *res, bool *isnull); extern IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup); extern bool _bt_check_natts(Relation rel, Page page, OffsetNumber offnum); +extern void _bt_allocate_tuple_workspaces(BTScanState state); /* * prototypes for functions in nbtvalidate.c