diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile index 4a6c99e090..634059b8ff 100644 --- a/src/backend/commands/Makefile +++ b/src/backend/commands/Makefile @@ -20,6 +20,6 @@ OBJS = amcmds.o aggregatecmds.o alter.o analyze.o async.o cluster.o comment.o \ policy.o portalcmds.o prepare.o proclang.o publicationcmds.o \ schemacmds.o seclabel.o sequence.o statscmds.o subscriptioncmds.o \ tablecmds.o tablespace.o trigger.o tsearchcmds.o typecmds.o user.o \ - vacuum.o vacuumlazy.o variable.o view.o + vacuum.o vacuumlazy.o vacuumtidmap.o variable.o view.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index d2a006671a..580835d8c9 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -4,23 +4,21 @@ * Concurrent ("lazy") vacuuming. * * - * The major space usage for LAZY VACUUM is storage for the array of dead tuple - * TIDs. We want to ensure we can vacuum even the very largest relations with - * finite memory space usage. To do that, we set upper bounds on the number of - * tuples we will keep track of at once. + * The major space usage for LAZY VACUUM is storage of dead tuple TIDs. + * We want to ensure we can vacuum even the very largest relations with + * finite memory space usage. To do that, we set upper bounds on the amount + * of memory used to track the TIDs at any one time. * * We are willing to use at most maintenance_work_mem (or perhaps * autovacuum_work_mem) memory space to keep track of dead tuples. We - * initially allocate an array of TIDs of that size, with an upper limit that - * depends on table size (this limit ensures we don't allocate a huge area - * uselessly for vacuuming small tables). If the array threatens to overflow, - * we suspend the heap scan phase and perform a pass of index cleanup and page - * compaction, then resume the heap scan with an empty TID array. + * use a specialized data structure to hold the TIDs, which keeps track of + * the memory used. If the memory limit is about to be exceeded, we suspend + * the heap scan phase and perform a pass of index cleanup and page + * compaction, then resume the heap scan with an empty tid map. * * If we're processing a table with no indexes, we can just vacuum each page * as we go; there's no need to save up multiple tuples to minimize the number - * of index scans performed. So we don't use maintenance_work_mem memory for - * the TID array, just enough to hold as many heap tuples as fit on one page. + * of index scans performed. * * * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group @@ -94,13 +92,6 @@ ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) /* - * Guesstimation of number of dead tuples per page. This is used to - * provide an upper limit to memory allocated when vacuuming small - * tables. - */ -#define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage - -/* * Before we consider skipping a page that's marked as clean in * visibility map, we must've seen at least this many clean pages. */ @@ -130,11 +121,7 @@ typedef struct LVRelStats BlockNumber pages_removed; double tuples_deleted; BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ - /* List of TIDs of tuples we intend to delete */ - /* NB: this list is ordered by TID address */ - int num_dead_tuples; /* current # of entries */ - int max_dead_tuples; /* # slots allocated in array */ - ItemPointer dead_tuples; /* array of ItemPointerData */ + VacuumTidMap *dead_tuples; /* list of TIDs of tuples we intend to delete */ int num_index_scans; TransactionId latestRemovedXid; bool lock_waiter_detected; @@ -163,17 +150,17 @@ static void lazy_vacuum_index(Relation indrel, static void lazy_cleanup_index(Relation indrel, IndexBulkDeleteResult *stats, LVRelStats *vacrelstats); -static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, - int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer); +static void lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, + OffsetNumber *dead_offsets, int num_dead_tuples, + LVRelStats *vacrelstats, Buffer *vmbuffer); static bool should_attempt_truncation(LVRelStats *vacrelstats); static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats); static BlockNumber count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats); -static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks); static void lazy_record_dead_tuple(LVRelStats *vacrelstats, ItemPointer itemptr); +static void lazy_clear_dead_tuples(LVRelStats *vacrelstats); static bool lazy_tid_reaped(ItemPointer itemptr, void *state); -static int vac_cmp_itemptr(const void *left, const void *right); static bool heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cutoff_xid, bool *all_frozen); @@ -492,6 +479,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, PROGRESS_VACUUM_MAX_DEAD_TUPLES }; int64 initprog_val[3]; + int vac_work_mem; pg_rusage_init(&ru0); @@ -521,13 +509,23 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; - lazy_space_alloc(vacrelstats, nblocks); + vac_work_mem = IsAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem : maintenance_work_mem; + + vacrelstats->dead_tuples = CreateVacuumTidMap(vac_work_mem); + frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage); /* Report that we're scanning the heap, advertising total # of blocks */ initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; initprog_val[1] = nblocks; - initprog_val[2] = vacrelstats->max_dead_tuples; + /* + * XXX: The reported max number of tuples is just a rough estimate. It + * doesn't take into account the memory needed by internal nodes in the + * TID map. + */ + initprog_val[2] = vac_work_mem / sizeof(ItemPointerData); pgstat_progress_update_multi_param(3, initprog_index, initprog_val); /* @@ -611,7 +609,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, maxoff; bool tupgone, hastup; - int prev_dead_count; + uint16 prev_dead_count; int nfrozen; Size freespace; bool all_visible_according_to_vm = false; @@ -705,8 +703,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ - if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && - vacrelstats->num_dead_tuples > 0) + if (VacuumTidMapIsFull(vacrelstats->dead_tuples)) { const int hvp_index[] = { PROGRESS_VACUUM_PHASE, @@ -757,7 +754,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * not to reset latestRemovedXid since we want that value to be * valid. */ - vacrelstats->num_dead_tuples = 0; + lazy_clear_dead_tuples(vacrelstats); vacrelstats->num_index_scans++; /* @@ -947,7 +944,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, has_dead_tuples = false; nfrozen = 0; hastup = false; - prev_dead_count = vacrelstats->num_dead_tuples; + prev_dead_count = VacuumTidMapGetNumTuples(vacrelstats->dead_tuples); maxoff = PageGetMaxOffsetNumber(page); /* @@ -1202,10 +1199,24 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * instead of doing a second scan. */ if (nindexes == 0 && - vacrelstats->num_dead_tuples > 0) + !VacuumTidMapIsEmpty(vacrelstats->dead_tuples)) { + BlockNumber tblkno; + int ntuples; + OffsetNumber *dead_offsets; + /* Remove tuples from heap */ - lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer); + VacuumTidMapBeginIterate(vacrelstats->dead_tuples); + if (!VacuumTidMapNext(vacrelstats->dead_tuples, &tblkno, &ntuples, &dead_offsets)) + elog(ERROR, "vacuum tid map was empty"); + if (tblkno != blkno) + elog(ERROR, "vacuum tid map contained wrong block"); + + lazy_vacuum_page(onerel, blkno, buf, dead_offsets, ntuples, vacrelstats, &vmbuffer); + + if (VacuumTidMapNext(vacrelstats->dead_tuples, &tblkno, &ntuples, &dead_offsets)) + elog(ERROR, "unexpected entries in vacuum tid map"); + has_dead_tuples = false; /* @@ -1213,7 +1224,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * not to reset latestRemovedXid since we want that value to be * valid. */ - vacrelstats->num_dead_tuples = 0; + lazy_clear_dead_tuples(vacrelstats); vacuumed_pages++; /* @@ -1329,7 +1340,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ - if (vacrelstats->num_dead_tuples == prev_dead_count) + if (VacuumTidMapGetNumTuples(vacrelstats->dead_tuples) == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } @@ -1363,7 +1374,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ - if (vacrelstats->num_dead_tuples > 0) + if (!VacuumTidMapIsEmpty(vacrelstats->dead_tuples)) { const int hvp_index[] = { PROGRESS_VACUUM_PHASE, @@ -1467,25 +1478,32 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) { - int tupindex; + int tottuples; int npages; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; + int num_dead_tuples; + BlockNumber tblk; + OffsetNumber *dead_offsets; + + if (VacuumTidMapIsEmpty(vacrelstats->dead_tuples)) + return; pg_rusage_init(&ru0); npages = 0; - tupindex = 0; - while (tupindex < vacrelstats->num_dead_tuples) + tottuples = 0; + + VacuumTidMapBeginIterate(vacrelstats->dead_tuples); + while (VacuumTidMapNext(vacrelstats->dead_tuples, &tblk, &num_dead_tuples, &dead_offsets)) { - BlockNumber tblk; + int tupindex = 0; Buffer buf; Page page; Size freespace; vacuum_delay_point(); - tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, vac_strategy); if (!ConditionalLockBufferForCleanup(buf)) @@ -1494,8 +1512,9 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) ++tupindex; continue; } - tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats, - &vmbuffer); + lazy_vacuum_page(onerel, tblk, buf, + dead_offsets, num_dead_tuples, + vacrelstats, &vmbuffer); /* Now that we've compacted the page, record its available space */ page = BufferGetPage(buf); @@ -1504,6 +1523,8 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, tblk, freespace); npages++; + + tottuples += num_dead_tuples; } if (BufferIsValid(vmbuffer)) @@ -1515,7 +1536,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) ereport(elevel, (errmsg("\"%s\": removed %d row versions in %d pages", RelationGetRelationName(onerel), - tupindex, npages), + tottuples, npages), errdetail_internal("%s", pg_rusage_show(&ru0)))); } @@ -1525,37 +1546,32 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) * * Caller must hold pin and buffer cleanup lock on the buffer. * - * tupindex is the index in vacrelstats->dead_tuples of the first dead + * tupindex is the index in seg->dt_tids of the first dead * tuple for this page. We assume the rest follow sequentially. * The return value is the first tupindex after the tuples of this page. */ -static int +static void lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, - int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer) + OffsetNumber *dead_offsets, int num_dead_tuples, + LVRelStats *vacrelstats, Buffer *vmbuffer) { Page page = BufferGetPage(buffer); - OffsetNumber unused[MaxOffsetNumber]; - int uncnt = 0; TransactionId visibility_cutoff_xid; bool all_frozen; + int tupindex; pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); START_CRIT_SECTION(); - for (; tupindex < vacrelstats->num_dead_tuples; tupindex++) + for (tupindex = 0; tupindex < num_dead_tuples; tupindex++) { - BlockNumber tblk; OffsetNumber toff; ItemId itemid; - tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); - if (tblk != blkno) - break; /* past end of tuples for this block */ - toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]); + toff = dead_offsets[tupindex]; itemid = PageGetItemId(page, toff); ItemIdSetUnused(itemid); - unused[uncnt++] = toff; } PageRepairFragmentation(page); @@ -1572,7 +1588,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, recptr = log_heap_clean(onerel, buffer, NULL, 0, NULL, 0, - unused, uncnt, + dead_offsets, num_dead_tuples, vacrelstats->latestRemovedXid); PageSetLSN(page, recptr); } @@ -1616,8 +1632,6 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer, visibility_cutoff_xid, flags); } - - return tupindex; } /* @@ -1697,14 +1711,18 @@ lazy_vacuum_index(Relation indrel, ivinfo.num_heap_tuples = vacrelstats->old_live_tuples; ivinfo.strategy = vac_strategy; + /* If uninitialized, we have no tuples to delete from the indexes */ + if (VacuumTidMapIsEmpty(vacrelstats->dead_tuples)) + return; + /* Do bulk deletion */ *stats = index_bulk_delete(&ivinfo, *stats, lazy_tid_reaped, (void *) vacrelstats); ereport(elevel, - (errmsg("scanned index \"%s\" to remove %d row versions", + (errmsg("scanned index \"%s\" to remove " UINT64_FORMAT " row versions", RelationGetRelationName(indrel), - vacrelstats->num_dead_tuples), + VacuumTidMapGetNumTuples(vacrelstats->dead_tuples)), errdetail_internal("%s", pg_rusage_show(&ru0)))); } @@ -2071,113 +2089,37 @@ count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) } /* - * lazy_space_alloc - space allocation decisions for lazy vacuum - * - * See the comments at the head of this file for rationale. + * lazy_record_dead_tuple - remember one deletable tuple */ static void -lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) +lazy_record_dead_tuple(LVRelStats *vacrelstats, + ItemPointer itemptr) { - long maxtuples; - int vac_work_mem = IsAutoVacuumWorkerProcess() && - autovacuum_work_mem != -1 ? - autovacuum_work_mem : maintenance_work_mem; - - if (vacrelstats->hasindex) - { - maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData); - maxtuples = Min(maxtuples, INT_MAX); - maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData)); - - /* curious coding here to ensure the multiplication can't overflow */ - if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks) - maxtuples = relblocks * LAZY_ALLOC_TUPLES; - - /* stay sane if small maintenance_work_mem */ - maxtuples = Max(maxtuples, MaxHeapTuplesPerPage); - } - else - { - maxtuples = MaxHeapTuplesPerPage; - } - - vacrelstats->num_dead_tuples = 0; - vacrelstats->max_dead_tuples = (int) maxtuples; - vacrelstats->dead_tuples = (ItemPointer) - palloc(maxtuples * sizeof(ItemPointerData)); + VacuumTidMapRecordTid(vacrelstats->dead_tuples, itemptr); + pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, + VacuumTidMapGetNumTuples(vacrelstats->dead_tuples)); } /* - * lazy_record_dead_tuple - remember one deletable tuple + * lazy_clear_dead_tuples - reset the dead tuples map */ static void -lazy_record_dead_tuple(LVRelStats *vacrelstats, - ItemPointer itemptr) +lazy_clear_dead_tuples(LVRelStats *vacrelstats) { - /* - * The array shouldn't overflow under normal behavior, but perhaps it - * could if we are given a really small maintenance_work_mem. In that - * case, just forget the last few tuples (we'll get 'em next time). - */ - if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples) - { - vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr; - vacrelstats->num_dead_tuples++; - pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, - vacrelstats->num_dead_tuples); - } + VacuumTidMapReset(vacrelstats->dead_tuples); } /* * lazy_tid_reaped() -- is a particular tid deletable? * * This has the right signature to be an IndexBulkDeleteCallback. - * - * Assumes dead_tuples array is in sorted order. */ static bool lazy_tid_reaped(ItemPointer itemptr, void *state) { LVRelStats *vacrelstats = (LVRelStats *) state; - ItemPointer res; - - res = (ItemPointer) bsearch((void *) itemptr, - (void *) vacrelstats->dead_tuples, - vacrelstats->num_dead_tuples, - sizeof(ItemPointerData), - vac_cmp_itemptr); - - return (res != NULL); -} - -/* - * Comparator routines for use with qsort() and bsearch(). - */ -static int -vac_cmp_itemptr(const void *left, const void *right) -{ - BlockNumber lblk, - rblk; - OffsetNumber loff, - roff; - - lblk = ItemPointerGetBlockNumber((ItemPointer) left); - rblk = ItemPointerGetBlockNumber((ItemPointer) right); - - if (lblk < rblk) - return -1; - if (lblk > rblk) - return 1; - - loff = ItemPointerGetOffsetNumber((ItemPointer) left); - roff = ItemPointerGetOffsetNumber((ItemPointer) right); - - if (loff < roff) - return -1; - if (loff > roff) - return 1; - return 0; + return VacuumTidMapContainsTid(vacrelstats->dead_tuples, itemptr); } /* diff --git a/src/backend/commands/vacuumtidmap.c b/src/backend/commands/vacuumtidmap.c new file mode 100644 index 0000000000..182951f2e6 --- /dev/null +++ b/src/backend/commands/vacuumtidmap.c @@ -0,0 +1,520 @@ +/*------------------------------------------------------------------------- + * + * vacuumtidmap.c + * Data structure to hold TIDs of dead tuples during vacuum. + * + * Vacuum Tid Map is used to hold the TIDs of dead tuples during VACUUM. + * The data structure is a fairly straightforward B-tree, but tailored for + * storing TIDs. + * + * Operations we need to support: + * + * - Adding new TIDs. TIDs are only added in increasing TID order. Thanks + * to that, we can fill each internal node fully, and never need to split + * existing nodes. + * + * - Random lookups by TID. This is done heavily while scanning indexes. + * + * - Scan all TIDs in order. In the 2nd phase of vacuum. To make that + * simpler, we store a 'next' pointer on each leaf node. + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/commands/vacuumtidmap.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "commands/vacuum.h" +#include "utils/memutils.h" + +/* + * Node structures, for the in-memory B-tree. + * + * Each leaf node stores a number of TIDs, in a sorted array. There is also + * a pointer to the next leaf node, so that the leaf nodes can be easily + * walked from beginning to end. + * + * An internal node holds a number of downlink pointers, to leaf nodes, or + * nodes to internal nodes on lower level. For each downlink, the TID + * corresponding lower level node is stored. + + * Note that because these form just an in-memory data structure, there is no + * need for the nodes to be of the same size. + */ + +#define MAX_LEAF_NODE_ITEMS 2048 +#define MAX_INTERNAL_NODE_ITEMS 64 + +/* common structure of both leaf and internal nodes. */ +typedef struct +{ + int level; +} VacuumTidMapNode; + +typedef struct VacuumTidMapLeafNode VacuumTidMapLeafNode; + +struct VacuumTidMapLeafNode +{ + int level; /* always 0 on leaf nodes. */ + VacuumTidMapLeafNode *next; + int num_items; + ItemPointerData itemptrs[MAX_LEAF_NODE_ITEMS]; +}; + +typedef struct +{ + int level; /* always >= 1 on internal nodes */ + int num_items; + ItemPointerData itemptrs[MAX_INTERNAL_NODE_ITEMS]; + VacuumTidMapNode *downlinks[MAX_INTERNAL_NODE_ITEMS]; +} VacuumTidMapInternalNode; + +/* Maximum height of the tree. 10 should be plenty. */ +#define MAX_LEVELS 10 + +struct VacuumTidMap +{ + int num_levels; + uint64 num_entries; /* total # of tids in the tree */ + + /* Memory context, to hold all extra nodes */ + MemoryContext context; + + /* Memory tracking */ + uint64 mem_max; + uint64 mem_used; + + /* + * 'root' points to the root of the tree (or the only leaf node, if + * num_levels == 1). 'first_leaf' points to the leftmost leaf page. + */ + VacuumTidMapNode *root; + VacuumTidMapLeafNode *first_leaf; + + /* + * Pointer to the rightmost leaf page, and its parent, grandparent etc. + * all the way up to the root. + */ + VacuumTidMapLeafNode *last_leaf; + VacuumTidMapInternalNode *last_parents[MAX_LEVELS]; + + /* Iterator support */ + OffsetNumber *iter_offsets; + VacuumTidMapLeafNode *iter_node; + int iter_itemno; +}; + +static inline int vac_itemptr_binsrch(BlockNumber refblk, OffsetNumber refoff, ItemPointer arr, int arr_elems); +static void update_upper(VacuumTidMap * dt, int level, void *new_node, ItemPointer new_node_itemptr); + +/* + * Create a new, initially empty, tidmap. + * + * 'vac_work_mem' is the max amount of memory to be used. The tidmap doesn't + * actually limit the amount of memory used in any way, but it affects when + * VacuumTidMapIsFull() says that the tidmap is full. + */ +VacuumTidMap * +CreateVacuumTidMap(int vac_work_mem) +{ + VacuumTidMap *dt; + + /* + * Allocate the tid map struct, and the first leaf node in the current + * memory context. Any additional nodes are allocated in a separate + * context, so that they can be freed easily in VacuumTidMapReset(). + */ + dt = (VacuumTidMap *) palloc(sizeof(VacuumTidMap)); + + dt->context = AllocSetContextCreate(CurrentMemoryContext, + "Vacuum TID map", + ALLOCSET_DEFAULT_SIZES); + dt->num_entries = 0; + + dt->first_leaf = (VacuumTidMapLeafNode *) + palloc(sizeof(VacuumTidMapLeafNode)); + dt->first_leaf->level = 0; + dt->first_leaf->next = NULL; + dt->first_leaf->num_items = 0; + + dt->last_leaf = dt->first_leaf; + dt->root = (VacuumTidMapNode *) dt->first_leaf; + dt->num_levels = 1; + + dt->mem_max = ((uint64) vac_work_mem) * 1024; + dt->mem_used = sizeof(VacuumTidMap) + sizeof(VacuumTidMapLeafNode); + + dt->iter_offsets = NULL; /* no iteration in progress */ + + return dt; +} + +/* + * Clear all items from the tid map. + */ +void +VacuumTidMapReset(VacuumTidMap *dt) +{ + dt->num_entries = 0; + + dt->first_leaf->next = NULL; + dt->first_leaf->num_items = 0; + dt->last_leaf = dt->first_leaf; + dt->root = (VacuumTidMapNode *) dt->first_leaf; + dt->num_levels = 1; + MemoryContextReset(dt->context); + + dt->mem_used = sizeof(VacuumTidMap) + sizeof(VacuumTidMapLeafNode); + dt->iter_offsets = NULL; +} + +bool +VacuumTidMapIsEmpty(VacuumTidMap *dt) +{ + return (dt->num_entries == 0); +} + +/* + * Returns 'true', if inserting another heap page's worth of dead tuples would + * overrun the allocated memory. + */ +bool +VacuumTidMapIsFull(VacuumTidMap *dt) +{ + /* Can we fit a whole heap page's worth of items on this leaf node? */ + if (MAX_LEAF_NODE_ITEMS - dt->last_leaf->num_items >= MaxHeapTuplesPerPage) + return false; + + /* + * Do we have space to allocate another leaf node? + * + * XXX: This doesn't take into account the possibility that allocating a + * new leaf page also requires allocating new internal pages, possibly all + * the way up to the root. So we might still overshoot if that happens. + */ + if (dt->mem_max - dt->mem_used > sizeof(VacuumTidMapLeafNode)) + return false; + + return true; +} + +uint64 +VacuumTidMapGetNumTuples(VacuumTidMap *dt) +{ + return dt->num_entries; +} + +/* + * Begin in-order scan through all the TIDs. + */ +void +VacuumTidMapBeginIterate(VacuumTidMap *dt) +{ + if (dt->iter_offsets) + elog(ERROR, "iteration on vacuum tid map is already in progress"); + + dt->iter_offsets = MemoryContextAlloc(dt->context, + MaxHeapTuplesPerPage * sizeof(OffsetNumber)); + dt->iter_node = dt->first_leaf; + dt->iter_itemno = 0; +} + +/* + * Returns next batch of tuples. + * + * VacuumTidMapBeginIterate() must be called first. VacuumTidMapNext() returns + * TIDs one block number at a time, such that each call returns all the TIDs + * with the same block number. + * + * If there are any more entries, returns true, and 'blkno', 'ntuples' and + * 'offsets' are filled with the next batch of TIDs. + */ +bool +VacuumTidMapNext(VacuumTidMap *dt, BlockNumber *blkno, int *ntuples, OffsetNumber **offsets) +{ + int curr_itemno; + BlockNumber currblk; + VacuumTidMapLeafNode *curr_node; + int num_offsets; + + if (!dt->iter_offsets) + elog(ERROR, "vacuum tid map iteration is not in progress"); + + if (!dt->iter_node) + { + /* No more TIDs. End the iterator, and return false */ + pfree(dt->iter_offsets); + dt->iter_offsets = NULL; + return false; + } + + Assert(dt->iter_node->level == 0); + Assert(dt->iter_node->num_items > 0); + + /* + * There are more TIDs to return. + * + * Grab the block number from the next TID, and scan forward, collecting + * the offset numbers of all TIDs on the same block. + */ + curr_node = dt->iter_node; + curr_itemno = dt->iter_itemno; + + currblk = ItemPointerGetBlockNumber(&curr_node->itemptrs[curr_itemno]); + dt->iter_offsets[0] = ItemPointerGetOffsetNumber(&curr_node->itemptrs[curr_itemno]); + num_offsets = 1; + curr_itemno++; + + for (;;) + { + if (curr_itemno >= curr_node->num_items) + { + /* We have reached end of this node. Step to the next one. */ + curr_node = curr_node->next; + curr_itemno = 0; + + if (!curr_node) + { + /* Reached the very end. */ + break; + } + } + + if (ItemPointerGetBlockNumber(&curr_node->itemptrs[curr_itemno]) != currblk) + break; + + dt->iter_offsets[num_offsets] = + ItemPointerGetOffsetNumber(&curr_node->itemptrs[curr_itemno]); + num_offsets++; + curr_itemno++; + } + + /* Remember where we stopped, for the next call */ + dt->iter_node = curr_node; + dt->iter_itemno = curr_itemno; + + *blkno = currblk; + *ntuples = num_offsets; + *offsets = dt->iter_offsets; + return true; +} + +/* + * Record a TID in the map. + * + * TIDs must be recorded in order. + */ +void +VacuumTidMapRecordTid(VacuumTidMap *dt, ItemPointer itemptr) +{ + VacuumTidMapLeafNode *last_leaf; + + /* The new TID should be larger than the last one recorded */ + Assert(ItemPointerIsValid(itemptr)); + Assert(dt->num_entries == 0 || + ItemPointerCompare(itemptr, &dt->last_leaf->itemptrs[dt->last_leaf->num_items - 1]) > 0); + + /* Allocate a new leaf node if needed */ + if (dt->last_leaf->num_items == MAX_LEAF_NODE_ITEMS) + { + VacuumTidMapLeafNode *new_node; + + dt->mem_used += sizeof(VacuumTidMapLeafNode); + new_node = (VacuumTidMapLeafNode *) + MemoryContextAlloc(dt->context, sizeof(VacuumTidMapLeafNode)); + new_node->level = 0; + new_node->next = NULL; + new_node->num_items = 0; + + dt->last_leaf->next = new_node; + dt->last_leaf = new_node; + + update_upper(dt, 1, new_node, itemptr); + } + last_leaf = dt->last_leaf; + + last_leaf->itemptrs[last_leaf->num_items] = *itemptr; + last_leaf->num_items++; + + dt->num_entries++; +} + +/* + * Insert the downlink into parent node, after creating a new node. + * + * Recurses if the parent node is full, too. + */ +static void +update_upper(VacuumTidMap *dt, int level, void *new_node, + ItemPointer new_node_itemptr) +{ + VacuumTidMapInternalNode *parent; + + /* Append to the parent. */ + if (level >= dt->num_levels) + { + /* Create new root node. */ + ItemPointerData old_root_itemptr; + + /* MAX_LEVELS should be more than enough, so this shouldn't happen */ + if (dt->num_levels == MAX_LEVELS) + elog(ERROR, "could not expand vacuum tid map, maximum number of levels reached"); + + if (dt->num_levels == 1) + old_root_itemptr = ((VacuumTidMapLeafNode *) dt->root)->itemptrs[0]; + else + old_root_itemptr = ((VacuumTidMapInternalNode *) dt->root)->itemptrs[0]; + + dt->mem_used += sizeof(VacuumTidMapInternalNode); + parent = (VacuumTidMapInternalNode *) + MemoryContextAlloc(dt->context, sizeof(VacuumTidMapInternalNode)); + parent->level = level; + parent->itemptrs[0] = old_root_itemptr; + parent->downlinks[0] = dt->root; + parent->num_items = 1; + + dt->last_parents[level] = parent; + dt->root = (VacuumTidMapNode *) parent; + dt->num_levels++; + } + + parent = dt->last_parents[level]; + + if (parent->num_items < MAX_INTERNAL_NODE_ITEMS) + { + parent->itemptrs[parent->num_items] = *new_node_itemptr; + parent->downlinks[parent->num_items] = new_node; + parent->num_items++; + } + else + { + /* Doesn't fit on the parent. Allocate new parent. */ + dt->mem_used += sizeof(VacuumTidMapInternalNode); + parent = (VacuumTidMapInternalNode *) MemoryContextAlloc(dt->context, + sizeof(VacuumTidMapInternalNode)); + parent->level = level; + parent->itemptrs[0] = *new_node_itemptr; + parent->downlinks[0] = new_node; + parent->num_items = 1; + + dt->last_parents[level] = parent; + + /* Link this new parent to its parent */ + update_upper(dt, level + 1, parent, new_node_itemptr); + } +} + +/* + * Does the tid map contain the given TID? + */ +bool +VacuumTidMapContainsTid(VacuumTidMap *dt, ItemPointer itemptr) +{ + VacuumTidMapLeafNode *leaf_node; + VacuumTidMapInternalNode *node; + int level = dt->num_levels - 1; + int itemno; + BlockNumber refblk; + OffsetNumber refoff; + + if (dt->num_entries == 0 || !ItemPointerIsValid(itemptr)) + return false; + + refblk = ItemPointerGetBlockNumberNoCheck(itemptr); + refoff = ItemPointerGetOffsetNumberNoCheck(itemptr); + + /* + * Start from the root, and walk down the B-tree to find the right leaf + * node. + */ + node = (VacuumTidMapInternalNode *) dt->root; + while (level > 0) + { + itemno = vac_itemptr_binsrch(refblk, refoff, node->itemptrs, node->num_items); + + if (itemno < 0) + return false; + + node = (VacuumTidMapInternalNode *) node->downlinks[itemno]; + level--; + } + + leaf_node = (VacuumTidMapLeafNode *) node; + Assert(leaf_node->level == 0); + + /* Binary search in the leaf node */ + itemno = vac_itemptr_binsrch(refblk, refoff, leaf_node->itemptrs, leaf_node->num_items); + + if (itemno < 0) + return false; + else + { + ItemPointer titem = &leaf_node->itemptrs[itemno]; + + return (refblk == ItemPointerGetBlockNumberNoCheck(titem) && + refoff == ItemPointerGetOffsetNumberNoCheck(titem)); + } +} + + +/* + * vac_itemptr_binsrch() -- search a sorted array of item pointers + * + * Returns the offset of the first item pointer less than or equal to + * 'refblk' and 'refoff', or -1 if there is no such item pointer. + * + * All item pointers in the array are assumed to be valid + */ +static inline int +vac_itemptr_binsrch(BlockNumber refblk, OffsetNumber refoff, + ItemPointer arr, int arr_elems) +{ + BlockNumber blk; + OffsetNumber off; + ItemPointer value; + int left, + right, + mid; + + left = 0; + right = arr_elems; + while (right > left) + { + /* + * We're dealing with indexes in the B-tree nodes, which are orders of + * magnitude smaller than the max range of int, so we don't need to + * worry about overflow here. + */ + mid = (left + right) / 2; + value = &arr[mid]; + + blk = ItemPointerGetBlockNumberNoCheck(value); + if (refblk < blk) + { + right = mid; + } + else if (refblk == blk) + { + off = ItemPointerGetOffsetNumberNoCheck(value); + if (refoff < off) + right = mid; + else if (refoff == off) + return mid; + else + left = mid + 1; + } + else + { + left = mid + 1; + } + } + + return left - 1; +} diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 85d472f0a5..f327f5a7a8 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -155,6 +155,25 @@ extern int vacuum_multixact_freeze_min_age; extern int vacuum_multixact_freeze_table_age; +/* + * prototypes for Vacuum Tid Map code, in vacuumtidmap.c + */ +typedef struct VacuumTidMap VacuumTidMap; + +extern VacuumTidMap *CreateVacuumTidMap(int vac_work_mem); +extern void VacuumTidMapReset(VacuumTidMap *dt); + +extern void VacuumTidMapRecordTid(VacuumTidMap *dt, ItemPointer itemptr); + +extern bool VacuumTidMapIsFull(VacuumTidMap *dt); +extern bool VacuumTidMapIsEmpty(VacuumTidMap *dt); +extern uint64 VacuumTidMapGetNumTuples(VacuumTidMap *dt); + +extern bool VacuumTidMapContainsTid(VacuumTidMap *dt, ItemPointer itemptr); +extern void VacuumTidMapBeginIterate(VacuumTidMap *dt); +extern bool VacuumTidMapNext(VacuumTidMap *dt, + BlockNumber *blkno, int *ntuples, OffsetNumber **offsets); + /* in commands/vacuum.c */ extern void ExecVacuum(VacuumStmt *vacstmt, bool isTopLevel); extern void vacuum(int options, List *relations, VacuumParams *params, diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out index d66e2aa3b7..d60523ba99 100644 --- a/src/test/regress/expected/vacuum.out +++ b/src/test/regress/expected/vacuum.out @@ -1,7 +1,17 @@ -- -- VACUUM -- -CREATE TABLE vactst (i INT); +CREATE FUNCTION wait_barrier() RETURNS void LANGUAGE plpgsql AS $$ +DECLARE vxids text[]; +BEGIN + -- wait for all transactions to commit to make deleted tuples vacuumable + SELECT array_agg(DISTINCT virtualtransaction) FROM pg_locks WHERE pid <> pg_backend_pid() INTO vxids; + WHILE (SELECT count(virtualtransaction) FROM pg_locks WHERE pid <> pg_backend_pid() AND virtualtransaction = ANY(vxids)) > 0 LOOP + PERFORM pg_sleep(0.1); + END LOOP; +END +$$; +CREATE TABLE vactst (i INT) WITH (autovacuum_enabled = false); INSERT INTO vactst VALUES (1); INSERT INTO vactst SELECT * FROM vactst; INSERT INTO vactst SELECT * FROM vactst; @@ -22,6 +32,12 @@ SELECT count(*) FROM vactst; (1 row) DELETE FROM vactst WHERE i != 0; +SELECT wait_barrier(); + wait_barrier +-------------- + +(1 row) + SELECT * FROM vactst; i --- @@ -49,8 +65,20 @@ SELECT count(*) FROM vactst; (1 row) DELETE FROM vactst WHERE i != 0; +SELECT wait_barrier(); + wait_barrier +-------------- + +(1 row) + VACUUM (FULL) vactst; DELETE FROM vactst; +SELECT wait_barrier(); + wait_barrier +-------------- + +(1 row) + SELECT * FROM vactst; i --- @@ -119,6 +147,48 @@ ANALYZE (nonexistant-arg) does_not_exist; ERROR: syntax error at or near "nonexistant" LINE 1: ANALYZE (nonexistant-arg) does_not_exist; ^ +-- large mwm vacuum runs +CREATE UNLOGGED TABLE vactst2 (i INT) WITH (autovacuum_enabled = false); +INSERT INTO vactst2 SELECT * from generate_series(1,300000); +CREATE INDEX ix_vactst2 ON vactst2 (i); +DELETE FROM vactst2 WHERE i % 4 != 1; +SET maintenance_work_mem = 1024; +SELECT wait_barrier(); + wait_barrier +-------------- + +(1 row) + +VACUUM vactst2; +SET maintenance_work_mem TO DEFAULT; +DROP INDEX ix_vactst2; +TRUNCATE TABLE vactst2; +INSERT INTO vactst2 SELECT * from generate_series(1,40); +CREATE INDEX ix_vactst2 ON vactst2 (i); +DELETE FROM vactst2; +SELECT wait_barrier(); + wait_barrier +-------------- + +(1 row) + +VACUUM vactst2; +EXPLAIN (ANALYZE, BUFFERS, COSTS off, TIMING off, SUMMARY off) SELECT * FROM vactst2; + QUERY PLAN +--------------------------------------------- + Seq Scan on vactst2 (actual rows=0 loops=1) +(1 row) + +SELECT count(*) FROM vactst2; + count +------- + 0 +(1 row) + +DROP INDEX ix_vactst2; +TRUNCATE TABLE vactst2; DROP TABLE vaccluster; +DROP TABLE vactst2; DROP TABLE vactst; DROP TABLE vacparted; +DROP FUNCTION wait_barrier(); diff --git a/src/test/regress/sql/vacuum.sql b/src/test/regress/sql/vacuum.sql index 275ce2e270..87ea900686 100644 --- a/src/test/regress/sql/vacuum.sql +++ b/src/test/regress/sql/vacuum.sql @@ -2,7 +2,18 @@ -- VACUUM -- -CREATE TABLE vactst (i INT); +CREATE FUNCTION wait_barrier() RETURNS void LANGUAGE plpgsql AS $$ +DECLARE vxids text[]; +BEGIN + -- wait for all transactions to commit to make deleted tuples vacuumable + SELECT array_agg(DISTINCT virtualtransaction) FROM pg_locks WHERE pid <> pg_backend_pid() INTO vxids; + WHILE (SELECT count(virtualtransaction) FROM pg_locks WHERE pid <> pg_backend_pid() AND virtualtransaction = ANY(vxids)) > 0 LOOP + PERFORM pg_sleep(0.1); + END LOOP; +END +$$; + +CREATE TABLE vactst (i INT) WITH (autovacuum_enabled = false); INSERT INTO vactst VALUES (1); INSERT INTO vactst SELECT * FROM vactst; INSERT INTO vactst SELECT * FROM vactst; @@ -18,6 +29,7 @@ INSERT INTO vactst SELECT * FROM vactst; INSERT INTO vactst VALUES (0); SELECT count(*) FROM vactst; DELETE FROM vactst WHERE i != 0; +SELECT wait_barrier(); SELECT * FROM vactst; VACUUM FULL vactst; UPDATE vactst SET i = i + 1; @@ -35,8 +47,10 @@ INSERT INTO vactst SELECT * FROM vactst; INSERT INTO vactst VALUES (0); SELECT count(*) FROM vactst; DELETE FROM vactst WHERE i != 0; +SELECT wait_barrier(); VACUUM (FULL) vactst; DELETE FROM vactst; +SELECT wait_barrier(); SELECT * FROM vactst; VACUUM (FULL, FREEZE) vactst; @@ -93,6 +107,30 @@ ANALYZE vactst (i), vacparted (does_not_exist); ANALYZE (VERBOSE) does_not_exist; ANALYZE (nonexistant-arg) does_not_exist; +-- large mwm vacuum runs +CREATE UNLOGGED TABLE vactst2 (i INT) WITH (autovacuum_enabled = false); +INSERT INTO vactst2 SELECT * from generate_series(1,300000); +CREATE INDEX ix_vactst2 ON vactst2 (i); +DELETE FROM vactst2 WHERE i % 4 != 1; +SET maintenance_work_mem = 1024; +SELECT wait_barrier(); +VACUUM vactst2; +SET maintenance_work_mem TO DEFAULT; +DROP INDEX ix_vactst2; +TRUNCATE TABLE vactst2; + +INSERT INTO vactst2 SELECT * from generate_series(1,40); +CREATE INDEX ix_vactst2 ON vactst2 (i); +DELETE FROM vactst2; +SELECT wait_barrier(); +VACUUM vactst2; +EXPLAIN (ANALYZE, BUFFERS, COSTS off, TIMING off, SUMMARY off) SELECT * FROM vactst2; +SELECT count(*) FROM vactst2; +DROP INDEX ix_vactst2; +TRUNCATE TABLE vactst2; + DROP TABLE vaccluster; +DROP TABLE vactst2; DROP TABLE vactst; DROP TABLE vacparted; +DROP FUNCTION wait_barrier();