diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index f9da24c..d2a0066 100644 *** a/src/backend/commands/vacuumlazy.c --- b/src/backend/commands/vacuumlazy.c *************** *** 85,90 **** --- 85,99 ---- #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */ /* + * When a table has no indexes, vacuum the FSM after every 8GB, approximately + * (it won't be exact because we only vacuum FSM after processing a heap page + * that has some removable tuples). When there are indexes, this is ignored, + * and we vacuum FSM after each index/heap cleaning pass. + */ + #define VACUUM_FSM_EVERY_PAGES \ + ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) + + /* * Guesstimation of number of dead tuples per page. This is used to * provide an upper limit to memory allocated when vacuuming small * tables. *************** lazy_vacuum_rel(Relation onerel, int opt *** 285,293 **** pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_PHASE_FINAL_CLEANUP); - /* Vacuum the Free Space Map */ - FreeSpaceMapVacuum(onerel); - /* * Update statistics in pg_class. * --- 294,299 ---- *************** lazy_scan_heap(Relation onerel, int opti *** 465,471 **** TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid; TransactionId relminmxid = onerel->rd_rel->relminmxid; BlockNumber empty_pages, ! vacuumed_pages; double num_tuples, /* total number of nonremovable tuples */ live_tuples, /* live tuples (reltuples estimate) */ tups_vacuumed, /* tuples cleaned up by vacuum */ --- 471,478 ---- TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid; TransactionId relminmxid = onerel->rd_rel->relminmxid; BlockNumber empty_pages, ! vacuumed_pages, ! next_fsm_block_to_vacuum; double num_tuples, /* total number of nonremovable tuples */ live_tuples, /* live tuples (reltuples estimate) */ tups_vacuumed, /* tuples cleaned up by vacuum */ *************** lazy_scan_heap(Relation onerel, int opti *** 501,506 **** --- 508,514 ---- relname))); empty_pages = vacuumed_pages = 0; + next_fsm_block_to_vacuum = (BlockNumber) 0; num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) *************** lazy_scan_heap(Relation onerel, int opti *** 752,757 **** --- 760,772 ---- vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; + /* + * Vacuum the Free Space Map to make newly-freed space visible on + * upper-level FSM pages. Note we have not yet processed blkno. + */ + FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno); + next_fsm_block_to_vacuum = blkno; + /* Report that we are once again scanning the heap */ pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_PHASE_SCAN_HEAP); *************** lazy_scan_heap(Relation onerel, int opti *** 1200,1205 **** --- 1215,1233 ---- */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; + + /* + * Periodically do incremental FSM vacuuming to make newly-freed + * space visible on upper FSM pages. Note: although we've cleaned + * the current block, we haven't yet updated its FSM entry (that + * happens further down), so passing end == blkno is correct. + */ + if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) + { + FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, + blkno); + next_fsm_block_to_vacuum = blkno; + } } freespace = PageGetHeapFreeSpace(page); *************** lazy_scan_heap(Relation onerel, int opti *** 1368,1373 **** --- 1396,1408 ---- vacrelstats->num_index_scans++; } + /* + * Vacuum the remainder of the Free Space Map. We must do this whether or + * not there were indexes. + */ + if (blkno > next_fsm_block_to_vacuum) + FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno); + /* report all blocks vacuumed; and that we're cleaning up */ pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README index bbd1b93..e7ff23b 100644 *** a/src/backend/storage/freespace/README --- b/src/backend/storage/freespace/README *************** have a corrupted page, with a parent som *** 180,192 **** Secondly, if we detect corrupted pages while we search, traversing down the tree. That check will notice if a parent node is set to too high a value. In both cases, the upper nodes on the page are immediately rebuilt, fixing ! the corruption. ! Vacuum updates all the bottom level pages with correct amount of free space ! on the heap pages, fixing any outdated values there. After the heap and ! index passes are done, FreeSpaceMapVacuum is called, and the FSM tree is ! scanned in depth-first order. This fixes any discrepancies between upper ! and lower level FSM pages. TODO ---- --- 180,192 ---- Secondly, if we detect corrupted pages while we search, traversing down the tree. That check will notice if a parent node is set to too high a value. In both cases, the upper nodes on the page are immediately rebuilt, fixing ! the corruption so far as that page is concerned. ! VACUUM updates all the bottom-level FSM pages with the correct amount of free ! space on corresponding heap pages, as it proceeds through the heap. This ! goes through fsm_set_avail(), so that the upper nodes on those pages are ! immediately updated. Periodically, VACUUM calls FreeSpaceMapVacuum[Range] ! to propagate the new free-space info into the upper pages of the FSM tree. TODO ---- diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index dd8ae52..8ae981f 100644 *** a/src/backend/storage/freespace/freespace.c --- b/src/backend/storage/freespace/freespace.c *************** static Size fsm_space_cat_to_avail(uint8 *** 108,114 **** static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, uint8 newValue, uint8 minValue); static BlockNumber fsm_search(Relation rel, uint8 min_cat); ! static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof); static BlockNumber fsm_get_lastblckno(Relation rel, FSMAddress addr); static void fsm_update_recursive(Relation rel, FSMAddress addr, uint8 new_cat); --- 108,116 ---- static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, uint8 newValue, uint8 minValue); static BlockNumber fsm_search(Relation rel, uint8 min_cat); ! static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, ! BlockNumber start, BlockNumber end, ! bool *eof); static BlockNumber fsm_get_lastblckno(Relation rel, FSMAddress addr); static void fsm_update_recursive(Relation rel, FSMAddress addr, uint8 new_cat); *************** FreeSpaceMapTruncateRel(Relation rel, Bl *** 370,390 **** */ if (rel->rd_smgr) rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; } /* ! * FreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM */ void FreeSpaceMapVacuum(Relation rel) { bool dummy; ! /* ! * Traverse the tree in depth-first order. The tree is stored physically ! * in depth-first order, so this should be pretty I/O efficient. ! */ ! fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, &dummy); } /******** Internal routines ********/ --- 372,418 ---- */ if (rel->rd_smgr) rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; + + /* + * Update upper-level FSM pages to account for the truncation. This is + * important because the just-truncated pages were likely marked as + * all-free, and would be preferentially selected. + */ + FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); } /* ! * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM ! * ! * We assume that the bottom-level pages have already been updated with ! * new free-space information. */ void FreeSpaceMapVacuum(Relation rel) { bool dummy; ! /* Recursively scan the tree, starting at the root */ ! (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, ! (BlockNumber) 0, InvalidBlockNumber, ! &dummy); ! } ! ! /* ! * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM ! * ! * As above, but assume that only heap pages between start and end-1 inclusive ! * have new free-space information, so update only the upper-level slots ! * covering that block range. end == InvalidBlockNumber is equivalent to ! * "all the rest of the relation". ! */ ! void ! FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end) ! { ! bool dummy; ! ! /* Recursively scan the tree, starting at the root */ ! (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy); } /******** Internal routines ********/ *************** fsm_search(Relation rel, uint8 min_cat) *** 783,791 **** /* * Recursive guts of FreeSpaceMapVacuum */ static uint8 ! fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) { Buffer buf; Page page; --- 811,831 ---- /* * Recursive guts of FreeSpaceMapVacuum + * + * Examine the FSM page indicated by addr, as well as its children, updating + * upper-level nodes that cover the heap block range from start to end-1. + * (It's okay if end is beyond the actual end of the map.) + * Return the maximum freespace value on this page. + * + * If addr is past the end of the FSM, set *eof_p to true and return 0. + * + * This traverses the tree in depth-first order. The tree is stored + * physically in depth-first order, so this should be pretty I/O efficient. */ static uint8 ! fsm_vacuum_page(Relation rel, FSMAddress addr, ! BlockNumber start, BlockNumber end, ! bool *eof_p) { Buffer buf; Page page; *************** fsm_vacuum_page(Relation rel, FSMAddress *** 804,818 **** page = BufferGetPage(buf); /* ! * Recurse into children, and fix the information stored about them at ! * this level. */ if (addr.level > FSM_BOTTOM_LEVEL) { ! int slot; bool eof = false; ! for (slot = 0; slot < SlotsPerFSMPage; slot++) { int child_avail; --- 844,894 ---- page = BufferGetPage(buf); /* ! * If we're above the bottom level, recurse into children, and fix the ! * information stored about them at this level. */ if (addr.level > FSM_BOTTOM_LEVEL) { ! FSMAddress fsm_start, ! fsm_end; ! uint16 fsm_start_slot, ! fsm_end_slot; ! int slot, ! start_slot, ! end_slot; bool eof = false; ! /* ! * Compute the range of slots we need to update on this page, given ! * the requested range of heap blocks to consider. (Some of this work ! * will be duplicated in each recursive call, but it's cheap enough to ! * not worry about.) ! */ ! fsm_start = fsm_get_location(start, &fsm_start_slot); ! fsm_end = fsm_get_location(end, &fsm_end_slot); ! ! while (fsm_start.level < addr.level) ! { ! fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot); ! fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot); ! } ! Assert(fsm_start.level == addr.level); ! ! if (fsm_start.logpageno == addr.logpageno) ! start_slot = fsm_start_slot; ! else if (fsm_start.logpageno > addr.logpageno) ! start_slot = SlotsPerFSMPage; ! else ! start_slot = 0; ! ! if (fsm_end.logpageno == addr.logpageno) ! end_slot = fsm_end_slot; ! else if (fsm_end.logpageno > addr.logpageno) ! end_slot = SlotsPerFSMPage; ! else ! end_slot = -1; ! ! for (slot = start_slot; slot < end_slot; slot++) { int child_avail; *************** fsm_vacuum_page(Relation rel, FSMAddress *** 820,826 **** /* After we hit end-of-file, just clear the rest of the slots */ if (!eof) ! child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof); else child_avail = 0; --- 896,904 ---- /* After we hit end-of-file, just clear the rest of the slots */ if (!eof) ! child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), ! start, end, ! &eof); else child_avail = 0; *************** fsm_vacuum_page(Relation rel, FSMAddress *** 835,840 **** --- 913,919 ---- } } + /* Now get the maximum value on the page, to return to caller */ max_avail = fsm_get_max_avail(BufferGetPage(buf)); /* diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h index a517d7e..a203251 100644 *** a/src/include/storage/freespace.h --- b/src/include/storage/freespace.h *************** extern void XLogRecordPageWithFreeSpace( *** 32,37 **** --- 32,39 ---- extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks); extern void FreeSpaceMapVacuum(Relation rel); + extern void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, + BlockNumber end); extern void UpdateFreeSpaceMap(Relation rel, BlockNumber startBlkNum, BlockNumber endBlkNum,