diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index b68a0d1..b95275f 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -138,6 +138,7 @@ blhandler(PG_FUNCTION_ARGS) amroutine->amendscan = blendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 1b45a4c..ba3fffb 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -111,6 +111,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->amendscan = brinendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index b8aa9bc..491e411 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -88,6 +88,7 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->amendscan = gistendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 6806e32..2026004 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -85,6 +85,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->amendscan = hashendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = hashrecheck; PG_RETURN_POINTER(amroutine); } @@ -265,6 +266,8 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) OffsetNumber offnum; ItemPointer current; bool res; + IndexTuple itup; + /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; @@ -302,8 +305,6 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { - IndexTuple itup; - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 8d43b38..05b078f 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -59,6 +59,8 @@ _hash_next(IndexScanDesc scan, ScanDirection dir) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; + if (scan->xs_want_itup) + scan->xs_itup = itup; return true; } @@ -407,6 +409,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; + if (scan->xs_want_itup) + scan->xs_itup = itup; + return true; } diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index fa9cbdc..6897985 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -17,8 +17,12 @@ #include "access/hash.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/index.h" +#include "executor/executor.h" +#include "nodes/execnodes.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/datum.h" #define CALC_NEW_BUCKET(old_bucket, lowmask) \ old_bucket | (lowmask + 1) @@ -446,3 +450,109 @@ _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, return new_bucket; } + +/* + * Recheck if the heap tuple satisfies the key stored in the index tuple + */ +bool +hashrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple) +{ + IndexInfo *indexInfo; + EState *estate; + ExprContext *econtext; + TupleTableSlot *slot; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum values2[INDEX_MAX_KEYS]; + bool isnull2[INDEX_MAX_KEYS]; + int i; + bool equal; + int natts = indexRel->rd_rel->relnatts; + Form_pg_attribute att; + + indexInfo = BuildIndexInfo(indexRel); + + /* + * The heap tuple must be put into a slot for FormIndexDatum. + */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel)); + + ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); + + /* + * Typically the index won't have expressions, but if it does we need an + * EState to evaluate them. We need it for exclusion constraints too, + * even if they are just on simple columns. + */ + if (indexInfo->ii_Expressions != NIL || + indexInfo->ii_ExclusionOps != NULL) + { + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = slot; + } + else + estate = NULL; + + /* + * Form the index values and isnull flags for the index entry that we need + * to check. + * + * Note: if the index uses functions that are not as immutable as they are + * supposed to be, this could produce an index tuple different from the + * original. The index AM can catch such errors by verifying that it + * finds a matching index entry with the tuple's TID. For exclusion + * constraints we check this in check_exclusion_constraint(). + */ + FormIndexDatum(indexInfo, slot, estate, values, isnull); + + /* + * HASH indexes compute a hash value of the key and store that in the + * index. So we must first obtain the hash of the value obtained from the + * heap and then do a comparison + */ + _hash_convert_tuple(indexRel, values, isnull, values2, isnull2); + + equal = true; + for (i = 1; i <= natts; i++) + { + Datum indxvalue; + bool indxisnull; + + indxvalue = index_getattr(indexTuple, i, indexRel->rd_att, &indxisnull); + + /* + * If both are NULL then they are equal + */ + if (isnull2[i - 1] && indxisnull) + continue; + + /* + * If either is NULL then they are not equal + */ + if (isnull2[i - 1] || indxisnull) + { + equal = false; + break; + } + + /* + * Now do a raw memory comparison + */ + att = indexRel->rd_att->attrs[i - 1]; + if (!datumIsEqual(values2[i - 1], indxvalue, att->attbyval, + att->attlen)) + { + equal = false; + break; + } + } + + if (estate != NULL) + FreeExecutorState(estate); + + ExecDropSingleTupleTableSlot(slot); + + return equal; +} diff --git a/src/backend/access/heap/README.WARM b/src/backend/access/heap/README.WARM new file mode 100644 index 0000000..f793570 --- /dev/null +++ b/src/backend/access/heap/README.WARM @@ -0,0 +1,271 @@ +src/backend/access/heap/README.WARM + +Write Amplification Reduction Method (WARM) +=========================================== + +The Heap Only Tuple (HOT) feature greatly eliminated redudant index +entries and allowed re-use of the dead space occupied by previously +updated or deleted tuples (see src/backend/access/heap/README.HOT) + +One of the necessary conditions for satisfying HOT update is that the +update must not change a column used in any of the indexes on the table. +The condition is sometimes hard to meet, especially for complex +workloads with several indexes on large yet frequently updated tables. +Worse, sometimes only one or two index columns may be updated, but the +regular non-HOT update will still insert a new index entry in every +index on the table, irrespective of whether the key pertaining to the +index changed or not. + +WARM is a technique devised to address these problems. + + +Update Chains With Multiple Index Entries Pointing to the Root +-------------------------------------------------------------- + +When a non-HOT update is caused by an index key change, a new index +entry must be inserted for the changed index. But if the index key +hasn't changed for other indexes, we don't really need to insert a new +entry. Even though the existing index entry is pointing to the old +tuple, the new tuple is reachable via the t_ctid chain. To keep things +simple, a WARM update requires that the heap block must have enough +space to store the new version of the tuple. This is same as HOT +updates. + +In WARM, we ensure that every index entry always points to the root of +the WARM chain. In fact, a WARM chain looks exactly like a HOT chain +except for the fact that there could be multiple index entries pointing +to the root of the chain. So when new entry is inserted in an index for +updated tuple, and if we are doing a WARM update, the new entry is made +point to the root of the WARM chain. + +For example, if we have a table with two columns and two indexes on each +of the column. When a tuple is first inserted the table, we have exactly +one index entry pointing to the tuple from both indexes. + + lp [1] + [1111, aaaa] + + Index1's entry (1111) points to 1 + Index2's entry (aaaa) also points to 1 + +Now if the tuple's second column is updated and if there is room on the +page, we perform a WARM update. To do so, Index1 does not get any new +entry and Index2's new entry will still point to the root tuple of the +chain. + + lp [1] [2] + [1111, aaaa]->[111, bbbb] + + Index1's entry (1111) points to 1 + Index2's old entry (aaaa) points to 1 + Index2's new entry (bbbb) also points to 1 + +"A update chain which has more than one index entries pointing to its +root line pointer is called WARM chain and the action that creates a +WARM chain is called WARM update." + +Since all indexes always point to the root of the WARM chain, even when +there are more than one index entries, WARM chains can be pruned and +dead tuples can be removed without a need to do corresponding index +cleanup. + +While this solves the problem of pruning dead tuples from a HOT/WARM +chain, it also opens up a new technical challenge because now we have a +situation where a heap tuple is reachable from multiple index entries, +each having a different index key. While MVCC still ensures that only +valid tuples are returned, a tuple with a wrong index key may be +returned because of wrong index entries. In the above example, tuple +[1111, bbbb] is reachable from both keys (aaaa) as well as (bbbb). For +this reason, tuples returned from a WARM chain must always be rechecked +for index key-match. + +Recheck Index Key Againt Heap Tuple +----------------------------------- + +Since every Index AM has it's own notion of index tuples, each Index AM +must implement its own method to recheck heap tuples. For example, a +hash index stores the hash value of the column and hence recheck routine +for hash AM must first compute the hash value of the heap attribute and +then compare it against the value stored in the index tuple. + +The patch currently implement recheck routines for hash and btree +indexes. If the table has an index which doesn't support recheck +routine, WARM updates are disabled on such tables. + +Problem With Duplicate (key, ctid) Index Entries +------------------------------------------------ + +The index-key recheck logic works as long as there are no duplicate +index keys, both pointing to the same WARM chain. In that case, the same +valid tuple will be reachable via multiple index keys, yet satisfying +the index key checks. In the above example, if the tuple [1111, bbbb] is +again updated to [1111, aaaa] and if we insert a new index entry (aaaa) +pointing to the root line pointer, we will end up with the following +structure: + + lp [1] [2] [3] + [1111, aaaa]->[1111, bbbb]->[1111, aaaa] + + Index1's entry (1111) points to 1 + Index2's oldest entry (aaaa) points to 1 + Index2's old entry (bbbb) also points to 1 + Index2's new entry (aaaa) also points to 1 + +We must solve this problem to ensure that the same tuple is not +reachable via multiple index pointers. There are couple of ways to +address this issue: + +1. Do not allow WARM update to a tuple from a WARM chain. This +guarantees that there can never be duplicate index entries to the same +root line pointer because we must have checked for old and new index +keys while doing the first WARM update. + +2. Do not allow duplicate (key, ctid) index pointers. In the above +example, since (aaaa, 1) already exists in the index, we must not insert +a duplicate index entry. + +The patch currently implements 1 i.e. do not do WARM updates to a tuple +from a WARM chain. HOT updates are fine because they do not add a new +index entry. + +Even with the restriction, this is a significant improvement because the +number of regular UPDATEs are curtailed down to half. + +Expression and Partial Indexes +------------------------------ + +Expressions may evaluate to the same value even if the underlying column +values have changed. A simple example is an index on "lower(col)" which +will return the same value if the new heap value only differs in the +case sensitivity. So we can not solely rely on the heap column check to +decide whether or not to insert a new index entry for expression +indexes. Similarly, for partial indexes, the predicate expression must +be evaluated to decide whether or not to cause a new index entry when +columns referred in the predicate expressions change. + +(None of these things are currently implemented and we squarely disallow +WARM update if a column from expression indexes or predicate has +changed). + + +Efficiently Finding the Root Line Pointer +----------------------------------------- + +During WARM update, we must be able to find the root line pointer of the +tuple being updated. It must be noted that the t_ctid field in the heap +tuple header is usually used to find the next tuple in the update chain. +But the tuple that we are updating, must be the last tuple in the update +chain. In such cases, the c_tid field usually points the tuple itself. +So in theory, we could use the t_ctid to store additional information in +the last tuple of the update chain, if the information about the tuple +being the last tuple is stored elsewhere. + +We now utilize another bit from t_infomask2 to explicitly identify that +this is the last tuple in the update chain. + +HEAP_LATEST_TUPLE - When this bit is set, the tuple is the last tuple in +the update chain. The OffsetNumber part of t_ctid points to the root +line pointer of the chain when HEAP_LATEST_TUPLE flag is set. + +If UPDATE operation is aborted, the last tuple in the update chain +becomes dead. The root line pointer information stored in the tuple +which remains the last valid tuple in the chain is also lost. In such +rare cases, the root line pointer must be found in a hard way by +scanning the entire heap page. + +Tracking WARM Chains +-------------------- + +The old and every subsequent tuple in the chain is marked with a special +HEAP_WARM_TUPLE flag. We use the last remaining bit in t_infomask2 to +store this information. + +When a tuple is returned from a WARM chain, the caller must do +additional checks to ensure that the tuple matches the index key. Even +if the tuple comes precedes the WARM update in the chain, it must still +be rechecked for the index key match (case when old tuple is returned by +the new index key). So we must follow the update chain everytime to the +end to see check if this is a WARM chain. + +When the old updated tuple is retired and the root line pointer is +converted into a redirected line pointer, we can copy the information +about WARM chain to the redirected line pointer by storing a special +value in the lp_len field of the line pointer. This will handle the most +common case where a WARM chain is replaced by a redirect line pointer +and a single tuple in the chain. + +Converting WARM chains back to HOT chains (VACUUM ?) +---------------------------------------------------- + +The current implementation of WARM allows only one WARM update per +chain. This simplifies the design and addresses certain issues around +duplicate scans. But this also implies that the benefit of WARM will be +no more than 50%, which is still significant, but if we could return +WARM chains back to normal status, we could do far more WARM updates. + +A distinct property of a WARM chain is that at least one index has more +than one live index entries pointing to the root of the chain. In other +words, if we can remove duplicate entry from every index or conclusively +prove that there are no duplicate index entries for the root line +pointer, the chain can again be marked as HOT. + +Here is one idea: + +A WARM chain has two parts, separated by the tuple that caused WARM +update. All tuples in each part has matching index keys, but certain +index keys may not match between these two parts. Lets say we mark heap +tuples in each part with a special Red-Blue flag. The same flag is +replicated in the index tuples. For example, when new rows are inserted +in a table, they are marked with Blue flag and the index entries +associated with those rows are also marked with Blue flag. When a row is +WARM updated, the new version is marked with Red flag and the new index +entry created by the update is also marked with Red flag. + + +Heap chain: [1] [2] [3] [4] + [aaaa, 1111]B -> [aaaa, 1111]B -> [bbbb, 1111]R -> [bbbb, 1111]R + +Index1: (aaaa)B points to 1 (satisfies only tuples marked with B) + (bbbb)R points to 1 (satisfies only tuples marked with R) + +Index2: (1111)B points to 1 (satisfied bith B and R tuples) + + +It's clear that for indexes with Red and Blue pointers, a heap tuple +with Blue flag will be reachable from Blue pointer and that with Red +flag will be reachable from Red pointer. But for indexes which did not +create a new entry, both Blue and Red tuples will be reachable from Blue +pointer (there is no Red pointer in such indexes). So, as a side note, +matching Red and Blue flags is not enough from index scan perspective. + +During first heap scan of VACUUM, we look for tuples with +HEAP_WARM_TUPLE set. If all live tuples in the chain are either marked +with Blue flag or Red flag (but no mix of Red and Blue), then the chain +is a candidate for HOT conversion. We remember the root line pointer +and Red-Blue flag of the WARM chain in a separate array. + +If we have a Red WARM chain, then our goal is to remove Blue pointers +and vice versa. But there is a catch. For Index2 above, there is only +Blue pointer and that must not be removed. IOW we should remove Blue +pointer iff a Red pointer exists. Since index vacuum may visit Red and +Blue pointers in any order, I think we will need another index pass to +remove dead index pointers. So in the first index pass we check which +WARM candidates have 2 index pointers. In the second pass, we remove the +dead pointer and reset Red flag is the surviving index pointer is Red. + +During the second heap scan, we fix WARM chain by clearing +HEAP_WARM_TUPLE flag and also reset Red flag to Blue. + +There are some more problems around aborted vacuums. For example, if +vacuum aborts after changing Red index flag to Blue but before removing +the other Blue pointer, we will end up with two Blue pointers to a Red +WARM chain. But since the HEAP_WARM_TUPLE flag on the heap tuple is +still set, further WARM updates to the chain will be blocked. I guess we +will need some special handling for case with multiple Blue pointers. We +can either leave these WARM chains alone and let them die with a +subsequent non-WARM update or must apply heap-recheck logic during index +vacuum to find the dead pointer. Given that vacuum-aborts are not +common, I am inclined to leave this case unhandled. We must still check +for presence of multiple Blue pointers and ensure that we don't +accidently remove either of the Blue pointers and not clear WARM chains +either. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index bef9c84..b3de79c 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -99,7 +99,10 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, static void HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, Bitmapset *key_attrs, Bitmapset *id_attrs, - bool *satisfies_hot, bool *satisfies_key, + Bitmapset *exprindx_attrs, + Bitmapset **updated_attrs, + bool *satisfies_hot, bool *satisfies_warm, + bool *satisfies_key, bool *satisfies_id, HeapTuple oldtup, HeapTuple newtup); static bool heap_acquire_tuplock(Relation relation, ItemPointer tid, @@ -1960,6 +1963,76 @@ heap_fetch(Relation relation, } /* + * Check if the HOT chain is originating or continuing at tid ever became a + * WARM chain, even if the actual UPDATE operation finally aborted. + */ +static void +hot_check_warm_chain(Page dp, ItemPointer tid, bool *recheck) +{ + TransactionId prev_xmax = InvalidTransactionId; + OffsetNumber offnum; + HeapTupleData heapTuple; + + if (*recheck == true) + return; + + offnum = ItemPointerGetOffsetNumber(tid); + heapTuple.t_self = *tid; + /* Scan through possible multiple members of HOT-chain */ + for (;;) + { + ItemId lp; + + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + break; + + lp = PageGetItemId(dp, offnum); + + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + break; + + heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); + ItemPointerSetOffsetNumber(&heapTuple.t_self, offnum); + + /* + * The xmin should match the previous xmax value, else chain is + * broken. + */ + if (TransactionIdIsValid(prev_xmax) && + !TransactionIdEquals(prev_xmax, + HeapTupleHeaderGetXmin(heapTuple.t_data))) + break; + + + /* + * Presence of either WARM or WARM updated tuple signals possible + * breakage and the caller must recheck tuple returned from this chain + * for index satisfaction + */ + if (HeapTupleHeaderIsHeapWarmTuple(heapTuple.t_data)) + { + *recheck = true; + break; + } + + /* + * Check to see if HOT chain continues past this tuple; if so fetch + * the next offnum and loop around. + */ + if (HeapTupleIsHotUpdated(&heapTuple)) + { + offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid); + prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple.t_data); + } + else + break; /* end of chain */ + } + +} + +/* * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot * * On entry, *tid is the TID of a tuple (either a simple tuple, or the root @@ -1979,11 +2052,14 @@ heap_fetch(Relation relation, * Unlike heap_fetch, the caller must already have pin and (at least) share * lock on the buffer; it is still pinned/locked at exit. Also unlike * heap_fetch, we do not report any pgstats count; caller may do so if wanted. + * + * recheck should be set false on entry by caller, will be set true on exit + * if a WARM tuple is encountered. */ bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call) + bool *all_dead, bool first_call, bool *recheck) { Page dp = (Page) BufferGetPage(buffer); TransactionId prev_xmax = InvalidTransactionId; @@ -2025,6 +2101,16 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, /* Follow the redirect */ offnum = ItemIdGetRedirect(lp); at_chain_start = false; + + /* Check if it's a WARM chain */ + if (recheck && *recheck == false) + { + if (ItemIdIsHeapWarm(lp)) + { + *recheck = true; + Assert(!IsSystemRelation(relation)); + } + } continue; } /* else must be end of chain */ @@ -2037,9 +2123,12 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum); /* - * Shouldn't see a HEAP_ONLY tuple at chain start. + * Shouldn't see a HEAP_ONLY tuple at chain start, unless we are + * dealing with a WARM updated tuple in which case deferred triggers + * may request to fetch a WARM tuple from middle of a chain. */ - if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) + if (at_chain_start && HeapTupleIsHeapOnly(heapTuple) && + !HeapTupleIsHeapWarmTuple(heapTuple)) break; /* @@ -2052,6 +2141,22 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, break; /* + * Check if there exists a WARM tuple somewhere down the chain and set + * recheck to TRUE. + * + * XXX This is not very efficient right now, and we should look for + * possible improvements here + */ + if (recheck && *recheck == false) + { + hot_check_warm_chain(dp, &heapTuple->t_self, recheck); + + /* WARM is not supported on system tables yet */ + if (*recheck == true) + Assert(!IsSystemRelation(relation)); + } + + /* * When first_call is true (and thus, skip is initially false) we'll * return the first tuple we find. But on later passes, heapTuple * will initially be pointing to the tuple we returned last time. @@ -2124,18 +2229,41 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, */ bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, - bool *all_dead) + bool *all_dead, bool *recheck, Buffer *cbuffer, + HeapTuple heapTuple) { bool result; Buffer buffer; - HeapTupleData heapTuple; + ItemPointerData ret_tid = *tid; buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); LockBuffer(buffer, BUFFER_LOCK_SHARE); - result = heap_hot_search_buffer(tid, relation, buffer, snapshot, - &heapTuple, all_dead, true); - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); + result = heap_hot_search_buffer(&ret_tid, relation, buffer, snapshot, + heapTuple, all_dead, true, recheck); + + /* + * If we are returning a potential candidate tuple from this chain and the + * caller has requested for "recheck" hint, keep the buffer locked and + * pinned. The caller must release the lock and pin on the buffer in all + * such cases + */ + if (!result || !recheck || !(*recheck)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* + * Set the caller supplied tid with the actual location of the tuple being + * returned + */ + if (result) + { + *tid = ret_tid; + if (cbuffer) + *cbuffer = buffer; + } + return result; } @@ -3442,13 +3570,15 @@ simple_heap_delete(Relation relation, ItemPointer tid) HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode) + HeapUpdateFailureData *hufd, LockTupleMode *lockmode, + Bitmapset **updated_attrs, bool *warm_update) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); Bitmapset *hot_attrs; Bitmapset *key_attrs; Bitmapset *id_attrs; + Bitmapset *exprindx_attrs; ItemId lp; HeapTupleData oldtup; HeapTuple heaptup; @@ -3469,9 +3599,11 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, bool have_tuple_lock = false; bool iscombo; bool satisfies_hot; + bool satisfies_warm; bool satisfies_key; bool satisfies_id; bool use_hot_update = false; + bool use_warm_update = false; bool key_intact; bool all_visible_cleared = false; bool all_visible_cleared_new = false; @@ -3496,6 +3628,10 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot update tuples during a parallel operation"))); + /* Assume no-warm update */ + if (warm_update) + *warm_update = false; + /* * Fetch the list of attributes to be checked for HOT update. This is * wasted effort if we fail to update or have to put the new tuple on a @@ -3512,6 +3648,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); id_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_IDENTITY_KEY); + exprindx_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_EXPR_PREDICATE); block = ItemPointerGetBlockNumber(otid); offnum = ItemPointerGetOffsetNumber(otid); @@ -3571,7 +3709,10 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, * serendipitiously arrive at the same key values. */ HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs, id_attrs, - &satisfies_hot, &satisfies_key, + exprindx_attrs, + updated_attrs, + &satisfies_hot, &satisfies_warm, + &satisfies_key, &satisfies_id, &oldtup, newtup); if (satisfies_key) { @@ -4118,6 +4259,34 @@ l2: */ if (satisfies_hot) use_hot_update = true; + else + { + /* + * If no WARM updates yet on this chain, let this update be a WARM + * update. + * + * We check for both warm and warm updated tuples since if the + * previous WARM update aborted, we may still have added + * another index entry for this HOT chain. In such situations, we + * must not attempt a WARM update until duplicate (key, CTID) index + * entry issue is sorted out + * + * XXX Later we'll add more checks to ensure WARM chains can + * further be WARM updated. This is probably good to do first rounf + * of tests of remaining functionality + * + * XXX Disable WARM updates on system tables. There is nothing in + * principle that stops us from supporting this. But it would + * require API change to propogate the changed columns back to the + * caller so that CatalogUpdateIndexes() can avoid adding new + * entries to indexes that are not changed by update. This will be + * fixed once basic patch is tested. !!FIXME + */ + if (satisfies_warm && + !HeapTupleIsHeapWarmTuple(&oldtup) && + !IsSystemRelation(relation)) + use_warm_update = true; + } } else { @@ -4158,6 +4327,21 @@ l2: HeapTupleSetHeapOnly(heaptup); /* Mark the caller's copy too, in case different from heaptup */ HeapTupleSetHeapOnly(newtup); + + /* + * Even if we are doing a HOT update, we must carry forward the WARM + * flag because we may have already inserted another index entry + * pointing to our root and a third entry may create duplicates + * + * XXX This should be revisited if we get index (key, CTID) duplicate + * detection mechanism in place + */ + if (HeapTupleIsHeapWarmTuple(&oldtup)) + { + HeapTupleSetHeapWarmTuple(heaptup); + HeapTupleSetHeapWarmTuple(newtup); + } + /* * For HOT (or WARM) updated tuples, we store the offset of the root * line pointer of this chain in the ip_posid field of the new tuple. @@ -4173,12 +4357,38 @@ l2: ItemPointerGetOffsetNumber(&(oldtup.t_self)), &root_offnum); } + else if (use_warm_update) + { + Assert(!IsSystemRelation(relation)); + + /* Mark the old tuple as HOT-updated */ + HeapTupleSetHotUpdated(&oldtup); + HeapTupleSetHeapWarmTuple(&oldtup); + /* And mark the new tuple as heap-only */ + HeapTupleSetHeapOnly(heaptup); + HeapTupleSetHeapWarmTuple(heaptup); + /* Mark the caller's copy too, in case different from heaptup */ + HeapTupleSetHeapOnly(newtup); + HeapTupleSetHeapWarmTuple(newtup); + if (HeapTupleHeaderHasRootOffset(oldtup.t_data)) + root_offnum = HeapTupleHeaderGetRootOffset(oldtup.t_data); + else + heap_get_root_tuple_one(page, + ItemPointerGetOffsetNumber(&(oldtup.t_self)), + &root_offnum); + + /* Let the caller know we did a WARM update */ + if (warm_update) + *warm_update = true; + } else { /* Make sure tuples are correctly marked as not-HOT */ HeapTupleClearHotUpdated(&oldtup); HeapTupleClearHeapOnly(heaptup); HeapTupleClearHeapOnly(newtup); + HeapTupleClearHeapWarmTuple(heaptup); + HeapTupleClearHeapWarmTuple(newtup); root_offnum = InvalidOffsetNumber; } @@ -4297,7 +4507,10 @@ l2: if (have_tuple_lock) UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); - pgstat_count_heap_update(relation, use_hot_update); + /* + * Count HOT and WARM updates separately + */ + pgstat_count_heap_update(relation, use_hot_update, use_warm_update); /* * If heaptup is a private copy, release it. Don't forget to copy t_self @@ -4405,6 +4618,13 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, * will be checking very similar sets of columns, and doing the same tests on * them, it makes sense to optimize and do them together. * + * The exprindx_attrs designates the set of attributes used in expression or + * predicate indexes. Currently, we don't allow WARM updates if expression or + * predicate index column is updated + * + * If updated_attrs is not NULL, then the caller is always interested in + * knowing the list of changed attributes + * * We receive three bitmapsets comprising the three sets of columns we're * interested in. Note these are destructively modified; that is OK since * this is invoked at most once in heap_update. @@ -4417,7 +4637,11 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, static void HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, Bitmapset *key_attrs, Bitmapset *id_attrs, - bool *satisfies_hot, bool *satisfies_key, + Bitmapset *exprindx_attrs, + Bitmapset **updated_attrs, + bool *satisfies_hot, + bool *satisfies_warm, + bool *satisfies_key, bool *satisfies_id, HeapTuple oldtup, HeapTuple newtup) { @@ -4427,6 +4651,7 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, bool hot_result = true; bool key_result = true; bool id_result = true; + Bitmapset *hot_attrs_copy = bms_copy(hot_attrs); /* If REPLICA IDENTITY is set to FULL, id_attrs will be empty. */ Assert(bms_is_subset(id_attrs, key_attrs)); @@ -4454,8 +4679,11 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, * Since the HOT attributes are a superset of the key attributes and * the key attributes are a superset of the id attributes, this logic * is guaranteed to identify the next column that needs to be checked. + * + * If the caller also wants to know the list of updated index + * attributes, we must scan through all the attributes */ - if (hot_result && next_hot_attnum > FirstLowInvalidHeapAttributeNumber) + if ((hot_result || updated_attrs) && next_hot_attnum > FirstLowInvalidHeapAttributeNumber) check_now = next_hot_attnum; else if (key_result && next_key_attnum > FirstLowInvalidHeapAttributeNumber) check_now = next_key_attnum; @@ -4476,8 +4704,16 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, if (check_now == next_id_attnum) id_result = false; + /* + * Add the changed attribute to updated_attrs if the caller has + * asked for it + */ + if (updated_attrs) + *updated_attrs = bms_add_member(*updated_attrs, check_now - + FirstLowInvalidHeapAttributeNumber); + /* if all are false now, we can stop checking */ - if (!hot_result && !key_result && !id_result) + if (!hot_result && !key_result && !id_result && !updated_attrs) break; } @@ -4488,7 +4724,7 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, * bms_first_member() will return -1 and the attribute number will end * up with a value less than FirstLowInvalidHeapAttributeNumber. */ - if (hot_result && check_now == next_hot_attnum) + if ((hot_result || updated_attrs) && check_now == next_hot_attnum) { next_hot_attnum = bms_first_member(hot_attrs); next_hot_attnum += FirstLowInvalidHeapAttributeNumber; @@ -4505,6 +4741,29 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, } } + /* + * If an attributed used in the expression of an expression index or + * predicate of a predicate index has changed, we don't yet support WARM + * update + */ + if (updated_attrs && bms_overlap(*updated_attrs, exprindx_attrs)) + *satisfies_warm = false; + /* If the table does not support WARM update, honour that */ + else if (!relation->rd_supportswarm) + *satisfies_warm = false; + /* + * If all index keys are being updated, there is hardly any point in doing + * a WARM update. + */ + else if (updated_attrs && bms_is_subset(hot_attrs_copy, *updated_attrs)) + *satisfies_warm = false; + /* + * XXX Should we handle some more cases, such as when an update will result + * in many or most indexes, should we fall back to a regular update? + */ + else + *satisfies_warm = true; + *satisfies_hot = hot_result; *satisfies_key = key_result; *satisfies_id = id_result; @@ -4528,7 +4787,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) result = heap_update(relation, otid, tup, GetCurrentCommandId(true), InvalidSnapshot, true /* wait for commit */ , - &hufd, &lockmode); + &hufd, &lockmode, NULL, NULL); switch (result) { case HeapTupleSelfUpdated: @@ -7426,6 +7685,7 @@ log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid) XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, + OffsetNumber *warm, int nwarm, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid) @@ -7439,6 +7699,7 @@ log_heap_clean(Relation reln, Buffer buffer, xlrec.latestRemovedXid = latestRemovedXid; xlrec.nredirected = nredirected; xlrec.ndead = ndead; + xlrec.nwarm = nwarm; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapClean); @@ -7461,6 +7722,10 @@ log_heap_clean(Relation reln, Buffer buffer, XLogRegisterBufData(0, (char *) nowdead, ndead * sizeof(OffsetNumber)); + if (nwarm > 0) + XLogRegisterBufData(0, (char *) warm, + nwarm * sizeof(OffsetNumber)); + if (nunused > 0) XLogRegisterBufData(0, (char *) nowunused, nunused * sizeof(OffsetNumber)); @@ -7566,6 +7831,7 @@ log_heap_update(Relation reln, Buffer oldbuf, bool need_tuple_data = RelationIsLogicallyLogged(reln); bool init; int bufflags; + bool warm_update; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); @@ -7577,6 +7843,9 @@ log_heap_update(Relation reln, Buffer oldbuf, else info = XLOG_HEAP_UPDATE; + if (HeapTupleIsHeapWarmTuple(newtup)) + warm_update = true; + /* * If the old and new tuple are on the same page, we only need to log the * parts of the new tuple that were changed. That saves on the amount of @@ -7650,6 +7919,8 @@ log_heap_update(Relation reln, Buffer oldbuf, xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY; } } + if (warm_update) + xlrec.flags |= XLH_UPDATE_WARM_UPDATE; /* If new tuple is the single and first tuple on page... */ if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && @@ -8017,24 +8288,38 @@ heap_xlog_clean(XLogReaderState *record) OffsetNumber *redirected; OffsetNumber *nowdead; OffsetNumber *nowunused; + OffsetNumber *warm; int nredirected; int ndead; int nunused; + int nwarm; + int i; Size datalen; + bool warmchain[MaxHeapTuplesPerPage + 1]; redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); nredirected = xlrec->nredirected; ndead = xlrec->ndead; + nwarm = xlrec->nwarm; + end = (OffsetNumber *) ((char *) redirected + datalen); nowdead = redirected + (nredirected * 2); - nowunused = nowdead + ndead; - nunused = (end - nowunused); + warm = nowdead + ndead; + nowunused = warm + nwarm; + + nunused = (end - warm); Assert(nunused >= 0); + memset(warmchain, 0, sizeof (warmchain)); + for (i = 0; i < nwarm; i++) + warmchain[warm[i]] = true; + + /* Update all item pointers per the record, and repair fragmentation */ heap_page_prune_execute(buffer, redirected, nredirected, + warmchain, nowdead, ndead, nowunused, nunused); @@ -8621,16 +8906,22 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) Size freespace = 0; XLogRedoAction oldaction; XLogRedoAction newaction; + bool warm_update = false; /* initialize to keep the compiler quiet */ oldtup.t_data = NULL; oldtup.t_len = 0; + if (xlrec->flags & XLH_UPDATE_WARM_UPDATE) + warm_update = true; + XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk); if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk)) { /* HOT updates are never done across pages */ Assert(!hot_update); + /* WARM updates are never done across pages */ + Assert(!warm_update); } else oldblk = newblk; @@ -8690,6 +8981,11 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + /* Mark the old tuple has a WARM tuple */ + if (warm_update) + HeapTupleHeaderSetHeapWarmTuple(htup); + /* Set forward chain link in t_ctid */ HeapTupleHeaderSetNextCtid(htup, ItemPointerGetBlockNumber(&newtid), ItemPointerGetOffsetNumber(&newtid)); @@ -8825,6 +9121,11 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); HeapTupleHeaderSetCmin(htup, FirstCommandId); HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + + /* Mark the new tuple has a WARM tuple */ + if (warm_update) + HeapTupleHeaderSetHeapWarmTuple(htup); + /* Make sure there is no forward chain link in t_ctid */ HeapTupleHeaderSetHeapLatest(htup); diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 7c2231a..d71a297 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -36,12 +36,19 @@ typedef struct int nredirected; /* numbers of entries in arrays below */ int ndead; int nunused; + int nwarm; /* arrays that accumulate indexes of items to be changed */ OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; OffsetNumber nowdead[MaxHeapTuplesPerPage]; OffsetNumber nowunused[MaxHeapTuplesPerPage]; + OffsetNumber warm[MaxHeapTuplesPerPage]; /* marked[i] is TRUE if item i is entered in one of the above arrays */ bool marked[MaxHeapTuplesPerPage + 1]; + /* + * warmchain[i] is TRUE if item is becoming redirected lp and points a WARM + * chain + */ + bool warmchain[MaxHeapTuplesPerPage + 1]; } PruneState; /* Local functions */ @@ -54,6 +61,8 @@ static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum); static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum); static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); +static void heap_prune_record_warmupdate(PruneState *prstate, + OffsetNumber offnum); static void heap_get_root_tuples_internal(Page page, OffsetNumber target_offnum, OffsetNumber *root_offsets); @@ -203,8 +212,9 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, */ prstate.new_prune_xid = InvalidTransactionId; prstate.latestRemovedXid = *latestRemovedXid; - prstate.nredirected = prstate.ndead = prstate.nunused = 0; + prstate.nredirected = prstate.ndead = prstate.nunused = prstate.nwarm = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); + memset(prstate.warmchain, 0, sizeof(prstate.marked)); /* Scan the page */ maxoff = PageGetMaxOffsetNumber(page); @@ -241,6 +251,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, */ heap_page_prune_execute(buffer, prstate.redirected, prstate.nredirected, + prstate.warmchain, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused); @@ -268,6 +279,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, recptr = log_heap_clean(relation, buffer, prstate.redirected, prstate.nredirected, + prstate.warm, prstate.nwarm, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, prstate.latestRemovedXid); @@ -479,6 +491,12 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) break; + if (HeapTupleHeaderIsHeapWarmTuple(htup)) + { + Assert(!IsSystemRelation(relation)); + heap_prune_record_warmupdate(prstate, rootoffnum); + } + /* * OK, this tuple is indeed a member of the chain. */ @@ -668,6 +686,18 @@ heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) prstate->marked[offnum] = true; } +/* Record item pointer which is a root of a WARM chain */ +static void +heap_prune_record_warmupdate(PruneState *prstate, OffsetNumber offnum) +{ + Assert(prstate->nwarm < MaxHeapTuplesPerPage); + if (prstate->warmchain[offnum]) + return; + prstate->warm[prstate->nwarm] = offnum; + prstate->nwarm++; + prstate->warmchain[offnum] = true; +} + /* * Perform the actual page changes needed by heap_page_prune. @@ -681,6 +711,7 @@ heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, + bool *warmchain, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused) { @@ -697,6 +728,12 @@ heap_page_prune_execute(Buffer buffer, ItemId fromlp = PageGetItemId(page, fromoff); ItemIdSetRedirect(fromlp, tooff); + + /* + * Save information about WARM chains in the item itself + */ + if (warmchain[fromoff]) + ItemIdSetHeapWarm(fromlp); } /* Update all now-dead line pointers */ diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 65c941d..4f9fb12 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -99,7 +99,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) else scan->orderByData = NULL; - scan->xs_want_itup = false; /* may be set later */ + scan->xs_want_itup = true; /* hack for now to always get index tuple */ /* * During recovery we ignore killed tuples and don't bother to kill them diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 54b71cb..7632573 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -71,10 +71,12 @@ #include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/index.h" +#include "executor/executor.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "utils/datum.h" #include "utils/snapmgr.h" #include "utils/tqual.h" @@ -409,7 +411,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_ctup.t_self. It should also set - * scan->xs_recheck and possibly scan->xs_itup, though we pay no attention + * scan->xs_tuple_recheck and possibly scan->xs_itup, though we pay no attention * to those fields here. */ found = scan->indexRelation->rd_amroutine->amgettuple(scan, direction); @@ -448,7 +450,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan * call). * - * Note: caller must check scan->xs_recheck, and perform rechecking of the + * Note: caller must check scan->xs_tuple_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- @@ -475,6 +477,15 @@ index_fetch_heap(IndexScanDesc scan) */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf); + + /* + * If we're not always re-checking, reset recheck for this tuple + */ + if (!scan->xs_recheck) + scan->xs_tuple_recheck = false; + else + scan->xs_tuple_recheck = true; + } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -484,32 +495,63 @@ index_fetch_heap(IndexScanDesc scan) scan->xs_snapshot, &scan->xs_ctup, &all_dead, - !scan->xs_continue_hot); + !scan->xs_continue_hot, + &scan->xs_tuple_recheck); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); if (got_heap_tuple) { + bool res = true; + + /* + * Ok we got a tuple which satisfies the snapshot, but if its part of a + * WARM chain, we must do additional checks to ensure that we are + * indeed returning a correct tuple. Note that if the index AM does not + * implement amrecheck method, then we don't any additional checks + * since WARM must have been disabled on such tables + * + * XXX What happens when a new index which does not support amcheck is + * added to the table? Do we need to handle this case or is CREATE + * INDEX and CREATE INDEX CONCURRENTLY smart enough to handle this + * issue? + */ + if (scan->xs_tuple_recheck && + scan->indexRelation->rd_amroutine->amrecheck) + { + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); + res = scan->indexRelation->rd_amroutine->amrecheck( + scan->indexRelation, + scan->xs_itup, + scan->heapRelation, + &scan->xs_ctup); + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + } + /* * Only in a non-MVCC snapshot can more than one member of the HOT * chain be visible. */ scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); pgstat_count_heap_fetch(scan->indexRelation); - return &scan->xs_ctup; - } - /* We've reached the end of the HOT chain. */ - scan->xs_continue_hot = false; + if (res) + return &scan->xs_ctup; + } + else + { + /* We've reached the end of the HOT chain. */ + scan->xs_continue_hot = false; - /* - * If we scanned a whole HOT chain and found only dead tuples, tell index - * AM to kill its entry for that TID (this will take effect in the next - * amgettuple call, in index_getnext_tid). We do not do this when in - * recovery because it may violate MVCC to do so. See comments in - * RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + } return NULL; } diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index ef69290..e0afffd 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -19,11 +19,14 @@ #include "access/nbtree.h" #include "access/transam.h" #include "access/xloginsert.h" +#include "catalog/index.h" +#include "executor/executor.h" #include "miscadmin.h" +#include "nodes/execnodes.h" #include "storage/lmgr.h" #include "storage/predicate.h" #include "utils/tqual.h" - +#include "utils/datum.h" typedef struct { @@ -249,6 +252,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; bool found = false; + Buffer buffer; + HeapTupleData heapTuple; + bool recheck = false; /* Assume unique until we find a duplicate */ *is_unique = true; @@ -308,6 +314,8 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, curitup = (IndexTuple) PageGetItem(page, curitemid); htid = curitup->t_tid; + recheck = false; + /* * If we are doing a recheck, we expect to find the tuple we * are rechecking. It's not a duplicate, but we have to keep @@ -325,112 +333,153 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * have just a single index entry for the entire chain. */ else if (heap_hot_search(&htid, heapRel, &SnapshotDirty, - &all_dead)) + &all_dead, &recheck, &buffer, + &heapTuple)) { TransactionId xwait; + bool result = true; /* - * It is a duplicate. If we are only doing a partial - * check, then don't bother checking if the tuple is being - * updated in another transaction. Just return the fact - * that it is a potential conflict and leave the full - * check till later. + * If the tuple was WARM update, we may again see our own + * tuple. Since WARM updates don't create new index + * entries, our own tuple is only reachable via the old + * index pointer */ - if (checkUnique == UNIQUE_CHECK_PARTIAL) + if (checkUnique == UNIQUE_CHECK_EXISTING && + ItemPointerCompare(&htid, &itup->t_tid) == 0) { - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - *is_unique = false; - return InvalidTransactionId; + found = true; + result = false; + if (recheck) + UnlockReleaseBuffer(buffer); } - - /* - * If this tuple is being updated by other transaction - * then we have to wait for its commit/abort. - */ - xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? - SnapshotDirty.xmin : SnapshotDirty.xmax; - - if (TransactionIdIsValid(xwait)) + else if (recheck) { - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - /* Tell _bt_doinsert to wait... */ - *speculativeToken = SnapshotDirty.speculativeToken; - return xwait; + result = btrecheck(rel, curitup, heapRel, &heapTuple); + UnlockReleaseBuffer(buffer); } - /* - * Otherwise we have a definite conflict. But before - * complaining, look to see if the tuple we want to insert - * is itself now committed dead --- if so, don't complain. - * This is a waste of time in normal scenarios but we must - * do it to support CREATE INDEX CONCURRENTLY. - * - * We must follow HOT-chains here because during - * concurrent index build, we insert the root TID though - * the actual tuple may be somewhere in the HOT-chain. - * While following the chain we might not stop at the - * exact tuple which triggered the insert, but that's OK - * because if we find a live tuple anywhere in this chain, - * we have a unique key conflict. The other live tuple is - * not part of this chain because it had a different index - * entry. - */ - htid = itup->t_tid; - if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL)) - { - /* Normal case --- it's still live */ - } - else + if (result) { /* - * It's been deleted, so no error, and no need to - * continue searching + * It is a duplicate. If we are only doing a partial + * check, then don't bother checking if the tuple is being + * updated in another transaction. Just return the fact + * that it is a potential conflict and leave the full + * check till later. */ - break; - } + if (checkUnique == UNIQUE_CHECK_PARTIAL) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + *is_unique = false; + return InvalidTransactionId; + } - /* - * Check for a conflict-in as we would if we were going to - * write to this page. We aren't actually going to write, - * but we want a chance to report SSI conflicts that would - * otherwise be masked by this unique constraint - * violation. - */ - CheckForSerializableConflictIn(rel, NULL, buf); + /* + * If this tuple is being updated by other transaction + * then we have to wait for its commit/abort. + */ + xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? + SnapshotDirty.xmin : SnapshotDirty.xmax; + + if (TransactionIdIsValid(xwait)) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + /* Tell _bt_doinsert to wait... */ + *speculativeToken = SnapshotDirty.speculativeToken; + return xwait; + } - /* - * This is a definite conflict. Break the tuple down into - * datums and report the error. But first, make sure we - * release the buffer locks we're holding --- - * BuildIndexValueDescription could make catalog accesses, - * which in the worst case might touch this same index and - * cause deadlocks. - */ - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - _bt_relbuf(rel, buf); + /* + * Otherwise we have a definite conflict. But before + * complaining, look to see if the tuple we want to insert + * is itself now committed dead --- if so, don't complain. + * This is a waste of time in normal scenarios but we must + * do it to support CREATE INDEX CONCURRENTLY. + * + * We must follow HOT-chains here because during + * concurrent index build, we insert the root TID though + * the actual tuple may be somewhere in the HOT-chain. + * While following the chain we might not stop at the + * exact tuple which triggered the insert, but that's OK + * because if we find a live tuple anywhere in this chain, + * we have a unique key conflict. The other live tuple is + * not part of this chain because it had a different index + * entry. + */ + recheck = false; + ItemPointerCopy(&itup->t_tid, &htid); + if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL, + &recheck, &buffer, &heapTuple)) + { + bool result = true; + if (recheck) + { + /* + * Recheck if the tuple actually satisfies the + * index key. Otherwise, we might be following + * a wrong index pointer and mustn't entertain + * this tuple + */ + result = btrecheck(rel, itup, heapRel, &heapTuple); + UnlockReleaseBuffer(buffer); + } + if (!result) + break; + /* Normal case --- it's still live */ + } + else + { + /* + * It's been deleted, so no error, and no need to + * continue searching + */ + break; + } - { - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - char *key_desc; - - index_deform_tuple(itup, RelationGetDescr(rel), - values, isnull); - - key_desc = BuildIndexValueDescription(rel, values, - isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("duplicate key value violates unique constraint \"%s\"", - RelationGetRelationName(rel)), - key_desc ? errdetail("Key %s already exists.", - key_desc) : 0, - errtableconstraint(heapRel, - RelationGetRelationName(rel)))); + /* + * Check for a conflict-in as we would if we were going to + * write to this page. We aren't actually going to write, + * but we want a chance to report SSI conflicts that would + * otherwise be masked by this unique constraint + * violation. + */ + CheckForSerializableConflictIn(rel, NULL, buf); + + /* + * This is a definite conflict. Break the tuple down into + * datums and report the error. But first, make sure we + * release the buffer locks we're holding --- + * BuildIndexValueDescription could make catalog accesses, + * which in the worst case might touch this same index and + * cause deadlocks. + */ + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + _bt_relbuf(rel, buf); + + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + index_deform_tuple(itup, RelationGetDescr(rel), + values, isnull); + + key_desc = BuildIndexValueDescription(rel, values, + isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("duplicate key value violates unique constraint \"%s\"", + RelationGetRelationName(rel)), + key_desc ? errdetail("Key %s already exists.", + key_desc) : 0, + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + } } } else if (all_dead) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 128744c..6b1236a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -23,6 +23,7 @@ #include "access/xlog.h" #include "catalog/index.h" #include "commands/vacuum.h" +#include "executor/nodeIndexscan.h" #include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" @@ -117,6 +118,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->amendscan = btendscan; amroutine->ammarkpos = btmarkpos; amroutine->amrestrpos = btrestrpos; + amroutine->amrecheck = btrecheck; PG_RETURN_POINTER(amroutine); } @@ -292,8 +294,9 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; bool res; - /* btree indexes are never lossy */ - scan->xs_recheck = false; + /* btree indexes are never lossy, except for WARM tuples */ + scan->xs_recheck = indexscan_recheck; + scan->xs_tuple_recheck = indexscan_recheck; /* * If we have any array keys, initialize them during first call for a diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 063c988..c9c0501 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -20,11 +20,15 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/index.h" +#include "executor/executor.h" #include "miscadmin.h" +#include "nodes/execnodes.h" #include "utils/array.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/datum.h" typedef struct BTSortArrayContext @@ -2065,3 +2069,103 @@ btproperty(Oid index_oid, int attno, return false; /* punt to generic code */ } } + +/* + * Check if the index tuple's key matches the one computed from the given heap + * tuple's attribute + */ +bool +btrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple) +{ + IndexInfo *indexInfo; + EState *estate; + ExprContext *econtext; + TupleTableSlot *slot; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + int i; + bool equal; + int natts = indexRel->rd_rel->relnatts; + Form_pg_attribute att; + + /* Get IndexInfo for this index */ + indexInfo = BuildIndexInfo(indexRel); + + /* + * The heap tuple must be put into a slot for FormIndexDatum. + */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel)); + + ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); + + /* + * Typically the index won't have expressions, but if it does we need an + * EState to evaluate them. We need it for exclusion constraints too, + * even if they are just on simple columns. + */ + if (indexInfo->ii_Expressions != NIL || + indexInfo->ii_ExclusionOps != NULL) + { + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = slot; + } + else + estate = NULL; + + /* + * Form the index values and isnull flags for the index entry that we need + * to check. + * + * Note: if the index uses functions that are not as immutable as they are + * supposed to be, this could produce an index tuple different from the + * original. The index AM can catch such errors by verifying that it + * finds a matching index entry with the tuple's TID. For exclusion + * constraints we check this in check_exclusion_constraint(). + */ + FormIndexDatum(indexInfo, slot, estate, values, isnull); + + equal = true; + for (i = 1; i <= natts; i++) + { + Datum indxvalue; + bool indxisnull; + + indxvalue = index_getattr(indexTuple, i, indexRel->rd_att, &indxisnull); + + /* + * If both are NULL, then they are equal + */ + if (isnull[i - 1] && indxisnull) + continue; + + /* + * If just one is NULL, then they are not equal + */ + if (isnull[i - 1] || indxisnull) + { + equal = false; + break; + } + + /* + * Now just do a raw memory comparison. If the index tuple was formed + * using this heap tuple, the computed index values must match + */ + att = indexRel->rd_att->attrs[i - 1]; + if (!datumIsEqual(values[i - 1], indxvalue, att->attbyval, + att->attlen)) + { + equal = false; + break; + } + } + + if (estate != NULL) + FreeExecutorState(estate); + + ExecDropSingleTupleTableSlot(slot); + + return equal; +} diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index d570ae5..813b5c3 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -67,6 +67,7 @@ spghandler(PG_FUNCTION_ARGS) amroutine->amendscan = spgendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 08b646d..e76e928 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -54,6 +54,7 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" +#include "optimizer/var.h" #include "parser/parser.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" @@ -1691,6 +1692,20 @@ BuildIndexInfo(Relation index) ii->ii_Concurrent = false; ii->ii_BrokenHotChain = false; + /* build a bitmap of all table attributes referred by this index */ + for (i = 0; i < ii->ii_NumIndexAttrs; i++) + { + AttrNumber attr = ii->ii_KeyAttrNumbers[i]; + ii->ii_indxattrs = bms_add_member(ii->ii_indxattrs, attr - + FirstLowInvalidHeapAttributeNumber); + } + + /* Collect all attributes used in expressions, too */ + pull_varattnos((Node *) ii->ii_Expressions, 1, &ii->ii_indxattrs); + + /* Collect all attributes in the index predicate, too */ + pull_varattnos((Node *) ii->ii_Predicate, 1, &ii->ii_indxattrs); + return ii; } diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index e011af1..97672a9 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -472,6 +472,7 @@ CREATE VIEW pg_stat_all_tables AS pg_stat_get_tuples_updated(C.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(C.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(C.oid) AS n_tup_hot_upd, + pg_stat_get_tuples_warm_updated(C.oid) AS n_tup_warm_upd, pg_stat_get_live_tuples(C.oid) AS n_live_tup, pg_stat_get_dead_tuples(C.oid) AS n_dead_tup, pg_stat_get_mod_since_analyze(C.oid) AS n_mod_since_analyze, @@ -502,7 +503,8 @@ CREATE VIEW pg_stat_xact_all_tables AS pg_stat_get_xact_tuples_inserted(C.oid) AS n_tup_ins, pg_stat_get_xact_tuples_updated(C.oid) AS n_tup_upd, pg_stat_get_xact_tuples_deleted(C.oid) AS n_tup_del, - pg_stat_get_xact_tuples_hot_updated(C.oid) AS n_tup_hot_upd + pg_stat_get_xact_tuples_hot_updated(C.oid) AS n_tup_hot_upd, + pg_stat_get_xact_tuples_warm_updated(C.oid) AS n_tup_warm_upd FROM pg_class C LEFT JOIN pg_index I ON C.oid = I.indrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 26f9114..997c8f5 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -40,6 +40,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) TriggerData *trigdata = (TriggerData *) fcinfo->context; const char *funcname = "unique_key_recheck"; HeapTuple new_row; + HeapTupleData heapTuple; ItemPointerData tmptid; Relation indexRel; IndexInfo *indexInfo; @@ -102,7 +103,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) * removed. */ tmptid = new_row->t_self; - if (!heap_hot_search(&tmptid, trigdata->tg_relation, SnapshotSelf, NULL)) + if (!heap_hot_search(&tmptid, trigdata->tg_relation, SnapshotSelf, NULL, + NULL, NULL, &heapTuple)) { /* * All rows in the HOT chain are dead, so skip the check. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index ec5d6f1..5e57cc9 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2551,6 +2551,8 @@ CopyFrom(CopyState cstate) if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), + NULL, estate, false, NULL, @@ -2669,6 +2671,7 @@ CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, ExecStoreTuple(bufferedTuples[i], myslot, InvalidBuffer, false); recheckIndexes = ExecInsertIndexTuples(myslot, &(bufferedTuples[i]->t_self), + &(bufferedTuples[i]->t_self), NULL, estate, false, NULL, NIL); ExecARInsertTriggers(estate, resultRelInfo, bufferedTuples[i], diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index b5fb325..cd9b9a7 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -1468,6 +1468,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, recptr = log_heap_clean(onerel, buffer, NULL, 0, NULL, 0, + NULL, 0, unused, uncnt, vacrelstats->latestRemovedXid); PageSetLSN(page, recptr); @@ -2128,6 +2129,22 @@ heap_page_is_all_visible(Relation rel, Buffer buf, break; } + /* + * If this tuple was ever WARM updated or is a WARM tuple, + * there could be multiple index entries pointing to the + * root of this chain. We can't do index-only scans for + * such tuples without verifying index key check. So mark + * the page as !all_visible + * + * XXX Should we look at the root line pointer and check if + * WARM flag is set there or checking for tuples in the + * chain is good enough? + */ + if (HeapTupleHeaderIsHeapWarmTuple(tuple.t_data)) + { + all_visible = false; + } + /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, *visibility_cutoff_xid)) *visibility_cutoff_xid = xmin; diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 009c1b7..03c6b62 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -270,6 +270,8 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) List * ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, + ItemPointer root_tid, + Bitmapset *updated_attrs, EState *estate, bool noDupErr, bool *specConflict, @@ -324,6 +326,17 @@ ExecInsertIndexTuples(TupleTableSlot *slot, if (!indexInfo->ii_ReadyForInserts) continue; + /* + * If updated_attrs is set, we only insert index entries for those + * indexes whose column has changed. All other indexes can use their + * existing index pointers to look up the new tuple + */ + if (updated_attrs) + { + if (!bms_overlap(updated_attrs, indexInfo->ii_indxattrs)) + continue; + } + /* Check for partial index */ if (indexInfo->ii_Predicate != NIL) { @@ -389,7 +402,7 @@ ExecInsertIndexTuples(TupleTableSlot *slot, index_insert(indexRelation, /* index relation */ values, /* array of index Datums */ isnull, /* null flags */ - tupleid, /* tid of heap tuple */ + root_tid, /* tid of heap or root tuple */ heapRelation, /* heap relation */ checkUnique); /* type of uniqueness check to do */ diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 449aacb..ff77349 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -37,6 +37,7 @@ #include "access/relscan.h" #include "access/transam.h" +#include "access/valid.h" #include "executor/execdebug.h" #include "executor/nodeBitmapHeapscan.h" #include "pgstat.h" @@ -362,11 +363,23 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) OffsetNumber offnum = tbmres->offsets[curslot]; ItemPointerData tid; HeapTupleData heapTuple; + bool recheck = false; ItemPointerSet(&tid, page, offnum); if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, - &heapTuple, NULL, true)) - scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + &heapTuple, NULL, true, &recheck)) + { + bool valid = true; + + if (scan->rs_key) + HeapKeyTest(&heapTuple, RelationGetDescr(scan->rs_rd), + scan->rs_nkeys, scan->rs_key, valid); + if (valid) + scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + + if (recheck) + tbmres->recheck = true; + } } } else diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 4f6f91c..49bda34 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -141,6 +141,26 @@ IndexOnlyNext(IndexOnlyScanState *node) * but it's not clear whether it's a win to do so. The next index * entry might require a visit to the same heap page. */ + + /* + * If the index was lossy or the tuple was WARM, we have to recheck + * the index quals using the fetched tuple. + */ + if (scandesc->xs_tuple_recheck) + { + ExecStoreTuple(tuple, /* tuple to store */ + slot, /* slot to store in */ + scandesc->xs_cbuf, /* buffer containing tuple */ + false); /* don't pfree */ + econtext->ecxt_scantuple = slot; + ResetExprContext(econtext); + if (!ExecQual(node->indexqual, econtext, false)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + continue; + } + } } /* diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 3143bd9..daa0826 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -39,6 +39,8 @@ #include "utils/memutils.h" #include "utils/rel.h" +bool indexscan_recheck = false; + /* * When an ordering operator is used, tuples fetched from the index that * need to be reordered are queued in a pairing heap, as ReorderTuples. @@ -115,10 +117,10 @@ IndexNext(IndexScanState *node) false); /* don't pfree */ /* - * If the index was lossy, we have to recheck the index quals using - * the fetched tuple. + * If the index was lossy or the tuple was WARM, we have to recheck + * the index quals using the fetched tuple. */ - if (scandesc->xs_recheck) + if (scandesc->xs_recheck || scandesc->xs_tuple_recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index efb0c5e..0ba71a3 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -448,6 +448,7 @@ ExecInsert(ModifyTableState *mtstate, /* insert index entries for tuple */ recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), NULL, estate, true, &specConflict, arbiterIndexes); @@ -494,6 +495,7 @@ ExecInsert(ModifyTableState *mtstate, /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), NULL, estate, false, NULL, arbiterIndexes); } @@ -824,6 +826,9 @@ ExecUpdate(ItemPointer tupleid, HTSU_Result result; HeapUpdateFailureData hufd; List *recheckIndexes = NIL; + Bitmapset *updated_attrs = NULL; + ItemPointerData root_tid; + bool warm_update; /* * abort the operation if not running transactions @@ -938,7 +943,7 @@ lreplace:; estate->es_output_cid, estate->es_crosscheck_snapshot, true /* wait for commit */ , - &hufd, &lockmode); + &hufd, &lockmode, &updated_attrs, &warm_update); switch (result) { case HeapTupleSelfUpdated: @@ -1025,10 +1030,28 @@ lreplace:; * the t_self field. * * If it's a HOT update, we mustn't insert new index entries. + * + * If it's a WARM update, then we must insert new entries with TID + * pointing to the root of the WARM chain. */ - if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple)) + if (resultRelInfo->ri_NumIndices > 0 && + (!HeapTupleIsHeapOnly(tuple) || warm_update)) + { + if (warm_update) + ItemPointerSet(&root_tid, + ItemPointerGetBlockNumber(&(tuple->t_self)), + HeapTupleHeaderGetRootOffset(tuple->t_data)); + else + { + ItemPointerCopy(&tuple->t_self, &root_tid); + bms_free(updated_attrs); + updated_attrs = NULL; + } recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &root_tid, + updated_attrs, estate, false, NULL, NIL); + } } if (canSetTag) diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index c7584cb..d89d37b 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -1823,7 +1823,7 @@ pgstat_count_heap_insert(Relation rel, int n) * pgstat_count_heap_update - count a tuple update */ void -pgstat_count_heap_update(Relation rel, bool hot) +pgstat_count_heap_update(Relation rel, bool hot, bool warm) { PgStat_TableStatus *pgstat_info = rel->pgstat_info; @@ -1841,6 +1841,8 @@ pgstat_count_heap_update(Relation rel, bool hot) /* t_tuples_hot_updated is nontransactional, so just advance it */ if (hot) pgstat_info->t_counts.t_tuples_hot_updated++; + else if (warm) + pgstat_info->t_counts.t_tuples_warm_updated++; } } @@ -4083,6 +4085,7 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) result->tuples_updated = 0; result->tuples_deleted = 0; result->tuples_hot_updated = 0; + result->tuples_warm_updated = 0; result->n_live_tuples = 0; result->n_dead_tuples = 0; result->changes_since_analyze = 0; @@ -5192,6 +5195,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated; tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted; tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated; + tabentry->tuples_warm_updated = tabmsg->t_counts.t_tuples_warm_updated; tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples; tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples; tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples; @@ -5219,6 +5223,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated; tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted; tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated; + tabentry->tuples_warm_updated += tabmsg->t_counts.t_tuples_warm_updated; /* If table was truncated, first reset the live/dead counters */ if (tabmsg->t_counts.t_truncated) { diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 2d3cf9e..25752b0 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -115,6 +115,7 @@ extern Datum pg_stat_get_xact_tuples_inserted(PG_FUNCTION_ARGS); extern Datum pg_stat_get_xact_tuples_updated(PG_FUNCTION_ARGS); extern Datum pg_stat_get_xact_tuples_deleted(PG_FUNCTION_ARGS); extern Datum pg_stat_get_xact_tuples_hot_updated(PG_FUNCTION_ARGS); +extern Datum pg_stat_get_xact_tuples_warm_updated(PG_FUNCTION_ARGS); extern Datum pg_stat_get_xact_blocks_fetched(PG_FUNCTION_ARGS); extern Datum pg_stat_get_xact_blocks_hit(PG_FUNCTION_ARGS); @@ -245,6 +246,22 @@ pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS) Datum +pg_stat_get_tuples_warm_updated(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 result; + PgStat_StatTabEntry *tabentry; + + if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL) + result = 0; + else + result = (int64) (tabentry->tuples_warm_updated); + + PG_RETURN_INT64(result); +} + + +Datum pg_stat_get_live_tuples(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); @@ -1744,6 +1761,21 @@ pg_stat_get_xact_tuples_hot_updated(PG_FUNCTION_ARGS) } Datum +pg_stat_get_xact_tuples_warm_updated(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 result; + PgStat_TableStatus *tabentry; + + if ((tabentry = find_tabstat_entry(relid)) == NULL) + result = 0; + else + result = (int64) (tabentry->t_counts.t_tuples_warm_updated); + + PG_RETURN_INT64(result); +} + +Datum pg_stat_get_xact_blocks_fetched(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 79e0b1f..37874ca 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -2030,6 +2030,7 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) list_free_deep(relation->rd_fkeylist); list_free(relation->rd_indexlist); bms_free(relation->rd_indexattr); + bms_free(relation->rd_exprindexattr); bms_free(relation->rd_keyattr); bms_free(relation->rd_idattr); if (relation->rd_options) @@ -4373,12 +4374,15 @@ Bitmapset * RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) { Bitmapset *indexattrs; /* indexed columns */ + Bitmapset *exprindexattrs; /* indexed columns in expression/prediacate + indexes */ Bitmapset *uindexattrs; /* columns in unique indexes */ Bitmapset *idindexattrs; /* columns in the replica identity */ List *indexoidlist; Oid relreplindex; ListCell *l; MemoryContext oldcxt; + bool supportswarm = true;/* True if the table can be WARM updated */ /* Quick exit if we already computed the result. */ if (relation->rd_indexattr != NULL) @@ -4391,6 +4395,8 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) return bms_copy(relation->rd_keyattr); case INDEX_ATTR_BITMAP_IDENTITY_KEY: return bms_copy(relation->rd_idattr); + case INDEX_ATTR_BITMAP_EXPR_PREDICATE: + return bms_copy(relation->rd_exprindexattr); default: elog(ERROR, "unknown attrKind %u", attrKind); } @@ -4429,6 +4435,7 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) * won't be returned at all by RelationGetIndexList. */ indexattrs = NULL; + exprindexattrs = NULL; uindexattrs = NULL; idindexattrs = NULL; foreach(l, indexoidlist) @@ -4474,19 +4481,32 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) } /* Collect all attributes used in expressions, too */ - pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &exprindexattrs); /* Collect all attributes in the index predicate, too */ - pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &exprindexattrs); + + /* + * Check if the index has amrecheck method defined. If the method is + * not defined, the index does not support WARM update. Completely + * disable WARM updates on such tables + */ + if (!indexDesc->rd_amroutine->amrecheck) + supportswarm = false; index_close(indexDesc, AccessShareLock); } list_free(indexoidlist); + /* Remember if the table can do WARM updates */ + relation->rd_supportswarm = supportswarm; + /* Don't leak the old values of these bitmaps, if any */ bms_free(relation->rd_indexattr); relation->rd_indexattr = NULL; + bms_free(relation->rd_exprindexattr); + relation->rd_exprindexattr = NULL; bms_free(relation->rd_keyattr); relation->rd_keyattr = NULL; bms_free(relation->rd_idattr); @@ -4502,7 +4522,8 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) oldcxt = MemoryContextSwitchTo(CacheMemoryContext); relation->rd_keyattr = bms_copy(uindexattrs); relation->rd_idattr = bms_copy(idindexattrs); - relation->rd_indexattr = bms_copy(indexattrs); + relation->rd_exprindexattr = bms_copy(exprindexattrs); + relation->rd_indexattr = bms_copy(bms_union(indexattrs, exprindexattrs)); MemoryContextSwitchTo(oldcxt); /* We return our original working copy for caller to play with */ @@ -4514,6 +4535,8 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) return uindexattrs; case INDEX_ATTR_BITMAP_IDENTITY_KEY: return idindexattrs; + case INDEX_ATTR_BITMAP_EXPR_PREDICATE: + return exprindexattrs; default: elog(ERROR, "unknown attrKind %u", attrKind); return NULL; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 28ebcb6..2241ffb 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -112,6 +112,7 @@ extern char *default_tablespace; extern char *temp_tablespaces; extern bool ignore_checksum_failure; extern bool synchronize_seqscans; +extern bool indexscan_recheck; #ifdef TRACE_SYNCSCAN extern bool trace_syncscan; @@ -1288,6 +1289,16 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"indexscan_recheck", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Recheck heap rows returned from an index scan."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &indexscan_recheck, + false, + NULL, NULL, NULL + }, + { {"debug_deadlocks", PGC_SUSET, DEVELOPER_OPTIONS, gettext_noop("Dumps information about all current locks when a deadlock timeout occurs."), NULL, diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 1036cca..37eaf76 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -13,6 +13,7 @@ #define AMAPI_H #include "access/genam.h" +#include "access/itup.h" /* * We don't wish to include planner header files here, since most of an index @@ -137,6 +138,9 @@ typedef void (*ammarkpos_function) (IndexScanDesc scan); /* restore marked scan position */ typedef void (*amrestrpos_function) (IndexScanDesc scan); +/* recheck index tuple and heap tuple match */ +typedef bool (*amrecheck_function) (Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); /* * API struct for an index AM. Note this must be stored in a single palloc'd @@ -196,6 +200,7 @@ typedef struct IndexAmRoutine amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amrecheck_function amrecheck; /* can be NULL */ } IndexAmRoutine; diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 6dfc41f..f1c73a0 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -389,4 +389,8 @@ extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, bool bucket_has_garbage, IndexBulkDeleteCallback callback, void *callback_state); +/* hash.c */ +extern bool hashrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); + #endif /* HASH_H */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 81f7982..78e16a9 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -137,9 +137,10 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot, Relation stats_relation); extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call); + bool *all_dead, bool first_call, bool *recheck); extern bool heap_hot_search(ItemPointer tid, Relation relation, - Snapshot snapshot, bool *all_dead); + Snapshot snapshot, bool *all_dead, + bool *recheck, Buffer *buffer, HeapTuple heapTuple); extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid); @@ -160,7 +161,8 @@ extern void heap_abort_speculative(Relation relation, HeapTuple tuple); extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode); + HeapUpdateFailureData *hufd, LockTupleMode *lockmode, + Bitmapset **updated_attrs, bool *warm_update); extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_update, @@ -186,6 +188,7 @@ extern int heap_page_prune(Relation relation, Buffer buffer, bool report_stats, TransactionId *latestRemovedXid); extern void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, + bool *warmchain, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused); extern void heap_get_root_tuple_one(Page page, OffsetNumber target_offnum, diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 5a04561..ddc3a7a 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -80,6 +80,7 @@ #define XLH_UPDATE_CONTAINS_NEW_TUPLE (1<<4) #define XLH_UPDATE_PREFIX_FROM_OLD (1<<5) #define XLH_UPDATE_SUFFIX_FROM_OLD (1<<6) +#define XLH_UPDATE_WARM_UPDATE (1<<7) /* convenience macro for checking whether any form of old tuple was logged */ #define XLH_UPDATE_CONTAINS_OLD \ @@ -211,7 +212,9 @@ typedef struct xl_heap_update * * for each redirected item: the item offset, then the offset redirected to * * for each now-dead item: the item offset * * for each now-unused item: the item offset - * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused. + * * for each now-warm item: the item offset + * The total number of OffsetNumbers is therefore + * 2*nredirected+ndead+nunused+nwarm. * Note that nunused is not explicitly stored, but may be found by reference * to the total record length. */ @@ -220,10 +223,11 @@ typedef struct xl_heap_clean TransactionId latestRemovedXid; uint16 nredirected; uint16 ndead; + uint16 nwarm; /* OFFSET NUMBERS are in the block reference 0 */ } xl_heap_clean; -#define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) +#define SizeOfHeapClean (offsetof(xl_heap_clean, nwarm) + sizeof(uint16)) /* * Cleanup_info is required in some cases during a lazy VACUUM. @@ -384,6 +388,7 @@ extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid); extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, + OffsetNumber *warm, int nwarm, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid); diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 4313eb9..09246b2 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -260,7 +260,8 @@ struct HeapTupleHeaderData * information stored in t_infomask2: */ #define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ -/* bits 0x0800 are available */ +#define HEAP_WARM_TUPLE 0x0800 /* This tuple is a part of a WARM chain + */ #define HEAP_LATEST_TUPLE 0x1000 /* * This is the last tuple in chain and * ip_posid points to the root line @@ -271,7 +272,7 @@ struct HeapTupleHeaderData #define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ #define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ -#define HEAP2_XACT_MASK 0xF000 /* visibility-related bits */ +#define HEAP2_XACT_MASK 0xF800 /* visibility-related bits */ /* @@ -510,6 +511,21 @@ do { \ (tup)->t_infomask2 & HEAP_ONLY_TUPLE \ ) +#define HeapTupleHeaderSetHeapWarmTuple(tup) \ +do { \ + (tup)->t_infomask2 |= HEAP_WARM_TUPLE; \ +} while (0) + +#define HeapTupleHeaderClearHeapWarmTuple(tup) \ +do { \ + (tup)->t_infomask2 &= ~HEAP_WARM_TUPLE; \ +} while (0) + +#define HeapTupleHeaderIsHeapWarmTuple(tup) \ +( \ + ((tup)->t_infomask2 & HEAP_WARM_TUPLE) \ +) + #define HeapTupleHeaderSetHeapLatest(tup) \ ( \ (tup)->t_infomask2 |= HEAP_LATEST_TUPLE \ @@ -771,6 +787,15 @@ struct MinimalTupleData #define HeapTupleClearHeapOnly(tuple) \ HeapTupleHeaderClearHeapOnly((tuple)->t_data) +#define HeapTupleIsHeapWarmTuple(tuple) \ + HeapTupleHeaderIsHeapWarmTuple((tuple)->t_data) + +#define HeapTupleSetHeapWarmTuple(tuple) \ + HeapTupleHeaderSetHeapWarmTuple((tuple)->t_data) + +#define HeapTupleClearHeapWarmTuple(tuple) \ + HeapTupleHeaderClearHeapWarmTuple((tuple)->t_data) + #define HeapTupleGetOid(tuple) \ HeapTupleHeaderGetOid((tuple)->t_data) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index c580f51..83af072 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -751,6 +751,8 @@ extern bytea *btoptions(Datum reloptions, bool validate); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); +extern bool btrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index de98dd6..da7ec84 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -111,7 +111,8 @@ typedef struct IndexScanDescData HeapTupleData xs_ctup; /* current heap tuple, if any */ Buffer xs_cbuf; /* current heap buffer in scan, if any */ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ - bool xs_recheck; /* T means scan keys must be rechecked */ + bool xs_recheck; /* T means scan keys must be rechecked for each tuple */ + bool xs_tuple_recheck; /* T means scan keys must be rechecked for current tuple */ /* * When fetching with an ordering operator, the values of the ORDER BY diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 047a1ce..31f295f 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2734,6 +2734,8 @@ DATA(insert OID = 1933 ( pg_stat_get_tuples_deleted PGNSP PGUID 12 1 0 0 0 f f DESCR("statistics: number of tuples deleted"); DATA(insert OID = 1972 ( pg_stat_get_tuples_hot_updated PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_hot_updated _null_ _null_ _null_ )); DESCR("statistics: number of tuples hot updated"); +DATA(insert OID = 3344 ( pg_stat_get_tuples_warm_updated PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_warm_updated _null_ _null_ _null_ )); +DESCR("statistics: number of tuples warm updated"); DATA(insert OID = 2878 ( pg_stat_get_live_tuples PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_live_tuples _null_ _null_ _null_ )); DESCR("statistics: number of live tuples"); DATA(insert OID = 2879 ( pg_stat_get_dead_tuples PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_dead_tuples _null_ _null_ _null_ )); @@ -2884,6 +2886,8 @@ DATA(insert OID = 3042 ( pg_stat_get_xact_tuples_deleted PGNSP PGUID 12 1 0 0 DESCR("statistics: number of tuples deleted in current transaction"); DATA(insert OID = 3043 ( pg_stat_get_xact_tuples_hot_updated PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_tuples_hot_updated _null_ _null_ _null_ )); DESCR("statistics: number of tuples hot updated in current transaction"); +DATA(insert OID = 3343 ( pg_stat_get_xact_tuples_warm_updated PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_tuples_warm_updated _null_ _null_ _null_ )); +DESCR("statistics: number of tuples warm updated in current transaction"); DATA(insert OID = 3044 ( pg_stat_get_xact_blocks_fetched PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_blocks_fetched _null_ _null_ _null_ )); DESCR("statistics: number of blocks fetched in current transaction"); DATA(insert OID = 3045 ( pg_stat_get_xact_blocks_hit PGNSP PGUID 12 1 0 0 0 f f f f t f v r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_xact_blocks_hit _null_ _null_ _null_ )); diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 136276b..f463014 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -366,6 +366,7 @@ extern void UnregisterExprContextCallback(ExprContext *econtext, extern void ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative); extern void ExecCloseIndices(ResultRelInfo *resultRelInfo); extern List *ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, + ItemPointer root_tid, Bitmapset *updated_attrs, EState *estate, bool noDupErr, bool *specConflict, List *arbiterIndexes); extern bool ExecCheckIndexConstraints(TupleTableSlot *slot, EState *estate, diff --git a/src/include/executor/nodeIndexscan.h b/src/include/executor/nodeIndexscan.h index 194fadb..fe9c78e 100644 --- a/src/include/executor/nodeIndexscan.h +++ b/src/include/executor/nodeIndexscan.h @@ -38,4 +38,5 @@ extern bool ExecIndexEvalArrayKeys(ExprContext *econtext, IndexArrayKeyInfo *arrayKeys, int numArrayKeys); extern bool ExecIndexAdvanceArrayKeys(IndexArrayKeyInfo *arrayKeys, int numArrayKeys); +extern bool indexscan_recheck; #endif /* NODEINDEXSCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 8004d85..3bf4b5f 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -61,6 +61,7 @@ typedef struct IndexInfo NodeTag type; int ii_NumIndexAttrs; AttrNumber ii_KeyAttrNumbers[INDEX_MAX_KEYS]; + Bitmapset *ii_indxattrs; /* bitmap of all columns used in this index */ List *ii_Expressions; /* list of Expr */ List *ii_ExpressionsState; /* list of ExprState */ List *ii_Predicate; /* list of Expr */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 152ff06..e0c8a90 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -105,6 +105,7 @@ typedef struct PgStat_TableCounts PgStat_Counter t_tuples_updated; PgStat_Counter t_tuples_deleted; PgStat_Counter t_tuples_hot_updated; + PgStat_Counter t_tuples_warm_updated; bool t_truncated; PgStat_Counter t_delta_live_tuples; @@ -625,6 +626,7 @@ typedef struct PgStat_StatTabEntry PgStat_Counter tuples_updated; PgStat_Counter tuples_deleted; PgStat_Counter tuples_hot_updated; + PgStat_Counter tuples_warm_updated; PgStat_Counter n_live_tuples; PgStat_Counter n_dead_tuples; @@ -1176,7 +1178,7 @@ pgstat_report_wait_end(void) (pgStatBlockWriteTime += (n)) extern void pgstat_count_heap_insert(Relation rel, int n); -extern void pgstat_count_heap_update(Relation rel, bool hot); +extern void pgstat_count_heap_update(Relation rel, bool hot, bool warm); extern void pgstat_count_heap_delete(Relation rel); extern void pgstat_count_truncate(Relation rel); extern void pgstat_update_heap_dead_tuples(Relation rel, int delta); diff --git a/src/include/storage/itemid.h b/src/include/storage/itemid.h index 509c577..8c9cc99 100644 --- a/src/include/storage/itemid.h +++ b/src/include/storage/itemid.h @@ -46,6 +46,12 @@ typedef ItemIdData *ItemId; typedef uint16 ItemOffset; typedef uint16 ItemLength; +/* + * Special value used in lp_len to indicate that the chain starting at line + * pointer may contain WARM tuples. This must only be interpreted along with + * LP_REDIRECT flag + */ +#define SpecHeapWarmLen 0x1ffb /* ---------------- * support macros @@ -112,12 +118,15 @@ typedef uint16 ItemLength; #define ItemIdIsDead(itemId) \ ((itemId)->lp_flags == LP_DEAD) +#define ItemIdIsHeapWarm(itemId) \ + (((itemId)->lp_flags == LP_REDIRECT) && \ + ((itemId)->lp_len == SpecHeapWarmLen)) /* * ItemIdHasStorage * True iff item identifier has associated storage. */ #define ItemIdHasStorage(itemId) \ - ((itemId)->lp_len != 0) + (!ItemIdIsRedirected(itemId) && (itemId)->lp_len != 0) /* * ItemIdSetUnused @@ -168,6 +177,26 @@ typedef uint16 ItemLength; ) /* + * ItemIdSetHeapWarm + * Set the item identifier to identify as starting of a WARM chain + * + * Note: Since all bits in lp_flags are currently used, we store a special + * value in lp_len field to indicate this state. This is required only for + * LP_REDIRECT tuple and lp_len field is unused for such line pointers. + */ +#define ItemIdSetHeapWarm(itemId) \ +do { \ + AssertMacro((itemId)->lp_flags == LP_REDIRECT); \ + (itemId)->lp_len = SpecHeapWarmLen; \ +} while (0) + +#define ItemIdClearHeapWarm(itemId) \ +( \ + AssertMacro((itemId)->lp_flags == LP_REDIRECT); \ + (itemId)->lp_len = 0; \ +) + +/* * ItemIdMarkDead * Set the item identifier to be DEAD, keeping its existing storage. * diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index fa15f28..982bf4c 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -101,8 +101,11 @@ typedef struct RelationData /* data managed by RelationGetIndexAttrBitmap: */ Bitmapset *rd_indexattr; /* identifies columns used in indexes */ + Bitmapset *rd_exprindexattr; /* indentified columns used in expression or + predicate indexes */ Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */ Bitmapset *rd_idattr; /* included in replica identity index */ + bool rd_supportswarm;/* True if the table can be WARM updated */ /* * rd_options is set whenever rd_rel is loaded into the relcache entry. diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 6ea7dd2..290e9b7 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -48,7 +48,8 @@ typedef enum IndexAttrBitmapKind { INDEX_ATTR_BITMAP_ALL, INDEX_ATTR_BITMAP_KEY, - INDEX_ATTR_BITMAP_IDENTITY_KEY + INDEX_ATTR_BITMAP_IDENTITY_KEY, + INDEX_ATTR_BITMAP_EXPR_PREDICATE } IndexAttrBitmapKind; extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation, diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 031e8c2..c416fe6 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1705,6 +1705,7 @@ pg_stat_all_tables| SELECT c.oid AS relid, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(c.oid) AS n_tup_hot_upd, + pg_stat_get_tuples_warm_updated(c.oid) AS n_tup_warm_upd, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_mod_since_analyze(c.oid) AS n_mod_since_analyze, @@ -1838,6 +1839,7 @@ pg_stat_sys_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, + pg_stat_all_tables.n_tup_warm_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.n_mod_since_analyze, @@ -1881,6 +1883,7 @@ pg_stat_user_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, + pg_stat_all_tables.n_tup_warm_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.n_mod_since_analyze, @@ -1918,7 +1921,8 @@ pg_stat_xact_all_tables| SELECT c.oid AS relid, pg_stat_get_xact_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_xact_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_xact_tuples_deleted(c.oid) AS n_tup_del, - pg_stat_get_xact_tuples_hot_updated(c.oid) AS n_tup_hot_upd + pg_stat_get_xact_tuples_hot_updated(c.oid) AS n_tup_hot_upd, + pg_stat_get_xact_tuples_warm_updated(c.oid) AS n_tup_warm_upd FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) @@ -1934,7 +1938,8 @@ pg_stat_xact_sys_tables| SELECT pg_stat_xact_all_tables.relid, pg_stat_xact_all_tables.n_tup_ins, pg_stat_xact_all_tables.n_tup_upd, pg_stat_xact_all_tables.n_tup_del, - pg_stat_xact_all_tables.n_tup_hot_upd + pg_stat_xact_all_tables.n_tup_hot_upd, + pg_stat_xact_all_tables.n_tup_warm_upd FROM pg_stat_xact_all_tables WHERE ((pg_stat_xact_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_xact_all_tables.schemaname ~ '^pg_toast'::text)); pg_stat_xact_user_functions| SELECT p.oid AS funcid, @@ -1956,7 +1961,8 @@ pg_stat_xact_user_tables| SELECT pg_stat_xact_all_tables.relid, pg_stat_xact_all_tables.n_tup_ins, pg_stat_xact_all_tables.n_tup_upd, pg_stat_xact_all_tables.n_tup_del, - pg_stat_xact_all_tables.n_tup_hot_upd + pg_stat_xact_all_tables.n_tup_hot_upd, + pg_stat_xact_all_tables.n_tup_warm_upd FROM pg_stat_xact_all_tables WHERE ((pg_stat_xact_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_xact_all_tables.schemaname !~ '^pg_toast'::text)); pg_statio_all_indexes| SELECT c.oid AS relid, diff --git a/src/test/regress/expected/warm.out b/src/test/regress/expected/warm.out new file mode 100644 index 0000000..0aa1b83 --- /dev/null +++ b/src/test/regress/expected/warm.out @@ -0,0 +1,51 @@ +CREATE TABLE test_warm (a text unique, b text); +CREATE INDEX test_warmindx ON test_warm (lower(a)); +INSERT INTO test_warm values ('test', 'foo'); +UPDATE test_warm SET a = 'TEST'; +select *, ctid from test_warm where lower(a) = 'test'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +explain select * from test_warm where lower(a) = 'test'; + QUERY PLAN +---------------------------------------------------------------------------- + Bitmap Heap Scan on test_warm (cost=4.18..12.65 rows=4 width=64) + Recheck Cond: (lower(a) = 'test'::text) + -> Bitmap Index Scan on test_warmindx (cost=0.00..4.18 rows=4 width=0) + Index Cond: (lower(a) = 'test'::text) +(4 rows) + +select *, ctid from test_warm where lower(a) = 'test'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +select *, ctid from test_warm where a = 'test'; + a | b | ctid +---+---+------ +(0 rows) + +select *, ctid from test_warm where a = 'TEST'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +set enable_bitmapscan TO false; +explain select * from test_warm where lower(a) = 'test'; + QUERY PLAN +--------------------------------------------------------------------------------- + Index Scan using test_warmindx on test_warm (cost=0.15..20.22 rows=4 width=64) + Index Cond: (lower(a) = 'test'::text) +(2 rows) + +select *, ctid from test_warm where lower(a) = 'test'; + a | b | ctid +------+-----+------- + TEST | foo | (0,2) +(1 row) + +DROP TABLE test_warm; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 8641769..a610039 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -42,6 +42,8 @@ test: create_type test: create_table test: create_function_2 +test: warm + # ---------- # Load huge amounts of data # We should split the data files into single files and then diff --git a/src/test/regress/sql/warm.sql b/src/test/regress/sql/warm.sql new file mode 100644 index 0000000..166ea37 --- /dev/null +++ b/src/test/regress/sql/warm.sql @@ -0,0 +1,15 @@ +CREATE TABLE test_warm (a text unique, b text); +CREATE INDEX test_warmindx ON test_warm (lower(a)); +INSERT INTO test_warm values ('test', 'foo'); +UPDATE test_warm SET a = 'TEST'; +select *, ctid from test_warm where lower(a) = 'test'; +explain select * from test_warm where lower(a) = 'test'; +select *, ctid from test_warm where lower(a) = 'test'; +select *, ctid from test_warm where a = 'test'; +select *, ctid from test_warm where a = 'TEST'; +set enable_bitmapscan TO false; +explain select * from test_warm where lower(a) = 'test'; +select *, ctid from test_warm where lower(a) = 'test'; +DROP TABLE test_warm; + +