From 58e0f867076b3ea20cc4aab1c2bebb50e040a6b5 Mon Sep 17 00:00:00 2001 From: Hari Babu Date: Thu, 7 Sep 2017 14:48:24 +1000 Subject: [PATCH 6/8] Tuple Insert API is added to Storage AM heap_insert, heap_delete, heap_fetch, heap_update, heap_get_latest_oid, heap_lock_tuple and heap_multi_insert functions are added to storage AM. Replaced the usage of HeapTuple with storageTuple in some places, increased the use of slot. --- src/backend/access/common/heaptuple.c | 24 + src/backend/access/heap/Makefile | 2 +- src/backend/access/heap/heapam.c | 2737 ++++-------------------------- src/backend/access/heap/heapam_storage.c | 2153 ++++++++++++++++++++++- src/backend/access/heap/rewriteheap.c | 5 +- src/backend/access/heap/storageam.c | 306 ++++ src/backend/access/heap/tuptoaster.c | 8 +- src/backend/commands/copy.c | 29 +- src/backend/commands/createas.c | 18 +- src/backend/commands/matview.c | 10 +- src/backend/commands/tablecmds.c | 5 +- src/backend/commands/trigger.c | 43 +- src/backend/executor/execMain.c | 120 +- src/backend/executor/execReplication.c | 35 +- src/backend/executor/nodeLockRows.c | 39 +- src/backend/executor/nodeModifyTable.c | 177 +- src/backend/executor/nodeTidscan.c | 23 +- src/backend/utils/adt/tid.c | 5 +- src/include/access/heapam.h | 26 +- src/include/access/heapam_common.h | 127 ++ src/include/access/htup_details.h | 1 + src/include/access/storageam.h | 81 + src/include/access/storageamapi.h | 21 +- src/include/commands/trigger.h | 2 +- src/include/executor/executor.h | 6 +- src/include/nodes/execnodes.h | 4 +- 26 files changed, 3290 insertions(+), 2717 deletions(-) create mode 100644 src/backend/access/heap/storageam.c create mode 100644 src/include/access/storageam.h diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 5ed0f15ac4..714c4d862c 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -685,6 +685,30 @@ heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc) return PointerGetDatum(td); } +/* + * heap_form_tuple_by_datum + * construct a tuple from the given dataum + * + * The result is allocated in the current memory context. + */ +HeapTuple +heap_form_tuple_by_datum(Datum data, Oid tableoid) +{ + HeapTuple newTuple; + HeapTupleHeader td; + + td = DatumGetHeapTupleHeader(data); + + newTuple = (HeapTuple) palloc(HEAPTUPLESIZE + HeapTupleHeaderGetDatumLength(td)); + newTuple->t_len = HeapTupleHeaderGetDatumLength(td); + newTuple->t_self = td->t_ctid; + newTuple->t_tableOid = tableoid; + newTuple->t_data = (HeapTupleHeader) ((char *) newTuple + HEAPTUPLESIZE); + memcpy((char *) newTuple->t_data, (char *) td, newTuple->t_len); + + return newTuple; +} + /* * heap_form_tuple * construct a tuple from the given values[] and isnull[] arrays, diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile index e6bc18e5ea..162736ff15 100644 --- a/src/backend/access/heap/Makefile +++ b/src/backend/access/heap/Makefile @@ -13,7 +13,7 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = heapam.o heapam_common.o heapam_storage.o hio.o \ - pruneheap.o rewriteheap.o storageamapi.o \ + pruneheap.o rewriteheap.o storageam.o storageamapi.o \ syncscan.o tuptoaster.o visibilitymap.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index c21d6f8559..d20f211a08 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -94,8 +94,6 @@ static HeapScanDesc heap_beginscan_internal(Relation relation, bool temp_snap); static void heap_parallelscan_startblock_init(HeapScanDesc scan); static BlockNumber heap_parallelscan_nextpage(HeapScanDesc scan); -static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, - TransactionId xid, CommandId cid, int options); static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tup, @@ -103,108 +101,17 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, static Bitmapset *HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols, HeapTuple oldtup, HeapTuple newtup); -static bool heap_acquire_tuplock(Relation relation, ItemPointer tid, - LockTupleMode mode, LockWaitPolicy wait_policy, - bool *have_tuple_lock); -static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, - uint16 old_infomask2, TransactionId add_to_xmax, - LockTupleMode mode, bool is_update, - TransactionId *result_xmax, uint16 *result_infomask, - uint16 *result_infomask2); -static HTSU_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple, - ItemPointer ctid, TransactionId xid, - LockTupleMode mode); static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2); static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask); -static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, - LockTupleMode lockmode); -static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, - Relation rel, ItemPointer ctid, XLTW_Oper oper, - int *remaining); -static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, - uint16 infomask, Relation rel, int *remaining); -static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); +static bool Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, + uint16 infomask, bool nowait, + Relation rel, ItemPointer ctid, XLTW_Oper oper, + int *remaining); static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified, bool *copy); - -/* - * Each tuple lock mode has a corresponding heavyweight lock, and one or two - * corresponding MultiXactStatuses (one to merely lock tuples, another one to - * update them). This table (and the macros below) helps us determine the - * heavyweight lock mode and MultiXactStatus values to use for any particular - * tuple lock strength. - * - * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock - * instead. - */ -static const struct -{ - LOCKMODE hwlock; - int lockstatus; - int updstatus; -} - - tupleLockExtraInfo[MaxLockTupleMode + 1] = -{ - { /* LockTupleKeyShare */ - AccessShareLock, - MultiXactStatusForKeyShare, - -1 /* KeyShare does not allow updating tuples */ - }, - { /* LockTupleShare */ - RowShareLock, - MultiXactStatusForShare, - -1 /* Share does not allow updating tuples */ - }, - { /* LockTupleNoKeyExclusive */ - ExclusiveLock, - MultiXactStatusForNoKeyUpdate, - MultiXactStatusNoKeyUpdate - }, - { /* LockTupleExclusive */ - AccessExclusiveLock, - MultiXactStatusForUpdate, - MultiXactStatusUpdate - } -}; - -/* Get the LOCKMODE for a given MultiXactStatus */ -#define LOCKMODE_from_mxstatus(status) \ - (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock) - -/* - * Acquire heavyweight locks on tuples, using a LockTupleMode strength value. - * This is more readable than having every caller translate it to lock.h's - * LOCKMODE. - */ -#define LockTupleTuplock(rel, tup, mode) \ - LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) -#define UnlockTupleTuplock(rel, tup, mode) \ - UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) -#define ConditionalLockTupleTuplock(rel, tup, mode) \ - ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) - -/* - * This table maps tuple lock strength values for each particular - * MultiXactStatus value. - */ -static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = -{ - LockTupleKeyShare, /* ForKeyShare */ - LockTupleShare, /* ForShare */ - LockTupleNoKeyExclusive, /* ForNoKeyUpdate */ - LockTupleExclusive, /* ForUpdate */ - LockTupleNoKeyExclusive, /* NoKeyUpdate */ - LockTupleExclusive /* Update */ -}; - -/* Get the LockTupleMode for a given MultiXactStatus */ -#define TUPLOCK_from_mxstatus(status) \ - (MultiXactStatusLock[(status)]) - /* ---------------------------------------------------------------- * heap support routines * ---------------------------------------------------------------- @@ -1837,158 +1744,6 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction) return &(scan->rs_ctup); } -/* - * heap_fetch - retrieve tuple with given tid - * - * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding - * the tuple, fill in the remaining fields of *tuple, and check the tuple - * against the specified snapshot. - * - * If successful (tuple found and passes snapshot time qual), then *userbuf - * is set to the buffer holding the tuple and TRUE is returned. The caller - * must unpin the buffer when done with the tuple. - * - * If the tuple is not found (ie, item number references a deleted slot), - * then tuple->t_data is set to NULL and FALSE is returned. - * - * If the tuple is found but fails the time qual check, then FALSE is returned - * but tuple->t_data is left pointing to the tuple. - * - * keep_buf determines what is done with the buffer in the FALSE-result cases. - * When the caller specifies keep_buf = true, we retain the pin on the buffer - * and return it in *userbuf (so the caller must eventually unpin it); when - * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer. - * - * stats_relation is the relation to charge the heap_fetch operation against - * for statistical purposes. (This could be the heap rel itself, an - * associated index, or NULL to not count the fetch at all.) - * - * heap_fetch does not follow HOT chains: only the exact TID requested will - * be fetched. - * - * It is somewhat inconsistent that we ereport() on invalid block number but - * return false on invalid item number. There are a couple of reasons though. - * One is that the caller can relatively easily check the block number for - * validity, but cannot check the item number without reading the page - * himself. Another is that when we are following a t_ctid link, we can be - * reasonably confident that the page number is valid (since VACUUM shouldn't - * truncate off the destination page without having killed the referencing - * tuple first), but the item number might well not be good. - */ -bool -heap_fetch(Relation relation, - Snapshot snapshot, - HeapTuple tuple, - Buffer *userbuf, - bool keep_buf, - Relation stats_relation) -{ - ItemPointer tid = &(tuple->t_self); - ItemId lp; - Buffer buffer; - Page page; - OffsetNumber offnum; - bool valid; - - /* - * Fetch and pin the appropriate page of the relation. - */ - buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); - - /* - * Need share lock on buffer to examine tuple commit status. - */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); - page = BufferGetPage(buffer); - TestForOldSnapshot(snapshot, relation, page); - - /* - * We'd better check for out-of-range offnum in case of VACUUM since the - * TID was obtained. - */ - offnum = ItemPointerGetOffsetNumber(tid); - if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - if (keep_buf) - *userbuf = buffer; - else - { - ReleaseBuffer(buffer); - *userbuf = InvalidBuffer; - } - tuple->t_data = NULL; - return false; - } - - /* - * get the item line pointer corresponding to the requested tid - */ - lp = PageGetItemId(page, offnum); - - /* - * Must check for deleted tuple. - */ - if (!ItemIdIsNormal(lp)) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - if (keep_buf) - *userbuf = buffer; - else - { - ReleaseBuffer(buffer); - *userbuf = InvalidBuffer; - } - tuple->t_data = NULL; - return false; - } - - /* - * fill in *tuple fields - */ - tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); - tuple->t_len = ItemIdGetLength(lp); - tuple->t_tableOid = RelationGetRelid(relation); - - /* - * check time qualification of tuple, then release lock - */ - valid = HeapTupleSatisfiesVisibility(relation->rd_stamroutine, tuple, snapshot, buffer); - - if (valid) - PredicateLockTuple(relation, tuple, snapshot); - - CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot); - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - if (valid) - { - /* - * All checks passed, so return the tuple as valid. Caller is now - * responsible for releasing the buffer. - */ - *userbuf = buffer; - - /* Count the successful fetch against appropriate rel, if any */ - if (stats_relation != NULL) - pgstat_count_heap_fetch(stats_relation); - - return true; - } - - /* Tuple failed time qual, but maybe caller wants to see it anyway. */ - if (keep_buf) - *userbuf = buffer; - else - { - ReleaseBuffer(buffer); - *userbuf = InvalidBuffer; - } - - return false; -} - /* * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot * @@ -2172,130 +1927,6 @@ heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, return result; } -/* - * heap_get_latest_tid - get the latest tid of a specified tuple - * - * Actually, this gets the latest version that is visible according to - * the passed snapshot. You can pass SnapshotDirty to get the very latest, - * possibly uncommitted version. - * - * *tid is both an input and an output parameter: it is updated to - * show the latest version of the row. Note that it will not be changed - * if no version of the row passes the snapshot test. - */ -void -heap_get_latest_tid(Relation relation, - Snapshot snapshot, - ItemPointer tid) -{ - BlockNumber blk; - ItemPointerData ctid; - TransactionId priorXmax; - - /* this is to avoid Assert failures on bad input */ - if (!ItemPointerIsValid(tid)) - return; - - /* - * Since this can be called with user-supplied TID, don't trust the input - * too much. (RelationGetNumberOfBlocks is an expensive check, so we - * don't check t_ctid links again this way. Note that it would not do to - * call it just once and save the result, either.) - */ - blk = ItemPointerGetBlockNumber(tid); - if (blk >= RelationGetNumberOfBlocks(relation)) - elog(ERROR, "block number %u is out of range for relation \"%s\"", - blk, RelationGetRelationName(relation)); - - /* - * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we - * need to examine, and *tid is the TID we will return if ctid turns out - * to be bogus. - * - * Note that we will loop until we reach the end of the t_ctid chain. - * Depending on the snapshot passed, there might be at most one visible - * version of the row, but we don't try to optimize for that. - */ - ctid = *tid; - priorXmax = InvalidTransactionId; /* cannot check first XMIN */ - for (;;) - { - Buffer buffer; - Page page; - OffsetNumber offnum; - ItemId lp; - HeapTupleData tp; - bool valid; - - /* - * Read, pin, and lock the page. - */ - buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); - LockBuffer(buffer, BUFFER_LOCK_SHARE); - page = BufferGetPage(buffer); - TestForOldSnapshot(snapshot, relation, page); - - /* - * Check for bogus item number. This is not treated as an error - * condition because it can happen while following a t_ctid link. We - * just assume that the prior tid is OK and return it unchanged. - */ - offnum = ItemPointerGetOffsetNumber(&ctid); - if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) - { - UnlockReleaseBuffer(buffer); - break; - } - lp = PageGetItemId(page, offnum); - if (!ItemIdIsNormal(lp)) - { - UnlockReleaseBuffer(buffer); - break; - } - - /* OK to access the tuple */ - tp.t_self = ctid; - tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); - tp.t_len = ItemIdGetLength(lp); - tp.t_tableOid = RelationGetRelid(relation); - - /* - * After following a t_ctid link, we might arrive at an unrelated - * tuple. Check for XMIN match. - */ - if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) - { - UnlockReleaseBuffer(buffer); - break; - } - - /* - * Check time qualification of tuple; if visible, set it as the new - * result candidate. - */ - valid = HeapTupleSatisfiesVisibility(relation->rd_stamroutine, &tp, snapshot, buffer); - CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot); - if (valid) - *tid = ctid; - - /* - * If there's a valid t_ctid link, follow it, else we're done. - */ - if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || - HeapTupleHeaderIsOnlyLocked(tp.t_data) || - ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) - { - UnlockReleaseBuffer(buffer); - break; - } - - ctid = tp.t_data->t_ctid; - priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); - UnlockReleaseBuffer(buffer); - } /* end of loop */ -} - /* * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends @@ -2313,7 +1944,7 @@ heap_get_latest_tid(Relation relation, * * Note this is not allowed for tuples whose xmax is a multixact. */ -static void +void UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid) { Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid)); @@ -2596,7 +2227,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, * tuple if not. Note that in any case, the header fields are also set in * the original tuple. */ -static HeapTuple +HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options) { @@ -2664,412 +2295,110 @@ heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, } /* - * heap_multi_insert - insert multiple tuple into a heap + * simple_heap_insert - insert a tuple + * + * Currently, this routine differs from heap_insert only in supplying + * a default command ID and not allowing access to the speedup options. * - * This is like heap_insert(), but inserts multiple tuples in one operation. - * That's faster than calling heap_insert() in a loop, because when multiple - * tuples can be inserted on a single page, we can write just a single WAL - * record covering all of them, and only need to lock/unlock the page once. + * This should be used rather than using heap_insert directly in most places + * where we are modifying system catalogs. + */ +Oid +simple_heap_insert(Relation relation, HeapTuple tup) +{ + return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL); +} + +/* + * Given infomask/infomask2, compute the bits that must be saved in the + * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock, + * xl_heap_lock_updated WAL records. * - * Note: this leaks memory into the current memory context. You can create a - * temporary context before calling this, if that's a problem. + * See fix_infomask_from_infobits. */ -void -heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, - CommandId cid, int options, BulkInsertState bistate) +uint8 +compute_infobits(uint16 infomask, uint16 infomask2) +{ + return + ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) | + ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) | + ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) | + /* note we ignore HEAP_XMAX_SHR_LOCK here */ + ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) | + ((infomask2 & HEAP_KEYS_UPDATED) != 0 ? + XLHL_KEYS_UPDATED : 0); +} + + + +/* + * heap_delete - delete a tuple + * + * NB: do not call this directly unless you are prepared to deal with + * concurrent-update conditions. Use simple_heap_delete instead. + * + * relation - table to be modified (caller must hold suitable lock) + * tid - TID of tuple to be deleted + * cid - delete command ID (used for visibility test, and stored into + * cmax if successful) + * crosscheck - if not InvalidSnapshot, also check tuple against this + * wait - true if should wait for any conflicting update to commit/abort + * hufd - output parameter, filled in failure cases (see below) + * + * Normal, successful return value is HeapTupleMayBeUpdated, which + * actually means we did delete it. Failure return codes are + * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated + * (the last only possible if wait == false). + * + * In the failure cases, the routine fills *hufd with the tuple's t_ctid, + * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax + * (the last only for HeapTupleSelfUpdated, since we + * cannot obtain cmax from a combocid generated by another transaction). + * See comments for struct HeapUpdateFailureData for additional info. + */ +HTSU_Result +heap_delete(Relation relation, ItemPointer tid, + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd) { + HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); - HeapTuple *heaptuples; - int i; - int ndone; - char *scratch = NULL; + ItemId lp; + HeapTupleData tp; Page page; - bool needwal; - Size saveFreeSpace; - bool need_tuple_data = RelationIsLogicallyLogged(relation); - bool need_cids = RelationIsAccessibleInLogicalDecoding(relation); - - needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); - saveFreeSpace = RelationGetTargetPageFreeSpace(relation, - HEAP_DEFAULT_FILLFACTOR); + BlockNumber block; + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + TransactionId new_xmax; + uint16 new_infomask, + new_infomask2; + bool have_tuple_lock = false; + bool iscombo; + bool all_visible_cleared = false; + HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */ + bool old_key_copied = false; - /* Toast and set header data in all the tuples */ - heaptuples = palloc(ntuples * sizeof(HeapTuple)); - for (i = 0; i < ntuples; i++) - heaptuples[i] = heap_prepare_insert(relation, tuples[i], - xid, cid, options); + Assert(ItemPointerIsValid(tid)); /* - * Allocate some memory to use for constructing the WAL record. Using - * palloc() within a critical section is not safe, so we allocate this - * beforehand. + * Forbid this during a parallel operation, lest it allocate a combocid. + * Other workers might need that combocid for visibility checks, and we + * have no provision for broadcasting it to them. */ - if (needwal) - scratch = palloc(BLCKSZ); + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot delete tuples during a parallel operation"))); + + block = ItemPointerGetBlockNumber(tid); + buffer = ReadBuffer(relation, block); + page = BufferGetPage(buffer); /* - * We're about to do the actual inserts -- but check for conflict first, - * to minimize the possibility of having to roll back work we've just - * done. - * - * A check here does not definitively prevent a serialization anomaly; - * that check MUST be done at least past the point of acquiring an - * exclusive buffer content lock on every buffer that will be affected, - * and MAY be done after all inserts are reflected in the buffers and - * those locks are released; otherwise there race condition. Since - * multiple buffers can be locked and unlocked in the loop below, and it - * would not be feasible to identify and lock all of those buffers before - * the loop, we must do a final check at the end. - * - * The check here could be omitted with no loss of correctness; it is - * present strictly as an optimization. - * - * For heap inserts, we only need to check for table-level SSI locks. Our - * new tuples can't possibly conflict with existing tuple locks, and heap - * page locks are only consolidated versions of tuple locks; they do not - * lock "gaps" as index page locks do. So we don't need to specify a - * buffer when making the call, which makes for a faster check. - */ - CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); - - ndone = 0; - while (ndone < ntuples) - { - Buffer buffer; - Buffer vmbuffer = InvalidBuffer; - bool all_visible_cleared = false; - int nthispage; - - CHECK_FOR_INTERRUPTS(); - - /* - * Find buffer where at least the next tuple will fit. If the page is - * all-visible, this will also pin the requisite visibility map page. - */ - buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len, - InvalidBuffer, options, bistate, - &vmbuffer, NULL); - page = BufferGetPage(buffer); - - /* NO EREPORT(ERROR) from here till changes are logged */ - START_CRIT_SECTION(); - - /* - * RelationGetBufferForTuple has ensured that the first tuple fits. - * Put that on the page, and then as many other tuples as fit. - */ - RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false); - for (nthispage = 1; ndone + nthispage < ntuples; nthispage++) - { - HeapTuple heaptup = heaptuples[ndone + nthispage]; - - if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) - break; - - RelationPutHeapTuple(relation, buffer, heaptup, false); - - /* - * We don't use heap_multi_insert for catalog tuples yet, but - * better be prepared... - */ - if (needwal && need_cids) - log_heap_new_cid(relation, heaptup); - } - - if (PageIsAllVisible(page)) - { - all_visible_cleared = true; - PageClearAllVisible(page); - visibilitymap_clear(relation, - BufferGetBlockNumber(buffer), - vmbuffer, VISIBILITYMAP_VALID_BITS); - } - - /* - * XXX Should we set PageSetPrunable on this page ? See heap_insert() - */ - - MarkBufferDirty(buffer); - - /* XLOG stuff */ - if (needwal) - { - XLogRecPtr recptr; - xl_heap_multi_insert *xlrec; - uint8 info = XLOG_HEAP2_MULTI_INSERT; - char *tupledata; - int totaldatalen; - char *scratchptr = scratch; - bool init; - int bufflags = 0; - - /* - * If the page was previously empty, we can reinit the page - * instead of restoring the whole thing. - */ - init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber && - PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1); - - /* allocate xl_heap_multi_insert struct from the scratch area */ - xlrec = (xl_heap_multi_insert *) scratchptr; - scratchptr += SizeOfHeapMultiInsert; - - /* - * Allocate offsets array. Unless we're reinitializing the page, - * in that case the tuples are stored in order starting at - * FirstOffsetNumber and we don't need to store the offsets - * explicitly. - */ - if (!init) - scratchptr += nthispage * sizeof(OffsetNumber); - - /* the rest of the scratch space is used for tuple data */ - tupledata = scratchptr; - - xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0; - xlrec->ntuples = nthispage; - - /* - * Write out an xl_multi_insert_tuple and the tuple data itself - * for each tuple. - */ - for (i = 0; i < nthispage; i++) - { - HeapTuple heaptup = heaptuples[ndone + i]; - xl_multi_insert_tuple *tuphdr; - int datalen; - - if (!init) - xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); - /* xl_multi_insert_tuple needs two-byte alignment. */ - tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); - scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; - - tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; - tuphdr->t_infomask = heaptup->t_data->t_infomask; - tuphdr->t_hoff = heaptup->t_data->t_hoff; - - /* write bitmap [+ padding] [+ oid] + data */ - datalen = heaptup->t_len - SizeofHeapTupleHeader; - memcpy(scratchptr, - (char *) heaptup->t_data + SizeofHeapTupleHeader, - datalen); - tuphdr->datalen = datalen; - scratchptr += datalen; - } - totaldatalen = scratchptr - tupledata; - Assert((scratchptr - scratch) < BLCKSZ); - - if (need_tuple_data) - xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; - - /* - * Signal that this is the last xl_heap_multi_insert record - * emitted by this call to heap_multi_insert(). Needed for logical - * decoding so it knows when to cleanup temporary data. - */ - if (ndone + nthispage == ntuples) - xlrec->flags |= XLH_INSERT_LAST_IN_MULTI; - - if (init) - { - info |= XLOG_HEAP_INIT_PAGE; - bufflags |= REGBUF_WILL_INIT; - } - - /* - * If we're doing logical decoding, include the new tuple data - * even if we take a full-page image of the page. - */ - if (need_tuple_data) - bufflags |= REGBUF_KEEP_DATA; - - XLogBeginInsert(); - XLogRegisterData((char *) xlrec, tupledata - scratch); - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); - - XLogRegisterBufData(0, tupledata, totaldatalen); - - /* filtering by origin on a row level is much more efficient */ - XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - - recptr = XLogInsert(RM_HEAP2_ID, info); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - UnlockReleaseBuffer(buffer); - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); - - ndone += nthispage; - } - - /* - * We're done with the actual inserts. Check for conflicts again, to - * ensure that all rw-conflicts in to these inserts are detected. Without - * this final check, a sequential scan of the heap may have locked the - * table after the "before" check, missing one opportunity to detect the - * conflict, and then scanned the table before the new tuples were there, - * missing the other chance to detect the conflict. - * - * For heap inserts, we only need to check for table-level SSI locks. Our - * new tuples can't possibly conflict with existing tuple locks, and heap - * page locks are only consolidated versions of tuple locks; they do not - * lock "gaps" as index page locks do. So we don't need to specify a - * buffer when making the call. - */ - CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); - - /* - * If tuples are cachable, mark them for invalidation from the caches in - * case we abort. Note it is OK to do this after releasing the buffer, - * because the heaptuples data structure is all in local memory, not in - * the shared buffer. - */ - if (IsCatalogRelation(relation)) - { - for (i = 0; i < ntuples; i++) - CacheInvalidateHeapTuple(relation, heaptuples[i], NULL); - } - - /* - * Copy t_self fields back to the caller's original tuples. This does - * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's - * probably faster to always copy than check. - */ - for (i = 0; i < ntuples; i++) - tuples[i]->t_self = heaptuples[i]->t_self; - - pgstat_count_heap_insert(relation, ntuples); -} - -/* - * simple_heap_insert - insert a tuple - * - * Currently, this routine differs from heap_insert only in supplying - * a default command ID and not allowing access to the speedup options. - * - * This should be used rather than using heap_insert directly in most places - * where we are modifying system catalogs. - */ -Oid -simple_heap_insert(Relation relation, HeapTuple tup) -{ - return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL); -} - -/* - * Given infomask/infomask2, compute the bits that must be saved in the - * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock, - * xl_heap_lock_updated WAL records. - * - * See fix_infomask_from_infobits. - */ -static uint8 -compute_infobits(uint16 infomask, uint16 infomask2) -{ - return - ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) | - ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) | - ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) | - /* note we ignore HEAP_XMAX_SHR_LOCK here */ - ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) | - ((infomask2 & HEAP_KEYS_UPDATED) != 0 ? - XLHL_KEYS_UPDATED : 0); -} - -/* - * Given two versions of the same t_infomask for a tuple, compare them and - * return whether the relevant status for a tuple Xmax has changed. This is - * used after a buffer lock has been released and reacquired: we want to ensure - * that the tuple state continues to be the same it was when we previously - * examined it. - * - * Note the Xmax field itself must be compared separately. - */ -static inline bool -xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) -{ - const uint16 interesting = - HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK; - - if ((new_infomask & interesting) != (old_infomask & interesting)) - return true; - - return false; -} - -/* - * heap_delete - delete a tuple - * - * NB: do not call this directly unless you are prepared to deal with - * concurrent-update conditions. Use simple_heap_delete instead. - * - * relation - table to be modified (caller must hold suitable lock) - * tid - TID of tuple to be deleted - * cid - delete command ID (used for visibility test, and stored into - * cmax if successful) - * crosscheck - if not InvalidSnapshot, also check tuple against this - * wait - true if should wait for any conflicting update to commit/abort - * hufd - output parameter, filled in failure cases (see below) - * - * Normal, successful return value is HeapTupleMayBeUpdated, which - * actually means we did delete it. Failure return codes are - * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated - * (the last only possible if wait == false). - * - * In the failure cases, the routine fills *hufd with the tuple's t_ctid, - * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax - * (the last only for HeapTupleSelfUpdated, since we - * cannot obtain cmax from a combocid generated by another transaction). - * See comments for struct HeapUpdateFailureData for additional info. - */ -HTSU_Result -heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd) -{ - HTSU_Result result; - TransactionId xid = GetCurrentTransactionId(); - ItemId lp; - HeapTupleData tp; - Page page; - BlockNumber block; - Buffer buffer; - Buffer vmbuffer = InvalidBuffer; - TransactionId new_xmax; - uint16 new_infomask, - new_infomask2; - bool have_tuple_lock = false; - bool iscombo; - bool all_visible_cleared = false; - HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */ - bool old_key_copied = false; - - Assert(ItemPointerIsValid(tid)); - - /* - * Forbid this during a parallel operation, lest it allocate a combocid. - * Other workers might need that combocid for visibility checks, and we - * have no provision for broadcasting it to them. - */ - if (IsInParallelMode()) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TRANSACTION_STATE), - errmsg("cannot delete tuples during a parallel operation"))); - - block = ItemPointerGetBlockNumber(tid); - buffer = ReadBuffer(relation, block); - page = BufferGetPage(buffer); - - /* - * Before locking the buffer, pin the visibility map page if it appears to - * be necessary. Since we haven't got the lock yet, someone else might be - * in the middle of changing this, so we'll need to recheck after we have - * the lock. + * Before locking the buffer, pin the visibility map page if it appears to + * be necessary. Since we haven't got the lock yet, someone else might be + * in the middle of changing this, so we'll need to recheck after we have + * the lock. */ if (PageIsAllVisible(page)) visibilitymap_pin(relation, block, &vmbuffer); @@ -4504,7 +3833,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) /* * Return the MultiXactStatus corresponding to the given tuple lock mode. */ -static MultiXactStatus +MultiXactStatus get_mxact_status_for_lock(LockTupleMode mode, bool is_update) { int retval; @@ -4522,724 +3851,34 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) } /* - * heap_lock_tuple - lock a tuple in shared or exclusive mode - * - * Note that this acquires a buffer pin, which the caller must release. - * - * Input parameters: - * relation: relation containing tuple (caller must hold suitable lock) - * tuple->t_self: TID of tuple to lock (rest of struct need not be valid) - * cid: current command ID (used for visibility test, and stored into - * tuple's cmax if lock is successful) - * mode: indicates if shared or exclusive tuple lock is desired - * wait_policy: what to do if tuple lock is not available - * follow_updates: if true, follow the update chain to also lock descendant - * tuples. - * - * Output parameters: - * *tuple: all fields filled in - * *buffer: set to buffer holding tuple (pinned but not locked at exit) - * *hufd: filled in failure cases (see below) - * - * Function result may be: - * HeapTupleMayBeUpdated: lock was successfully acquired - * HeapTupleInvisible: lock failed because tuple was never visible to us - * HeapTupleSelfUpdated: lock failed because tuple updated by self - * HeapTupleUpdated: lock failed because tuple updated by other xact - * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip + * Acquire heavyweight lock on the given tuple, in preparation for acquiring + * its normal, Xmax-based tuple lock. * - * In the failure cases other than HeapTupleInvisible, the routine fills - * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, - * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated, - * since we cannot obtain cmax from a combocid generated by another - * transaction). - * See comments for struct HeapUpdateFailureData for additional info. + * have_tuple_lock is an input and output parameter: on input, it indicates + * whether the lock has previously been acquired (and this function does + * nothing in that case). If this function returns success, have_tuple_lock + * has been flipped to true. * - * See README.tuplock for a thorough explanation of this mechanism. + * Returns false if it was unable to obtain the lock; this can only happen if + * wait_policy is Skip. */ -HTSU_Result -heap_lock_tuple(Relation relation, HeapTuple tuple, - CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, HeapUpdateFailureData *hufd) +bool +heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, + LockWaitPolicy wait_policy, bool *have_tuple_lock) { - HTSU_Result result; - ItemPointer tid = &(tuple->t_self); - ItemId lp; - Page page; - Buffer vmbuffer = InvalidBuffer; - BlockNumber block; - TransactionId xid, - xmax; - uint16 old_infomask, - new_infomask, - new_infomask2; - bool first_time = true; - bool have_tuple_lock = false; - bool cleared_all_frozen = false; - - *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); - block = ItemPointerGetBlockNumber(tid); - - /* - * Before locking the buffer, pin the visibility map page if it appears to - * be necessary. Since we haven't got the lock yet, someone else might be - * in the middle of changing this, so we'll need to recheck after we have - * the lock. - */ - if (PageIsAllVisible(BufferGetPage(*buffer))) - visibilitymap_pin(relation, block, &vmbuffer); - - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - page = BufferGetPage(*buffer); - lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); - Assert(ItemIdIsNormal(lp)); + if (*have_tuple_lock) + return true; - tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); - tuple->t_len = ItemIdGetLength(lp); - tuple->t_tableOid = RelationGetRelid(relation); + switch (wait_policy) + { + case LockWaitBlock: + LockTupleTuplock(relation, tid, mode); + break; -l3: - result = relation->rd_stamroutine->snapshot_satisfiesUpdate(tuple, cid, *buffer); - - if (result == HeapTupleInvisible) - { - /* - * This is possible, but only when locking a tuple for ON CONFLICT - * UPDATE. We return this value here rather than throwing an error in - * order to give that case the opportunity to throw a more specific - * error. - */ - result = HeapTupleInvisible; - goto out_locked; - } - else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated) - { - TransactionId xwait; - uint16 infomask; - uint16 infomask2; - bool require_sleep; - ItemPointerData t_ctid; - - /* must copy state data before unlocking buffer */ - xwait = HeapTupleHeaderGetRawXmax(tuple->t_data); - infomask = tuple->t_data->t_infomask; - infomask2 = tuple->t_data->t_infomask2; - ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); - - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); - - /* - * If any subtransaction of the current top transaction already holds - * a lock as strong as or stronger than what we're requesting, we - * effectively hold the desired lock already. We *must* succeed - * without trying to take the tuple lock, else we will deadlock - * against anyone wanting to acquire a stronger lock. - * - * Note we only do this the first time we loop on the HTSU result; - * there is no point in testing in subsequent passes, because - * evidently our own transaction cannot have acquired a new lock after - * the first time we checked. - */ - if (first_time) - { - first_time = false; - - if (infomask & HEAP_XMAX_IS_MULTI) - { - int i; - int nmembers; - MultiXactMember *members; - - /* - * We don't need to allow old multixacts here; if that had - * been the case, HeapTupleSatisfiesUpdate would have returned - * MayBeUpdated and we wouldn't be here. - */ - nmembers = - GetMultiXactIdMembers(xwait, &members, false, - HEAP_XMAX_IS_LOCKED_ONLY(infomask)); - - for (i = 0; i < nmembers; i++) - { - /* only consider members of our own transaction */ - if (!TransactionIdIsCurrentTransactionId(members[i].xid)) - continue; - - if (TUPLOCK_from_mxstatus(members[i].status) >= mode) - { - pfree(members); - result = HeapTupleMayBeUpdated; - goto out_unlocked; - } - } - - if (members) - pfree(members); - } - else if (TransactionIdIsCurrentTransactionId(xwait)) - { - switch (mode) - { - case LockTupleKeyShare: - Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) || - HEAP_XMAX_IS_SHR_LOCKED(infomask) || - HEAP_XMAX_IS_EXCL_LOCKED(infomask)); - result = HeapTupleMayBeUpdated; - goto out_unlocked; - case LockTupleShare: - if (HEAP_XMAX_IS_SHR_LOCKED(infomask) || - HEAP_XMAX_IS_EXCL_LOCKED(infomask)) - { - result = HeapTupleMayBeUpdated; - goto out_unlocked; - } - break; - case LockTupleNoKeyExclusive: - if (HEAP_XMAX_IS_EXCL_LOCKED(infomask)) - { - result = HeapTupleMayBeUpdated; - goto out_unlocked; - } - break; - case LockTupleExclusive: - if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) && - infomask2 & HEAP_KEYS_UPDATED) - { - result = HeapTupleMayBeUpdated; - goto out_unlocked; - } - break; - } - } - } - - /* - * Initially assume that we will have to wait for the locking - * transaction(s) to finish. We check various cases below in which - * this can be turned off. - */ - require_sleep = true; - if (mode == LockTupleKeyShare) - { - /* - * If we're requesting KeyShare, and there's no update present, we - * don't need to wait. Even if there is an update, we can still - * continue if the key hasn't been modified. - * - * However, if there are updates, we need to walk the update chain - * to mark future versions of the row as locked, too. That way, - * if somebody deletes that future version, we're protected - * against the key going away. This locking of future versions - * could block momentarily, if a concurrent transaction is - * deleting a key; or it could return a value to the effect that - * the transaction deleting the key has already committed. So we - * do this before re-locking the buffer; otherwise this would be - * prone to deadlocks. - * - * Note that the TID we're locking was grabbed before we unlocked - * the buffer. For it to change while we're not looking, the - * other properties we're testing for below after re-locking the - * buffer would also change, in which case we would restart this - * loop above. - */ - if (!(infomask2 & HEAP_KEYS_UPDATED)) - { - bool updated; - - updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask); - - /* - * If there are updates, follow the update chain; bail out if - * that cannot be done. - */ - if (follow_updates && updated) - { - HTSU_Result res; - - res = heap_lock_updated_tuple(relation, tuple, &t_ctid, - GetCurrentTransactionId(), - mode); - if (res != HeapTupleMayBeUpdated) - { - result = res; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - } - - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * Make sure it's still an appropriate lock, else start over. - * Also, if it wasn't updated before we released the lock, but - * is updated now, we start over too; the reason is that we - * now need to follow the update chain to lock the new - * versions. - */ - if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) && - ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) || - !updated)) - goto l3; - - /* Things look okay, so we can skip sleeping */ - require_sleep = false; - - /* - * Note we allow Xmax to change here; other updaters/lockers - * could have modified it before we grabbed the buffer lock. - * However, this is not a problem, because with the recheck we - * just did we ensure that they still don't conflict with the - * lock we want. - */ - } - } - else if (mode == LockTupleShare) - { - /* - * If we're requesting Share, we can similarly avoid sleeping if - * there's no update and no exclusive lock present. - */ - if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && - !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) - { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * Make sure it's still an appropriate lock, else start over. - * See above about allowing xmax to change. - */ - if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || - HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask)) - goto l3; - require_sleep = false; - } - } - else if (mode == LockTupleNoKeyExclusive) - { - /* - * If we're requesting NoKeyExclusive, we might also be able to - * avoid sleeping; just ensure that there no conflicting lock - * already acquired. - */ - if (infomask & HEAP_XMAX_IS_MULTI) - { - if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask, - mode)) - { - /* - * No conflict, but if the xmax changed under us in the - * meantime, start over. - */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), - xwait)) - goto l3; - - /* otherwise, we're good */ - require_sleep = false; - } - } - else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) - { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - /* if the xmax changed in the meantime, start over */ - if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals( - HeapTupleHeaderGetRawXmax(tuple->t_data), - xwait)) - goto l3; - /* otherwise, we're good */ - require_sleep = false; - } - } - - /* - * As a check independent from those above, we can also avoid sleeping - * if the current transaction is the sole locker of the tuple. Note - * that the strength of the lock already held is irrelevant; this is - * not about recording the lock in Xmax (which will be done regardless - * of this optimization, below). Also, note that the cases where we - * hold a lock stronger than we are requesting are already handled - * above by not doing anything. - * - * Note we only deal with the non-multixact case here; MultiXactIdWait - * is well equipped to deal with this situation on its own. - */ - if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) && - TransactionIdIsCurrentTransactionId(xwait)) - { - /* ... but if the xmax changed in the meantime, start over */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), - xwait)) - goto l3; - Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask)); - require_sleep = false; - } - - /* - * Time to sleep on the other transaction/multixact, if necessary. - * - * If the other transaction is an update that's already committed, - * then sleeping cannot possibly do any good: if we're required to - * sleep, get out to raise an error instead. - * - * By here, we either have already acquired the buffer exclusive lock, - * or we must wait for the locking transaction or multixact; so below - * we ensure that we grab buffer lock after the sleep. - */ - if (require_sleep && result == HeapTupleUpdated) - { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - else if (require_sleep) - { - /* - * Acquire tuple lock to establish our priority for the tuple, or - * die trying. LockTuple will release us when we are next-in-line - * for the tuple. We must do this even if we are share-locking. - * - * If we are forced to "start over" below, we keep the tuple lock; - * this arranges that we stay at the head of the line while - * rechecking tuple state. - */ - if (!heap_acquire_tuplock(relation, tid, mode, wait_policy, - &have_tuple_lock)) - { - /* - * This can only happen if wait_policy is Skip and the lock - * couldn't be obtained. - */ - result = HeapTupleWouldBlock; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - - if (infomask & HEAP_XMAX_IS_MULTI) - { - MultiXactStatus status = get_mxact_status_for_lock(mode, false); - - /* We only ever lock tuples, never update them */ - if (status >= MultiXactStatusNoKeyUpdate) - elog(ERROR, "invalid lock mode in heap_lock_tuple"); - - /* wait for multixact to end, or die trying */ - switch (wait_policy) - { - case LockWaitBlock: - MultiXactIdWait((MultiXactId) xwait, status, infomask, - relation, &tuple->t_self, XLTW_Lock, NULL); - break; - case LockWaitSkip: - if (!ConditionalMultiXactIdWait((MultiXactId) xwait, - status, infomask, relation, - NULL)) - { - result = HeapTupleWouldBlock; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - break; - case LockWaitError: - if (!ConditionalMultiXactIdWait((MultiXactId) xwait, - status, infomask, relation, - NULL)) - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg("could not obtain lock on row in relation \"%s\"", - RelationGetRelationName(relation)))); - - break; - } - - /* - * Of course, the multixact might not be done here: if we're - * requesting a light lock mode, other transactions with light - * locks could still be alive, as well as locks owned by our - * own xact or other subxacts of this backend. We need to - * preserve the surviving MultiXact members. Note that it - * isn't absolutely necessary in the latter case, but doing so - * is simpler. - */ - } - else - { - /* wait for regular transaction to end, or die trying */ - switch (wait_policy) - { - case LockWaitBlock: - XactLockTableWait(xwait, relation, &tuple->t_self, - XLTW_Lock); - break; - case LockWaitSkip: - if (!ConditionalXactLockTableWait(xwait)) - { - result = HeapTupleWouldBlock; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - break; - case LockWaitError: - if (!ConditionalXactLockTableWait(xwait)) - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg("could not obtain lock on row in relation \"%s\"", - RelationGetRelationName(relation)))); - break; - } - } - - /* if there are updates, follow the update chain */ - if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask)) - { - HTSU_Result res; - - res = heap_lock_updated_tuple(relation, tuple, &t_ctid, - GetCurrentTransactionId(), - mode); - if (res != HeapTupleMayBeUpdated) - { - result = res; - /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto failed; - } - } - - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * xwait is done, but if xwait had just locked the tuple then some - * other xact could update this tuple before we get to this point. - * Check for xmax change, and start over if so. - */ - if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), - xwait)) - goto l3; - - if (!(infomask & HEAP_XMAX_IS_MULTI)) - { - /* - * Otherwise check if it committed or aborted. Note we cannot - * be here if the tuple was only locked by somebody who didn't - * conflict with us; that would have been handled above. So - * that transaction must necessarily be gone by now. But - * don't check for this in the multixact case, because some - * locker transactions might still be running. - */ - UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); - } - } - - /* By here, we're certain that we hold buffer exclusive lock again */ - - /* - * We may lock if previous xmax aborted, or if it committed but only - * locked the tuple without updating it; or if we didn't have to wait - * at all for whatever reason. - */ - if (!require_sleep || - (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || - HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || - HeapTupleHeaderIsOnlyLocked(tuple->t_data)) - result = HeapTupleMayBeUpdated; - else - result = HeapTupleUpdated; - } - -failed: - if (result != HeapTupleMayBeUpdated) - { - Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated || - result == HeapTupleWouldBlock); - Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); - hufd->ctid = tuple->t_data->t_ctid; - hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); - if (result == HeapTupleSelfUpdated) - hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data); - else - hufd->cmax = InvalidCommandId; - goto out_locked; - } - - /* - * If we didn't pin the visibility map page and the page has become all - * visible while we were busy locking the buffer, or during some - * subsequent window during which we had it unlocked, we'll have to unlock - * and re-lock, to avoid holding the buffer lock across I/O. That's a bit - * unfortunate, especially since we'll now have to recheck whether the - * tuple has been locked or updated under us, but hopefully it won't - * happen very often. - */ - if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) - { - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); - visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - goto l3; - } - - xmax = HeapTupleHeaderGetRawXmax(tuple->t_data); - old_infomask = tuple->t_data->t_infomask; - - /* - * If this is the first possibly-multixact-able operation in the current - * transaction, set my per-backend OldestMemberMXactId setting. We can be - * certain that the transaction will never become a member of any older - * MultiXactIds than that. (We have to do this even if we end up just - * using our own TransactionId below, since some other backend could - * incorporate our XID into a MultiXact immediately afterwards.) - */ - MultiXactIdSetOldestMember(); - - /* - * Compute the new xmax and infomask to store into the tuple. Note we do - * not modify the tuple just yet, because that would leave it in the wrong - * state if multixact.c elogs. - */ - compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2, - GetCurrentTransactionId(), mode, false, - &xid, &new_infomask, &new_infomask2); - - START_CRIT_SECTION(); - - /* - * Store transaction information of xact locking the tuple. - * - * Note: Cmax is meaningless in this context, so don't set it; this avoids - * possibly generating a useless combo CID. Moreover, if we're locking a - * previously updated tuple, it's important to preserve the Cmax. - * - * Also reset the HOT UPDATE bit, but only if there's no update; otherwise - * we would break the HOT chain. - */ - tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS; - tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; - tuple->t_data->t_infomask |= new_infomask; - tuple->t_data->t_infomask2 |= new_infomask2; - if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) - HeapTupleHeaderClearHotUpdated(tuple->t_data); - HeapTupleHeaderSetXmax(tuple->t_data, xid); - - /* - * Make sure there is no forward chain link in t_ctid. Note that in the - * cases where the tuple has been updated, we must not overwrite t_ctid, - * because it was set by the updater. Moreover, if the tuple has been - * updated, we need to follow the update chain to lock the new versions of - * the tuple as well. - */ - if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) - tuple->t_data->t_ctid = *tid; - - /* Clear only the all-frozen bit on visibility map if needed */ - if (PageIsAllVisible(page) && - visibilitymap_clear(relation, block, vmbuffer, - VISIBILITYMAP_ALL_FROZEN)) - cleared_all_frozen = true; - - - MarkBufferDirty(*buffer); - - /* - * XLOG stuff. You might think that we don't need an XLOG record because - * there is no state change worth restoring after a crash. You would be - * wrong however: we have just written either a TransactionId or a - * MultiXactId that may never have been seen on disk before, and we need - * to make sure that there are XLOG entries covering those ID numbers. - * Else the same IDs might be re-used after a crash, which would be - * disastrous if this page made it to disk before the crash. Essentially - * we have to enforce the WAL log-before-data rule even in this case. - * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG - * entries for everything anyway.) - */ - if (RelationNeedsWAL(relation)) - { - xl_heap_lock xlrec; - XLogRecPtr recptr; - - XLogBeginInsert(); - XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); - - xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); - xlrec.locking_xid = xid; - xlrec.infobits_set = compute_infobits(new_infomask, - tuple->t_data->t_infomask2); - xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; - XLogRegisterData((char *) &xlrec, SizeOfHeapLock); - - /* we don't decode row locks atm, so no need to log the origin */ - - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - result = HeapTupleMayBeUpdated; - -out_locked: - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); - -out_unlocked: - if (BufferIsValid(vmbuffer)) - ReleaseBuffer(vmbuffer); - - /* - * Don't update the visibility map here. Locking a tuple doesn't change - * visibility info. - */ - - /* - * Now that we have successfully marked the tuple as locked, we can - * release the lmgr tuple lock, if we had it. - */ - if (have_tuple_lock) - UnlockTupleTuplock(relation, tid, mode); - - return result; -} - -/* - * Acquire heavyweight lock on the given tuple, in preparation for acquiring - * its normal, Xmax-based tuple lock. - * - * have_tuple_lock is an input and output parameter: on input, it indicates - * whether the lock has previously been acquired (and this function does - * nothing in that case). If this function returns success, have_tuple_lock - * has been flipped to true. - * - * Returns false if it was unable to obtain the lock; this can only happen if - * wait_policy is Skip. - */ -static bool -heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, - LockWaitPolicy wait_policy, bool *have_tuple_lock) -{ - if (*have_tuple_lock) - return true; - - switch (wait_policy) - { - case LockWaitBlock: - LockTupleTuplock(relation, tid, mode); - break; - - case LockWaitSkip: - if (!ConditionalLockTupleTuplock(relation, tid, mode)) - return false; - break; + case LockWaitSkip: + if (!ConditionalLockTupleTuplock(relation, tid, mode)) + return false; + break; case LockWaitError: if (!ConditionalLockTupleTuplock(relation, tid, mode)) @@ -5272,7 +3911,7 @@ heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, * window, but it's still possible to end up creating an unnecessary * MultiXactId. Fortunately this is harmless. */ -static void +void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, @@ -5319,916 +3958,226 @@ l5: break; case LockTupleNoKeyExclusive: new_xmax = add_to_xmax; - new_infomask |= HEAP_XMAX_EXCL_LOCK; - break; - case LockTupleExclusive: - new_xmax = add_to_xmax; - new_infomask |= HEAP_XMAX_EXCL_LOCK; - new_infomask2 |= HEAP_KEYS_UPDATED; - break; - default: - new_xmax = InvalidTransactionId; /* silence compiler */ - elog(ERROR, "invalid lock mode"); - } - } - } - else if (old_infomask & HEAP_XMAX_IS_MULTI) - { - MultiXactStatus new_status; - - /* - * Currently we don't allow XMAX_COMMITTED to be set for multis, so - * cross-check. - */ - Assert(!(old_infomask & HEAP_XMAX_COMMITTED)); - - /* - * A multixact together with LOCK_ONLY set but neither lock bit set - * (i.e. a pg_upgraded share locked tuple) cannot possibly be running - * anymore. This check is critical for databases upgraded by - * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume - * that such multis are never passed. - */ - if (HEAP_LOCKED_UPGRADED(old_infomask)) - { - old_infomask &= ~HEAP_XMAX_IS_MULTI; - old_infomask |= HEAP_XMAX_INVALID; - goto l5; - } - - /* - * If the XMAX is already a MultiXactId, then we need to expand it to - * include add_to_xmax; but if all the members were lockers and are - * all gone, we can do away with the IS_MULTI bit and just set - * add_to_xmax as the only locker/updater. If all lockers are gone - * and we have an updater that aborted, we can also do without a - * multi. - * - * The cost of doing GetMultiXactIdMembers would be paid by - * MultiXactIdExpand if we weren't to do this, so this check is not - * incurring extra work anyhow. - */ - if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))) - { - if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) || - !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax, - old_infomask))) - { - /* - * Reset these bits and restart; otherwise fall through to - * create a new multi below. - */ - old_infomask &= ~HEAP_XMAX_IS_MULTI; - old_infomask |= HEAP_XMAX_INVALID; - goto l5; - } - } - - new_status = get_mxact_status_for_lock(mode, is_update); - - new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax, - new_status); - GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); - } - else if (old_infomask & HEAP_XMAX_COMMITTED) - { - /* - * It's a committed update, so we need to preserve him as updater of - * the tuple. - */ - MultiXactStatus status; - MultiXactStatus new_status; - - if (old_infomask2 & HEAP_KEYS_UPDATED) - status = MultiXactStatusUpdate; - else - status = MultiXactStatusNoKeyUpdate; - - new_status = get_mxact_status_for_lock(mode, is_update); - - /* - * since it's not running, it's obviously impossible for the old - * updater to be identical to the current one, so we need not check - * for that case as we do in the block above. - */ - new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); - GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); - } - else if (TransactionIdIsInProgress(xmax)) - { - /* - * If the XMAX is a valid, in-progress TransactionId, then we need to - * create a new MultiXactId that includes both the old locker or - * updater and our own TransactionId. - */ - MultiXactStatus new_status; - MultiXactStatus old_status; - LockTupleMode old_mode; - - if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) - { - if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) - old_status = MultiXactStatusForKeyShare; - else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) - old_status = MultiXactStatusForShare; - else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) - { - if (old_infomask2 & HEAP_KEYS_UPDATED) - old_status = MultiXactStatusForUpdate; - else - old_status = MultiXactStatusForNoKeyUpdate; - } - else - { - /* - * LOCK_ONLY can be present alone only when a page has been - * upgraded by pg_upgrade. But in that case, - * TransactionIdIsInProgress() should have returned false. We - * assume it's no longer locked in this case. - */ - elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax); - old_infomask |= HEAP_XMAX_INVALID; - old_infomask &= ~HEAP_XMAX_LOCK_ONLY; - goto l5; - } - } - else - { - /* it's an update, but which kind? */ - if (old_infomask2 & HEAP_KEYS_UPDATED) - old_status = MultiXactStatusUpdate; - else - old_status = MultiXactStatusNoKeyUpdate; - } - - old_mode = TUPLOCK_from_mxstatus(old_status); - - /* - * If the lock to be acquired is for the same TransactionId as the - * existing lock, there's an optimization possible: consider only the - * strongest of both locks as the only one present, and restart. - */ - if (xmax == add_to_xmax) - { - /* - * Note that it's not possible for the original tuple to be - * updated: we wouldn't be here because the tuple would have been - * invisible and we wouldn't try to update it. As a subtlety, - * this code can also run when traversing an update chain to lock - * future versions of a tuple. But we wouldn't be here either, - * because the add_to_xmax would be different from the original - * updater. - */ - Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); - - /* acquire the strongest of both */ - if (mode < old_mode) - mode = old_mode; - /* mustn't touch is_update */ - - old_infomask |= HEAP_XMAX_INVALID; - goto l5; - } - - /* otherwise, just fall back to creating a new multixact */ - new_status = get_mxact_status_for_lock(mode, is_update); - new_xmax = MultiXactIdCreate(xmax, old_status, - add_to_xmax, new_status); - GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); - } - else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) && - TransactionIdDidCommit(xmax)) - { - /* - * It's a committed update, so we gotta preserve him as updater of the - * tuple. - */ - MultiXactStatus status; - MultiXactStatus new_status; - - if (old_infomask2 & HEAP_KEYS_UPDATED) - status = MultiXactStatusUpdate; - else - status = MultiXactStatusNoKeyUpdate; - - new_status = get_mxact_status_for_lock(mode, is_update); - - /* - * since it's not running, it's obviously impossible for the old - * updater to be identical to the current one, so we need not check - * for that case as we do in the block above. - */ - new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); - GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); - } - else - { - /* - * Can get here iff the locking/updating transaction was running when - * the infomask was extracted from the tuple, but finished before - * TransactionIdIsInProgress got to run. Deal with it as if there was - * no locker at all in the first place. - */ - old_infomask |= HEAP_XMAX_INVALID; - goto l5; - } - - *result_infomask = new_infomask; - *result_infomask2 = new_infomask2; - *result_xmax = new_xmax; -} - -/* - * Subroutine for heap_lock_updated_tuple_rec. - * - * Given a hypothetical multixact status held by the transaction identified - * with the given xid, does the current transaction need to wait, fail, or can - * it continue if it wanted to acquire a lock of the given mode? "needwait" - * is set to true if waiting is necessary; if it can continue, then - * HeapTupleMayBeUpdated is returned. If the lock is already held by the - * current transaction, return HeapTupleSelfUpdated. In case of a conflict - * with another transaction, a different HeapTupleSatisfiesUpdate return code - * is returned. - * - * The held status is said to be hypothetical because it might correspond to a - * lock held by a single Xid, i.e. not a real MultiXactId; we express it this - * way for simplicity of API. - */ -static HTSU_Result -test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, - LockTupleMode mode, bool *needwait) -{ - MultiXactStatus wantedstatus; - - *needwait = false; - wantedstatus = get_mxact_status_for_lock(mode, false); - - /* - * Note: we *must* check TransactionIdIsInProgress before - * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an - * explanation. - */ - if (TransactionIdIsCurrentTransactionId(xid)) - { - /* - * The tuple has already been locked by our own transaction. This is - * very rare but can happen if multiple transactions are trying to - * lock an ancient version of the same tuple. - */ - return HeapTupleSelfUpdated; - } - else if (TransactionIdIsInProgress(xid)) - { - /* - * If the locking transaction is running, what we do depends on - * whether the lock modes conflict: if they do, then we must wait for - * it to finish; otherwise we can fall through to lock this tuple - * version without waiting. - */ - if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), - LOCKMODE_from_mxstatus(wantedstatus))) - { - *needwait = true; - } - - /* - * If we set needwait above, then this value doesn't matter; - * otherwise, this value signals to caller that it's okay to proceed. - */ - return HeapTupleMayBeUpdated; - } - else if (TransactionIdDidAbort(xid)) - return HeapTupleMayBeUpdated; - else if (TransactionIdDidCommit(xid)) - { - /* - * The other transaction committed. If it was only a locker, then the - * lock is completely gone now and we can return success; but if it - * was an update, then what we do depends on whether the two lock - * modes conflict. If they conflict, then we must report error to - * caller. But if they don't, we can fall through to allow the current - * transaction to lock the tuple. - * - * Note: the reason we worry about ISUPDATE here is because as soon as - * a transaction ends, all its locks are gone and meaningless, and - * thus we can ignore them; whereas its updates persist. In the - * TransactionIdIsInProgress case, above, we don't need to check - * because we know the lock is still "alive" and thus a conflict needs - * always be checked. - */ - if (!ISUPDATE_from_mxstatus(status)) - return HeapTupleMayBeUpdated; - - if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), - LOCKMODE_from_mxstatus(wantedstatus))) - /* bummer */ - return HeapTupleUpdated; - - return HeapTupleMayBeUpdated; - } - - /* Not in progress, not aborted, not committed -- must have crashed */ - return HeapTupleMayBeUpdated; -} - - -/* - * Recursive part of heap_lock_updated_tuple - * - * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given - * xid with the given mode; if this tuple is updated, recurse to lock the new - * version as well. - */ -static HTSU_Result -heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, - LockTupleMode mode) -{ - HTSU_Result result; - ItemPointerData tupid; - HeapTupleData mytup; - Buffer buf; - uint16 new_infomask, - new_infomask2, - old_infomask, - old_infomask2; - TransactionId xmax, - new_xmax; - TransactionId priorXmax = InvalidTransactionId; - bool cleared_all_frozen = false; - Buffer vmbuffer = InvalidBuffer; - BlockNumber block; - - ItemPointerCopy(tid, &tupid); - - for (;;) - { - new_infomask = 0; - new_xmax = InvalidTransactionId; - block = ItemPointerGetBlockNumber(&tupid); - ItemPointerCopy(&tupid, &(mytup.t_self)); - - if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL)) - { - /* - * if we fail to find the updated version of the tuple, it's - * because it was vacuumed/pruned away after its creator - * transaction aborted. So behave as if we got to the end of the - * chain, and there's no further tuple to lock: return success to - * caller. - */ - return HeapTupleMayBeUpdated; - } - -l4: - CHECK_FOR_INTERRUPTS(); - - /* - * Before locking the buffer, pin the visibility map page if it - * appears to be necessary. Since we haven't got the lock yet, - * someone else might be in the middle of changing this, so we'll need - * to recheck after we have the lock. - */ - if (PageIsAllVisible(BufferGetPage(buf))) - visibilitymap_pin(rel, block, &vmbuffer); - else - vmbuffer = InvalidBuffer; - - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - - /* - * If we didn't pin the visibility map page and the page has become - * all visible while we were busy locking the buffer, we'll have to - * unlock and re-lock, to avoid holding the buffer lock across I/O. - * That's a bit unfortunate, but hopefully shouldn't happen often. - */ - if (vmbuffer == InvalidBuffer && PageIsAllVisible(BufferGetPage(buf))) - { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - visibilitymap_pin(rel, block, &vmbuffer); - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - } - - /* - * Check the tuple XMIN against prior XMAX, if any. If we reached the - * end of the chain, we're done, so return success. - */ - if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data), - priorXmax)) - { - result = HeapTupleMayBeUpdated; - goto out_locked; - } - - /* - * Also check Xmin: if this tuple was created by an aborted - * (sub)transaction, then we already locked the last live one in the - * chain, thus we're done, so return success. - */ - if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data))) - { - UnlockReleaseBuffer(buf); - return HeapTupleMayBeUpdated; - } - - old_infomask = mytup.t_data->t_infomask; - old_infomask2 = mytup.t_data->t_infomask2; - xmax = HeapTupleHeaderGetRawXmax(mytup.t_data); - - /* - * If this tuple version has been updated or locked by some concurrent - * transaction(s), what we do depends on whether our lock mode - * conflicts with what those other transactions hold, and also on the - * status of them. - */ - if (!(old_infomask & HEAP_XMAX_INVALID)) - { - TransactionId rawxmax; - bool needwait; - - rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data); - if (old_infomask & HEAP_XMAX_IS_MULTI) - { - int nmembers; - int i; - MultiXactMember *members; - - /* - * We don't need a test for pg_upgrade'd tuples: this is only - * applied to tuples after the first in an update chain. Said - * first tuple in the chain may well be locked-in-9.2-and- - * pg_upgraded, but that one was already locked by our caller, - * not us; and any subsequent ones cannot be because our - * caller must necessarily have obtained a snapshot later than - * the pg_upgrade itself. - */ - Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask)); - - nmembers = GetMultiXactIdMembers(rawxmax, &members, false, - HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); - for (i = 0; i < nmembers; i++) - { - result = test_lockmode_for_conflict(members[i].status, - members[i].xid, - mode, &needwait); - - /* - * If the tuple was already locked by ourselves in a - * previous iteration of this (say heap_lock_tuple was - * forced to restart the locking loop because of a change - * in xmax), then we hold the lock already on this tuple - * version and we don't need to do anything; and this is - * not an error condition either. We just need to skip - * this tuple and continue locking the next version in the - * update chain. - */ - if (result == HeapTupleSelfUpdated) - { - pfree(members); - goto next; - } - - if (needwait) - { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - XactLockTableWait(members[i].xid, rel, - &mytup.t_self, - XLTW_LockUpdated); - pfree(members); - goto l4; - } - if (result != HeapTupleMayBeUpdated) - { - pfree(members); - goto out_locked; - } - } - if (members) - pfree(members); - } - else - { - MultiXactStatus status; - - /* - * For a non-multi Xmax, we first need to compute the - * corresponding MultiXactStatus by using the infomask bits. - */ - if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) - { - if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) - status = MultiXactStatusForKeyShare; - else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) - status = MultiXactStatusForShare; - else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) - { - if (old_infomask2 & HEAP_KEYS_UPDATED) - status = MultiXactStatusForUpdate; - else - status = MultiXactStatusForNoKeyUpdate; - } - else - { - /* - * LOCK_ONLY present alone (a pg_upgraded tuple marked - * as share-locked in the old cluster) shouldn't be - * seen in the middle of an update chain. - */ - elog(ERROR, "invalid lock status in tuple"); - } - } - else - { - /* it's an update, but which kind? */ - if (old_infomask2 & HEAP_KEYS_UPDATED) - status = MultiXactStatusUpdate; - else - status = MultiXactStatusNoKeyUpdate; - } - - result = test_lockmode_for_conflict(status, rawxmax, mode, - &needwait); - - /* - * If the tuple was already locked by ourselves in a previous - * iteration of this (say heap_lock_tuple was forced to - * restart the locking loop because of a change in xmax), then - * we hold the lock already on this tuple version and we don't - * need to do anything; and this is not an error condition - * either. We just need to skip this tuple and continue - * locking the next version in the update chain. - */ - if (result == HeapTupleSelfUpdated) - goto next; - - if (needwait) - { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - XactLockTableWait(rawxmax, rel, &mytup.t_self, - XLTW_LockUpdated); - goto l4; - } - if (result != HeapTupleMayBeUpdated) - { - goto out_locked; - } - } - } - - /* compute the new Xmax and infomask values for the tuple ... */ - compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2, - xid, mode, false, - &new_xmax, &new_infomask, &new_infomask2); - - if (PageIsAllVisible(BufferGetPage(buf)) && - visibilitymap_clear(rel, block, vmbuffer, - VISIBILITYMAP_ALL_FROZEN)) - cleared_all_frozen = true; - - START_CRIT_SECTION(); - - /* ... and set them */ - HeapTupleHeaderSetXmax(mytup.t_data, new_xmax); - mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS; - mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; - mytup.t_data->t_infomask |= new_infomask; - mytup.t_data->t_infomask2 |= new_infomask2; - - MarkBufferDirty(buf); - - /* XLOG stuff */ - if (RelationNeedsWAL(rel)) - { - xl_heap_lock_updated xlrec; - XLogRecPtr recptr; - Page page = BufferGetPage(buf); - - XLogBeginInsert(); - XLogRegisterBuffer(0, buf, REGBUF_STANDARD); - - xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self); - xlrec.xmax = new_xmax; - xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2); - xlrec.flags = - cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; - - XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated); - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - -next: - /* if we find the end of update chain, we're done. */ - if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID || - ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || - HeapTupleHeaderIsOnlyLocked(mytup.t_data)) - { - result = HeapTupleMayBeUpdated; - goto out_locked; - } - - /* tail recursion */ - priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data); - ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid); - UnlockReleaseBuffer(buf); - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); - } - - result = HeapTupleMayBeUpdated; - -out_locked: - UnlockReleaseBuffer(buf); - - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); - - return result; - -} - -/* - * heap_lock_updated_tuple - * Follow update chain when locking an updated tuple, acquiring locks (row - * marks) on the updated versions. - * - * The initial tuple is assumed to be already locked. - * - * This function doesn't check visibility, it just unconditionally marks the - * tuple(s) as locked. If any tuple in the updated chain is being deleted - * concurrently (or updated with the key being modified), sleep until the - * transaction doing it is finished. - * - * Note that we don't acquire heavyweight tuple locks on the tuples we walk - * when we have to wait for other transactions to release them, as opposed to - * what heap_lock_tuple does. The reason is that having more than one - * transaction walking the chain is probably uncommon enough that risk of - * starvation is not likely: one of the preconditions for being here is that - * the snapshot in use predates the update that created this tuple (because we - * started at an earlier version of the tuple), but at the same time such a - * transaction cannot be using repeatable read or serializable isolation - * levels, because that would lead to a serializability failure. - */ -static HTSU_Result -heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, - TransactionId xid, LockTupleMode mode) -{ - if (!ItemPointerEquals(&tuple->t_self, ctid)) - { - /* - * If this is the first possibly-multixact-able operation in the - * current transaction, set my per-backend OldestMemberMXactId - * setting. We can be certain that the transaction will never become a - * member of any older MultiXactIds than that. (We have to do this - * even if we end up just using our own TransactionId below, since - * some other backend could incorporate our XID into a MultiXact - * immediately afterwards.) - */ - MultiXactIdSetOldestMember(); - - return heap_lock_updated_tuple_rec(rel, ctid, xid, mode); - } - - /* nothing to lock */ - return HeapTupleMayBeUpdated; -} - -/* - * heap_finish_speculative - mark speculative insertion as successful - * - * To successfully finish a speculative insertion we have to clear speculative - * token from tuple. To do so the t_ctid field, which will contain a - * speculative token value, is modified in place to point to the tuple itself, - * which is characteristic of a newly inserted ordinary tuple. - * - * NB: It is not ok to commit without either finishing or aborting a - * speculative insertion. We could treat speculative tuples of committed - * transactions implicitly as completed, but then we would have to be prepared - * to deal with speculative tokens on committed tuples. That wouldn't be - * difficult - no-one looks at the ctid field of a tuple with invalid xmax - - * but clearing the token at completion isn't very expensive either. - * An explicit confirmation WAL record also makes logical decoding simpler. - */ -void -heap_finish_speculative(Relation relation, HeapTuple tuple) -{ - Buffer buffer; - Page page; - OffsetNumber offnum; - ItemId lp = NULL; - HeapTupleHeader htup; - - buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self))); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = (Page) BufferGetPage(buffer); - - offnum = ItemPointerGetOffsetNumber(&(tuple->t_self)); - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); - - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) - elog(ERROR, "invalid lp"); - - htup = (HeapTupleHeader) PageGetItem(page, lp); - - /* SpecTokenOffsetNumber should be distinguishable from any real offset */ - StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber, - "invalid speculative token constant"); - - /* NO EREPORT(ERROR) from here till changes are logged */ - START_CRIT_SECTION(); - - Assert(HeapTupleHeaderIsSpeculative(tuple->t_data)); - - MarkBufferDirty(buffer); - - /* - * Replace the speculative insertion token with a real t_ctid, pointing to - * itself like it does on regular tuples. - */ - htup->t_ctid = tuple->t_self; - - /* XLOG stuff */ - if (RelationNeedsWAL(relation)) - { - xl_heap_confirm xlrec; - XLogRecPtr recptr; - - xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); - - XLogBeginInsert(); - - /* We want the same filtering on this as on a plain insert */ - XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - - XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm); - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - UnlockReleaseBuffer(buffer); -} - -/* - * heap_abort_speculative - kill a speculatively inserted tuple - * - * Marks a tuple that was speculatively inserted in the same command as dead, - * by setting its xmin as invalid. That makes it immediately appear as dead - * to all transactions, including our own. In particular, it makes - * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend - * inserting a duplicate key value won't unnecessarily wait for our whole - * transaction to finish (it'll just wait for our speculative insertion to - * finish). - * - * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks - * that arise due to a mutual dependency that is not user visible. By - * definition, unprincipled deadlocks cannot be prevented by the user - * reordering lock acquisition in client code, because the implementation level - * lock acquisitions are not under the user's direct control. If speculative - * inserters did not take this precaution, then under high concurrency they - * could deadlock with each other, which would not be acceptable. - * - * This is somewhat redundant with heap_delete, but we prefer to have a - * dedicated routine with stripped down requirements. Note that this is also - * used to delete the TOAST tuples created during speculative insertion. - * - * This routine does not affect logical decoding as it only looks at - * confirmation records. - */ -void -heap_abort_speculative(Relation relation, HeapTuple tuple) -{ - TransactionId xid = GetCurrentTransactionId(); - ItemPointer tid = &(tuple->t_self); - ItemId lp; - HeapTupleData tp; - Page page; - BlockNumber block; - Buffer buffer; - - Assert(ItemPointerIsValid(tid)); - - block = ItemPointerGetBlockNumber(tid); - buffer = ReadBuffer(relation, block); - page = BufferGetPage(buffer); - - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * Page can't be all visible, we just inserted into it, and are still - * running. - */ - Assert(!PageIsAllVisible(page)); - - lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); - Assert(ItemIdIsNormal(lp)); - - tp.t_tableOid = RelationGetRelid(relation); - tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); - tp.t_len = ItemIdGetLength(lp); - tp.t_self = *tid; - - /* - * Sanity check that the tuple really is a speculatively inserted tuple, - * inserted by us. - */ - if (tp.t_data->t_choice.t_heap.t_xmin != xid) - elog(ERROR, "attempted to kill a tuple inserted by another transaction"); - if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data))) - elog(ERROR, "attempted to kill a non-speculative tuple"); - Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); + new_infomask |= HEAP_XMAX_EXCL_LOCK; + break; + case LockTupleExclusive: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_EXCL_LOCK; + new_infomask2 |= HEAP_KEYS_UPDATED; + break; + default: + new_xmax = InvalidTransactionId; /* silence compiler */ + elog(ERROR, "invalid lock mode"); + } + } + } + else if (old_infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactStatus new_status; - /* - * No need to check for serializable conflicts here. There is never a - * need for a combocid, either. No need to extract replica identity, or - * do anything special with infomask bits. - */ + /* + * Currently we don't allow XMAX_COMMITTED to be set for multis, so + * cross-check. + */ + Assert(!(old_infomask & HEAP_XMAX_COMMITTED)); - START_CRIT_SECTION(); + /* + * A multixact together with LOCK_ONLY set but neither lock bit set + * (i.e. a pg_upgraded share locked tuple) cannot possibly be running + * anymore. This check is critical for databases upgraded by + * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume + * that such multis are never passed. + */ + if (HEAP_LOCKED_UPGRADED(old_infomask)) + { + old_infomask &= ~HEAP_XMAX_IS_MULTI; + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } - /* - * The tuple will become DEAD immediately. Flag that this page - * immediately is a candidate for pruning by setting xmin to - * RecentGlobalXmin. That's not pretty, but it doesn't seem worth - * inventing a nicer API for this. - */ - Assert(TransactionIdIsValid(RecentGlobalXmin)); - PageSetPrunable(page, RecentGlobalXmin); + /* + * If the XMAX is already a MultiXactId, then we need to expand it to + * include add_to_xmax; but if all the members were lockers and are + * all gone, we can do away with the IS_MULTI bit and just set + * add_to_xmax as the only locker/updater. If all lockers are gone + * and we have an updater that aborted, we can also do without a + * multi. + * + * The cost of doing GetMultiXactIdMembers would be paid by + * MultiXactIdExpand if we weren't to do this, so this check is not + * incurring extra work anyhow. + */ + if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))) + { + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) || + !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax, + old_infomask))) + { + /* + * Reset these bits and restart; otherwise fall through to + * create a new multi below. + */ + old_infomask &= ~HEAP_XMAX_IS_MULTI; + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + } - /* store transaction information of xact deleting the tuple */ - tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); - tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + new_status = get_mxact_status_for_lock(mode, is_update); - /* - * Set the tuple header xmin to InvalidTransactionId. This makes the - * tuple immediately invisible everyone. (In particular, to any - * transactions waiting on the speculative token, woken up later.) - */ - HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId); + new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax, + new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else if (old_infomask & HEAP_XMAX_COMMITTED) + { + /* + * It's a committed update, so we need to preserve him as updater of + * the tuple. + */ + MultiXactStatus status; + MultiXactStatus new_status; - /* Clear the speculative insertion token too */ - tp.t_data->t_ctid = tp.t_self; + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusUpdate; + else + status = MultiXactStatusNoKeyUpdate; - MarkBufferDirty(buffer); + new_status = get_mxact_status_for_lock(mode, is_update); - /* - * XLOG stuff - * - * The WAL records generated here match heap_delete(). The same recovery - * routines are used. - */ - if (RelationNeedsWAL(relation)) + /* + * since it's not running, it's obviously impossible for the old + * updater to be identical to the current one, so we need not check + * for that case as we do in the block above. + */ + new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else if (TransactionIdIsInProgress(xmax)) { - xl_heap_delete xlrec; - XLogRecPtr recptr; + /* + * If the XMAX is a valid, in-progress TransactionId, then we need to + * create a new MultiXactId that includes both the old locker or + * updater and our own TransactionId. + */ + MultiXactStatus new_status; + MultiXactStatus old_status; + LockTupleMode old_mode; - xlrec.flags = XLH_DELETE_IS_SUPER; - xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, - tp.t_data->t_infomask2); - xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); - xlrec.xmax = xid; + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) + { + if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) + old_status = MultiXactStatusForKeyShare; + else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) + old_status = MultiXactStatusForShare; + else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) + { + if (old_infomask2 & HEAP_KEYS_UPDATED) + old_status = MultiXactStatusForUpdate; + else + old_status = MultiXactStatusForNoKeyUpdate; + } + else + { + /* + * LOCK_ONLY can be present alone only when a page has been + * upgraded by pg_upgrade. But in that case, + * TransactionIdIsInProgress() should have returned false. We + * assume it's no longer locked in this case. + */ + elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax); + old_infomask |= HEAP_XMAX_INVALID; + old_infomask &= ~HEAP_XMAX_LOCK_ONLY; + goto l5; + } + } + else + { + /* it's an update, but which kind? */ + if (old_infomask2 & HEAP_KEYS_UPDATED) + old_status = MultiXactStatusUpdate; + else + old_status = MultiXactStatusNoKeyUpdate; + } - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + old_mode = TUPLOCK_from_mxstatus(old_status); + + /* + * If the lock to be acquired is for the same TransactionId as the + * existing lock, there's an optimization possible: consider only the + * strongest of both locks as the only one present, and restart. + */ + if (xmax == add_to_xmax) + { + /* + * Note that it's not possible for the original tuple to be + * updated: we wouldn't be here because the tuple would have been + * invisible and we wouldn't try to update it. As a subtlety, + * this code can also run when traversing an update chain to lock + * future versions of a tuple. But we wouldn't be here either, + * because the add_to_xmax would be different from the original + * updater. + */ + Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); - /* No replica identity & replication origin logged */ + /* acquire the strongest of both */ + if (mode < old_mode) + mode = old_mode; + /* mustn't touch is_update */ - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } - PageSetLSN(page, recptr); + /* otherwise, just fall back to creating a new multixact */ + new_status = get_mxact_status_for_lock(mode, is_update); + new_xmax = MultiXactIdCreate(xmax, old_status, + add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); } + else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) && + TransactionIdDidCommit(xmax)) + { + /* + * It's a committed update, so we gotta preserve him as updater of the + * tuple. + */ + MultiXactStatus status; + MultiXactStatus new_status; - END_CRIT_SECTION(); + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusUpdate; + else + status = MultiXactStatusNoKeyUpdate; - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + new_status = get_mxact_status_for_lock(mode, is_update); - if (HeapTupleHasExternal(&tp)) + /* + * since it's not running, it's obviously impossible for the old + * updater to be identical to the current one, so we need not check + * for that case as we do in the block above. + */ + new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } + else { - Assert(!IsToastRelation(relation)); - toast_delete(relation, &tp, true); + /* + * Can get here iff the locking/updating transaction was running when + * the infomask was extracted from the tuple, but finished before + * TransactionIdIsInProgress got to run. Deal with it as if there was + * no locker at all in the first place. + */ + old_infomask |= HEAP_XMAX_INVALID; + goto l5; } - /* - * Never need to mark tuple for invalidation, since catalogs don't support - * speculative insertion - */ + *result_infomask = new_infomask; + *result_infomask2 = new_infomask2; + *result_xmax = new_xmax; +} - /* Now we can release the buffer */ - ReleaseBuffer(buffer); - /* count deletion, as we counted the insertion too */ - pgstat_count_heap_delete(relation); -} /* * heap_inplace_update - update a tuple "in place" (ie, overwrite it) @@ -6993,7 +4942,7 @@ HeapTupleGetUpdateXid(HeapTupleHeader tuple) * * The passed infomask pairs up with the given multixact in the tuple header. */ -static bool +bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, LockTupleMode lockmode) { @@ -7160,7 +5109,7 @@ Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, * We return (in *remaining, if not NULL) the number of members that are still * running, including any (non-aborted) subtransactions of our own transaction. */ -static void +void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, ItemPointer ctid, XLTW_Oper oper, int *remaining) @@ -7182,7 +5131,7 @@ MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, * We return (in *remaining, if not NULL) the number of members that are still * running, including any (non-aborted) subtransactions of our own transaction. */ -static bool +bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining) { @@ -7744,7 +5693,7 @@ log_heap_update(Relation reln, Buffer oldbuf, * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog * tuples. */ -static XLogRecPtr +XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup) { xl_heap_new_cid xlrec; @@ -9120,46 +7069,6 @@ heap2_redo(XLogReaderState *record) } } -/* - * heap_sync - sync a heap, for use when no WAL has been written - * - * This forces the heap contents (including TOAST heap if any) down to disk. - * If we skipped using WAL, and WAL is otherwise needed, we must force the - * relation down to disk before it's safe to commit the transaction. This - * requires writing out any dirty buffers and then doing a forced fsync. - * - * Indexes are not touched. (Currently, index operations associated with - * the commands that use this are WAL-logged and so do not need fsync. - * That behavior might change someday, but in any case it's likely that - * any fsync decisions required would be per-index and hence not appropriate - * to be done here.) - */ -void -heap_sync(Relation rel) -{ - /* non-WAL-logged tables never need fsync */ - if (!RelationNeedsWAL(rel)) - return; - - /* main heap */ - FlushRelationBuffers(rel); - /* FlushRelationBuffers will have opened rd_smgr */ - smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM); - - /* FSM is not critical, don't bother syncing it */ - - /* toast heap, if any */ - if (OidIsValid(rel->rd_rel->reltoastrelid)) - { - Relation toastrel; - - toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock); - FlushRelationBuffers(toastrel); - smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM); - heap_close(toastrel, AccessShareLock); - } -} - /* * Mask a heap page before performing consistency checks on it. */ diff --git a/src/backend/access/heap/heapam_storage.c b/src/backend/access/heap/heapam_storage.c index 7d7ac759e3..a0e3272f67 100644 --- a/src/backend/access/heap/heapam_storage.c +++ b/src/backend/access/heap/heapam_storage.c @@ -50,7 +50,13 @@ /* local functions */ static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); - +static bool heap_fetch(Relation relation, + ItemPointer tid, + Snapshot snapshot, + HeapTuple tuple, + Buffer *userbuf, + bool keep_buf, + Relation stats_relation); /*------------------------------------------------------------------------- * * POSTGRES "time qualification" code, ie, tuple visibility rules. @@ -1660,10 +1666,2149 @@ HeapTupleSatisfiesHistoricMVCC(StorageTuple stup, Snapshot snapshot, return true; } -Datum -heapam_storage_handler(PG_FUNCTION_ARGS) + +/* + * heap_fetch - retrieve tuple with given tid + * + * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding + * the tuple, fill in the remaining fields of *tuple, and check the tuple + * against the specified snapshot. + * + * If successful (tuple found and passes snapshot time qual), then *userbuf + * is set to the buffer holding the tuple and TRUE is returned. The caller + * must unpin the buffer when done with the tuple. + * + * If the tuple is not found (ie, item number references a deleted slot), + * then tuple->t_data is set to NULL and FALSE is returned. + * + * If the tuple is found but fails the time qual check, then FALSE is returned + * but tuple->t_data is left pointing to the tuple. + * + * keep_buf determines what is done with the buffer in the FALSE-result cases. + * When the caller specifies keep_buf = true, we retain the pin on the buffer + * and return it in *userbuf (so the caller must eventually unpin it); when + * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer. + * + * stats_relation is the relation to charge the heap_fetch operation against + * for statistical purposes. (This could be the heap rel itself, an + * associated index, or NULL to not count the fetch at all.) + * + * heap_fetch does not follow HOT chains: only the exact TID requested will + * be fetched. + * + * It is somewhat inconsistent that we ereport() on invalid block number but + * return false on invalid item number. There are a couple of reasons though. + * One is that the caller can relatively easily check the block number for + * validity, but cannot check the item number without reading the page + * himself. Another is that when we are following a t_ctid link, we can be + * reasonably confident that the page number is valid (since VACUUM shouldn't + * truncate off the destination page without having killed the referencing + * tuple first), but the item number might well not be good. + */ +static bool +heap_fetch(Relation relation, + ItemPointer tid, + Snapshot snapshot, + HeapTuple tuple, + Buffer *userbuf, + bool keep_buf, + Relation stats_relation) { - StorageAmRoutine *amroutine = makeNode(StorageAmRoutine); + ItemId lp; + Buffer buffer; + Page page; + OffsetNumber offnum; + bool valid; + + /* + * Fetch and pin the appropriate page of the relation. + */ + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + + /* + * Need share lock on buffer to examine tuple commit status. + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, relation, page); + + /* + * We'd better check for out-of-range offnum in case of VACUUM since the + * TID was obtained. + */ + offnum = ItemPointerGetOffsetNumber(tid); + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + if (keep_buf) + *userbuf = buffer; + else + { + ReleaseBuffer(buffer); + *userbuf = InvalidBuffer; + } + return false; + } + + /* + * get the item line pointer corresponding to the requested tid + */ + lp = PageGetItemId(page, offnum); + + /* + * Must check for deleted tuple. + */ + if (!ItemIdIsNormal(lp)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + if (keep_buf) + *userbuf = buffer; + else + { + ReleaseBuffer(buffer); + *userbuf = InvalidBuffer; + } + return false; + } + + /* + * fill in tuple fields and place it in stuple + */ + ItemPointerCopy(tid, &(tuple->t_self)); + tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); + tuple->t_len = ItemIdGetLength(lp); + tuple->t_tableOid = RelationGetRelid(relation); + + /* + * check time qualification of tuple, then release lock + */ + valid = HeapTupleSatisfiesVisibility(relation->rd_stamroutine, tuple, snapshot, buffer); + + if (valid) + PredicateLockTuple(relation, tuple, snapshot); + + CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (valid) + { + /* + * All checks passed, so return the tuple as valid. Caller is now + * responsible for releasing the buffer. + */ + *userbuf = buffer; + + /* Count the successful fetch against appropriate rel, if any */ + if (stats_relation != NULL) + pgstat_count_heap_fetch(stats_relation); + + return true; + } + + /* Tuple failed time qual, but maybe caller wants to see it anyway. */ + if (keep_buf) + *userbuf = buffer; + else + { + ReleaseBuffer(buffer); + *userbuf = InvalidBuffer; + } + + return false; +} + + + + + +/* ---------------------------------------------------------------- + * storage AM support routines for heapam + * ---------------------------------------------------------------- + */ + +static bool +heapam_fetch(Relation relation, + ItemPointer tid, + Snapshot snapshot, + StorageTuple *stuple, + Buffer *userbuf, + bool keep_buf, + Relation stats_relation) +{ + HeapTupleData tuple; + + *stuple = NULL; + if (heap_fetch(relation, tid, snapshot, &tuple, userbuf, keep_buf, stats_relation)) + { + *stuple = heap_copytuple(&tuple); + return true; + } + + return false; +} + +/* + * Insert a heap tuple from a slot, which may contain an OID and speculative + * insertion token. + */ +static Oid +heapam_heap_insert(Relation relation, TupleTableSlot *slot, CommandId cid, + int options, BulkInsertState bistate) +{ + Oid oid; + HeapTuple tuple = NULL; + + if (slot->tts_storage) + { + HeapamTuple *htuple = slot->tts_storage; + tuple = htuple->hst_heaptuple; + + if (relation->rd_rel->relhasoids) + HeapTupleSetOid(tuple, InvalidOid); + } + else + { + /* + * Obtain the physical tuple to insert, building from the slot values. + * XXX: maybe the slot already contains a physical tuple in the right + * format? In fact, if the slot isn't fully deformed, this is completely + * bogus ... + */ + tuple = heap_form_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); + } + + /* Set the OID, if the slot has one */ + if (slot->tts_tupleOid != InvalidOid) + HeapTupleHeaderSetOid(tuple->t_data, slot->tts_tupleOid); + + /* Update the tuple with table oid */ + if (slot->tts_tableOid != InvalidOid) + tuple->t_tableOid = slot->tts_tableOid; + + /* Set the speculative insertion token, if the slot has one */ + if ((options & HEAP_INSERT_SPECULATIVE) && slot->tts_speculativeToken) + HeapTupleHeaderSetSpeculativeToken(tuple->t_data, slot->tts_speculativeToken); + + /* Perform the insertion, and copy the resulting ItemPointer */ + oid = heap_insert(relation, tuple, cid, options, bistate); + ItemPointerCopy(&tuple->t_self, &slot->tts_tid); + + if (slot->tts_storage == NULL) + ExecStoreTuple(tuple, slot, InvalidBuffer, true); + + return oid; +} + +static HTSU_Result +heapam_heap_delete(Relation relation, ItemPointer tid, CommandId cid, + Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd) +{ + return heap_delete(relation, tid, cid, crosscheck, wait, hufd); +} + +static HTSU_Result +heapam_heap_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd, LockTupleMode *lockmode) +{ + HeapTuple tuple; + HTSU_Result result; + + if (slot->tts_storage) + { + HeapamTuple *htuple = slot->tts_storage; + tuple = htuple->hst_heaptuple; + } + else + { + tuple = heap_form_tuple(slot->tts_tupleDescriptor, + slot->tts_values, + slot->tts_isnull); + } + + /* Set the OID, if the slot has one */ + if (slot->tts_tupleOid != InvalidOid) + HeapTupleHeaderSetOid(tuple->t_data, slot->tts_tupleOid); + + /* Update the tuple with table oid */ + if (slot->tts_tableOid != InvalidOid) + tuple->t_tableOid = slot->tts_tableOid; + + result = heap_update(relation, otid, tuple, cid, crosscheck, wait, + hufd, lockmode); + ItemPointerCopy(&tuple->t_self, &slot->tts_tid); + + if (slot->tts_storage == NULL) + ExecStoreTuple(tuple, slot, InvalidBuffer, true); + + return result; +} + +static void +heapam_finish_speculative(Relation relation, TupleTableSlot *slot) +{ + HeapamTuple *stuple = (HeapamTuple *) slot->tts_storage; + HeapTuple tuple = stuple->hst_heaptuple; + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + Assert(slot->tts_speculativeToken != 0); + + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self))); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + offnum = ItemPointerGetOffsetNumber(&(tuple->t_self)); + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(ERROR, "invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* SpecTokenOffsetNumber should be distinguishable from any real offset */ + StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber, + "invalid speculative token constant"); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + Assert(HeapTupleHeaderIsSpeculative(tuple->t_data)); + + MarkBufferDirty(buffer); + + /* + * Replace the speculative insertion token with a real t_ctid, pointing to + * itself like it does on regular tuples. + */ + htup->t_ctid = tuple->t_self; + + /* XLOG stuff */ + if (RelationNeedsWAL(relation)) + { + xl_heap_confirm xlrec; + XLogRecPtr recptr; + + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); + + XLogBeginInsert(); + + /* We want the same filtering on this as on a plain insert */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); + slot->tts_speculativeToken = 0; +} + +static void +heapam_abort_speculative(Relation relation, TupleTableSlot *slot) +{ + HeapamTuple *stuple = (HeapamTuple *) slot->tts_storage; + HeapTuple tuple = stuple->hst_heaptuple; + TransactionId xid = GetCurrentTransactionId(); + ItemPointer tid = &(tuple->t_self); + ItemId lp; + HeapTupleData tp; + Page page; + BlockNumber block; + Buffer buffer; + + /*Assert(slot->tts_speculativeToken != 0); This needs some update in toast */ + Assert(ItemPointerIsValid(tid)); + + block = ItemPointerGetBlockNumber(tid); + buffer = ReadBuffer(relation, block); + page = BufferGetPage(buffer); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Page can't be all visible, we just inserted into it, and are still + * running. + */ + Assert(!PageIsAllVisible(page)); + + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsNormal(lp)); + + tp.t_tableOid = RelationGetRelid(relation); + tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tp.t_len = ItemIdGetLength(lp); + tp.t_self = *tid; + + /* + * Sanity check that the tuple really is a speculatively inserted tuple, + * inserted by us. + */ + if (tp.t_data->t_choice.t_heap.t_xmin != xid) + elog(ERROR, "attempted to kill a tuple inserted by another transaction"); + if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data))) + elog(ERROR, "attempted to kill a non-speculative tuple"); + Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); + + /* + * No need to check for serializable conflicts here. There is never a + * need for a combocid, either. No need to extract replica identity, or + * do anything special with infomask bits. + */ + + START_CRIT_SECTION(); + + /* + * The tuple will become DEAD immediately. Flag that this page + * immediately is a candidate for pruning by setting xmin to + * RecentGlobalXmin. That's not pretty, but it doesn't seem worth + * inventing a nicer API for this. + */ + Assert(TransactionIdIsValid(RecentGlobalXmin)); + PageSetPrunable(page, RecentGlobalXmin); + + /* store transaction information of xact deleting the tuple */ + tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + + /* + * Set the tuple header xmin to InvalidTransactionId. This makes the + * tuple immediately invisible everyone. (In particular, to any + * transactions waiting on the speculative token, woken up later.) + */ + HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId); + + /* Clear the speculative insertion token too */ + tp.t_data->t_ctid = tp.t_self; + + MarkBufferDirty(buffer); + + /* + * XLOG stuff + * + * The WAL records generated here match heap_delete(). The same recovery + * routines are used. + */ + if (RelationNeedsWAL(relation)) + { + xl_heap_delete xlrec; + XLogRecPtr recptr; + + xlrec.flags = XLH_DELETE_IS_SUPER; + xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, + tp.t_data->t_infomask2); + xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); + xlrec.xmax = xid; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + /* No replica identity & replication origin logged */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + if (HeapTupleHasExternal(&tp)) + { + Assert(!IsToastRelation(relation)); + toast_delete(relation, &tp, true); + } + + /* + * Never need to mark tuple for invalidation, since catalogs don't support + * speculative insertion + */ + + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + + /* count deletion, as we counted the insertion too */ + pgstat_count_heap_delete(relation); + slot->tts_speculativeToken = 0; +} + +/* + * heapam_multi_insert - insert multiple tuple into a heap + * + * This is like heap_insert(), but inserts multiple tuples in one operation. + * That's faster than calling heap_insert() in a loop, because when multiple + * tuples can be inserted on a single page, we can write just a single WAL + * record covering all of them, and only need to lock/unlock the page once. + * + * Note: this leaks memory into the current memory context. You can create a + * temporary context before calling this, if that's a problem. + */ +static void +heapam_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, + CommandId cid, int options, BulkInsertState bistate) +{ + TransactionId xid = GetCurrentTransactionId(); + HeapTuple *heaptuples; + int i; + int ndone; + char *scratch = NULL; + Page page; + bool needwal; + Size saveFreeSpace; + bool need_tuple_data = RelationIsLogicallyLogged(relation); + bool need_cids = RelationIsAccessibleInLogicalDecoding(relation); + + needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); + saveFreeSpace = RelationGetTargetPageFreeSpace(relation, + HEAP_DEFAULT_FILLFACTOR); + + /* Toast and set header data in all the tuples */ + heaptuples = palloc(ntuples * sizeof(HeapTuple)); + for (i = 0; i < ntuples; i++) + heaptuples[i] = heap_prepare_insert(relation, tuples[i], + xid, cid, options); + + /* + * Allocate some memory to use for constructing the WAL record. Using + * palloc() within a critical section is not safe, so we allocate this + * beforehand. + */ + if (needwal) + scratch = palloc(BLCKSZ); + + /* + * We're about to do the actual inserts -- but check for conflict first, + * to minimize the possibility of having to roll back work we've just + * done. + * + * A check here does not definitively prevent a serialization anomaly; + * that check MUST be done at least past the point of acquiring an + * exclusive buffer content lock on every buffer that will be affected, + * and MAY be done after all inserts are reflected in the buffers and + * those locks are released; otherwise there race condition. Since + * multiple buffers can be locked and unlocked in the loop below, and it + * would not be feasible to identify and lock all of those buffers before + * the loop, we must do a final check at the end. + * + * The check here could be omitted with no loss of correctness; it is + * present strictly as an optimization. + * + * For heap inserts, we only need to check for table-level SSI locks. Our + * new tuples can't possibly conflict with existing tuple locks, and heap + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. So we don't need to specify a + * buffer when making the call, which makes for a faster check. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); + + ndone = 0; + while (ndone < ntuples) + { + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + bool all_visible_cleared = false; + int nthispage; + + CHECK_FOR_INTERRUPTS(); + + /* + * Find buffer where at least the next tuple will fit. If the page is + * all-visible, this will also pin the requisite visibility map page. + */ + buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len, + InvalidBuffer, options, bistate, + &vmbuffer, NULL); + page = BufferGetPage(buffer); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + /* + * RelationGetBufferForTuple has ensured that the first tuple fits. + * Put that on the page, and then as many other tuples as fit. + */ + RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false); + for (nthispage = 1; ndone + nthispage < ntuples; nthispage++) + { + HeapTuple heaptup = heaptuples[ndone + nthispage]; + + if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) + break; + + RelationPutHeapTuple(relation, buffer, heaptup, false); + + /* + * We don't use heap_multi_insert for catalog tuples yet, but + * better be prepared... + */ + if (needwal && need_cids) + log_heap_new_cid(relation, heaptup); + } + + if (PageIsAllVisible(page)) + { + all_visible_cleared = true; + PageClearAllVisible(page); + visibilitymap_clear(relation, + BufferGetBlockNumber(buffer), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + + /* + * XXX Should we set PageSetPrunable on this page ? See heap_insert() + */ + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (needwal) + { + XLogRecPtr recptr; + xl_heap_multi_insert *xlrec; + uint8 info = XLOG_HEAP2_MULTI_INSERT; + char *tupledata; + int totaldatalen; + char *scratchptr = scratch; + bool init; + int bufflags = 0; + + /* + * If the page was previously empty, we can reinit the page + * instead of restoring the whole thing. + */ + init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber && + PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1); + + /* allocate xl_heap_multi_insert struct from the scratch area */ + xlrec = (xl_heap_multi_insert *) scratchptr; + scratchptr += SizeOfHeapMultiInsert; + + /* + * Allocate offsets array. Unless we're reinitializing the page, + * in that case the tuples are stored in order starting at + * FirstOffsetNumber and we don't need to store the offsets + * explicitly. + */ + if (!init) + scratchptr += nthispage * sizeof(OffsetNumber); + + /* the rest of the scratch space is used for tuple data */ + tupledata = scratchptr; + + xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0; + xlrec->ntuples = nthispage; + + /* + * Write out an xl_multi_insert_tuple and the tuple data itself + * for each tuple. + */ + for (i = 0; i < nthispage; i++) + { + HeapTuple heaptup = heaptuples[ndone + i]; + xl_multi_insert_tuple *tuphdr; + int datalen; + + if (!init) + xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); + /* xl_multi_insert_tuple needs two-byte alignment. */ + tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); + scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; + + tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; + tuphdr->t_infomask = heaptup->t_data->t_infomask; + tuphdr->t_hoff = heaptup->t_data->t_hoff; + + /* write bitmap [+ padding] [+ oid] + data */ + datalen = heaptup->t_len - SizeofHeapTupleHeader; + memcpy(scratchptr, + (char *) heaptup->t_data + SizeofHeapTupleHeader, + datalen); + tuphdr->datalen = datalen; + scratchptr += datalen; + } + totaldatalen = scratchptr - tupledata; + Assert((scratchptr - scratch) < BLCKSZ); + + if (need_tuple_data) + xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; + + /* + * Signal that this is the last xl_heap_multi_insert record + * emitted by this call to heap_multi_insert(). Needed for logical + * decoding so it knows when to cleanup temporary data. + */ + if (ndone + nthispage == ntuples) + xlrec->flags |= XLH_INSERT_LAST_IN_MULTI; + + if (init) + { + info |= XLOG_HEAP_INIT_PAGE; + bufflags |= REGBUF_WILL_INIT; + } + + /* + * If we're doing logical decoding, include the new tuple data + * even if we take a full-page image of the page. + */ + if (need_tuple_data) + bufflags |= REGBUF_KEEP_DATA; + + XLogBeginInsert(); + XLogRegisterData((char *) xlrec, tupledata - scratch); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); + + XLogRegisterBufData(0, tupledata, totaldatalen); + + /* filtering by origin on a row level is much more efficient */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + recptr = XLogInsert(RM_HEAP2_ID, info); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + ndone += nthispage; + } + + /* + * We're done with the actual inserts. Check for conflicts again, to + * ensure that all rw-conflicts in to these inserts are detected. Without + * this final check, a sequential scan of the heap may have locked the + * table after the "before" check, missing one opportunity to detect the + * conflict, and then scanned the table before the new tuples were there, + * missing the other chance to detect the conflict. + * + * For heap inserts, we only need to check for table-level SSI locks. Our + * new tuples can't possibly conflict with existing tuple locks, and heap + * page locks are only consolidated versions of tuple locks; they do not + * lock "gaps" as index page locks do. So we don't need to specify a + * buffer when making the call. + */ + CheckForSerializableConflictIn(relation, NULL, InvalidBuffer); + + /* + * If tuples are cachable, mark them for invalidation from the caches in + * case we abort. Note it is OK to do this after releasing the buffer, + * because the heaptuples data structure is all in local memory, not in + * the shared buffer. + */ + if (IsCatalogRelation(relation)) + { + for (i = 0; i < ntuples; i++) + CacheInvalidateHeapTuple(relation, heaptuples[i], NULL); + } + + /* + * Copy t_self fields back to the caller's original tuples. This does + * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's + * probably faster to always copy than check. + */ + for (i = 0; i < ntuples; i++) + tuples[i]->t_self = heaptuples[i]->t_self; + + pgstat_count_heap_insert(relation, ntuples); +} + +/* + * Subroutine for heap_lock_updated_tuple_rec. + * + * Given a hypothetical multixact status held by the transaction identified + * with the given xid, does the current transaction need to wait, fail, or can + * it continue if it wanted to acquire a lock of the given mode? "needwait" + * is set to true if waiting is necessary; if it can continue, then + * HeapTupleMayBeUpdated is returned. If the lock is already held by the + * current transaction, return HeapTupleSelfUpdated. In case of a conflict + * with another transaction, a different HeapTupleSatisfiesUpdate return code + * is returned. + * + * The held status is said to be hypothetical because it might correspond to a + * lock held by a single Xid, i.e. not a real MultiXactId; we express it this + * way for simplicity of API. + */ +static HTSU_Result +test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, + LockTupleMode mode, bool *needwait) +{ + MultiXactStatus wantedstatus; + + *needwait = false; + wantedstatus = get_mxact_status_for_lock(mode, false); + + /* + * Note: we *must* check TransactionIdIsInProgress before + * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an + * explanation. + */ + if (TransactionIdIsCurrentTransactionId(xid)) + { + /* + * The tuple has already been locked by our own transaction. This is + * very rare but can happen if multiple transactions are trying to + * lock an ancient version of the same tuple. + */ + return HeapTupleSelfUpdated; + } + else if (TransactionIdIsInProgress(xid)) + { + /* + * If the locking transaction is running, what we do depends on + * whether the lock modes conflict: if they do, then we must wait for + * it to finish; otherwise we can fall through to lock this tuple + * version without waiting. + */ + if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), + LOCKMODE_from_mxstatus(wantedstatus))) + { + *needwait = true; + } + + /* + * If we set needwait above, then this value doesn't matter; + * otherwise, this value signals to caller that it's okay to proceed. + */ + return HeapTupleMayBeUpdated; + } + else if (TransactionIdDidAbort(xid)) + return HeapTupleMayBeUpdated; + else if (TransactionIdDidCommit(xid)) + { + /* + * The other transaction committed. If it was only a locker, then the + * lock is completely gone now and we can return success; but if it + * was an update, then what we do depends on whether the two lock + * modes conflict. If they conflict, then we must report error to + * caller. But if they don't, we can fall through to allow the current + * transaction to lock the tuple. + * + * Note: the reason we worry about ISUPDATE here is because as soon as + * a transaction ends, all its locks are gone and meaningless, and + * thus we can ignore them; whereas its updates persist. In the + * TransactionIdIsInProgress case, above, we don't need to check + * because we know the lock is still "alive" and thus a conflict needs + * always be checked. + */ + if (!ISUPDATE_from_mxstatus(status)) + return HeapTupleMayBeUpdated; + + if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), + LOCKMODE_from_mxstatus(wantedstatus))) + /* bummer */ + return HeapTupleUpdated; + + return HeapTupleMayBeUpdated; + } + + /* Not in progress, not aborted, not committed -- must have crashed */ + return HeapTupleMayBeUpdated; +} + + +/* + * Recursive part of heap_lock_updated_tuple + * + * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given + * xid with the given mode; if this tuple is updated, recurse to lock the new + * version as well. + */ +static HTSU_Result +heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, + LockTupleMode mode) +{ + HTSU_Result result; + ItemPointerData tupid; + HeapTupleData mytup; + Buffer buf; + uint16 new_infomask, + new_infomask2, + old_infomask, + old_infomask2; + TransactionId xmax, + new_xmax; + TransactionId priorXmax = InvalidTransactionId; + bool cleared_all_frozen = false; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + + ItemPointerCopy(tid, &tupid); + + for (;;) + { + new_infomask = 0; + new_xmax = InvalidTransactionId; + block = ItemPointerGetBlockNumber(&tupid); + + if (!heap_fetch(rel, &tupid, SnapshotAny, &mytup, &buf, false, NULL)) + { + /* + * if we fail to find the updated version of the tuple, it's + * because it was vacuumed/pruned away after its creator + * transaction aborted. So behave as if we got to the end of the + * chain, and there's no further tuple to lock: return success to + * caller. + */ + return HeapTupleMayBeUpdated; + } + +l4: + CHECK_FOR_INTERRUPTS(); + + /* + * Before locking the buffer, pin the visibility map page if it + * appears to be necessary. Since we haven't got the lock yet, + * someone else might be in the middle of changing this, so we'll need + * to recheck after we have the lock. + */ + if (PageIsAllVisible(BufferGetPage(buf))) + visibilitymap_pin(rel, block, &vmbuffer); + else + vmbuffer = InvalidBuffer; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * If we didn't pin the visibility map page and the page has become + * all visible while we were busy locking the buffer, we'll have to + * unlock and re-lock, to avoid holding the buffer lock across I/O. + * That's a bit unfortunate, but hopefully shouldn't happen often. + */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(BufferGetPage(buf))) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + visibilitymap_pin(rel, block, &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * Check the tuple XMIN against prior XMAX, if any. If we reached the + * end of the chain, we're done, so return success. + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data), + priorXmax)) + { + result = HeapTupleMayBeUpdated; + goto out_locked; + } + + /* + * Also check Xmin: if this tuple was created by an aborted + * (sub)transaction, then we already locked the last live one in the + * chain, thus we're done, so return success. + */ + if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data))) + { + UnlockReleaseBuffer(buf); + return HeapTupleMayBeUpdated; + } + + old_infomask = mytup.t_data->t_infomask; + old_infomask2 = mytup.t_data->t_infomask2; + xmax = HeapTupleHeaderGetRawXmax(mytup.t_data); + + /* + * If this tuple version has been updated or locked by some concurrent + * transaction(s), what we do depends on whether our lock mode + * conflicts with what those other transactions hold, and also on the + * status of them. + */ + if (!(old_infomask & HEAP_XMAX_INVALID)) + { + TransactionId rawxmax; + bool needwait; + + rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data); + if (old_infomask & HEAP_XMAX_IS_MULTI) + { + int nmembers; + int i; + MultiXactMember *members; + + /* + * We don't need a test for pg_upgrade'd tuples: this is only + * applied to tuples after the first in an update chain. Said + * first tuple in the chain may well be locked-in-9.2-and- + * pg_upgraded, but that one was already locked by our caller, + * not us; and any subsequent ones cannot be because our + * caller must necessarily have obtained a snapshot later than + * the pg_upgrade itself. + */ + Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask)); + + nmembers = GetMultiXactIdMembers(rawxmax, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); + for (i = 0; i < nmembers; i++) + { + result = test_lockmode_for_conflict(members[i].status, + members[i].xid, + mode, &needwait); + + /* + * If the tuple was already locked by ourselves in a + * previous iteration of this (say heap_lock_tuple was + * forced to restart the locking loop because of a change + * in xmax), then we hold the lock already on this tuple + * version and we don't need to do anything; and this is + * not an error condition either. We just need to skip + * this tuple and continue locking the next version in the + * update chain. + */ + if (result == HeapTupleSelfUpdated) + { + pfree(members); + goto next; + } + + if (needwait) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(members[i].xid, rel, + &mytup.t_self, + XLTW_LockUpdated); + pfree(members); + goto l4; + } + if (result != HeapTupleMayBeUpdated) + { + pfree(members); + goto out_locked; + } + } + if (members) + pfree(members); + } + else + { + MultiXactStatus status; + + /* + * For a non-multi Xmax, we first need to compute the + * corresponding MultiXactStatus by using the infomask bits. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) + { + if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) + status = MultiXactStatusForKeyShare; + else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) + status = MultiXactStatusForShare; + else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) + { + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusForUpdate; + else + status = MultiXactStatusForNoKeyUpdate; + } + else + { + /* + * LOCK_ONLY present alone (a pg_upgraded tuple marked + * as share-locked in the old cluster) shouldn't be + * seen in the middle of an update chain. + */ + elog(ERROR, "invalid lock status in tuple"); + } + } + else + { + /* it's an update, but which kind? */ + if (old_infomask2 & HEAP_KEYS_UPDATED) + status = MultiXactStatusUpdate; + else + status = MultiXactStatusNoKeyUpdate; + } + + result = test_lockmode_for_conflict(status, rawxmax, mode, + &needwait); + + /* + * If the tuple was already locked by ourselves in a previous + * iteration of this (say heap_lock_tuple was forced to + * restart the locking loop because of a change in xmax), then + * we hold the lock already on this tuple version and we don't + * need to do anything; and this is not an error condition + * either. We just need to skip this tuple and continue + * locking the next version in the update chain. + */ + if (result == HeapTupleSelfUpdated) + goto next; + + if (needwait) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(rawxmax, rel, &mytup.t_self, + XLTW_LockUpdated); + goto l4; + } + if (result != HeapTupleMayBeUpdated) + { + goto out_locked; + } + } + } + + /* compute the new Xmax and infomask values for the tuple ... */ + compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2, + xid, mode, false, + &new_xmax, &new_infomask, &new_infomask2); + + if (PageIsAllVisible(BufferGetPage(buf)) && + visibilitymap_clear(rel, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN)) + cleared_all_frozen = true; + + START_CRIT_SECTION(); + + /* ... and set them */ + HeapTupleHeaderSetXmax(mytup.t_data, new_xmax); + mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS; + mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + mytup.t_data->t_infomask |= new_infomask; + mytup.t_data->t_infomask2 |= new_infomask2; + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_heap_lock_updated xlrec; + XLogRecPtr recptr; + Page page = BufferGetPage(buf); + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self); + xlrec.xmax = new_xmax; + xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2); + xlrec.flags = + cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + + XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + +next: + /* if we find the end of update chain, we're done. */ + if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID || + ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || + HeapTupleHeaderIsOnlyLocked(mytup.t_data)) + { + result = HeapTupleMayBeUpdated; + goto out_locked; + } + + /* tail recursion */ + priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data); + ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid); + UnlockReleaseBuffer(buf); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + } + + result = HeapTupleMayBeUpdated; + +out_locked: + UnlockReleaseBuffer(buf); + + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + return result; + +} + +/* + * heap_lock_updated_tuple + * Follow update chain when locking an updated tuple, acquiring locks (row + * marks) on the updated versions. + * + * The initial tuple is assumed to be already locked. + * + * This function doesn't check visibility, it just unconditionally marks the + * tuple(s) as locked. If any tuple in the updated chain is being deleted + * concurrently (or updated with the key being modified), sleep until the + * transaction doing it is finished. + * + * Note that we don't acquire heavyweight tuple locks on the tuples we walk + * when we have to wait for other transactions to release them, as opposed to + * what heap_lock_tuple does. The reason is that having more than one + * transaction walking the chain is probably uncommon enough that risk of + * starvation is not likely: one of the preconditions for being here is that + * the snapshot in use predates the update that created this tuple (because we + * started at an earlier version of the tuple), but at the same time such a + * transaction cannot be using repeatable read or serializable isolation + * levels, because that would lead to a serializability failure. + */ +static HTSU_Result +heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, + TransactionId xid, LockTupleMode mode) +{ + if (!ItemPointerEquals(&tuple->t_self, ctid)) + { + /* + * If this is the first possibly-multixact-able operation in the + * current transaction, set my per-backend OldestMemberMXactId + * setting. We can be certain that the transaction will never become a + * member of any older MultiXactIds than that. (We have to do this + * even if we end up just using our own TransactionId below, since + * some other backend could incorporate our XID into a MultiXact + * immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + + return heap_lock_updated_tuple_rec(rel, ctid, xid, mode); + } + + /* nothing to lock */ + return HeapTupleMayBeUpdated; +} + + +/* + * heapam_lock_tuple - lock a tuple in shared or exclusive mode + * + * Note that this acquires a buffer pin, which the caller must release. + * + * Input parameters: + * relation: relation containing tuple (caller must hold suitable lock) + * tuple->t_self: TID of tuple to lock (rest of struct need not be valid) + * cid: current command ID (used for visibility test, and stored into + * tuple's cmax if lock is successful) + * mode: indicates if shared or exclusive tuple lock is desired + * wait_policy: what to do if tuple lock is not available + * follow_updates: if true, follow the update chain to also lock descendant + * tuples. + * + * Output parameters: + * *tuple: all fields filled in + * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *hufd: filled in failure cases (see below) + * + * Function result may be: + * HeapTupleMayBeUpdated: lock was successfully acquired + * HeapTupleInvisible: lock failed because tuple was never visible to us + * HeapTupleSelfUpdated: lock failed because tuple updated by self + * HeapTupleUpdated: lock failed because tuple updated by other xact + * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip + * + * In the failure cases other than HeapTupleInvisible, the routine fills + * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, + * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated, + * since we cannot obtain cmax from a combocid generated by another + * transaction). + * See comments for struct HeapUpdateFailureData for additional info. + * + * See README.tuplock for a thorough explanation of this mechanism. + */ +static HTSU_Result +heapam_lock_tuple(Relation relation, ItemPointer tid, StorageTuple *stuple, + CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, + bool follow_updates, Buffer *buffer, HeapUpdateFailureData *hufd) +{ + HTSU_Result result; + ItemId lp; + Page page; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + TransactionId xid, + xmax; + uint16 old_infomask, + new_infomask, + new_infomask2; + bool first_time = true; + bool have_tuple_lock = false; + bool cleared_all_frozen = false; + HeapTupleData tuple; + Buffer buf; + + Assert(stuple != NULL); + + buf = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + block = ItemPointerGetBlockNumber(tid); + *buffer = buf; + + /* + * Before locking the buffer, pin the visibility map page if it appears to + * be necessary. Since we haven't got the lock yet, someone else might be + * in the middle of changing this, so we'll need to recheck after we have + * the lock. + */ + if (PageIsAllVisible(BufferGetPage(buf))) + visibilitymap_pin(relation, block, &vmbuffer); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buf); + lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsNormal(lp)); + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tuple.t_len = ItemIdGetLength(lp); + tuple.t_tableOid = RelationGetRelid(relation); + ItemPointerCopy(tid, &tuple.t_self); + +l3: + result = HeapTupleSatisfiesUpdate(&tuple, cid, buf); + + if (result == HeapTupleInvisible) + { + /* + * This is possible, but only when locking a tuple for ON CONFLICT + * UPDATE. We return this value here rather than throwing an error in + * order to give that case the opportunity to throw a more specific + * error. + */ + result = HeapTupleInvisible; + goto out_locked; + } + else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated) + { + TransactionId xwait; + uint16 infomask; + uint16 infomask2; + bool require_sleep; + ItemPointerData t_ctid; + + /* must copy state data before unlocking buffer */ + xwait = HeapTupleHeaderGetRawXmax(tuple.t_data); + infomask = tuple.t_data->t_infomask; + infomask2 = tuple.t_data->t_infomask2; + ItemPointerCopy(&tuple.t_data->t_ctid, &t_ctid); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + /* + * If any subtransaction of the current top transaction already holds + * a lock as strong as or stronger than what we're requesting, we + * effectively hold the desired lock already. We *must* succeed + * without trying to take the tuple lock, else we will deadlock + * against anyone wanting to acquire a stronger lock. + * + * Note we only do this the first time we loop on the HTSU result; + * there is no point in testing in subsequent passes, because + * evidently our own transaction cannot have acquired a new lock after + * the first time we checked. + */ + if (first_time) + { + first_time = false; + + if (infomask & HEAP_XMAX_IS_MULTI) + { + int i; + int nmembers; + MultiXactMember *members; + + /* + * We don't need to allow old multixacts here; if that had + * been the case, HeapTupleSatisfiesUpdate would have returned + * MayBeUpdated and we wouldn't be here. + */ + nmembers = + GetMultiXactIdMembers(xwait, &members, false, + HEAP_XMAX_IS_LOCKED_ONLY(infomask)); + + for (i = 0; i < nmembers; i++) + { + /* only consider members of our own transaction */ + if (!TransactionIdIsCurrentTransactionId(members[i].xid)) + continue; + + if (TUPLOCK_from_mxstatus(members[i].status) >= mode) + { + pfree(members); + result = HeapTupleMayBeUpdated; + goto out_unlocked; + } + } + + if (members) + pfree(members); + } + else if (TransactionIdIsCurrentTransactionId(xwait)) + { + switch (mode) + { + case LockTupleKeyShare: + Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) || + HEAP_XMAX_IS_SHR_LOCKED(infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(infomask)); + result = HeapTupleMayBeUpdated; + goto out_unlocked; + case LockTupleShare: + if (HEAP_XMAX_IS_SHR_LOCKED(infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(infomask)) + { + result = HeapTupleMayBeUpdated; + goto out_unlocked; + } + break; + case LockTupleNoKeyExclusive: + if (HEAP_XMAX_IS_EXCL_LOCKED(infomask)) + { + result = HeapTupleMayBeUpdated; + goto out_unlocked; + } + break; + case LockTupleExclusive: + if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) && + infomask2 & HEAP_KEYS_UPDATED) + { + result = HeapTupleMayBeUpdated; + goto out_unlocked; + } + break; + } + } + } + + /* + * Initially assume that we will have to wait for the locking + * transaction(s) to finish. We check various cases below in which + * this can be turned off. + */ + require_sleep = true; + if (mode == LockTupleKeyShare) + { + /* + * If we're requesting KeyShare, and there's no update present, we + * don't need to wait. Even if there is an update, we can still + * continue if the key hasn't been modified. + * + * However, if there are updates, we need to walk the update chain + * to mark future versions of the row as locked, too. That way, + * if somebody deletes that future version, we're protected + * against the key going away. This locking of future versions + * could block momentarily, if a concurrent transaction is + * deleting a key; or it could return a value to the effect that + * the transaction deleting the key has already committed. So we + * do this before re-locking the buffer; otherwise this would be + * prone to deadlocks. + * + * Note that the TID we're locking was grabbed before we unlocked + * the buffer. For it to change while we're not looking, the + * other properties we're testing for below after re-locking the + * buffer would also change, in which case we would restart this + * loop above. + */ + if (!(infomask2 & HEAP_KEYS_UPDATED)) + { + bool updated; + + updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask); + + /* + * If there are updates, follow the update chain; bail out if + * that cannot be done. + */ + if (follow_updates && updated) + { + HTSU_Result res; + + res = heap_lock_updated_tuple(relation, &tuple, &t_ctid, + GetCurrentTransactionId(), + mode); + if (res != HeapTupleMayBeUpdated) + { + result = res; + /* recovery code expects to have buffer lock held */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + } + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Make sure it's still an appropriate lock, else start over. + * Also, if it wasn't updated before we released the lock, but + * is updated now, we start over too; the reason is that we + * now need to follow the update chain to lock the new + * versions. + */ + if (!HeapTupleHeaderIsOnlyLocked(tuple.t_data) && + ((tuple.t_data->t_infomask2 & HEAP_KEYS_UPDATED) || + !updated)) + goto l3; + + /* Things look okay, so we can skip sleeping */ + require_sleep = false; + + /* + * Note we allow Xmax to change here; other updaters/lockers + * could have modified it before we grabbed the buffer lock. + * However, this is not a problem, because with the recheck we + * just did we ensure that they still don't conflict with the + * lock we want. + */ + } + } + else if (mode == LockTupleShare) + { + /* + * If we're requesting Share, we can similarly avoid sleeping if + * there's no update and no exclusive lock present. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && + !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) + { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Make sure it's still an appropriate lock, else start over. + * See above about allowing xmax to change. + */ + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple.t_data->t_infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(tuple.t_data->t_infomask)) + goto l3; + require_sleep = false; + } + } + else if (mode == LockTupleNoKeyExclusive) + { + /* + * If we're requesting NoKeyExclusive, we might also be able to + * avoid sleeping; just ensure that there no conflicting lock + * already acquired. + */ + if (infomask & HEAP_XMAX_IS_MULTI) + { + if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask, + mode)) + { + /* + * No conflict, but if the xmax changed under us in the + * meantime, start over. + */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + if (xmax_infomask_changed(tuple.t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple.t_data), + xwait)) + goto l3; + + /* otherwise, we're good */ + require_sleep = false; + } + } + else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) + { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* if the xmax changed in the meantime, start over */ + if (xmax_infomask_changed(tuple.t_data->t_infomask, infomask) || + !TransactionIdEquals( + HeapTupleHeaderGetRawXmax(tuple.t_data), + xwait)) + goto l3; + /* otherwise, we're good */ + require_sleep = false; + } + } + + /* + * As a check independent from those above, we can also avoid sleeping + * if the current transaction is the sole locker of the tuple. Note + * that the strength of the lock already held is irrelevant; this is + * not about recording the lock in Xmax (which will be done regardless + * of this optimization, below). Also, note that the cases where we + * hold a lock stronger than we are requesting are already handled + * above by not doing anything. + * + * Note we only deal with the non-multixact case here; MultiXactIdWait + * is well equipped to deal with this situation on its own. + */ + if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) && + TransactionIdIsCurrentTransactionId(xwait)) + { + /* ... but if the xmax changed in the meantime, start over */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + if (xmax_infomask_changed(tuple.t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple.t_data), + xwait)) + goto l3; + Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple.t_data->t_infomask)); + require_sleep = false; + } + + /* + * Time to sleep on the other transaction/multixact, if necessary. + * + * If the other transaction is an update that's already committed, + * then sleeping cannot possibly do any good: if we're required to + * sleep, get out to raise an error instead. + * + * By here, we either have already acquired the buffer exclusive lock, + * or we must wait for the locking transaction or multixact; so below + * we ensure that we grab buffer lock after the sleep. + */ + if (require_sleep && result == HeapTupleUpdated) + { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + else if (require_sleep) + { + /* + * Acquire tuple lock to establish our priority for the tuple, or + * die trying. LockTuple will release us when we are next-in-line + * for the tuple. We must do this even if we are share-locking. + * + * If we are forced to "start over" below, we keep the tuple lock; + * this arranges that we stay at the head of the line while + * rechecking tuple state. + */ + if (!heap_acquire_tuplock(relation, tid, mode, wait_policy, + &have_tuple_lock)) + { + /* + * This can only happen if wait_policy is Skip and the lock + * couldn't be obtained. + */ + result = HeapTupleWouldBlock; + /* recovery code expects to have buffer lock held */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + + if (infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactStatus status = get_mxact_status_for_lock(mode, false); + + /* We only ever lock tuples, never update them */ + if (status >= MultiXactStatusNoKeyUpdate) + elog(ERROR, "invalid lock mode in heap_lock_tuple"); + + /* wait for multixact to end, or die trying */ + switch (wait_policy) + { + case LockWaitBlock: + MultiXactIdWait((MultiXactId) xwait, status, infomask, + relation, &tuple.t_self, XLTW_Lock, NULL); + break; + case LockWaitSkip: + if (!ConditionalMultiXactIdWait((MultiXactId) xwait, + status, infomask, relation, + NULL)) + { + result = HeapTupleWouldBlock; + /* recovery code expects to have buffer lock held */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + break; + case LockWaitError: + if (!ConditionalMultiXactIdWait((MultiXactId) xwait, + status, infomask, relation, + NULL)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + + break; + } + + /* + * Of course, the multixact might not be done here: if we're + * requesting a light lock mode, other transactions with light + * locks could still be alive, as well as locks owned by our + * own xact or other subxacts of this backend. We need to + * preserve the surviving MultiXact members. Note that it + * isn't absolutely necessary in the latter case, but doing so + * is simpler. + */ + } + else + { + /* wait for regular transaction to end, or die trying */ + switch (wait_policy) + { + case LockWaitBlock: + XactLockTableWait(xwait, relation, &tuple.t_self, + XLTW_Lock); + break; + case LockWaitSkip: + if (!ConditionalXactLockTableWait(xwait)) + { + result = HeapTupleWouldBlock; + /* recovery code expects to have buffer lock held */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + break; + case LockWaitError: + if (!ConditionalXactLockTableWait(xwait)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + break; + } + } + + /* if there are updates, follow the update chain */ + if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask)) + { + HTSU_Result res; + + res = heap_lock_updated_tuple(relation, &tuple, &t_ctid, + GetCurrentTransactionId(), + mode); + if (res != HeapTupleMayBeUpdated) + { + result = res; + /* recovery code expects to have buffer lock held */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + } + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * xwait is done, but if xwait had just locked the tuple then some + * other xact could update this tuple before we get to this point. + * Check for xmax change, and start over if so. + */ + if (xmax_infomask_changed(tuple.t_data->t_infomask, infomask) || + !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple.t_data), + xwait)) + goto l3; + + if (!(infomask & HEAP_XMAX_IS_MULTI)) + { + /* + * Otherwise check if it committed or aborted. Note we cannot + * be here if the tuple was only locked by somebody who didn't + * conflict with us; that would have been handled above. So + * that transaction must necessarily be gone by now. But + * don't check for this in the multixact case, because some + * locker transactions might still be running. + */ + UpdateXmaxHintBits(tuple.t_data, buf, xwait); + } + } + + /* By here, we're certain that we hold buffer exclusive lock again */ + + /* + * We may lock if previous xmax aborted, or if it committed but only + * locked the tuple without updating it; or if we didn't have to wait + * at all for whatever reason. + */ + if (!require_sleep || + (tuple.t_data->t_infomask & HEAP_XMAX_INVALID) || + HEAP_XMAX_IS_LOCKED_ONLY(tuple.t_data->t_infomask) || + HeapTupleHeaderIsOnlyLocked(tuple.t_data)) + result = HeapTupleMayBeUpdated; + else + result = HeapTupleUpdated; + } + +failed: + if (result != HeapTupleMayBeUpdated) + { + Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated || + result == HeapTupleWouldBlock); + Assert(!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID)); + hufd->ctid = tuple.t_data->t_ctid; + hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple.t_data); + if (result == HeapTupleSelfUpdated) + hufd->cmax = HeapTupleHeaderGetCmax(tuple.t_data); + else + hufd->cmax = InvalidCommandId; + goto out_locked; + } + + /* + * If we didn't pin the visibility map page and the page has become all + * visible while we were busy locking the buffer, or during some + * subsequent window during which we had it unlocked, we'll have to unlock + * and re-lock, to avoid holding the buffer lock across I/O. That's a bit + * unfortunate, especially since we'll now have to recheck whether the + * tuple has been locked or updated under us, but hopefully it won't + * happen very often. + */ + if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + visibilitymap_pin(relation, block, &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + goto l3; + } + + xmax = HeapTupleHeaderGetRawXmax(tuple.t_data); + old_infomask = tuple.t_data->t_infomask; + + /* + * If this is the first possibly-multixact-able operation in the current + * transaction, set my per-backend OldestMemberMXactId setting. We can be + * certain that the transaction will never become a member of any older + * MultiXactIds than that. (We have to do this even if we end up just + * using our own TransactionId below, since some other backend could + * incorporate our XID into a MultiXact immediately afterwards.) + */ + MultiXactIdSetOldestMember(); + + /* + * Compute the new xmax and infomask to store into the tuple. Note we do + * not modify the tuple just yet, because that would leave it in the wrong + * state if multixact.c elogs. + */ + compute_new_xmax_infomask(xmax, old_infomask, tuple.t_data->t_infomask2, + GetCurrentTransactionId(), mode, false, + &xid, &new_infomask, &new_infomask2); + + START_CRIT_SECTION(); + + /* + * Store transaction information of xact locking the tuple. + * + * Note: Cmax is meaningless in this context, so don't set it; this avoids + * possibly generating a useless combo CID. Moreover, if we're locking a + * previously updated tuple, it's important to preserve the Cmax. + * + * Also reset the HOT UPDATE bit, but only if there's no update; otherwise + * we would break the HOT chain. + */ + tuple.t_data->t_infomask &= ~HEAP_XMAX_BITS; + tuple.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; + tuple.t_data->t_infomask |= new_infomask; + tuple.t_data->t_infomask2 |= new_infomask2; + if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) + HeapTupleHeaderClearHotUpdated(tuple.t_data); + HeapTupleHeaderSetXmax(tuple.t_data, xid); + + /* + * Make sure there is no forward chain link in t_ctid. Note that in the + * cases where the tuple has been updated, we must not overwrite t_ctid, + * because it was set by the updater. Moreover, if the tuple has been + * updated, we need to follow the update chain to lock the new versions of + * the tuple as well. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) + tuple.t_data->t_ctid = *tid; + + /* Clear only the all-frozen bit on visibility map if needed */ + if (PageIsAllVisible(page) && + visibilitymap_clear(relation, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN)) + cleared_all_frozen = true; + + + MarkBufferDirty(buf); + + /* + * XLOG stuff. You might think that we don't need an XLOG record because + * there is no state change worth restoring after a crash. You would be + * wrong however: we have just written either a TransactionId or a + * MultiXactId that may never have been seen on disk before, and we need + * to make sure that there are XLOG entries covering those ID numbers. + * Else the same IDs might be re-used after a crash, which would be + * disastrous if this page made it to disk before the crash. Essentially + * we have to enforce the WAL log-before-data rule even in this case. + * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG + * entries for everything anyway.) + */ + if (RelationNeedsWAL(relation)) + { + xl_heap_lock xlrec; + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple.t_self); + xlrec.locking_xid = xid; + xlrec.infobits_set = compute_infobits(new_infomask, + tuple.t_data->t_infomask2); + xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); + + /* we don't decode row locks atm, so no need to log the origin */ + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + result = HeapTupleMayBeUpdated; + +out_locked: + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + +out_unlocked: + if (BufferIsValid(vmbuffer)) + ReleaseBuffer(vmbuffer); + + /* + * Don't update the visibility map here. Locking a tuple doesn't change + * visibility info. + */ + + /* + * Now that we have successfully marked the tuple as locked, we can + * release the lmgr tuple lock, if we had it. + */ + if (have_tuple_lock) + UnlockTupleTuplock(relation, tid, mode); + + *stuple = heap_copytuple(&tuple); + return result; +} + +/* + * heapam_get_latest_tid - get the latest tid of a specified tuple + * + * Actually, this gets the latest version that is visible according to + * the passed snapshot. You can pass SnapshotDirty to get the very latest, + * possibly uncommitted version. + * + * *tid is both an input and an output parameter: it is updated to + * show the latest version of the row. Note that it will not be changed + * if no version of the row passes the snapshot test. + */ +static void +heapam_get_latest_tid(Relation relation, + Snapshot snapshot, + ItemPointer tid) +{ + BlockNumber blk; + ItemPointerData ctid; + TransactionId priorXmax; + + /* this is to avoid Assert failures on bad input */ + if (!ItemPointerIsValid(tid)) + return; + + /* + * Since this can be called with user-supplied TID, don't trust the input + * too much. (RelationGetNumberOfBlocks is an expensive check, so we + * don't check t_ctid links again this way. Note that it would not do to + * call it just once and save the result, either.) + */ + blk = ItemPointerGetBlockNumber(tid); + if (blk >= RelationGetNumberOfBlocks(relation)) + elog(ERROR, "block number %u is out of range for relation \"%s\"", + blk, RelationGetRelationName(relation)); + + /* + * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we + * need to examine, and *tid is the TID we will return if ctid turns out + * to be bogus. + * + * Note that we will loop until we reach the end of the t_ctid chain. + * Depending on the snapshot passed, there might be at most one visible + * version of the row, but we don't try to optimize for that. + */ + ctid = *tid; + priorXmax = InvalidTransactionId; /* cannot check first XMIN */ + for (;;) + { + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp; + HeapTupleData tp; + bool valid; + + /* + * Read, pin, and lock the page. + */ + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, relation, page); + + /* + * Check for bogus item number. This is not treated as an error + * condition because it can happen while following a t_ctid link. We + * just assume that the prior tid is OK and return it unchanged. + */ + offnum = ItemPointerGetOffsetNumber(&ctid); + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) + { + UnlockReleaseBuffer(buffer); + break; + } + lp = PageGetItemId(page, offnum); + if (!ItemIdIsNormal(lp)) + { + UnlockReleaseBuffer(buffer); + break; + } + + /* OK to access the tuple */ + tp.t_self = ctid; + tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tp.t_len = ItemIdGetLength(lp); + tp.t_tableOid = RelationGetRelid(relation); + + /* + * After following a t_ctid link, we might arrive at an unrelated + * tuple. Check for XMIN match. + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) + { + UnlockReleaseBuffer(buffer); + break; + } + + /* + * Check time qualification of tuple; if visible, set it as the new + * result candidate. + */ + valid = HeapTupleSatisfiesVisibility(relation->rd_stamroutine, &tp, snapshot, buffer); + CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot); + if (valid) + *tid = ctid; + + /* + * If there's a valid t_ctid link, follow it, else we're done. + */ + if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsOnlyLocked(tp.t_data) || + ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) + { + UnlockReleaseBuffer(buffer); + break; + } + + ctid = tp.t_data->t_ctid; + priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); + UnlockReleaseBuffer(buffer); + } /* end of loop */ +} + + +/* + * heapam_sync - sync a heap, for use when no WAL has been written + * + * This forces the heap contents (including TOAST heap if any) down to disk. + * If we skipped using WAL, and WAL is otherwise needed, we must force the + * relation down to disk before it's safe to commit the transaction. This + * requires writing out any dirty buffers and then doing a forced fsync. + * + * Indexes are not touched. (Currently, index operations associated with + * the commands that use this are WAL-logged and so do not need fsync. + * That behavior might change someday, but in any case it's likely that + * any fsync decisions required would be per-index and hence not appropriate + * to be done here.) + */ +static void +heapam_sync(Relation rel) +{ + /* non-WAL-logged tables never need fsync */ + if (!RelationNeedsWAL(rel)) + return; + + /* main heap */ + FlushRelationBuffers(rel); + /* FlushRelationBuffers will have opened rd_smgr */ + smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM); + + /* FSM is not critical, don't bother syncing it */ + + /* toast heap, if any */ + if (OidIsValid(rel->rd_rel->reltoastrelid)) + { + Relation toastrel; + + toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock); + FlushRelationBuffers(toastrel); + smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM); + heap_close(toastrel, AccessShareLock); + } +} + +static tuple_data +heapam_get_tuple_data(StorageTuple tuple, tuple_data_flags flags) +{ + switch (flags) + { + case XMIN: + return (tuple_data)HeapTupleHeaderGetXmin(((HeapTuple)tuple)->t_data); + break; + case UPDATED_XID: + return (tuple_data)HeapTupleHeaderGetUpdateXid(((HeapTuple)tuple)->t_data); + break; + case CMIN: + return (tuple_data)HeapTupleHeaderGetCmin(((HeapTuple)tuple)->t_data); + break; + case TID: + return (tuple_data)((HeapTuple)tuple)->t_self; + break; + case CTID: + return (tuple_data)((HeapTuple)tuple)->t_data->t_ctid; + break; + default: + Assert(0); + break; + } +} + +static bool +heapam_tuple_is_heaopnly(StorageTuple tuple) +{ + return HeapTupleIsHeapOnly((HeapTuple)tuple); +} + +static StorageTuple +heapam_form_tuple_by_datum(Datum data, Oid tableoid) +{ + return heap_form_tuple_by_datum(data, tableoid); +} + +Datum +heapam_storage_handler(PG_FUNCTION_ARGS) +{ + StorageAmRoutine *amroutine = makeNode(StorageAmRoutine); + + amroutine->tuple_fetch = heapam_fetch; + amroutine->tuple_insert = heapam_heap_insert; + amroutine->tuple_delete = heapam_heap_delete; + amroutine->tuple_update = heapam_heap_update; + amroutine->tuple_lock = heapam_lock_tuple; + amroutine->multi_insert = heapam_multi_insert; + + amroutine->speculative_finish = heapam_finish_speculative; + amroutine->speculative_abort = heapam_abort_speculative; + + amroutine->get_tuple_data = heapam_get_tuple_data; + amroutine->tuple_is_heaponly = heapam_tuple_is_heaopnly; + amroutine->tuple_from_datum = heapam_form_tuple_by_datum; + amroutine->tuple_get_latest_tid = heapam_get_latest_tid; + + amroutine->relation_sync = heapam_sync; amroutine->slot_storageam = heapam_storage_slot_handler; diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 191f088703..8fba61c4f1 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -111,6 +111,7 @@ #include "access/heapam_common.h" #include "access/heapam_xlog.h" #include "access/rewriteheap.h" +#include "access/storageam.h" #include "access/transam.h" #include "access/tuptoaster.h" #include "access/xact.h" @@ -127,13 +128,13 @@ #include "storage/bufmgr.h" #include "storage/fd.h" +#include "storage/procarray.h" #include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/tqual.h" -#include "storage/procarray.h" /* * State associated with a rewrite operation. This is opaque to the user @@ -358,7 +359,7 @@ end_heap_rewrite(RewriteState state) * wrote before the checkpoint. */ if (RelationNeedsWAL(state->rs_new_rel)) - heap_sync(state->rs_new_rel); + storage_sync(state->rs_new_rel); logical_end_heap_rewrite(state); diff --git a/src/backend/access/heap/storageam.c b/src/backend/access/heap/storageam.c new file mode 100644 index 0000000000..d1d7364e7f --- /dev/null +++ b/src/backend/access/heap/storageam.c @@ -0,0 +1,306 @@ +/*------------------------------------------------------------------------- + * + * storageam.c + * storage access method code + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/storageam.c + * + * + * NOTES + * This file contains the storage_ routines which implement + * the POSTGRES storage access method used for all POSTGRES + * relations. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/hio.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/relscan.h" +#include "access/storageam.h" +#include "access/storageamapi.h" +#include "access/tuptoaster.h" +#include "access/valid.h" +#include "access/visibilitymap.h" +#include "access/xloginsert.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/rel.h" +#include "utils/tqual.h" + + +/* + * storage_fetch - retrieve tuple with given tid + * + * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding + * the tuple, fill in the remaining fields of *tuple, and check the tuple + * against the specified snapshot. + * + * If successful (tuple found and passes snapshot time qual), then *userbuf + * is set to the buffer holding the tuple and TRUE is returned. The caller + * must unpin the buffer when done with the tuple. + * + * If the tuple is not found (ie, item number references a deleted slot), + * then tuple->t_data is set to NULL and FALSE is returned. + * + * If the tuple is found but fails the time qual check, then FALSE is returned + * but tuple->t_data is left pointing to the tuple. + * + * keep_buf determines what is done with the buffer in the FALSE-result cases. + * When the caller specifies keep_buf = true, we retain the pin on the buffer + * and return it in *userbuf (so the caller must eventually unpin it); when + * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer. + * + * stats_relation is the relation to charge the heap_fetch operation against + * for statistical purposes. (This could be the heap rel itself, an + * associated index, or NULL to not count the fetch at all.) + * + * heap_fetch does not follow HOT chains: only the exact TID requested will + * be fetched. + * + * It is somewhat inconsistent that we ereport() on invalid block number but + * return false on invalid item number. There are a couple of reasons though. + * One is that the caller can relatively easily check the block number for + * validity, but cannot check the item number without reading the page + * himself. Another is that when we are following a t_ctid link, we can be + * reasonably confident that the page number is valid (since VACUUM shouldn't + * truncate off the destination page without having killed the referencing + * tuple first), but the item number might well not be good. + */ +bool +storage_fetch(Relation relation, + ItemPointer tid, + Snapshot snapshot, + StorageTuple *stuple, + Buffer *userbuf, + bool keep_buf, + Relation stats_relation) +{ + return relation->rd_stamroutine->tuple_fetch(relation, tid, snapshot, stuple, + userbuf, keep_buf, stats_relation); +} + + +/* + * storage_lock_tuple - lock a tuple in shared or exclusive mode + * + * Note that this acquires a buffer pin, which the caller must release. + * + * Input parameters: + * relation: relation containing tuple (caller must hold suitable lock) + * tuple->t_self: TID of tuple to lock (rest of struct need not be valid) + * cid: current command ID (used for visibility test, and stored into + * tuple's cmax if lock is successful) + * mode: indicates if shared or exclusive tuple lock is desired + * wait_policy: what to do if tuple lock is not available + * follow_updates: if true, follow the update chain to also lock descendant + * tuples. + * + * Output parameters: + * *tuple: all fields filled in + * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *hufd: filled in failure cases (see below) + * + * Function result may be: + * HeapTupleMayBeUpdated: lock was successfully acquired + * HeapTupleInvisible: lock failed because tuple was never visible to us + * HeapTupleSelfUpdated: lock failed because tuple updated by self + * HeapTupleUpdated: lock failed because tuple updated by other xact + * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip + * + * In the failure cases other than HeapTupleInvisible, the routine fills + * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, + * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated, + * since we cannot obtain cmax from a combocid generated by another + * transaction). + * See comments for struct HeapUpdateFailureData for additional info. + * + * See README.tuplock for a thorough explanation of this mechanism. + */ +HTSU_Result +storage_lock_tuple(Relation relation, ItemPointer tid, StorageTuple *stuple, + CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, + bool follow_updates, Buffer *buffer, HeapUpdateFailureData *hufd) +{ + return relation->rd_stamroutine->tuple_lock(relation, tid, stuple, + cid, mode, wait_policy, + follow_updates, buffer, hufd); +} + +/* + * Insert a tuple from a slot into storage AM routine + */ +Oid +storage_insert(Relation relation, TupleTableSlot *slot, CommandId cid, + int options, BulkInsertState bistate) +{ + return relation->rd_stamroutine->tuple_insert(relation, slot, cid, + options, bistate); +} + +/* + * Delete a tuple from tid using storage AM routine + */ +HTSU_Result +storage_delete(Relation relation, ItemPointer tid, CommandId cid, + Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd) +{ + return relation->rd_stamroutine->tuple_delete(relation, tid, cid, + crosscheck, wait, hufd); +} + +/* + * update a tuple from tid using storage AM routine + */ +HTSU_Result +storage_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd, LockTupleMode *lockmode) +{ + return relation->rd_stamroutine->tuple_update(relation, otid, slot, cid, + crosscheck, wait, hufd, lockmode); +} + + +/* + * storage_multi_insert - insert multiple tuple into a storage + * + * This is like heap_insert(), but inserts multiple tuples in one operation. + * That's faster than calling heap_insert() in a loop, because when multiple + * tuples can be inserted on a single page, we can write just a single WAL + * record covering all of them, and only need to lock/unlock the page once. + * + * Note: this leaks memory into the current memory context. You can create a + * temporary context before calling this, if that's a problem. + */ +void +storage_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, + CommandId cid, int options, BulkInsertState bistate) +{ + relation->rd_stamroutine->multi_insert(relation, tuples, ntuples, + cid, options, bistate); +} + + +/* + * storage_finish_speculative - mark speculative insertion as successful + * + * To successfully finish a speculative insertion we have to clear speculative + * token from tuple. To do so the t_ctid field, which will contain a + * speculative token value, is modified in place to point to the tuple itself, + * which is characteristic of a newly inserted ordinary tuple. + * + * NB: It is not ok to commit without either finishing or aborting a + * speculative insertion. We could treat speculative tuples of committed + * transactions implicitly as completed, but then we would have to be prepared + * to deal with speculative tokens on committed tuples. That wouldn't be + * difficult - no-one looks at the ctid field of a tuple with invalid xmax - + * but clearing the token at completion isn't very expensive either. + * An explicit confirmation WAL record also makes logical decoding simpler. + */ +void +storage_finish_speculative(Relation relation, TupleTableSlot *slot) +{ + relation->rd_stamroutine->speculative_finish(relation, slot); +} + +/* + * storage_abort_speculative - kill a speculatively inserted tuple + * + * Marks a tuple that was speculatively inserted in the same command as dead, + * by setting its xmin as invalid. That makes it immediately appear as dead + * to all transactions, including our own. In particular, it makes + * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend + * inserting a duplicate key value won't unnecessarily wait for our whole + * transaction to finish (it'll just wait for our speculative insertion to + * finish). + * + * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks + * that arise due to a mutual dependency that is not user visible. By + * definition, unprincipled deadlocks cannot be prevented by the user + * reordering lock acquisition in client code, because the implementation level + * lock acquisitions are not under the user's direct control. If speculative + * inserters did not take this precaution, then under high concurrency they + * could deadlock with each other, which would not be acceptable. + * + * This is somewhat redundant with heap_delete, but we prefer to have a + * dedicated routine with stripped down requirements. Note that this is also + * used to delete the TOAST tuples created during speculative insertion. + * + * This routine does not affect logical decoding as it only looks at + * confirmation records. + */ +void +storage_abort_speculative(Relation relation, TupleTableSlot *slot) +{ + relation->rd_stamroutine->speculative_abort(relation, slot); +} + +tuple_data +storage_tuple_get_data(Relation relation, StorageTuple tuple, tuple_data_flags flags) +{ + return relation->rd_stamroutine->get_tuple_data(tuple, flags); +} + +bool +storage_tuple_is_heaponly(Relation relation, StorageTuple tuple) +{ + return relation->rd_stamroutine->tuple_is_heaponly(tuple); +} + +StorageTuple +storage_tuple_by_datum(Relation relation, Datum data, Oid tableoid) +{ + if (relation) + return relation->rd_stamroutine->tuple_from_datum(data, tableoid); + else + return heap_form_tuple_by_datum(data, tableoid); +} + +void +storage_get_latest_tid(Relation relation, + Snapshot snapshot, + ItemPointer tid) +{ + relation->rd_stamroutine->tuple_get_latest_tid(relation, snapshot, tid); +} + +/* + * storage_sync - sync a heap, for use when no WAL has been written + * + * This forces the heap contents (including TOAST heap if any) down to disk. + * If we skipped using WAL, and WAL is otherwise needed, we must force the + * relation down to disk before it's safe to commit the transaction. This + * requires writing out any dirty buffers and then doing a forced fsync. + * + * Indexes are not touched. (Currently, index operations associated with + * the commands that use this are WAL-logged and so do not need fsync. + * That behavior might change someday, but in any case it's likely that + * any fsync decisions required would be per-index and hence not appropriate + * to be done here.) + */ +void +storage_sync(Relation rel) +{ + rel->rd_stamroutine->relation_sync(rel); +} diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c index 5a8f1dab83..d766a6eb6a 100644 --- a/src/backend/access/heap/tuptoaster.c +++ b/src/backend/access/heap/tuptoaster.c @@ -32,6 +32,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/storageam.h" #include "access/tuptoaster.h" #include "access/xact.h" #include "catalog/catalog.h" @@ -1777,7 +1778,12 @@ toast_delete_datum(Relation rel, Datum value, bool is_speculative) * Have a chunk, delete it */ if (is_speculative) - heap_abort_speculative(toastrel, toasttup); + { + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(toastrel)); + ExecStoreTuple(toasttup, slot, InvalidBuffer, false); + storage_abort_speculative(toastrel, slot); + ExecDropSingleTupleTableSlot(slot); + } else simple_heap_delete(toastrel, &toasttup->t_self); } diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index f1f546a321..beb7f050fe 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -22,6 +22,7 @@ #include "access/heapam.h" #include "access/htup_details.h" +#include "access/storageam.h" #include "access/sysattr.h" #include "access/xact.h" #include "access/xlog.h" @@ -2696,8 +2697,6 @@ CopyFrom(CopyState cstate) if (slot == NULL) /* "do nothing" */ skip_tuple = true; - else /* trigger might have changed tuple */ - tuple = ExecHeapifySlot(slot); } if (!skip_tuple) @@ -2760,19 +2759,18 @@ CopyFrom(CopyState cstate) List *recheckIndexes = NIL; /* OK, store the tuple and create index entries for it */ - heap_insert(resultRelInfo->ri_RelationDesc, tuple, mycid, - hi_options, bistate); + storage_insert(resultRelInfo->ri_RelationDesc, slot, mycid, hi_options, bistate); if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, - &(tuple->t_self), + &(slot->tts_tid), estate, false, NULL, NIL); /* AFTER ROW INSERT Triggers */ - ExecARInsertTriggers(estate, resultRelInfo, tuple, + ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes, cstate->transition_capture); list_free(recheckIndexes); @@ -2868,7 +2866,7 @@ CopyFrom(CopyState cstate) * indexes since those use WAL anyway) */ if (hi_options & HEAP_INSERT_SKIP_WAL) - heap_sync(cstate->rel); + storage_sync(cstate->rel); return processed; } @@ -2901,12 +2899,12 @@ CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, * before calling it. */ oldcontext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); - heap_multi_insert(cstate->rel, - bufferedTuples, - nBufferedTuples, - mycid, - hi_options, - bistate); + storage_multi_insert(cstate->rel, + bufferedTuples, + nBufferedTuples, + mycid, + hi_options, + bistate); MemoryContextSwitchTo(oldcontext); /* @@ -2925,7 +2923,7 @@ CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, ExecInsertIndexTuples(myslot, &(bufferedTuples[i]->t_self), estate, false, NULL, NIL); ExecARInsertTriggers(estate, resultRelInfo, - bufferedTuples[i], + myslot, recheckIndexes, cstate->transition_capture); list_free(recheckIndexes); } @@ -2942,8 +2940,9 @@ CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, for (i = 0; i < nBufferedTuples; i++) { cstate->cur_lineno = firstBufferedLineNo + i; + ExecStoreTuple(bufferedTuples[i], myslot, InvalidBuffer, false); ExecARInsertTriggers(estate, resultRelInfo, - bufferedTuples[i], + myslot, NIL, cstate->transition_capture); } } diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index a0ec444d33..d119149039 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -26,6 +26,7 @@ #include "access/reloptions.h" #include "access/htup_details.h" +#include "access/storageam.h" #include "access/sysattr.h" #include "access/xact.h" #include "access/xlog.h" @@ -582,25 +583,24 @@ static bool intorel_receive(TupleTableSlot *slot, DestReceiver *self) { DR_intorel *myState = (DR_intorel *) self; - HeapTuple tuple; /* * get the heap tuple out of the tuple table slot, making sure we have a * writable copy */ - tuple = ExecHeapifySlot(slot); + ExecMaterializeSlot(slot); /* * force assignment of new OID (see comments in ExecInsert) */ if (myState->rel->rd_rel->relhasoids) - HeapTupleSetOid(tuple, InvalidOid); + slot->tts_tupleOid = InvalidOid; - heap_insert(myState->rel, - tuple, - myState->output_cid, - myState->hi_options, - myState->bistate); + storage_insert(myState->rel, + slot, + myState->output_cid, + myState->hi_options, + myState->bistate); /* We know this is a newly created relation, so there are no indexes */ @@ -619,7 +619,7 @@ intorel_shutdown(DestReceiver *self) /* If we skipped using WAL, must heap_sync before commit */ if (myState->hi_options & HEAP_INSERT_SKIP_WAL) - heap_sync(myState->rel); + storage_sync(myState->rel); /* close rel, but keep lock until commit */ heap_close(myState->rel, NoLock); diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index b440740e28..6102481fde 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -16,6 +16,7 @@ #include "access/htup_details.h" #include "access/multixact.h" +#include "access/storageam.h" #include "access/xact.h" #include "access/xlog.h" #include "catalog/catalog.h" @@ -491,16 +492,15 @@ static bool transientrel_receive(TupleTableSlot *slot, DestReceiver *self) { DR_transientrel *myState = (DR_transientrel *) self; - HeapTuple tuple; /* * get the heap tuple out of the tuple table slot, making sure we have a * writable copy */ - tuple = ExecHeapifySlot(slot); + ExecMaterializeSlot(slot); - heap_insert(myState->transientrel, - tuple, + storage_insert(myState->transientrel, + slot, myState->output_cid, myState->hi_options, myState->bistate); @@ -522,7 +522,7 @@ transientrel_shutdown(DestReceiver *self) /* If we skipped using WAL, must heap_sync before commit */ if (myState->hi_options & HEAP_INSERT_SKIP_WAL) - heap_sync(myState->transientrel); + storage_sync(myState->transientrel); /* close transientrel, but keep lock until commit */ heap_close(myState->transientrel, NoLock); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 563bcda30c..c9e5ae0832 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -19,6 +19,7 @@ #include "access/multixact.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "access/storageam.h" #include "access/sysattr.h" #include "access/tupconvert.h" #include "access/xact.h" @@ -4652,7 +4653,7 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) /* Write the tuple out to the new relation */ if (newrel) - heap_insert(newrel, tuple, mycid, hi_options, bistate); + storage_insert(newrel, newslot, mycid, hi_options, bistate); ResetExprContext(econtext); @@ -4676,7 +4677,7 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) /* If we skipped writing WAL, then we need to sync the heap. */ if (hi_options & HEAP_INSERT_SKIP_WAL) - heap_sync(newrel); + storage_sync(newrel); heap_close(newrel, NoLock); } diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 2f530169b8..8e2f351949 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2352,17 +2352,21 @@ ExecBRInsertTriggers(EState *estate, ResultRelInfo *relinfo, void ExecARInsertTriggers(EState *estate, ResultRelInfo *relinfo, - HeapTuple trigtuple, List *recheckIndexes, + TupleTableSlot *slot, List *recheckIndexes, TransitionCaptureState *transition_capture) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; if ((trigdesc && trigdesc->trig_insert_after_row) || (transition_capture && transition_capture->tcs_insert_new_table)) + { + HeapTuple trigtuple = ExecHeapifySlot(slot); + AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_INSERT, true, NULL, trigtuple, recheckIndexes, NULL, transition_capture); + } } TupleTableSlot * @@ -3012,9 +3016,10 @@ GetTupleForTrigger(EState *estate, TupleTableSlot **newSlot) { Relation relation = relinfo->ri_RelationDesc; - HeapTupleData tuple; + StorageTuple tuple; HeapTuple result; Buffer buffer; + tuple_data t_data; if (newSlot != NULL) { @@ -3030,11 +3035,11 @@ GetTupleForTrigger(EState *estate, * lock tuple for update */ ltrmark:; - tuple.t_self = *tid; - test = heap_lock_tuple(relation, &tuple, + test = storage_lock_tuple(relation, tid, &tuple, estate->es_output_cid, lockmode, LockWaitBlock, false, &buffer, &hufd); + result = tuple; switch (test) { case HeapTupleSelfUpdated: @@ -3066,7 +3071,8 @@ ltrmark:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - if (!ItemPointerEquals(&hufd.ctid, &tuple.t_self)) + t_data = relation->rd_stamroutine->get_tuple_data(tuple, TID); + if (!ItemPointerEquals(&hufd.ctid, &(t_data.tid))) { /* it was updated, so look at the updated version */ TupleTableSlot *epqslot; @@ -3112,6 +3118,7 @@ ltrmark:; { Page page; ItemId lp; + HeapTupleData tupledata; buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); @@ -3130,17 +3137,17 @@ ltrmark:; Assert(ItemIdIsNormal(lp)); - tuple.t_data = (HeapTupleHeader) PageGetItem(page, lp); - tuple.t_len = ItemIdGetLength(lp); - tuple.t_self = *tid; - tuple.t_tableOid = RelationGetRelid(relation); + tupledata.t_data = (HeapTupleHeader) PageGetItem(page, lp); + tupledata.t_len = ItemIdGetLength(lp); + tupledata.t_self = *tid; + tupledata.t_tableOid = RelationGetRelid(relation); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + result = heap_copytuple(&tupledata); } - result = heap_copytuple(&tuple); ReleaseBuffer(buffer); - return result; } @@ -3953,8 +3960,8 @@ AfterTriggerExecute(AfterTriggerEvent event, AfterTriggerShared evtshared = GetTriggerSharedData(event); Oid tgoid = evtshared->ats_tgoid; TriggerData LocTriggerData; - HeapTupleData tuple1; - HeapTupleData tuple2; + StorageTuple tuple1; + StorageTuple tuple2; HeapTuple rettuple; Buffer buffer1 = InvalidBuffer; Buffer buffer2 = InvalidBuffer; @@ -4027,10 +4034,9 @@ AfterTriggerExecute(AfterTriggerEvent event, default: if (ItemPointerIsValid(&(event->ate_ctid1))) { - ItemPointerCopy(&(event->ate_ctid1), &(tuple1.t_self)); - if (!heap_fetch(rel, SnapshotAny, &tuple1, &buffer1, false, NULL)) + if (!storage_fetch(rel, &(event->ate_ctid1), SnapshotAny, &tuple1, &buffer1, false, NULL)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); - LocTriggerData.tg_trigtuple = &tuple1; + LocTriggerData.tg_trigtuple = tuple1; LocTriggerData.tg_trigtuplebuf = buffer1; } else @@ -4044,10 +4050,9 @@ AfterTriggerExecute(AfterTriggerEvent event, AFTER_TRIGGER_2CTID && ItemPointerIsValid(&(event->ate_ctid2))) { - ItemPointerCopy(&(event->ate_ctid2), &(tuple2.t_self)); - if (!heap_fetch(rel, SnapshotAny, &tuple2, &buffer2, false, NULL)) + if (!storage_fetch(rel, &(event->ate_ctid2), SnapshotAny, &tuple2, &buffer2, false, NULL)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); - LocTriggerData.tg_newtuple = &tuple2; + LocTriggerData.tg_newtuple = tuple2; LocTriggerData.tg_newtuplebuf = buffer2; } else diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 62fb05efac..8657a139ed 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1895,7 +1895,7 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, /* See the comment above. */ if (resultRelInfo->ri_PartitionRoot) { - HeapTuple tuple = ExecFetchSlotTuple(slot); + StorageTuple tuple = ExecFetchSlotTuple(slot); TupleDesc old_tupdesc = RelationGetDescr(rel); TupleConversionMap *map; @@ -1975,7 +1975,7 @@ ExecConstraints(ResultRelInfo *resultRelInfo, */ if (resultRelInfo->ri_PartitionRoot) { - HeapTuple tuple = ExecFetchSlotTuple(slot); + StorageTuple tuple = ExecFetchSlotTuple(slot); TupleConversionMap *map; rel = resultRelInfo->ri_PartitionRoot; @@ -2022,7 +2022,7 @@ ExecConstraints(ResultRelInfo *resultRelInfo, /* See the comment above. */ if (resultRelInfo->ri_PartitionRoot) { - HeapTuple tuple = ExecFetchSlotTuple(slot); + StorageTuple tuple = ExecFetchSlotTuple(slot); TupleDesc old_tupdesc = RelationGetDescr(rel); TupleConversionMap *map; @@ -2481,7 +2481,8 @@ EvalPlanQual(EState *estate, EPQState *epqstate, ItemPointer tid, TransactionId priorXmax) { TupleTableSlot *slot; - HeapTuple copyTuple; + StorageTuple copyTuple; + tuple_data t_data; Assert(rti > 0); @@ -2498,7 +2499,9 @@ EvalPlanQual(EState *estate, EPQState *epqstate, * For UPDATE/DELETE we have to return tid of actual row we're executing * PQ for. */ - *tid = copyTuple->t_self; + + t_data = storage_tuple_get_data(relation, copyTuple, TID); + *tid = t_data.tid; /* * Need to run a recheck subquery. Initialize or reinitialize EPQ state. @@ -2529,7 +2532,7 @@ EvalPlanQual(EState *estate, EPQState *epqstate, * is to guard against early re-use of the EPQ query. */ if (!TupIsNull(slot)) - (void) ExecMaterializeSlot(slot); + ExecMaterializeSlot(slot); /* * Clear out the test tuple. This is needed in case the EPQ query is @@ -2562,14 +2565,14 @@ EvalPlanQual(EState *estate, EPQState *epqstate, * Note: properly, lockmode should be declared as enum LockTupleMode, * but we use "int" to avoid having to include heapam.h in executor.h. */ -HeapTuple +StorageTuple EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, LockWaitPolicy wait_policy, ItemPointer tid, TransactionId priorXmax) { - HeapTuple copyTuple = NULL; - HeapTupleData tuple; + StorageTuple tuple = NULL; SnapshotData SnapshotDirty; + tuple_data t_data; /* * fetch target tuple @@ -2577,12 +2580,12 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, * Loop here to deal with updated or busy tuples */ InitDirtySnapshot(SnapshotDirty); - tuple.t_self = *tid; for (;;) { Buffer buffer; + ItemPointerData ctid; - if (heap_fetch(relation, &SnapshotDirty, &tuple, &buffer, true, NULL)) + if (storage_fetch(relation, tid, &SnapshotDirty, &tuple, &buffer, true, NULL)) { HTSU_Result test; HeapUpdateFailureData hufd; @@ -2596,8 +2599,8 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, * atomic, and Xmin never changes in an existing tuple, except to * invalid or frozen, and neither of those can match priorXmax.) */ - if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data), - priorXmax)) + t_data = storage_tuple_get_data(relation, tuple, XMIN); + if (!TransactionIdEquals(t_data.xid, priorXmax)) { ReleaseBuffer(buffer); return NULL; @@ -2618,7 +2621,8 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, { case LockWaitBlock: XactLockTableWait(SnapshotDirty.xmax, - relation, &tuple.t_self, + relation, + tid, XLTW_FetchUpdated); break; case LockWaitSkip: @@ -2647,17 +2651,20 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, * that priorXmax == xmin, so we can test that variable instead of * doing HeapTupleHeaderGetXmin again. */ - if (TransactionIdIsCurrentTransactionId(priorXmax) && - HeapTupleHeaderGetCmin(tuple.t_data) >= estate->es_output_cid) + if (TransactionIdIsCurrentTransactionId(priorXmax)) { - ReleaseBuffer(buffer); - return NULL; + t_data = storage_tuple_get_data(relation, tuple, CMIN); + if (t_data.cid >= estate->es_output_cid) + { + ReleaseBuffer(buffer); + return NULL; + } } /* * This is a live tuple, so now try to lock it. */ - test = heap_lock_tuple(relation, &tuple, + test = storage_lock_tuple(relation, tid, tuple, estate->es_output_cid, lockmode, wait_policy, false, &buffer, &hufd); @@ -2696,12 +2703,15 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); +#if 0 //hari /* Should not encounter speculative tuple on recheck */ Assert(!HeapTupleHeaderIsSpeculative(tuple.t_data)); - if (!ItemPointerEquals(&hufd.ctid, &tuple.t_self)) +#endif + t_data = storage_tuple_get_data(relation, tuple, TID); + if (!ItemPointerEquals(&hufd.ctid, &t_data.tid)) { /* it was updated, so look at the updated version */ - tuple.t_self = hufd.ctid; + *tid = hufd.ctid; /* updated row should have xmin matching this xmax */ priorXmax = hufd.xmax; continue; @@ -2723,10 +2733,6 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, return NULL; /* keep compiler quiet */ } - /* - * We got tuple - now copy it for use by recheck query. - */ - copyTuple = heap_copytuple(&tuple); ReleaseBuffer(buffer); break; } @@ -2735,7 +2741,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, * If the referenced slot was actually empty, the latest version of * the row must have been deleted, so we need do nothing. */ - if (tuple.t_data == NULL) + if (tuple == NULL) { ReleaseBuffer(buffer); return NULL; @@ -2744,8 +2750,8 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, /* * As above, if xmin isn't what we're expecting, do nothing. */ - if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data), - priorXmax)) + t_data = storage_tuple_get_data(relation, tuple, XMIN); + if (!TransactionIdEquals(t_data.xid, priorXmax)) { ReleaseBuffer(buffer); return NULL; @@ -2763,7 +2769,9 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, * As above, it should be safe to examine xmax and t_ctid without the * buffer content lock, because they can't be changing. */ - if (ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid)) + t_data = storage_tuple_get_data(relation, tuple, CTID); + ctid = t_data.tid; + if (ItemPointerEquals(tid, &ctid)) { /* deleted, so forget about it */ ReleaseBuffer(buffer); @@ -2771,17 +2779,19 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, } /* updated, so look at the updated row */ - tuple.t_self = tuple.t_data->t_ctid; + *tid = ctid; + /* updated row should have xmin matching this xmax */ - priorXmax = HeapTupleHeaderGetUpdateXid(tuple.t_data); + t_data = storage_tuple_get_data(relation, tuple, UPDATED_XID); + priorXmax = t_data.xid; ReleaseBuffer(buffer); /* loop back to fetch next in chain */ } /* - * Return the copied tuple + * Return the tuple */ - return copyTuple; + return tuple; } /* @@ -2827,7 +2837,7 @@ EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks) * NB: passed tuple must be palloc'd; it may get freed later */ void -EvalPlanQualSetTuple(EPQState *epqstate, Index rti, HeapTuple tuple) +EvalPlanQualSetTuple(EPQState *epqstate, Index rti, StorageTuple tuple) { EState *estate = epqstate->estate; @@ -2846,7 +2856,7 @@ EvalPlanQualSetTuple(EPQState *epqstate, Index rti, HeapTuple tuple) /* * Fetch back the current test tuple (if any) for the specified RTI */ -HeapTuple +StorageTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti) { EState *estate = epqstate->estate; @@ -2874,7 +2884,7 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate) ExecRowMark *erm = aerm->rowmark; Datum datum; bool isNull; - HeapTupleData tuple; + StorageTuple tuple; if (RowMarkRequiresRowShareLock(erm->markType)) elog(ERROR, "EvalPlanQual doesn't support locking rowmarks"); @@ -2905,8 +2915,6 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate) if (erm->markType == ROW_MARK_REFERENCE) { - HeapTuple copyTuple; - Assert(erm->relation != NULL); /* fetch the tuple's ctid */ @@ -2930,11 +2938,11 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot lock rows in foreign table \"%s\"", RelationGetRelationName(erm->relation)))); - copyTuple = fdwroutine->RefetchForeignRow(epqstate->estate, + tuple = fdwroutine->RefetchForeignRow(epqstate->estate, erm, datum, &updated); - if (copyTuple == NULL) + if (tuple == NULL) elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); /* @@ -2948,23 +2956,18 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate) /* ordinary table, fetch the tuple */ Buffer buffer; - tuple.t_self = *((ItemPointer) DatumGetPointer(datum)); - if (!heap_fetch(erm->relation, SnapshotAny, &tuple, &buffer, + if (!storage_fetch(erm->relation, (ItemPointer) DatumGetPointer(datum), SnapshotAny, &tuple, &buffer, false, NULL)) elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); - /* successful, copy tuple */ - copyTuple = heap_copytuple(&tuple); ReleaseBuffer(buffer); } /* store tuple */ - EvalPlanQualSetTuple(epqstate, erm->rti, copyTuple); + EvalPlanQualSetTuple(epqstate, erm->rti, tuple); } else { - HeapTupleHeader td; - Assert(erm->markType == ROW_MARK_COPY); /* fetch the whole-row Var for the relation */ @@ -2974,19 +2977,12 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate) /* non-locked rels could be on the inside of outer joins */ if (isNull) continue; - td = DatumGetHeapTupleHeader(datum); - - /* build a temporary HeapTuple control structure */ - tuple.t_len = HeapTupleHeaderGetDatumLength(td); - tuple.t_data = td; - /* relation might be a foreign table, if so provide tableoid */ - tuple.t_tableOid = erm->relid; - /* also copy t_ctid in case there's valid data there */ - tuple.t_self = td->t_ctid; - - /* copy and store tuple */ - EvalPlanQualSetTuple(epqstate, erm->rti, - heap_copytuple(&tuple)); + + tuple = storage_tuple_by_datum(erm->relation, datum, erm->relid); + + /* store tuple */ + EvalPlanQualSetTuple(epqstate, erm->rti, tuple); + } } } @@ -3152,8 +3148,8 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree) } else { - estate->es_epqTuple = (HeapTuple *) - palloc0(rtsize * sizeof(HeapTuple)); + estate->es_epqTuple = (StorageTuple *) + palloc0(rtsize * sizeof(StorageTuple)); estate->es_epqTupleSet = (bool *) palloc0(rtsize * sizeof(bool)); } diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 6700f0ad80..8d625b6cbe 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/relscan.h" +#include "access/storageam.h" #include "access/transam.h" #include "access/xact.h" #include "commands/trigger.h" @@ -169,19 +170,19 @@ retry: Buffer buf; HeapUpdateFailureData hufd; HTSU_Result res; - HeapTupleData locktup; - - ItemPointerCopy(&outslot->tts_tid, &locktup.t_self); + StorageTuple locktup; PushActiveSnapshot(GetLatestSnapshot()); - res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false), + res = storage_lock_tuple(rel, &(outslot->tts_tid), &locktup, GetCurrentCommandId(false), lockmode, LockWaitBlock, false /* don't follow updates */ , &buf, &hufd); /* the tuple slot already has the buffer pinned */ - ReleaseBuffer(buf); + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + pfree(locktup); PopActiveSnapshot(); @@ -277,19 +278,20 @@ retry: Buffer buf; HeapUpdateFailureData hufd; HTSU_Result res; - HeapTupleData locktup; - - ItemPointerCopy(&outslot->tts_tid, &locktup.t_self); + StorageTuple locktup; PushActiveSnapshot(GetLatestSnapshot()); - res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false), + res = storage_lock_tuple(rel, &(outslot->tts_tid), &locktup, GetCurrentCommandId(false), lockmode, LockWaitBlock, false /* don't follow updates */ , &buf, &hufd); /* the tuple slot already has the buffer pinned */ - ReleaseBuffer(buf); + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + + pfree(locktup); PopActiveSnapshot(); @@ -327,7 +329,7 @@ void ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot) { bool skip_tuple = false; - HeapTuple tuple; + StorageTuple tuple; ResultRelInfo *resultRelInfo = estate->es_result_relation_info; Relation rel = resultRelInfo->ri_RelationDesc; @@ -349,6 +351,7 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot) if (!skip_tuple) { List *recheckIndexes = NIL; + tuple_data t_data; /* Check the constraints of the tuple */ if (rel->rd_att->constr) @@ -359,14 +362,15 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot) /* OK, store the tuple and create index entries for it */ simple_heap_insert(rel, tuple); + t_data = storage_tuple_get_data(rel, tuple, TID); if (resultRelInfo->ri_NumIndices > 0) - recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + recheckIndexes = ExecInsertIndexTuples(slot, &(t_data.tid), estate, false, NULL, NIL); /* AFTER ROW INSERT Triggers */ - ExecARInsertTriggers(estate, resultRelInfo, tuple, + ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes, NULL); /* @@ -390,7 +394,7 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, TupleTableSlot *searchslot, TupleTableSlot *slot) { bool skip_tuple = false; - HeapTuple tuple; + StorageTuple tuple; ResultRelInfo *resultRelInfo = estate->es_result_relation_info; Relation rel = resultRelInfo->ri_RelationDesc; ItemPointer tid = &(searchslot->tts_tid); @@ -426,8 +430,7 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, /* OK, update the tuple and index entries for it */ simple_heap_update(rel, tid, tuple); - if (resultRelInfo->ri_NumIndices > 0 && - !HeapTupleIsHeapOnly(tuple)) + if (resultRelInfo->ri_NumIndices > 0 && !storage_tuple_is_heaponly(rel, tuple)) recheckIndexes = ExecInsertIndexTuples(slot, tid, estate, false, NULL, NIL); diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c index 93895600a5..f06f34a1ec 100644 --- a/src/backend/executor/nodeLockRows.c +++ b/src/backend/executor/nodeLockRows.c @@ -22,6 +22,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/storageam.h" #include "access/xact.h" #include "executor/executor.h" #include "executor/nodeLockRows.h" @@ -74,18 +75,20 @@ lnext: { ExecAuxRowMark *aerm = (ExecAuxRowMark *) lfirst(lc); ExecRowMark *erm = aerm->rowmark; - HeapTuple *testTuple; + StorageTuple *testTuple; Datum datum; bool isNull; - HeapTupleData tuple; + StorageTuple tuple; Buffer buffer; HeapUpdateFailureData hufd; LockTupleMode lockmode; HTSU_Result test; - HeapTuple copyTuple; + StorageTuple copyTuple; + ItemPointerData tid; + tuple_data t_data; /* clear any leftover test tuple for this rel */ - testTuple = &(node->lr_curtuples[erm->rti - 1]); + testTuple = (StorageTuple)(&(node->lr_curtuples[erm->rti - 1])); if (*testTuple != NULL) heap_freetuple(*testTuple); *testTuple = NULL; @@ -159,7 +162,7 @@ lnext: } /* okay, try to lock the tuple */ - tuple.t_self = *((ItemPointer) DatumGetPointer(datum)); + tid = *((ItemPointer) DatumGetPointer(datum)); switch (erm->markType) { case ROW_MARK_EXCLUSIVE: @@ -180,11 +183,13 @@ lnext: break; } - test = heap_lock_tuple(erm->relation, &tuple, + test = storage_lock_tuple(erm->relation, &tid, &tuple, estate->es_output_cid, lockmode, erm->waitPolicy, true, &buffer, &hufd); - ReleaseBuffer(buffer); + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + switch (test) { case HeapTupleWouldBlock: @@ -218,7 +223,8 @@ lnext: ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - if (ItemPointerEquals(&hufd.ctid, &tuple.t_self)) + t_data = erm->relation->rd_stamroutine->get_tuple_data(tuple, TID); + if (ItemPointerEquals(&hufd.ctid, &(t_data.tid))) { /* Tuple was deleted, so don't return it */ goto lnext; @@ -238,7 +244,8 @@ lnext: goto lnext; } /* remember the actually locked tuple's TID */ - tuple.t_self = copyTuple->t_self; + t_data = erm->relation->rd_stamroutine->get_tuple_data(copyTuple, TID); + tid = t_data.tid; /* Save locked tuple for EvalPlanQual testing below */ *testTuple = copyTuple; @@ -258,7 +265,7 @@ lnext: } /* Remember locked tuple's TID for EPQ testing and WHERE CURRENT OF */ - erm->curCtid = tuple.t_self; + erm->curCtid = tid; } /* @@ -280,7 +287,7 @@ lnext: { ExecAuxRowMark *aerm = (ExecAuxRowMark *) lfirst(lc); ExecRowMark *erm = aerm->rowmark; - HeapTupleData tuple; + StorageTuple tuple; Buffer buffer; /* skip non-active child tables, but clear their test tuples */ @@ -308,14 +315,12 @@ lnext: Assert(ItemPointerIsValid(&(erm->curCtid))); /* okay, fetch the tuple */ - tuple.t_self = erm->curCtid; - if (!heap_fetch(erm->relation, SnapshotAny, &tuple, &buffer, + if (!storage_fetch(erm->relation, &erm->curCtid, SnapshotAny, &tuple, &buffer, false, NULL)) elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); /* successful, copy and store tuple */ - EvalPlanQualSetTuple(&node->lr_epqstate, erm->rti, - heap_copytuple(&tuple)); + EvalPlanQualSetTuple(&node->lr_epqstate, erm->rti, tuple); ReleaseBuffer(buffer); } @@ -394,8 +399,8 @@ ExecInitLockRows(LockRows *node, EState *estate, int eflags) * Create workspace in which we can remember per-RTE locked tuples */ lrstate->lr_ntables = list_length(estate->es_range_table); - lrstate->lr_curtuples = (HeapTuple *) - palloc0(lrstate->lr_ntables * sizeof(HeapTuple)); + lrstate->lr_curtuples = (StorageTuple *) + palloc0(lrstate->lr_ntables * sizeof(StorageTuple)); /* * Locate the ExecRowMark(s) that this node is responsible for, and diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index dd8f792404..4681dc8dac 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -38,7 +38,10 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/storageam.h" +#include "access/storageam.h" #include "access/xact.h" +#include "catalog/pg_am.h" #include "commands/trigger.h" #include "executor/executor.h" #include "executor/nodeModifyTable.h" @@ -164,15 +167,13 @@ ExecProcessReturning(ResultRelInfo *resultRelInfo, econtext->ecxt_scantuple = tupleSlot; else { - HeapTuple tuple; - /* * RETURNING expressions might reference the tableoid column, so * initialize t_tableOid before evaluating them. */ Assert(!TupIsNull(econtext->ecxt_scantuple)); - tuple = ExecHeapifySlot(econtext->ecxt_scantuple); - tuple->t_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + ExecSlotUpdateTupleTableoid(econtext->ecxt_scantuple, + RelationGetRelid(resultRelInfo->ri_RelationDesc)); } econtext->ecxt_outertuple = planSlot; @@ -191,7 +192,7 @@ ExecProcessReturning(ResultRelInfo *resultRelInfo, static void ExecCheckHeapTupleVisible(EState *estate, Relation rel, - HeapTuple tuple, + StorageTuple tuple, Buffer buffer) { if (!IsolationUsesXactSnapshot()) @@ -204,13 +205,15 @@ ExecCheckHeapTupleVisible(EState *estate, LockBuffer(buffer, BUFFER_LOCK_SHARE); if (!HeapTupleSatisfiesVisibility(rel->rd_stamroutine, tuple, estate->es_snapshot, buffer)) { + tuple_data t_data = storage_tuple_get_data(rel, tuple, XMIN); + /* * We should not raise a serialization failure if the conflict is * against a tuple inserted by our own transaction, even if it's not * visible to our snapshot. (This would happen, for example, if * conflicting keys are proposed for insertion in a single command.) */ - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data))) + if (!TransactionIdIsCurrentTransactionId(t_data.xid)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); @@ -226,19 +229,20 @@ ExecCheckTIDVisible(EState *estate, ResultRelInfo *relinfo, ItemPointer tid) { + Buffer buffer; Relation rel = relinfo->ri_RelationDesc; - Buffer buffer; - HeapTupleData tuple; + StorageTuple tuple; /* Redundantly check isolation level */ if (!IsolationUsesXactSnapshot()) return; - tuple.t_self = *tid; - if (!heap_fetch(rel, SnapshotAny, &tuple, &buffer, false, NULL)) + if (!storage_fetch(rel, tid, SnapshotAny, &tuple, &buffer, false, NULL)) elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); - ExecCheckHeapTupleVisible(estate, rel, &tuple, buffer); - ReleaseBuffer(buffer); + ExecCheckHeapTupleVisible(estate, rel, tuple, buffer); + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + pfree(tuple); } /* ---------------------------------------------------------------- @@ -259,7 +263,7 @@ ExecInsert(ModifyTableState *mtstate, EState *estate, bool canSetTag) { - HeapTuple tuple; + StorageTuple tuple; ResultRelInfo *resultRelInfo; ResultRelInfo *saved_resultRelInfo = NULL; Relation resultRelationDesc; @@ -267,12 +271,6 @@ ExecInsert(ModifyTableState *mtstate, List *recheckIndexes = NIL; TupleTableSlot *result = NULL; - /* - * get the heap tuple out of the tuple table slot, making sure we have a - * writable copy - */ - tuple = ExecHeapifySlot(slot); - /* * get information on the (current) result relation */ @@ -284,6 +282,8 @@ ExecInsert(ModifyTableState *mtstate, int leaf_part_index; TupleConversionMap *map; + tuple = ExecHeapifySlot(slot); + /* * Away we go ... If we end up not finding a partition after all, * ExecFindPartition() does not return and errors out instead. @@ -374,19 +374,31 @@ ExecInsert(ModifyTableState *mtstate, resultRelationDesc = resultRelInfo->ri_RelationDesc; /* - * If the result relation has OIDs, force the tuple's OID to zero so that - * heap_insert will assign a fresh OID. Usually the OID already will be - * zero at this point, but there are corner cases where the plan tree can - * return a tuple extracted literally from some table with the same - * rowtype. + * get the heap tuple out of the tuple table slot, making sure we have a + * writable copy <-- obsolete comment XXX explain what we really do here + * + * Do we really need to do this here? + */ + ExecMaterializeSlot(slot); + + + /* + * If the result relation uses heapam and has OIDs, force the tuple's OID + * to zero so that heap_insert will assign a fresh OID. Usually the OID + * already will be zero at this point, but there are corner cases where the + * plan tree can return a tuple extracted literally from some table with + * the same rowtype. * * XXX if we ever wanted to allow users to assign their own OIDs to new * rows, this'd be the place to do it. For the moment, we make a point of * doing this before calling triggers, so that a user-supplied trigger * could hack the OID if desired. */ - if (resultRelationDesc->rd_rel->relhasoids) - HeapTupleSetOid(tuple, InvalidOid); + if (resultRelationDesc->rd_rel->relam == HEAPAM_STORAGE_AM_OID && + resultRelationDesc->rd_rel->relhasoids) + { + slot->tts_tupleOid = InvalidOid; + } /* * BEFORE ROW INSERT Triggers. @@ -404,9 +416,6 @@ ExecInsert(ModifyTableState *mtstate, if (slot == NULL) /* "do nothing" */ return NULL; - - /* trigger might have changed tuple */ - tuple = ExecHeapifySlot(slot); } /* INSTEAD OF ROW INSERT Triggers */ @@ -418,9 +427,6 @@ ExecInsert(ModifyTableState *mtstate, if (slot == NULL) /* "do nothing" */ return NULL; - /* trigger might have changed tuple */ - tuple = ExecHeapifySlot(slot); - newId = InvalidOid; } else if (resultRelInfo->ri_FdwRoutine) @@ -436,14 +442,12 @@ ExecInsert(ModifyTableState *mtstate, if (slot == NULL) /* "do nothing" */ return NULL; - /* FDW might have changed tuple */ - tuple = ExecHeapifySlot(slot); - /* * AFTER ROW Triggers or RETURNING expressions might reference the * tableoid column, so initialize t_tableOid before evaluating them. */ - tuple->t_tableOid = RelationGetRelid(resultRelationDesc); + slot->tts_tableOid = RelationGetRelid(resultRelationDesc); + ExecSlotUpdateTupleTableoid(slot, slot->tts_tableOid); newId = InvalidOid; } @@ -463,7 +467,8 @@ ExecInsert(ModifyTableState *mtstate, * Constraints might reference the tableoid column, so initialize * t_tableOid before evaluating them. */ - tuple->t_tableOid = RelationGetRelid(resultRelationDesc); + slot->tts_tableOid = RelationGetRelid(resultRelationDesc); + ExecSlotUpdateTupleTableoid(slot, slot->tts_tableOid); /* * Check any RLS INSERT WITH CHECK policies @@ -554,24 +559,24 @@ ExecInsert(ModifyTableState *mtstate, * waiting for the whole transaction to complete. */ specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); - HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken); + slot->tts_speculativeToken = specToken; /* insert the tuple, with the speculative token */ - newId = heap_insert(resultRelationDesc, tuple, + newId = storage_insert(resultRelationDesc, slot, estate->es_output_cid, HEAP_INSERT_SPECULATIVE, NULL); /* insert index entries for tuple */ - recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + recheckIndexes = ExecInsertIndexTuples(slot, &(slot->tts_tid), estate, true, &specConflict, arbiterIndexes); /* adjust the tuple's state accordingly */ if (!specConflict) - heap_finish_speculative(resultRelationDesc, tuple); + storage_finish_speculative(resultRelationDesc, slot); else - heap_abort_speculative(resultRelationDesc, tuple); + storage_abort_speculative(resultRelationDesc, slot); /* * Wake up anyone waiting for our decision. They will re-check @@ -599,17 +604,14 @@ ExecInsert(ModifyTableState *mtstate, { /* * insert the tuple normally. - * - * Note: heap_insert returns the tid (location) of the new tuple - * in the t_self field. */ - newId = heap_insert(resultRelationDesc, tuple, + newId = storage_insert(resultRelationDesc, slot, estate->es_output_cid, 0, NULL); /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) - recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + recheckIndexes = ExecInsertIndexTuples(slot, &(slot->tts_tid), estate, false, NULL, arbiterIndexes); } @@ -619,11 +621,11 @@ ExecInsert(ModifyTableState *mtstate, { (estate->es_processed)++; estate->es_lastoid = newId; - setLastTid(&(tuple->t_self)); + setLastTid(&(slot->tts_tid)); } /* AFTER ROW INSERT Triggers */ - ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes, + ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes, mtstate->mt_transition_capture); list_free(recheckIndexes); @@ -674,7 +676,7 @@ ExecInsert(ModifyTableState *mtstate, static TupleTableSlot * ExecDelete(ModifyTableState *mtstate, ItemPointer tupleid, - HeapTuple oldtuple, + StorageTuple oldtuple, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, @@ -719,8 +721,6 @@ ExecDelete(ModifyTableState *mtstate, } else if (resultRelInfo->ri_FdwRoutine) { - HeapTuple tuple; - /* * delete from foreign table: let the FDW do it * @@ -746,8 +746,10 @@ ExecDelete(ModifyTableState *mtstate, */ if (slot->tts_isempty) ExecStoreAllNullTuple(slot); - tuple = ExecHeapifySlot(slot); - tuple->t_tableOid = RelationGetRelid(resultRelationDesc); + + ExecMaterializeSlot(slot); + + slot->tts_tableOid = RelationGetRelid(resultRelationDesc); } else { @@ -761,7 +763,7 @@ ExecDelete(ModifyTableState *mtstate, * mode transactions. */ ldelete:; - result = heap_delete(resultRelationDesc, tupleid, + result = storage_delete(resultRelationDesc, tupleid, estate->es_output_cid, estate->es_crosscheck_snapshot, true /* wait for commit */ , @@ -861,7 +863,7 @@ ldelete:; * gotta fetch it. We can use the trigger tuple slot. */ TupleTableSlot *rslot; - HeapTupleData deltuple; + StorageTuple deltuple = NULL; Buffer delbuffer; if (resultRelInfo->ri_FdwRoutine) @@ -875,20 +877,19 @@ ldelete:; slot = estate->es_trig_tuple_slot; if (oldtuple != NULL) { - deltuple = *oldtuple; + deltuple = heap_copytuple(oldtuple); delbuffer = InvalidBuffer; } else { - deltuple.t_self = *tupleid; - if (!heap_fetch(resultRelationDesc, SnapshotAny, - &deltuple, &delbuffer, false, NULL)) + if (!storage_fetch(resultRelationDesc, tupleid, SnapshotAny, + &deltuple, &delbuffer, false, NULL)) elog(ERROR, "failed to fetch deleted tuple for DELETE RETURNING"); } if (slot->tts_tupleDescriptor != RelationGetDescr(resultRelationDesc)) ExecSetSlotDescriptor(slot, RelationGetDescr(resultRelationDesc)); - ExecStoreTuple(&deltuple, slot, InvalidBuffer, false); + ExecStoreTuple(deltuple, slot, InvalidBuffer, false); } rslot = ExecProcessReturning(resultRelInfo, slot, planSlot); @@ -897,7 +898,7 @@ ldelete:; * Before releasing the target tuple again, make sure rslot has a * local copy of any pass-by-reference values. */ - ExecHeapifySlot(rslot); + ExecMaterializeSlot(rslot); ExecClearTuple(slot); if (BufferIsValid(delbuffer)) @@ -934,14 +935,14 @@ ldelete:; static TupleTableSlot * ExecUpdate(ModifyTableState *mtstate, ItemPointer tupleid, - HeapTuple oldtuple, + StorageTuple oldtuple, TupleTableSlot *slot, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, bool canSetTag) { - HeapTuple tuple; + StorageTuple tuple; ResultRelInfo *resultRelInfo; Relation resultRelationDesc; HTSU_Result result; @@ -1006,14 +1007,14 @@ ExecUpdate(ModifyTableState *mtstate, if (slot == NULL) /* "do nothing" */ return NULL; - /* FDW might have changed tuple */ - tuple = ExecHeapifySlot(slot); - /* * AFTER ROW Triggers or RETURNING expressions might reference the * tableoid column, so initialize t_tableOid before evaluating them. */ - tuple->t_tableOid = RelationGetRelid(resultRelationDesc); + ExecSlotUpdateTupleTableoid(slot, RelationGetRelid(resultRelationDesc)); + + /* FDW might have changed tuple */ + tuple = ExecHeapifySlot(slot); } else { @@ -1023,7 +1024,7 @@ ExecUpdate(ModifyTableState *mtstate, * Constraints might reference the tableoid column, so initialize * t_tableOid before evaluating them. */ - tuple->t_tableOid = RelationGetRelid(resultRelationDesc); + slot->tts_tableOid = RelationGetRelid(resultRelationDesc); /* * Check any RLS UPDATE WITH CHECK policies @@ -1059,7 +1060,7 @@ lreplace:; * needed for referential integrity updates in transaction-snapshot * mode transactions. */ - result = heap_update(resultRelationDesc, tupleid, tuple, + result = storage_update(resultRelationDesc, tupleid, slot, estate->es_output_cid, estate->es_crosscheck_snapshot, true /* wait for commit */ , @@ -1151,8 +1152,8 @@ lreplace:; * * If it's a HOT update, we mustn't insert new index entries. */ - if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple)) - recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + if ((resultRelInfo->ri_NumIndices > 0) && !storage_tuple_is_heaponly(resultRelationDesc, tuple)) + recheckIndexes = ExecInsertIndexTuples(slot, &(slot->tts_tid), estate, false, NULL, NIL); } @@ -1211,11 +1212,12 @@ ExecOnConflictUpdate(ModifyTableState *mtstate, ExprContext *econtext = mtstate->ps.ps_ExprContext; Relation relation = resultRelInfo->ri_RelationDesc; ExprState *onConflictSetWhere = resultRelInfo->ri_onConflictSetWhere; - HeapTupleData tuple; + StorageTuple tuple = NULL; HeapUpdateFailureData hufd; LockTupleMode lockmode; HTSU_Result test; Buffer buffer; + tuple_data t_data; /* Determine lock mode to use */ lockmode = ExecUpdateLockMode(estate, resultRelInfo); @@ -1226,10 +1228,8 @@ ExecOnConflictUpdate(ModifyTableState *mtstate, * previous conclusion that the tuple is conclusively committed is not * true anymore. */ - tuple.t_self = *conflictTid; - test = heap_lock_tuple(relation, &tuple, estate->es_output_cid, - lockmode, LockWaitBlock, false, &buffer, - &hufd); + test = storage_lock_tuple(relation, conflictTid, &tuple, estate->es_output_cid, + lockmode, LockWaitBlock, false, &buffer, &hufd); switch (test) { case HeapTupleMayBeUpdated: @@ -1254,7 +1254,8 @@ ExecOnConflictUpdate(ModifyTableState *mtstate, * that for SQL MERGE, an exception must be raised in the event of * an attempt to update the same row twice. */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple.t_data))) + t_data = storage_tuple_get_data(relation, tuple, XMIN); + if (TransactionIdIsCurrentTransactionId(t_data.xid)) ereport(ERROR, (errcode(ERRCODE_CARDINALITY_VIOLATION), errmsg("ON CONFLICT DO UPDATE command cannot affect row a second time"), @@ -1285,7 +1286,9 @@ ExecOnConflictUpdate(ModifyTableState *mtstate, * loop here, as the new version of the row might not conflict * anymore, or the conflicting tuple has actually been deleted. */ - ReleaseBuffer(buffer); + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + pfree(tuple); return false; default: @@ -1313,10 +1316,10 @@ ExecOnConflictUpdate(ModifyTableState *mtstate, * snapshot. This is in line with the way UPDATE deals with newer tuple * versions. */ - ExecCheckHeapTupleVisible(estate, relation, &tuple, buffer); + ExecCheckHeapTupleVisible(estate, relation, tuple, buffer); /* Store target's existing tuple in the state's dedicated slot */ - ExecStoreTuple(&tuple, mtstate->mt_existing, buffer, false); + ExecStoreTuple(tuple, mtstate->mt_existing, buffer, false); /* * Make tuple and any needed join variables available to ExecQual and @@ -1331,7 +1334,9 @@ ExecOnConflictUpdate(ModifyTableState *mtstate, if (!ExecQual(onConflictSetWhere, econtext)) { - ReleaseBuffer(buffer); + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + pfree(tuple); InstrCountFiltered1(&mtstate->ps, 1); return true; /* done with the tuple */ } @@ -1371,12 +1376,14 @@ ExecOnConflictUpdate(ModifyTableState *mtstate, */ /* Execute UPDATE with projection */ - *returning = ExecUpdate(mtstate, &tuple.t_self, NULL, + *returning = ExecUpdate(mtstate, conflictTid, NULL, mtstate->mt_conflproj, planSlot, &mtstate->mt_epqstate, mtstate->ps.state, canSetTag); - ReleaseBuffer(buffer); + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + pfree(tuple); return true; } @@ -1569,7 +1576,7 @@ ExecModifyTable(PlanState *pstate) ItemPointer tupleid = NULL; ItemPointerData tuple_ctid; HeapTupleData oldtupdata; - HeapTuple oldtuple; + StorageTuple oldtuple; CHECK_FOR_INTERRUPTS(); diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c index 0ee76e7d25..8a6b2172ea 100644 --- a/src/backend/executor/nodeTidscan.c +++ b/src/backend/executor/nodeTidscan.c @@ -22,6 +22,7 @@ */ #include "postgres.h" +#include "access/storageam.h" #include "access/sysattr.h" #include "catalog/pg_type.h" #include "executor/execdebug.h" @@ -306,7 +307,7 @@ TidNext(TidScanState *node) ScanDirection direction; Snapshot snapshot; Relation heapRelation; - HeapTuple tuple; + StorageTuple tuple; TupleTableSlot *slot; Buffer buffer = InvalidBuffer; ItemPointerData *tidList; @@ -331,12 +332,6 @@ TidNext(TidScanState *node) tidList = node->tss_TidList; numTids = node->tss_NumTids; - /* - * We use node->tss_htup as the tuple pointer; note this can't just be a - * local variable here, as the scan tuple slot will keep a pointer to it. - */ - tuple = &(node->tss_htup); - /* * Initialize or advance scan position, depending on direction. */ @@ -364,7 +359,7 @@ TidNext(TidScanState *node) while (node->tss_TidPtr >= 0 && node->tss_TidPtr < numTids) { - tuple->t_self = tidList[node->tss_TidPtr]; + ItemPointerData tid = tidList[node->tss_TidPtr]; /* * For WHERE CURRENT OF, the tuple retrieved from the cursor might @@ -372,9 +367,9 @@ TidNext(TidScanState *node) * current according to our snapshot. */ if (node->tss_isCurrentOf) - heap_get_latest_tid(heapRelation, snapshot, &tuple->t_self); + storage_get_latest_tid(heapRelation, snapshot, &tid); - if (heap_fetch(heapRelation, snapshot, tuple, &buffer, false, NULL)) + if (storage_fetch(heapRelation, &tid, snapshot, &tuple, &buffer, false, NULL)) { /* * store the scanned tuple in the scan tuple slot of the scan @@ -385,14 +380,16 @@ TidNext(TidScanState *node) */ ExecStoreTuple(tuple, /* tuple to store */ slot, /* slot to store in */ - buffer, /* buffer associated with tuple */ - false); /* don't pfree */ + InvalidBuffer, /* buffer associated with tuple */ + true); /* don't pfree */ /* * At this point we have an extra pin on the buffer, because * ExecStoreTuple incremented the pin count. Drop our local pin. */ - ReleaseBuffer(buffer); + //hari + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); return slot; } diff --git a/src/backend/utils/adt/tid.c b/src/backend/utils/adt/tid.c index 083f7d60a7..52779b7256 100644 --- a/src/backend/utils/adt/tid.c +++ b/src/backend/utils/adt/tid.c @@ -21,6 +21,7 @@ #include #include "access/heapam.h" +#include "access/storageam.h" #include "access/sysattr.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" @@ -354,7 +355,7 @@ currtid_byreloid(PG_FUNCTION_ARGS) ItemPointerCopy(tid, result); snapshot = RegisterSnapshot(GetLatestSnapshot()); - heap_get_latest_tid(rel, snapshot, result); + storage_get_latest_tid(rel, snapshot, result); UnregisterSnapshot(snapshot); heap_close(rel, AccessShareLock); @@ -389,7 +390,7 @@ currtid_byrelname(PG_FUNCTION_ARGS) ItemPointerCopy(tid, result); snapshot = RegisterSnapshot(GetLatestSnapshot()); - heap_get_latest_tid(rel, snapshot, result); + storage_get_latest_tid(rel, snapshot, result); UnregisterSnapshot(snapshot); heap_close(rel, AccessShareLock); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 4e41024e92..cdd45ef313 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -133,40 +133,19 @@ extern void heap_parallelscan_initialize(ParallelHeapScanDesc target, extern void heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan); extern HeapScanDesc heap_beginscan_parallel(Relation, ParallelHeapScanDesc); -extern bool heap_fetch(Relation relation, Snapshot snapshot, - HeapTuple tuple, Buffer *userbuf, bool keep_buf, - Relation stats_relation); +extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid, + int options, BulkInsertState bistate); extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call); extern bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, bool *all_dead); - -extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, - ItemPointer tid); extern void setLastTid(const ItemPointer tid); extern BulkInsertState GetBulkInsertState(void); extern void FreeBulkInsertState(BulkInsertState); extern void ReleaseBulkInsertStatePin(BulkInsertState bistate); -extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid, - int options, BulkInsertState bistate); -extern void heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, - CommandId cid, int options, BulkInsertState bistate); -extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd); -extern void heap_finish_speculative(Relation relation, HeapTuple tuple); -extern void heap_abort_speculative(Relation relation, HeapTuple tuple); -extern HTSU_Result heap_update(Relation relation, ItemPointer otid, - HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode); -extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, - CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_update, - Buffer *buffer, HeapUpdateFailureData *hufd); extern void heap_inplace_update(Relation relation, HeapTuple tuple); extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, TransactionId cutoff_multi); @@ -179,7 +158,6 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid); extern void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup); -extern void heap_sync(Relation relation); extern void heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot); /* in heap/pruneheap.c */ diff --git a/src/include/access/heapam_common.h b/src/include/access/heapam_common.h index 1fe15ede56..799b4edada 100644 --- a/src/include/access/heapam_common.h +++ b/src/include/access/heapam_common.h @@ -34,6 +34,111 @@ #include "utils/relcache.h" #include "utils/snapshot.h" +/* + * Each tuple lock mode has a corresponding heavyweight lock, and one or two + * corresponding MultiXactStatuses (one to merely lock tuples, another one to + * update them). This table (and the macros below) helps us determine the + * heavyweight lock mode and MultiXactStatus values to use for any particular + * tuple lock strength. + * + * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock + * instead. + */ +static const struct +{ + LOCKMODE hwlock; + int lockstatus; + int updstatus; +} + + tupleLockExtraInfo[MaxLockTupleMode + 1] = +{ + { /* LockTupleKeyShare */ + AccessShareLock, + MultiXactStatusForKeyShare, + -1 /* KeyShare does not allow updating tuples */ + }, + { /* LockTupleShare */ + RowShareLock, + MultiXactStatusForShare, + -1 /* Share does not allow updating tuples */ + }, + { /* LockTupleNoKeyExclusive */ + ExclusiveLock, + MultiXactStatusForNoKeyUpdate, + MultiXactStatusNoKeyUpdate + }, + { /* LockTupleExclusive */ + AccessExclusiveLock, + MultiXactStatusForUpdate, + MultiXactStatusUpdate + } +}; + +/* + * This table maps tuple lock strength values for each particular + * MultiXactStatus value. + */ +static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = +{ + LockTupleKeyShare, /* ForKeyShare */ + LockTupleShare, /* ForShare */ + LockTupleNoKeyExclusive, /* ForNoKeyUpdate */ + LockTupleExclusive, /* ForUpdate */ + LockTupleNoKeyExclusive, /* NoKeyUpdate */ + LockTupleExclusive /* Update */ +}; + +/* Get the LockTupleMode for a given MultiXactStatus */ +#define TUPLOCK_from_mxstatus(status) \ + (MultiXactStatusLock[(status)]) + +/* + * Acquire heavyweight locks on tuples, using a LockTupleMode strength value. + * This is more readable than having every caller translate it to lock.h's + * LOCKMODE. + */ +#define LockTupleTuplock(rel, tup, mode) \ + LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) +#define UnlockTupleTuplock(rel, tup, mode) \ + UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) +#define ConditionalLockTupleTuplock(rel, tup, mode) \ + ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) +/* Get the LOCKMODE for a given MultiXactStatus */ +#define LOCKMODE_from_mxstatus(status) \ + (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock) +extern HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, + TransactionId xid, CommandId cid, int options); + +extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd); +extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd, LockTupleMode *lockmode); + +extern XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); +extern uint8 compute_infobits(uint16 infomask, uint16 infomask2); +extern void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, + uint16 old_infomask2, TransactionId add_to_xmax, + LockTupleMode mode, bool is_update, + TransactionId *result_xmax, uint16 *result_infomask, + uint16 *result_infomask2); +extern void UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid); +extern bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, + LockTupleMode lockmode); +extern bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, + uint16 infomask, Relation rel, int *remaining); + +extern void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, + Relation rel, ItemPointer ctid, XLTW_Oper oper, + int *remaining); +extern MultiXactStatus get_mxact_status_for_lock(LockTupleMode mode, bool is_update); + +extern void heap_inplace_update(Relation relation, HeapTuple tuple); +extern bool heap_acquire_tuplock(Relation relation, ItemPointer tid, + LockTupleMode mode, LockWaitPolicy wait_policy, + bool *have_tuple_lock); /* in heap/heapam_common.c */ extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, @@ -43,6 +148,28 @@ extern bool HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin); typedef struct StorageSlotAmRoutine StorageSlotAmRoutine; extern StorageSlotAmRoutine* heapam_storage_slot_handler(void); + +/* + * Given two versions of the same t_infomask for a tuple, compare them and + * return whether the relevant status for a tuple Xmax has changed. This is + * used after a buffer lock has been released and reacquired: we want to ensure + * that the tuple state continues to be the same it was when we previously + * examined it. + * + * Note the Xmax field itself must be compared separately. + */ +static inline bool +xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) +{ + const uint16 interesting = + HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK; + + if ((new_infomask & interesting) != (old_infomask & interesting)) + return true; + + return false; +} + /* * SetHintBits() * diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 9539d67bec..16dfb3e748 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -811,6 +811,7 @@ extern Datum heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, extern HeapTuple heap_copytuple(HeapTuple tuple); extern void heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest); extern Datum heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc); +extern HeapTuple heap_form_tuple_by_datum(Datum data, Oid relid); extern HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull); extern HeapTuple heap_modify_tuple(HeapTuple tuple, diff --git a/src/include/access/storageam.h b/src/include/access/storageam.h new file mode 100644 index 0000000000..9502c92318 --- /dev/null +++ b/src/include/access/storageam.h @@ -0,0 +1,81 @@ +/*------------------------------------------------------------------------- + * + * storageam.h + * POSTGRES storage access method definitions. + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/storageam.h + * + *------------------------------------------------------------------------- + */ +#ifndef STORAGEAM_H +#define STORAGEAM_H + +#include "access/heapam.h" +#include "executor/tuptable.h" + +/* A physical tuple coming from a storage AM scan */ +typedef void *StorageTuple; + +typedef union tuple_data +{ + TransactionId xid; + CommandId cid; + ItemPointerData tid; +} tuple_data; + +typedef enum tuple_data_flags +{ + XMIN = 0, + UPDATED_XID, + CMIN, + TID, + CTID +} tuple_data_flags; + +extern bool storage_fetch(Relation relation, + ItemPointer tid, + Snapshot snapshot, + StorageTuple *stuple, + Buffer *userbuf, + bool keep_buf, + Relation stats_relation); + +extern HTSU_Result storage_lock_tuple(Relation relation, ItemPointer tid, StorageTuple *stuple, + CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, + bool follow_updates, + Buffer *buffer, HeapUpdateFailureData *hufd); + +extern Oid storage_insert(Relation relation, TupleTableSlot *slot, CommandId cid, + int options, BulkInsertState bistate); + +extern HTSU_Result storage_delete(Relation relation, ItemPointer tid, CommandId cid, + Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd); + +extern HTSU_Result storage_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd, LockTupleMode *lockmode); + +extern void storage_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, + CommandId cid, int options, BulkInsertState bistate); + +extern void storage_abort_speculative(Relation relation, TupleTableSlot *slot); +extern void storage_finish_speculative(Relation relation, TupleTableSlot *slot); + +extern tuple_data storage_tuple_get_data(Relation relation, StorageTuple tuple, tuple_data_flags flags); + +extern bool storage_tuple_is_heaponly(Relation relation, StorageTuple tuple); + +extern StorageTuple storage_tuple_by_datum(Relation relation, Datum data, Oid tableoid); + +extern void storage_get_latest_tid(Relation relation, + Snapshot snapshot, + ItemPointer tid); + +extern void storage_sync(Relation rel); + +#endif diff --git a/src/include/access/storageamapi.h b/src/include/access/storageamapi.h index 95fe02888f..c2e6dc2aef 100644 --- a/src/include/access/storageamapi.h +++ b/src/include/access/storageamapi.h @@ -13,32 +13,13 @@ #include "access/htup.h" #include "access/heapam.h" +#include "access/storageam.h" #include "access/sdir.h" #include "access/skey.h" #include "executor/tuptable.h" #include "utils/relcache.h" #include "utils/snapshot.h" -/* A physical tuple coming from a storage AM scan */ -typedef void *StorageTuple; - -typedef union tuple_data -{ - TransactionId xid; - CommandId cid; - ItemPointerData tid; -} tuple_data; - -typedef enum tuple_data_flags -{ - XMIN = 0, - UPDATED_XID, - CMIN, - TID, - CTID -} tuple_data_flags; - - typedef HeapScanDesc (*scan_begin_hook) (Relation relation, Snapshot snapshot, int nkeys, ScanKey key, diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index adbcfa1297..203371148c 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -190,7 +190,7 @@ extern TupleTableSlot *ExecBRInsertTriggers(EState *estate, TupleTableSlot *slot); extern void ExecARInsertTriggers(EState *estate, ResultRelInfo *relinfo, - HeapTuple trigtuple, + TupleTableSlot *slot, List *recheckIndexes, TransitionCaptureState *transition_capture); extern TupleTableSlot *ExecIRInsertTriggers(EState *estate, diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 770881849c..8704b7b54c 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -196,7 +196,7 @@ extern ExecAuxRowMark *ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist); extern TupleTableSlot *EvalPlanQual(EState *estate, EPQState *epqstate, Relation relation, Index rti, int lockmode, ItemPointer tid, TransactionId priorXmax); -extern HeapTuple EvalPlanQualFetch(EState *estate, Relation relation, +extern StorageTuple EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, LockWaitPolicy wait_policy, ItemPointer tid, TransactionId priorXmax); extern void EvalPlanQualInit(EPQState *epqstate, EState *estate, @@ -204,8 +204,8 @@ extern void EvalPlanQualInit(EPQState *epqstate, EState *estate, extern void EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks); extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti, - HeapTuple tuple); -extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti); + StorageTuple tuple); +extern StorageTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti); extern void ExecSetupPartitionTupleRouting(Relation rel, Index resultRTindex, EState *estate, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index c6d3021c85..c19698089b 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -503,7 +503,7 @@ typedef struct EState * remember if the tuple has been returned already. Arrays are of size * list_length(es_range_table) and are indexed by scan node scanrelid - 1. */ - HeapTuple *es_epqTuple; /* array of EPQ substitute tuples */ + StorageTuple *es_epqTuple; /* array of EPQ substitute tuples */ bool *es_epqTupleSet; /* true if EPQ tuple is provided */ bool *es_epqScanDone; /* true if EPQ tuple has been fetched */ @@ -2023,7 +2023,7 @@ typedef struct LockRowsState PlanState ps; /* its first field is NodeTag */ List *lr_arowMarks; /* List of ExecAuxRowMarks */ EPQState lr_epqstate; /* for evaluating EvalPlanQual rechecks */ - HeapTuple *lr_curtuples; /* locked tuples (one entry per RT entry) */ + StorageTuple *lr_curtuples; /* locked tuples (one entry per RT entry) */ int lr_ntables; /* length of lr_curtuples[] array */ } LockRowsState; -- 2.14.1.windows.1