From 889d108c71bb390543826f982d6163d122db6979 Mon Sep 17 00:00:00 2001 From: Amul Sul Date: Tue, 13 Feb 2018 12:37:52 +0530 Subject: [PATCH 1/2] Invalidate ip_blkid v6 v6: - Changes w.r.t Amit's suggestion[11]. - AFAICS, Other than previous places(v6-wip), I didn't found any other places where we need the check for invalid block numberother than. - Minor change in the comments : 'partition key' -> 'the partition key' - ran pgindent. v6-wip: Update w.r.t Andres Freund review comments[8] - Added HeapTupleHeaderValidBlockNumber, HeapTupleHeaderSetBlockNumber and ItemPointerValidBlockNumber macro. - Fixed comments as per Andres' suggestions. - Also added invalid block number check in heap_get_latest_tid as discussed in the thread[9], similar change made in heap_lock_updated_tuple_rec. - In heap_lock_updated_tuple, rewrite_heap_tuple & EvalPlanQualFetch, I've added check for invalid block number where ItemPointerEquals used to conclude tuple has been updated/deleted. TODO: 1. Yet to test changes made in the heap_lock_updated_tuple, rewrite_heap_tuple & EvalPlanQualFetch function are valid or not. Also, address TODO tags in the same function. 2. Also check there are other places where similar changes like above needed(wrt Pavan Deolasee[10] concern) v5: - Added code in heap_mask to skip wal_consistency_checking[7] - Fixed previous todos. v5-wip2: - Minor changes in RelationFindReplTupleByIndex() and RelationFindReplTupleSeq() - TODO; Same as the privious v5-wip: Update w.r.t Amit Kapila's comments[6]. - Reverted error message in nodeModifyTable.c from 'tuple to be locked' to 'tuple to be updated'. - TODO: 1. Yet to made a decision of having LOG/ELOG/ASSERT in the RelationFindReplTupleByIndex() and RelationFindReplTupleSeq(). v4: Rebased on "UPDATE of partition key v35" patch[5]. v3: Update w.r.t Amit Kapila's[3] & Alvaro Herrera[4] comments - typo in all error message and comment : "to an another" -> "to another" - error message change : "tuple to be updated" -> "tuple to be locked" - In ExecOnConflictUpdate(), error report converted into assert & comments added. v2: Updated w.r.t Robert review comments[2] - Updated couple of comment of heap_delete argument and ItemPointerData - Added same concurrent update error logic in ExecOnConflictUpdate, RelationFindReplTupleByIndex and RelationFindReplTupleSeq v1: Initial version -- as per Amit Kapila's suggestions[1] - When tuple is being moved to another partition then ip_blkid in the tuple header mark to InvalidBlockNumber. ------------- References: ------------- 1] https://postgr.es/m/CAA4eK1KEZQ%2BCyXbBzfn1jFHoEfa_OemDLhLyy7xfD1QUZLo1DQ%40mail.gmail.com 2] https://postgr.es/m/CA%2BTgmoYY98AEjh7RDtuzaLC--_0smCozXRu6bFmZTaX5Ne%3DB5Q%40mail.gmail.com 3] https://postgr.es/m/CAA4eK1LQS6TmsGaEwR9HgF-9TZTHxrdAELuX6wOZBDbbjOfDjQ@mail.gmail.com 4] https://postgr.es/m/20171124160756.eyljpmpfzwd6jmnr@alvherre.pgsql 5] https://postgr.es/m/CAJ3gD9dixkkMzNnnP1CaZ1H17-U17ok_sVbjZZo+wnB=rJH6yg@mail.gmail.com 6] https://postgr.es/m/CAA4eK1LHVnNWYF53F1gUGx6CTxuvznozvU-Lr-dfE=Qeu1gEcg@mail.gmail.com 7] https://postgr.es/m/CAAJ_b94_29wiUA83W8LQjtfjv9XNV=+PT8+ioWRPjnnFHe3eqw@mail.gmail.com 8] https://postgr.es/m/20180305232353.gpue7jldnm4bjf4i@alap3.anarazel.de 9] https://postgr.es/m/CAAJ_b97BBkRWFowGRs9VNzFykoK0ikGB1yYEsWfYK8xR5enSrw@mail.gmail.com 10] https://postgr.es/m/CABOikdPXwqkLGgTZZm2qYwTn4L69V36rCh55fFma1fAYbon7Vg@mail.gmail.com 11] https://postgr.es/m/CAAJ_b97ohg=+WfiFT3g2x14rvXsXOFXvjH43GkYbgcLZvF7k+w@mail.gmail.com --- src/backend/access/heap/heapam.c | 37 +++++++++++++++++++++++++++++++--- src/backend/access/heap/rewriteheap.c | 1 + src/backend/commands/trigger.c | 5 +++++ src/backend/executor/execMain.c | 7 ++++++- src/backend/executor/execReplication.c | 26 ++++++++++++++++-------- src/backend/executor/nodeLockRows.c | 5 +++++ src/backend/executor/nodeModifyTable.c | 28 +++++++++++++++++++++---- src/include/access/heapam.h | 2 +- src/include/access/htup_details.h | 6 ++++++ src/include/storage/itemptr.h | 11 +++++++++- 10 files changed, 110 insertions(+), 18 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index c08ab14c02..54e6c6bde0 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2304,6 +2304,7 @@ heap_get_latest_tid(Relation relation, */ if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || HeapTupleHeaderIsOnlyLocked(tp.t_data) || + !HeapTupleHeaderValidBlockNumber(tp.t_data) || ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) { UnlockReleaseBuffer(buffer); @@ -2314,6 +2315,9 @@ heap_get_latest_tid(Relation relation, priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); UnlockReleaseBuffer(buffer); } /* end of loop */ + + /* Make sure that the return value has a valid block number */ + Assert(ItemPointerValidBlockNumber(tid)); } @@ -3037,6 +3041,9 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) * crosscheck - if not InvalidSnapshot, also check tuple against this * wait - true if should wait for any conflicting update to commit/abort * hufd - output parameter, filled in failure cases (see below) + * changing_part - true iff the tuple is being moved to another partition + * table due to an update of the partition key. Otherwise, + * false. * * Normal, successful return value is HeapTupleMayBeUpdated, which * actually means we did delete it. Failure return codes are @@ -3052,7 +3059,7 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) HTSU_Result heap_delete(Relation relation, ItemPointer tid, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd) + HeapUpdateFailureData *hufd, bool changing_part) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -3320,6 +3327,13 @@ l1: /* Make sure there is no forward chain link in t_ctid */ tp.t_data->t_ctid = tp.t_self; + /* + * Sets a block identifier to the InvalidBlockNumber to indicate such an + * update being moved tuple to another partition. + */ + if (changing_part) + HeapTupleHeaderSetBlockNumber(tp.t_data, InvalidBlockNumber); + MarkBufferDirty(buffer); /* @@ -3445,7 +3459,7 @@ simple_heap_delete(Relation relation, ItemPointer tid) result = heap_delete(relation, tid, GetCurrentCommandId(true), InvalidSnapshot, true /* wait for commit */ , - &hufd); + &hufd, false); switch (result) { case HeapTupleSelfUpdated: @@ -5956,6 +5970,7 @@ l4: next: /* if we find the end of update chain, we're done. */ if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID || + !HeapTupleHeaderValidBlockNumber(mytup.t_data) || ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || HeapTupleHeaderIsOnlyLocked(mytup.t_data)) { @@ -6007,7 +6022,8 @@ static HTSU_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, TransactionId xid, LockTupleMode mode) { - if (!ItemPointerEquals(&tuple->t_self, ctid)) + if (ItemPointerValidBlockNumber(ctid) && + !ItemPointerEquals(&tuple->t_self, ctid)) { /* * If this is the first possibly-multixact-able operation in the @@ -9322,6 +9338,21 @@ heap_mask(char *pagedata, BlockNumber blkno) */ if (HeapTupleHeaderIsSpeculative(page_htup)) ItemPointerSet(&page_htup->t_ctid, blkno, off); + + /* + * For a deleted tuple, a block identifier is set to + * InvalidBlockNumber to indicate that the tuple has been moved to + * another partition due to an update of the partition key. + * + * During redo, heap_xlog_delete sets t_ctid to current block + * number and self offset number. It doesn't verify the tuple is + * deleted by usual delete/update or deleted by the update of the + * partition key on the master. Hence, like speculative tuple, to + * ignore any inconsistency set block identifier to current block + * number. + */ + if (!HeapTupleHeaderValidBlockNumber(page_htup)) + HeapTupleHeaderSetBlockNumber(page_htup, blkno); } /* diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 7d466c2588..0943d95ea1 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -424,6 +424,7 @@ rewrite_heap_tuple(RewriteState state, */ if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) && + HeapTupleHeaderValidBlockNumber(old_tuple->t_data) && !(ItemPointerEquals(&(old_tuple->t_self), &(old_tuple->t_data->t_ctid)))) { diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index fbd176b5d0..93c1f2a51f 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -3071,6 +3071,11 @@ ltrmark:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + if (!ItemPointerValidBlockNumber(&hufd.ctid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update"))); + if (!ItemPointerEquals(&hufd.ctid, &tuple.t_self)) { /* it was updated, so look at the updated version */ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 91ba939bdc..884c012e5b 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -2712,6 +2712,10 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + if (!ItemPointerValidBlockNumber(&hufd.ctid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update"))); /* Should not encounter speculative tuple on recheck */ Assert(!HeapTupleHeaderIsSpeculative(tuple.t_data)); @@ -2780,7 +2784,8 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, * As above, it should be safe to examine xmax and t_ctid without the * buffer content lock, because they can't be changing. */ - if (ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid)) + if (!HeapTupleHeaderValidBlockNumber(tuple.t_data) || + ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid)) { /* deleted, so forget about it */ ReleaseBuffer(buffer); diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 32891abbdf..8430420de7 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -190,10 +190,15 @@ retry: case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: - /* XXX: Improve handling here */ - ereport(LOG, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("concurrent update, retrying"))); + if (!ItemPointerValidBlockNumber(&hufd.ctid)) + ereport(LOG, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying"))); + else + /* XXX: Improve handling here */ + ereport(LOG, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("concurrent update, retrying"))); goto retry; case HeapTupleInvisible: elog(ERROR, "attempted to lock invisible tuple"); @@ -348,10 +353,15 @@ retry: case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: - /* XXX: Improve handling here */ - ereport(LOG, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("concurrent update, retrying"))); + if (!ItemPointerValidBlockNumber(&hufd.ctid)) + ereport(LOG, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying"))); + else + /* XXX: Improve handling here */ + ereport(LOG, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("concurrent update, retrying"))); goto retry; case HeapTupleInvisible: elog(ERROR, "attempted to lock invisible tuple"); diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c index b39ccf7dc1..dd4d5f25ca 100644 --- a/src/backend/executor/nodeLockRows.c +++ b/src/backend/executor/nodeLockRows.c @@ -218,6 +218,11 @@ lnext: ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + if (!ItemPointerValidBlockNumber(&hufd.ctid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tuple to be locked was already moved to another partition due to concurrent update"))); + if (ItemPointerEquals(&hufd.ctid, &tuple.t_self)) { /* Tuple was deleted, so don't return it */ diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index c32928d9bd..eae2b4c732 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -719,7 +719,8 @@ ExecDelete(ModifyTableState *mtstate, EState *estate, bool *tupleDeleted, bool processReturning, - bool canSetTag) + bool canSetTag, + bool changing_part) { ResultRelInfo *resultRelInfo; Relation resultRelationDesc; @@ -810,7 +811,8 @@ ldelete:; estate->es_output_cid, estate->es_crosscheck_snapshot, true /* wait for commit */ , - &hufd); + &hufd, + changing_part); switch (result) { case HeapTupleSelfUpdated: @@ -856,6 +858,11 @@ ldelete:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + if (!ItemPointerValidBlockNumber(&hufd.ctid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tuple to be updated was already moved to another partition due to concurrent update"))); + if (!ItemPointerEquals(tupleid, &hufd.ctid)) { TupleTableSlot *epqslot; @@ -1158,7 +1165,7 @@ lreplace:; * processing. We want to return rows from INSERT. */ ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate, - &tuple_deleted, false, false); + &tuple_deleted, false, false, true); /* * For some reason if DELETE didn't happen (e.g. trigger prevented @@ -1303,6 +1310,11 @@ lreplace:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + if (!ItemPointerValidBlockNumber(&hufd.ctid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tuple to be updated was already moved to another partition due to concurrent update"))); + if (!ItemPointerEquals(tupleid, &hufd.ctid)) { TupleTableSlot *epqslot; @@ -1473,6 +1485,14 @@ ExecOnConflictUpdate(ModifyTableState *mtstate, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + /* + * As long as we don't support an UPDATE of INSERT ON CONFLICT for + * a partitioned table we shouldn't reach to a case where tuple to + * be lock is moved to another partition due to concurrent update + * of the partition key. + */ + Assert(ItemPointerValidBlockNumber(&hufd.ctid)); + /* * Tell caller to try again from the very start. * @@ -2062,7 +2082,7 @@ ExecModifyTable(PlanState *pstate) case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, &node->mt_epqstate, estate, - NULL, true, node->canSetTag); + NULL, true, node->canSetTag, false); break; default: elog(ERROR, "unknown operation"); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 4c0256b18a..e8da83c303 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -156,7 +156,7 @@ extern void heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, CommandId cid, int options, BulkInsertState bistate); extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd); + HeapUpdateFailureData *hufd, bool changing_part); extern void heap_finish_speculative(Relation relation, HeapTuple tuple); extern void heap_abort_speculative(Relation relation, HeapTuple tuple); extern HTSU_Result heap_update(Relation relation, ItemPointer otid, diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 2ab1815390..12df36c70b 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -441,6 +441,12 @@ do { \ ItemPointerSet(&(tup)->t_ctid, token, SpecTokenOffsetNumber) \ ) +#define HeapTupleHeaderSetBlockNumber(tup, blkno) \ + ItemPointerSetBlockNumber(&(tup)->t_ctid, blkno) + +#define HeapTupleHeaderValidBlockNumber(tup) \ + ItemPointerValidBlockNumber(&(tup)->t_ctid) + #define HeapTupleHeaderGetDatumLength(tup) \ VARSIZE(tup) diff --git a/src/include/storage/itemptr.h b/src/include/storage/itemptr.h index 6c9ed3696b..131ec518bf 100644 --- a/src/include/storage/itemptr.h +++ b/src/include/storage/itemptr.h @@ -23,7 +23,9 @@ * This is a pointer to an item within a disk page of a known file * (for example, a cross-link from an index to its parent table). * blkid tells us which block, posid tells us which entry in the linp - * (ItemIdData) array we want. + * (ItemIdData) array we want. blkid is marked InvalidBlockNumber when + * a tuple is moved to another partition relation due to an update of + * the partition key. * * Note: because there is an item pointer in each tuple header and index * tuple header on disk, it's very important not to waste space with @@ -60,6 +62,13 @@ typedef ItemPointerData *ItemPointer; #define ItemPointerIsValid(pointer) \ ((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0))) +/* + * ItemPointerIsValid + * True iff the block number of the item pointer is valid. + */ +#define ItemPointerValidBlockNumber(pointer) \ + ((bool) (BlockNumberIsValid(ItemPointerGetBlockNumberNoCheck(pointer)))) + /* * ItemPointerGetBlockNumberNoCheck * Returns the block number of a disk item pointer. -- 2.14.1