diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index c32928d..75b7e10 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -705,7 +705,9 @@ ExecInsert(ModifyTableState *mtstate, * foreign table, tupleid is invalid; the FDW has to figure out * which row to delete using data from the planSlot. oldtuple is * passed to foreign table triggers; it is NULL when the foreign - * table has no relevant triggers. + * table has no relevant triggers. When this DELETE is a part of + * an UPDATE of partition-key, then the slot returned by + * EvalPlanQual() is passed back using output parameter epqslot. * * Returns RETURNING result if any, otherwise NULL. * ---------------------------------------------------------------- @@ -718,6 +720,7 @@ ExecDelete(ModifyTableState *mtstate, EPQState *epqstate, EState *estate, bool *tupleDeleted, + TupleTableSlot **epqslot, bool processReturning, bool canSetTag) { @@ -858,19 +861,37 @@ ldelete:; errmsg("could not serialize access due to concurrent update"))); if (!ItemPointerEquals(tupleid, &hufd.ctid)) { - TupleTableSlot *epqslot; + TupleTableSlot *my_epqslot; - epqslot = EvalPlanQual(estate, + my_epqslot = EvalPlanQual(estate, epqstate, resultRelationDesc, resultRelInfo->ri_RangeTableIndex, LockTupleExclusive, &hufd.ctid, hufd.xmax); - if (!TupIsNull(epqslot)) + if (!TupIsNull(my_epqslot)) { *tupleid = hufd.ctid; - goto ldelete; + + /* + * If this is part of an UPDATE of partition-key, the + * epq tuple will contain the changes from this + * transaction over and above the updates done by the + * other transaction. The caller should now use this + * tuple as its NEW tuple, rather than the earlier NEW + * tuple. + */ + if (epqslot) + { + *epqslot = my_epqslot; + return NULL; + } + else + { + /* Normal DELETE: So delete the re-fetched row. */ + goto ldelete; + } } } /* tuple already deleted; nothing to do */ @@ -1141,6 +1162,7 @@ lreplace:; { bool tuple_deleted; TupleTableSlot *ret_slot; + TupleTableSlot *epqslot = NULL; PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; int map_index; TupleConversionMap *tupconv_map; @@ -1158,7 +1180,7 @@ lreplace:; * processing. We want to return rows from INSERT. */ ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate, - &tuple_deleted, false, false); + &tuple_deleted, &epqslot, false, false); /* * For some reason if DELETE didn't happen (e.g. trigger prevented @@ -1181,7 +1203,23 @@ lreplace:; * resurrect it. */ if (!tuple_deleted) - return NULL; + { + /* + * epqslot will be typically NULL. But when ExecDelete() finds + * that another transaction has concurrently updated the same + * row, it re-fetches the row, skips the delete, and epqslot is + * set to the re-fetched tuple slot. In that case, we need to + * do all the checks again. + */ + if (TupIsNull(epqslot)) + return NULL; + else + { + slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot); + tuple = ExecMaterializeSlot(slot); + goto lreplace; + } + } /* * Updates set the transition capture map only when a new subplan @@ -2062,7 +2100,7 @@ ExecModifyTable(PlanState *pstate) case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, &node->mt_epqstate, estate, - NULL, true, node->canSetTag); + NULL, NULL, true, node->canSetTag); break; default: elog(ERROR, "unknown operation"); diff --git a/src/test/isolation/expected/partition-key-update-1.out b/src/test/isolation/expected/partition-key-update-1.out new file mode 100644 index 0000000..79eb899 --- /dev/null +++ b/src/test/isolation/expected/partition-key-update-1.out @@ -0,0 +1,12 @@ +Parsed test spec with 2 sessions + +starting permutation: s2u s1u s2c s1c s1s +step s2u: UPDATE foo SET val = val || ' update2' WHERE key = 1; +step s1u: UPDATE foo SET key = key + 1, val = val || ' update1' WHERE key=1; +step s2c: COMMIT; +step s1u: <... completed> +step s1c: COMMIT; +step s1s: SELECT * FROM foo ORDER BY 1; +key val + +2 initial update2 update1 diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 74d7d59..fbe325c 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -66,3 +66,4 @@ test: async-notify test: vacuum-reltuples test: timeouts test: vacuum-concurrent-drop +test: partition-key-update-1 diff --git a/src/test/isolation/specs/partition-key-update-1.spec b/src/test/isolation/specs/partition-key-update-1.spec new file mode 100644 index 0000000..da27a98 --- /dev/null +++ b/src/test/isolation/specs/partition-key-update-1.spec @@ -0,0 +1,31 @@ +# Concurrency behaviour from ExecUpdate and ExecDelete. + +# Transaction A is moving a row into another partition, but is waiting for +# another transaction B that is updating the original row. The row that ends up +# in the new partition should contain the changes made by transaction B. + +setup +{ + CREATE TABLE foo (key integer, val text) PARTITION BY LIST (key); + CREATE TABLE foo_part1 PARTITION OF foo FOR VALUES IN (1); + CREATE TABLE foo_part2 PARTITION OF foo FOR VALUES IN (2); + INSERT INTO foo VALUES (1, 'initial'); +} + +teardown +{ + DROP TABLE foo; +} + +session "s1" +setup { BEGIN; } +step "s1u" { UPDATE foo SET key = key + 1, val = val || ' update1' WHERE key=1; } +step "s1c" { COMMIT; } +step "s1s" { SELECT * FROM foo ORDER BY 1; } + +session "s2" +setup { BEGIN; } +step "s2u" { UPDATE foo SET val = val || ' update2' WHERE key = 1; } +step "s2c" { COMMIT; } + +permutation "s2u" "s1u" "s2c" "s1c" "s1s"