diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml
index e6f50ec..1517757 100644
--- a/doc/src/sgml/ddl.sgml
+++ b/doc/src/sgml/ddl.sgml
@@ -3005,6 +3005,11 @@ VALUES ('Albany', NULL, NULL, 'NY');
foreign table partitions.
+
+ Updating the partition key of a row might cause it to be moved into a
+ different partition where this row satisfies its partition constraint.
+
+
Example
@@ -3297,9 +3302,22 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02
- An UPDATE that causes a row to move from one partition to
- another fails, because the new value of the row fails to satisfy the
- implicit partition constraint of the original partition.
+ When an UPDATE causes a row to move from one
+ partition to another, there is a chance that another concurrent
+ UPDATE or DELETE misses this row.
+ Suppose, session 1 is performing an UPDATE on a
+ partition key, and meanwhile a concurrent session 2 for which this row
+ is visible, performs an UPDATE or
+ DELETE operation on this row. Session 2 can silently
+ miss the row if the row is deleted from the partition due to session
+ 1's activity. In such case, session 2's
+ UPDATE/DELETE, being unaware of
+ the row movement, interprets that the row has just been deleted so there
+ is nothing to be done for this row. Whereas, in the usual case where the
+ table is not partitioned, or where there is no row movement, session 2
+ would have identified the newly updated row and carried
+ UPDATE/DELETE on this new row
+ version.
diff --git a/doc/src/sgml/ref/update.sgml b/doc/src/sgml/ref/update.sgml
index c0d0f71..3c665f0 100644
--- a/doc/src/sgml/ref/update.sgml
+++ b/doc/src/sgml/ref/update.sgml
@@ -282,10 +282,17 @@ UPDATE count
In the case of a partitioned table, updating a row might cause it to no
- longer satisfy the partition constraint. Since there is no provision to
- move the row to the partition appropriate to the new value of its
- partitioning key, an error will occur in this case. This can also happen
- when updating a partition directly.
+ longer satisfy the partition constraint of the containing partition. In that
+ case, if there is some other partition in the partition tree for which this
+ row satisfies its partition constraint, then the row is moved to that
+ partition. If there isn't such a partition, an error will occur. The error
+ will also occur when updating a partition directly. Behind the scenes, the
+ row movement is actually a DELETE and
+ INSERT operation. However, there is a possibility that a
+ concurrent UPDATE or DELETE on the
+ same row may miss this row. For details see the section
+ .
+
diff --git a/doc/src/sgml/trigger.sgml b/doc/src/sgml/trigger.sgml
index bf5d3f9..aaffc4d 100644
--- a/doc/src/sgml/trigger.sgml
+++ b/doc/src/sgml/trigger.sgml
@@ -154,6 +154,29 @@
+ If an UPDATE on a partitioned table causes a row to move
+ to another partition, it will be performed as a DELETE
+ from the original partition followed by INSERT into the
+ new partition. In this case, all row-level BEFORE
+ UPDATE triggers and all row-level
+ BEFORE DELETE triggers are fired on
+ the original partition. Then all row-level BEFORE
+ INSERT triggers are fired on the destination partition.
+ The possibility of surprising outcomes should be considered when all these
+ triggers affect the row being moved. As far as AFTER ROW
+ triggers are concerned, AFTER DELETE
+ and AFTER INSERT triggers are
+ applied; but AFTER UPDATE triggers
+ are not applied because the UPDATE has been converted to
+ a DELETE and INSERT. As far as
+ statement-level triggers are concerned, none of the
+ DELETE or INSERT triggers are fired,
+ even if row movement occurs; only the UPDATE triggers
+ defined on the target table used in the UPDATE statement
+ will be fired.
+
+
+
Trigger functions invoked by per-statement triggers should always
return NULL. Trigger functions invoked by per-row
triggers can return a table row (a value of
diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index d622305..57dc08f 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -1441,7 +1441,8 @@ get_qual_from_partbound(Relation rel, Relation parent,
/*
* map_partition_varattnos - maps varattno of any Vars in expr from the
- * parent attno to partition attno.
+ * attno's of 'from_rel' partition to the attno's of 'to_rel' partition.
+ * The rels can be both leaf partition or a partitioned table.
*
* We must allow for cases where physical attnos of a partition can be
* different from the parent's.
@@ -1454,8 +1455,8 @@ get_qual_from_partbound(Relation rel, Relation parent,
* are working on Lists, so it's less messy to do the casts internally.
*/
List *
-map_partition_varattnos(List *expr, int target_varno,
- Relation partrel, Relation parent,
+map_partition_varattnos(List *expr, int fromrel_varno,
+ Relation to_rel, Relation from_rel,
bool *found_whole_row)
{
bool my_found_whole_row = false;
@@ -1464,14 +1465,14 @@ map_partition_varattnos(List *expr, int target_varno,
{
AttrNumber *part_attnos;
- part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel),
- RelationGetDescr(parent),
+ part_attnos = convert_tuples_by_name_map(RelationGetDescr(to_rel),
+ RelationGetDescr(from_rel),
gettext_noop("could not convert row type"));
expr = (List *) map_variable_attnos((Node *) expr,
- target_varno, 0,
+ fromrel_varno, 0,
part_attnos,
- RelationGetDescr(parent)->natts,
- RelationGetForm(partrel)->reltype,
+ RelationGetDescr(from_rel)->natts,
+ RelationGetForm(to_rel)->reltype,
&my_found_whole_row);
}
@@ -2595,6 +2596,69 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
}
/*
+ * Checks if any of the 'attnums' is a partition key attribute for rel
+ *
+ * Sets *used_in_expr if any of the 'attnums' is found to be referenced in some
+ * partition key expression. It's possible for a column to be both used
+ * directly and as part of an expression; if that happens, *used_in_expr may
+ * end up as either true or false. That's OK for current uses of this
+ * function, because *used_in_expr is only used to tailor the error message
+ * text.
+ */
+bool
+has_partition_attrs(Relation rel, Bitmapset *attnums, bool *used_in_expr)
+{
+ PartitionKey key;
+ int partnatts;
+ List *partexprs;
+ ListCell *partexprs_item;
+ int i;
+
+ if (attnums == NULL || rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+ return false;
+
+ key = RelationGetPartitionKey(rel);
+ partnatts = get_partition_natts(key);
+ partexprs = get_partition_exprs(key);
+
+ partexprs_item = list_head(partexprs);
+ for (i = 0; i < partnatts; i++)
+ {
+ AttrNumber partattno = get_partition_col_attnum(key, i);
+
+ if (partattno != 0)
+ {
+ if (bms_is_member(partattno - FirstLowInvalidHeapAttributeNumber,
+ attnums))
+ {
+ if (used_in_expr)
+ *used_in_expr = false;
+ return true;
+ }
+ }
+ else
+ {
+ /* Arbitrary expression */
+ Node *expr = (Node *) lfirst(partexprs_item);
+ Bitmapset *expr_attrs = NULL;
+
+ /* Find all attributes referenced */
+ pull_varattnos(expr, 1, &expr_attrs);
+ partexprs_item = lnext(partexprs_item);
+
+ if (bms_overlap(attnums, expr_attrs))
+ {
+ if (used_in_expr)
+ *used_in_expr = true;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/*
* qsort_partition_hbound_cmp
*
* We sort hash bounds by modulus, then by remainder.
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index d6b235c..39c2921 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -165,12 +165,9 @@ typedef struct CopyStateData
bool volatile_defexprs; /* is any of defexprs volatile? */
List *range_table;
- PartitionDispatch *partition_dispatch_info;
- int num_dispatch; /* Number of entries in the above array */
- int num_partitions; /* Number of members in the following arrays */
- ResultRelInfo **partitions; /* Per partition result relation pointers */
- TupleConversionMap **partition_tupconv_maps;
- TupleTableSlot *partition_tuple_slot;
+ PartitionTupleRouting *partition_tuple_routing; /* all tuple-routing info
+ * for partitions.
+ */
TransitionCaptureState *transition_capture;
TupleConversionMap **transition_tupconv_maps;
@@ -2471,27 +2468,16 @@ CopyFrom(CopyState cstate)
*/
if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
{
- PartitionDispatch *partition_dispatch_info;
- ResultRelInfo **partitions;
- TupleConversionMap **partition_tupconv_maps;
- TupleTableSlot *partition_tuple_slot;
- int num_parted,
- num_partitions;
+ PartitionTupleRouting *ptr;
ExecSetupPartitionTupleRouting(cstate->rel,
+ NULL,
+ 0,
1,
estate,
- &partition_dispatch_info,
- &partitions,
- &partition_tupconv_maps,
- &partition_tuple_slot,
- &num_parted, &num_partitions);
- cstate->partition_dispatch_info = partition_dispatch_info;
- cstate->num_dispatch = num_parted;
- cstate->partitions = partitions;
- cstate->num_partitions = num_partitions;
- cstate->partition_tupconv_maps = partition_tupconv_maps;
- cstate->partition_tuple_slot = partition_tuple_slot;
+ &cstate->partition_tuple_routing);
+
+ ptr = cstate->partition_tuple_routing;
/*
* If we are capturing transition tuples, they may need to be
@@ -2504,11 +2490,11 @@ CopyFrom(CopyState cstate)
int i;
cstate->transition_tupconv_maps = (TupleConversionMap **)
- palloc0(sizeof(TupleConversionMap *) * cstate->num_partitions);
- for (i = 0; i < cstate->num_partitions; ++i)
+ palloc0(sizeof(TupleConversionMap *) * ptr->num_partitions);
+ for (i = 0; i < ptr->num_partitions; ++i)
{
cstate->transition_tupconv_maps[i] =
- convert_tuples_by_name(RelationGetDescr(cstate->partitions[i]->ri_RelationDesc),
+ convert_tuples_by_name(RelationGetDescr(ptr->partitions[i]->ri_RelationDesc),
RelationGetDescr(cstate->rel),
gettext_noop("could not convert row type"));
}
@@ -2528,7 +2514,7 @@ CopyFrom(CopyState cstate)
if ((resultRelInfo->ri_TrigDesc != NULL &&
(resultRelInfo->ri_TrigDesc->trig_insert_before_row ||
resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) ||
- cstate->partition_dispatch_info != NULL ||
+ cstate->partition_tuple_routing != NULL ||
cstate->volatile_defexprs)
{
useHeapMultiInsert = false;
@@ -2603,10 +2589,11 @@ CopyFrom(CopyState cstate)
ExecStoreTuple(tuple, slot, InvalidBuffer, false);
/* Determine the partition to heap_insert the tuple into */
- if (cstate->partition_dispatch_info)
+ if (cstate->partition_tuple_routing)
{
int leaf_part_index;
TupleConversionMap *map;
+ PartitionTupleRouting *ptr = cstate->partition_tuple_routing;
/*
* Away we go ... If we end up not finding a partition after all,
@@ -2617,11 +2604,11 @@ CopyFrom(CopyState cstate)
* partition, respectively.
*/
leaf_part_index = ExecFindPartition(resultRelInfo,
- cstate->partition_dispatch_info,
+ ptr->partition_dispatch_info,
slot,
estate);
Assert(leaf_part_index >= 0 &&
- leaf_part_index < cstate->num_partitions);
+ leaf_part_index < ptr->num_partitions);
/*
* If this tuple is mapped to a partition that is not same as the
@@ -2639,7 +2626,7 @@ CopyFrom(CopyState cstate)
* to the selected partition.
*/
saved_resultRelInfo = resultRelInfo;
- resultRelInfo = cstate->partitions[leaf_part_index];
+ resultRelInfo = ptr->partitions[leaf_part_index];
/* We do not yet have a way to insert into a foreign partition */
if (resultRelInfo->ri_FdwRoutine)
@@ -2686,7 +2673,7 @@ CopyFrom(CopyState cstate)
* We might need to convert from the parent rowtype to the
* partition rowtype.
*/
- map = cstate->partition_tupconv_maps[leaf_part_index];
+ map = ptr->parentchild_tupconv_maps[leaf_part_index];
if (map)
{
Relation partrel = resultRelInfo->ri_RelationDesc;
@@ -2698,7 +2685,7 @@ CopyFrom(CopyState cstate)
* point on. Use a dedicated slot from this point on until
* we're finished dealing with the partition.
*/
- slot = cstate->partition_tuple_slot;
+ slot = ptr->partition_tuple_slot;
Assert(slot != NULL);
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
ExecStoreTuple(tuple, slot, InvalidBuffer, true);
@@ -2749,7 +2736,7 @@ CopyFrom(CopyState cstate)
/* Check the constraints of the tuple */
if (cstate->rel->rd_att->constr || check_partition_constr)
- ExecConstraints(resultRelInfo, slot, estate);
+ ExecConstraints(resultRelInfo, slot, estate, true);
if (useHeapMultiInsert)
{
@@ -2850,8 +2837,9 @@ CopyFrom(CopyState cstate)
ExecCloseIndices(resultRelInfo);
/* Close all the partitioned tables, leaf partitions, and their indices */
- if (cstate->partition_dispatch_info)
+ if (cstate->partition_tuple_routing)
{
+ PartitionTupleRouting *ptr = cstate->partition_tuple_routing;
int i;
/*
@@ -2860,23 +2848,23 @@ CopyFrom(CopyState cstate)
* the main target table of COPY that will be closed eventually by
* DoCopy(). Also, tupslot is NULL for the root partitioned table.
*/
- for (i = 1; i < cstate->num_dispatch; i++)
+ for (i = 1; i < ptr->num_dispatch; i++)
{
- PartitionDispatch pd = cstate->partition_dispatch_info[i];
+ PartitionDispatch pd = ptr->partition_dispatch_info[i];
heap_close(pd->reldesc, NoLock);
ExecDropSingleTupleTableSlot(pd->tupslot);
}
- for (i = 0; i < cstate->num_partitions; i++)
+ for (i = 0; i < ptr->num_partitions; i++)
{
- ResultRelInfo *resultRelInfo = cstate->partitions[i];
+ ResultRelInfo *resultRelInfo = ptr->partitions[i];
ExecCloseIndices(resultRelInfo);
heap_close(resultRelInfo->ri_RelationDesc, NoLock);
}
/* Release the standalone partition tuple descriptor */
- ExecDropSingleTupleTableSlot(cstate->partition_tuple_slot);
+ ExecDropSingleTupleTableSlot(ptr->partition_tuple_slot);
}
/* Close any trigger target relations */
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index d979ce2..64c2185 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -468,7 +468,6 @@ static void RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid,
Oid oldRelOid, void *arg);
static void RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid,
Oid oldrelid, void *arg);
-static bool is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr);
static PartitionSpec *transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy);
static void ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
List **partexprs, Oid *partopclass, Oid *partcollation, char strategy);
@@ -6492,68 +6491,6 @@ ATPrepDropColumn(List **wqueue, Relation rel, bool recurse, bool recursing,
}
/*
- * Checks if attnum is a partition attribute for rel
- *
- * Sets *used_in_expr if attnum is found to be referenced in some partition
- * key expression. It's possible for a column to be both used directly and
- * as part of an expression; if that happens, *used_in_expr may end up as
- * either true or false. That's OK for current uses of this function, because
- * *used_in_expr is only used to tailor the error message text.
- */
-static bool
-is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr)
-{
- PartitionKey key;
- int partnatts;
- List *partexprs;
- ListCell *partexprs_item;
- int i;
-
- if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
- return false;
-
- key = RelationGetPartitionKey(rel);
- partnatts = get_partition_natts(key);
- partexprs = get_partition_exprs(key);
-
- partexprs_item = list_head(partexprs);
- for (i = 0; i < partnatts; i++)
- {
- AttrNumber partattno = get_partition_col_attnum(key, i);
-
- if (partattno != 0)
- {
- if (attnum == partattno)
- {
- if (used_in_expr)
- *used_in_expr = false;
- return true;
- }
- }
- else
- {
- /* Arbitrary expression */
- Node *expr = (Node *) lfirst(partexprs_item);
- Bitmapset *expr_attrs = NULL;
-
- /* Find all attributes referenced */
- pull_varattnos(expr, 1, &expr_attrs);
- partexprs_item = lnext(partexprs_item);
-
- if (bms_is_member(attnum - FirstLowInvalidHeapAttributeNumber,
- expr_attrs))
- {
- if (used_in_expr)
- *used_in_expr = true;
- return true;
- }
- }
- }
-
- return false;
-}
-
-/*
* Return value is the address of the dropped column.
*/
static ObjectAddress
@@ -6613,7 +6550,9 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName,
colName)));
/* Don't drop columns used in the partition key */
- if (is_partition_attr(rel, attnum, &is_expr))
+ if (has_partition_attrs(rel,
+ bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber),
+ &is_expr))
{
if (!is_expr)
ereport(ERROR,
@@ -8837,7 +8776,9 @@ ATPrepAlterColumnType(List **wqueue,
colName)));
/* Don't alter columns used in the partition key */
- if (is_partition_attr(rel, attnum, &is_expr))
+ if (has_partition_attrs(rel,
+ bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber),
+ &is_expr))
{
if (!is_expr)
ereport(ERROR,
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 92ae382..73ec872 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -2854,8 +2854,13 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo,
{
HeapTuple trigtuple;
- Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid));
- if (fdw_trigtuple == NULL)
+ /*
+ * Note: if the UPDATE is converted into a DELETE+INSERT as part of
+ * update-partition-key operation, then this function is also called
+ * separately for DELETE and INSERT to capture transition table rows.
+ * In such case, either old tuple or new tuple can be NULL.
+ */
+ if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid))
trigtuple = GetTupleForTrigger(estate,
NULL,
relinfo,
@@ -5414,7 +5419,12 @@ AfterTriggerPendingOnRel(Oid relid)
* triggers actually need to be queued. It is also called after each row,
* even if there are no triggers for that event, if there are any AFTER
* STATEMENT triggers for the statement which use transition tables, so that
- * the transition tuplestores can be built.
+ * the transition tuplestores can be built. Furthermore, if the transition
+ * capture is happening for UPDATEd rows being moved to another partition due
+ * partition-key change, then this function is called once when the row is
+ * deleted (to capture OLD row), and once when the row is inserted to another
+ * partition (to capture NEW row). This is done separately because DELETE and
+ * INSERT happen on different tables.
*
* Transition tuplestores are built now, rather than when events are pulled
* off of the queue because AFTER ROW triggers are allowed to select from the
@@ -5463,12 +5473,25 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
bool update_new_table = transition_capture->tcs_update_new_table;
bool insert_new_table = transition_capture->tcs_insert_new_table;;
- if ((event == TRIGGER_EVENT_DELETE && delete_old_table) ||
- (event == TRIGGER_EVENT_UPDATE && update_old_table))
+ /*
+ * For INSERT events newtup should be non-NULL, for DELETE events
+ * oldtup should be non-NULL, whereas for UPDATE events normally both
+ * oldtup and newtup are non-NULL. But for UPDATE event fired for
+ * capturing transition tuples during UPDATE partition-key row
+ * movement, oldtup is NULL when the event is for row being inserted,
+ * whereas newtup is NULL when the event is for row being deleted.
+ */
+ Assert(!(event == TRIGGER_EVENT_DELETE && delete_old_table &&
+ oldtup == NULL));
+ Assert(!(event == TRIGGER_EVENT_INSERT && insert_new_table &&
+ newtup == NULL));
+
+ if (oldtup != NULL &&
+ ((event == TRIGGER_EVENT_DELETE && delete_old_table) ||
+ (event == TRIGGER_EVENT_UPDATE && update_old_table)))
{
Tuplestorestate *old_tuplestore;
- Assert(oldtup != NULL);
old_tuplestore = transition_capture->tcs_private->old_tuplestore;
if (map != NULL)
@@ -5481,12 +5504,12 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
else
tuplestore_puttuple(old_tuplestore, oldtup);
}
- if ((event == TRIGGER_EVENT_INSERT && insert_new_table) ||
- (event == TRIGGER_EVENT_UPDATE && update_new_table))
+ if (newtup != NULL &&
+ ((event == TRIGGER_EVENT_INSERT && insert_new_table) ||
+ (event == TRIGGER_EVENT_UPDATE && update_new_table)))
{
Tuplestorestate *new_tuplestore;
- Assert(newtup != NULL);
new_tuplestore = transition_capture->tcs_private->new_tuplestore;
if (original_insert_tuple != NULL)
@@ -5502,11 +5525,17 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
tuplestore_puttuple(new_tuplestore, newtup);
}
- /* If transition tables are the only reason we're here, return. */
+ /*
+ * If transition tables are the only reason we're here, return. As
+ * mentioned above, we can also be here during update tuple routing in
+ * presence of transition tables, in which case this function is called
+ * separately for oldtup and newtup, so either can be NULL, not both.
+ */
if (trigdesc == NULL ||
(event == TRIGGER_EVENT_DELETE && !trigdesc->trig_delete_after_row) ||
(event == TRIGGER_EVENT_INSERT && !trigdesc->trig_insert_after_row) ||
- (event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row))
+ (event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row) ||
+ (event == TRIGGER_EVENT_UPDATE && ((oldtup == NULL) ^ (newtup == NULL))))
return;
}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index dbaa47f..5ec92d5 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1849,16 +1849,12 @@ ExecRelCheck(ResultRelInfo *resultRelInfo,
* ExecPartitionCheck --- check that tuple meets the partition constraint.
*
* Exported in executor.h for outside use.
+ * Returns true if it meets the partition constraint, else returns false.
*/
-void
+bool
ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
EState *estate)
{
- Relation rel = resultRelInfo->ri_RelationDesc;
- TupleDesc tupdesc = RelationGetDescr(rel);
- Bitmapset *modifiedCols;
- Bitmapset *insertedCols;
- Bitmapset *updatedCols;
ExprContext *econtext;
/*
@@ -1886,52 +1882,69 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
* As in case of the catalogued constraints, we treat a NULL result as
* success here, not a failure.
*/
- if (!ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext))
- {
- char *val_desc;
- Relation orig_rel = rel;
+ return ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext);
+}
+
+/*
+ * ExecPartitionCheckEmitError - Form and emit an error message after a failed
+ * partition constraint check.
+ */
+void
+ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo,
+ TupleTableSlot *slot,
+ EState *estate)
+{
+ Relation rel = resultRelInfo->ri_RelationDesc;
+ Relation orig_rel = rel;
+ TupleDesc tupdesc = RelationGetDescr(rel);
+ char *val_desc;
+ Bitmapset *modifiedCols;
+ Bitmapset *insertedCols;
+ Bitmapset *updatedCols;
- /* See the comment above. */
- if (resultRelInfo->ri_PartitionRoot)
+ /*
+ * Need to first convert the tuple to the root partitioned table's row
+ * type. For details, check similar comments in ExecConstraints().
+ */
+ if (resultRelInfo->ri_PartitionRoot)
+ {
+ HeapTuple tuple = ExecFetchSlotTuple(slot);
+ TupleDesc old_tupdesc = RelationGetDescr(rel);
+ TupleConversionMap *map;
+
+ rel = resultRelInfo->ri_PartitionRoot;
+ tupdesc = RelationGetDescr(rel);
+ /* a reverse map */
+ map = convert_tuples_by_name(old_tupdesc, tupdesc,
+ gettext_noop("could not convert row type"));
+ if (map != NULL)
{
- HeapTuple tuple = ExecFetchSlotTuple(slot);
- TupleDesc old_tupdesc = RelationGetDescr(rel);
- TupleConversionMap *map;
-
- rel = resultRelInfo->ri_PartitionRoot;
- tupdesc = RelationGetDescr(rel);
- /* a reverse map */
- map = convert_tuples_by_name(old_tupdesc, tupdesc,
- gettext_noop("could not convert row type"));
- if (map != NULL)
- {
- tuple = do_convert_tuple(tuple, map);
- ExecSetSlotDescriptor(slot, tupdesc);
- ExecStoreTuple(tuple, slot, InvalidBuffer, false);
- }
+ tuple = do_convert_tuple(tuple, map);
+ ExecSetSlotDescriptor(slot, tupdesc);
+ ExecStoreTuple(tuple, slot, InvalidBuffer, false);
}
-
- insertedCols = GetInsertedColumns(resultRelInfo, estate);
- updatedCols = GetUpdatedColumns(resultRelInfo, estate);
- modifiedCols = bms_union(insertedCols, updatedCols);
- val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel),
- slot,
- tupdesc,
- modifiedCols,
- 64);
- ereport(ERROR,
- (errcode(ERRCODE_CHECK_VIOLATION),
- errmsg("new row for relation \"%s\" violates partition constraint",
- RelationGetRelationName(orig_rel)),
- val_desc ? errdetail("Failing row contains %s.", val_desc) : 0));
}
+
+ insertedCols = GetInsertedColumns(resultRelInfo, estate);
+ updatedCols = GetUpdatedColumns(resultRelInfo, estate);
+ modifiedCols = bms_union(insertedCols, updatedCols);
+ val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel),
+ slot,
+ tupdesc,
+ modifiedCols,
+ 64);
+ ereport(ERROR,
+ (errcode(ERRCODE_CHECK_VIOLATION),
+ errmsg("new row for relation \"%s\" violates partition constraint",
+ RelationGetRelationName(orig_rel)),
+ val_desc ? errdetail("Failing row contains %s.", val_desc) : 0));
}
/*
* ExecConstraints - check constraints of the tuple in 'slot'
*
- * This checks the traditional NOT NULL and check constraints, as well as
- * the partition constraint, if any.
+ * This checks the traditional NOT NULL and check constraints, and if
+ * requested, checks the partition constraint.
*
* Note: 'slot' contains the tuple to check the constraints of, which may
* have been converted from the original input tuple after tuple routing.
@@ -1939,7 +1952,8 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
*/
void
ExecConstraints(ResultRelInfo *resultRelInfo,
- TupleTableSlot *slot, EState *estate)
+ TupleTableSlot *slot, EState *estate,
+ bool check_partition_constraint)
{
Relation rel = resultRelInfo->ri_RelationDesc;
TupleDesc tupdesc = RelationGetDescr(rel);
@@ -2055,8 +2069,9 @@ ExecConstraints(ResultRelInfo *resultRelInfo,
}
}
- if (resultRelInfo->ri_PartitionCheck)
- ExecPartitionCheck(resultRelInfo, slot, estate);
+ if (check_partition_constraint && resultRelInfo->ri_PartitionCheck &&
+ !ExecPartitionCheck(resultRelInfo, slot, estate))
+ ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
}
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 2fc411a..180798f 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -41,54 +41,91 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
* ExecSetupPartitionTupleRouting - set up information needed during
* tuple routing for partitioned tables
*
+ * 'update_rri' contains the UPDATE per-subplan result rels. For the output
+ * param 'partitions', we don't allocate new ResultRelInfo objects for
+ * leaf partitions for which they are already available in 'update_rri'.
+ *
+ * 'num_update_rri' is the number of elements in 'update_rri' array or zero for
+ * INSERT.
+ *
* Output arguments:
- * 'pd' receives an array of PartitionDispatch objects with one entry for
- * every partitioned table in the partition tree
- * 'partitions' receives an array of ResultRelInfo* objects with one entry for
- * every leaf partition in the partition tree
- * 'tup_conv_maps' receives an array of TupleConversionMap objects with one
- * entry for every leaf partition (required to convert input tuple based
- * on the root table's rowtype to a leaf partition's rowtype after tuple
- * routing is done)
- * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used
- * to manipulate any given leaf partition's rowtype after that partition
- * is chosen by tuple-routing.
- * 'num_parted' receives the number of partitioned tables in the partition
- * tree (= the number of entries in the 'pd' output array)
- * 'num_partitions' receives the number of leaf partitions in the partition
- * tree (= the number of entries in the 'partitions' and 'tup_conv_maps'
- * output arrays
+ *
+ * 'partition_tuple_routing' encapsulates all the partition related information
+ * required to do tuple routing.
*
* Note that all the relations in the partition tree are locked using the
* RowExclusiveLock mode upon return from this function.
*/
void
ExecSetupPartitionTupleRouting(Relation rel,
+ ResultRelInfo *update_rri,
+ int num_update_rri,
Index resultRTindex,
EState *estate,
- PartitionDispatch **pd,
- ResultRelInfo ***partitions,
- TupleConversionMap ***tup_conv_maps,
- TupleTableSlot **partition_tuple_slot,
- int *num_parted, int *num_partitions)
+ PartitionTupleRouting **partition_tuple_routing)
{
TupleDesc tupDesc = RelationGetDescr(rel);
List *leaf_parts;
ListCell *cell;
int i;
- ResultRelInfo *leaf_part_rri;
+ ResultRelInfo *leaf_part_arr = NULL;
+ int update_rri_index = 0;
+ bool is_update = (num_update_rri > 0);
+ PartitionTupleRouting *ptr;
/*
* Get the information about the partition tree after locking all the
* partitions.
*/
(void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
- *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts);
- *num_partitions = list_length(leaf_parts);
- *partitions = (ResultRelInfo **) palloc(*num_partitions *
+ ptr = *partition_tuple_routing =
+ (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
+ ptr->partition_dispatch_info =
+ RelationGetPartitionDispatchInfo(rel, &ptr->num_dispatch, &leaf_parts);
+ ptr->num_partitions = list_length(leaf_parts);
+ ptr->partitions = (ResultRelInfo **) palloc(ptr->num_partitions *
sizeof(ResultRelInfo *));
- *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions *
- sizeof(TupleConversionMap *));
+ ptr->parentchild_tupconv_maps =
+ (TupleConversionMap **) palloc0(ptr->num_partitions *
+ sizeof(TupleConversionMap *));
+
+ if (is_update)
+ {
+ /*
+ * For updates, if the leaf partition is already present in the
+ * per-subplan result rels, we re-use that rather than initialize a new
+ * result rel. The per-subplan resultrels and the resultrels of the
+ * leaf partitions are both in the same canonical order. So while going
+ * through the leaf partition oids, we need to keep track of the next
+ * per-subplan result rel to be looked for in the leaf partition
+ * resultrels. So, set update_rri_index to the first per-subplan result
+ * rel, and then shift it as we find them one by one while scanning the
+ * leaf partition oids.
+ */
+ update_rri_index = 0;
+
+ /*
+ * Prepare for generating the mapping from subplan result rels to leaf
+ * partition position.
+ */
+ ptr->subplan_partition_offsets = palloc(num_update_rri * sizeof(int));
+
+ /*
+ * For UPDATEs, we require an additional tuple slot for storing
+ * transient tuples that are converted to the root table descriptor.
+ */
+ ptr->root_tuple_slot = MakeTupleTableSlot();
+ }
+ else
+ {
+ /*
+ * For inserts, we need to create all new result rels, so avoid
+ * repeated pallocs by allocating memory for all the result rels in
+ * bulk.
+ */
+ leaf_part_arr = (ResultRelInfo *) palloc0(ptr->num_partitions *
+ sizeof(ResultRelInfo));
+ }
/*
* Initialize an empty slot that will be used to manipulate tuples of any
@@ -96,39 +133,82 @@ ExecSetupPartitionTupleRouting(Relation rel,
* (such as ModifyTableState) and released when the node finishes
* processing.
*/
- *partition_tuple_slot = MakeTupleTableSlot();
+ ptr->partition_tuple_slot = MakeTupleTableSlot();
- leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions *
- sizeof(ResultRelInfo));
i = 0;
foreach(cell, leaf_parts)
{
- Relation partrel;
+ ResultRelInfo *leaf_part_rri;
+ Relation partrel = NULL;
TupleDesc part_tupdesc;
+ Oid leaf_oid = lfirst_oid(cell);
+
+ if (is_update)
+ {
+ /* Is this leaf partition present in the update resultrel? */
+ if (update_rri_index < num_update_rri &&
+ RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
+ {
+ leaf_part_rri = &update_rri[update_rri_index];
+ partrel = leaf_part_rri->ri_RelationDesc;
+
+ /*
+ * This is required when we convert the partition's tuple to be
+ * compatible with the root partitioned table's tuple
+ * descriptor. When generating the per-subplan UPDATE result
+ * rels, this was not set.
+ */
+ leaf_part_rri->ri_PartitionRoot = rel;
+
+ /*
+ * Save the position of this update rel in the leaf partitions
+ * array
+ */
+ ptr->subplan_partition_offsets[update_rri_index] = i;
+
+ update_rri_index++;
+ }
+ else
+ leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo));
+ }
+ else
+ {
+ /* For INSERTs, we already have an array of result rels allocated */
+ leaf_part_rri = leaf_part_arr + i;
+ }
/*
- * We locked all the partitions above including the leaf partitions.
- * Note that each of the relations in *partitions are eventually
- * closed by the caller.
+ * If we didn't open the partition rel, it means we haven't initialized
+ * the result rel either.
*/
- partrel = heap_open(lfirst_oid(cell), NoLock);
+ if (!partrel)
+ {
+ /*
+ * We locked all the partitions above including the leaf
+ * partitions. Note that each of the newly opened relations in
+ * *partitions are eventually closed by the caller.
+ */
+ partrel = heap_open(leaf_oid, NoLock);
+ InitResultRelInfo(leaf_part_rri,
+ partrel,
+ resultRTindex,
+ rel,
+ estate->es_instrument);
+ }
+
part_tupdesc = RelationGetDescr(partrel);
/*
* Save a tuple conversion map to convert a tuple routed to this
* partition from the parent's type to the partition's.
*/
- (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc,
+ ptr->parentchild_tupconv_maps[i] = convert_tuples_by_name(tupDesc, part_tupdesc,
gettext_noop("could not convert row type"));
- InitResultRelInfo(leaf_part_rri,
- partrel,
- resultRTindex,
- rel,
- estate->es_instrument);
-
/*
- * Verify result relation is a valid target for INSERT.
+ * Verify result relation is a valid target for insert operation. Even
+ * for updates, we are doing this for tuple-routing, so again, we need
+ * to check the validity for insert operation.
*/
CheckValidResultRel(leaf_part_rri, CMD_INSERT);
@@ -144,9 +224,15 @@ ExecSetupPartitionTupleRouting(Relation rel,
estate->es_leaf_result_relations =
lappend(estate->es_leaf_result_relations, leaf_part_rri);
- (*partitions)[i] = leaf_part_rri++;
+ ptr->partitions[i] = leaf_part_rri;
i++;
}
+
+ /*
+ * For UPDATE, we should have found all the per-subplan resultrels in the
+ * leaf partitions.
+ */
+ Assert(!is_update || update_rri_index == num_update_rri);
}
/*
@@ -177,8 +263,9 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
* First check the root table's partition constraint, if any. No point in
* routing the tuple if it doesn't belong in the root table itself.
*/
- if (resultRelInfo->ri_PartitionCheck)
- ExecPartitionCheck(resultRelInfo, slot, estate);
+ if (resultRelInfo->ri_PartitionCheck &&
+ !ExecPartitionCheck(resultRelInfo, slot, estate))
+ ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
/* start with the root partitioned table */
parent = pd[0];
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index fb538c0..e11f7cb 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -401,7 +401,7 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot)
/* Check the constraints of the tuple */
if (rel->rd_att->constr)
- ExecConstraints(resultRelInfo, slot, estate);
+ ExecConstraints(resultRelInfo, slot, estate, true);
/* Store the slot into tuple that we can inspect. */
tuple = ExecMaterializeSlot(slot);
@@ -466,7 +466,7 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate,
/* Check the constraints of the tuple */
if (rel->rd_att->constr)
- ExecConstraints(resultRelInfo, slot, estate);
+ ExecConstraints(resultRelInfo, slot, estate, true);
/* Store the slot into tuple that we can write. */
tuple = ExecMaterializeSlot(slot);
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 201c607..919b32d 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -46,6 +46,7 @@
#include "foreign/fdwapi.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
+#include "optimizer/var.h"
#include "parser/parsetree.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
@@ -63,7 +64,16 @@ static bool ExecOnConflictUpdate(ModifyTableState *mtstate,
EState *estate,
bool canSetTag,
TupleTableSlot **returning);
-
+static void ExecSetupChildParentMap(ModifyTableState *mtstate,
+ ResultRelInfo *rootRelInfo,
+ int numResultRelInfos, bool perleaf);
+static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node,
+ int whichplan);
+static HeapTuple ConvertPartitionTupleSlot(ModifyTableState *mtstate,
+ TupleConversionMap *map,
+ HeapTuple tuple,
+ TupleTableSlot *new_slot,
+ TupleTableSlot **p_old_slot);
/*
* Verify that the tuples to be produced by INSERT or UPDATE match the
* target relation's rowtype
@@ -241,6 +251,38 @@ ExecCheckTIDVisible(EState *estate,
ReleaseBuffer(buffer);
}
+/*
+ * ConvertPartitionTupleSlot -- convenience function for tuple conversion using
+ * 'map'. The tuple, if converted, is stored in 'new_slot', and 'p_my_slot' is
+ * updated with the 'new_slot'. 'new_slot' typically should be one of the
+ * dedicated partition tuple slots. If map is NULL, keeps p_my_slot unchanged.
+ *
+ * Returns the converted tuple, unless map is NULL, in which case original
+ * tuple is returned unmodified.
+ */
+static HeapTuple
+ConvertPartitionTupleSlot(ModifyTableState *mtstate,
+ TupleConversionMap *map,
+ HeapTuple tuple,
+ TupleTableSlot *new_slot,
+ TupleTableSlot **p_my_slot)
+{
+ if (!map)
+ return tuple;
+
+ tuple = do_convert_tuple(tuple, map);
+
+ /*
+ * Change the partition tuple slot descriptor, as per converted tuple.
+ */
+ *p_my_slot = new_slot;
+ Assert(new_slot != NULL);
+ ExecSetSlotDescriptor(new_slot, map->outdesc);
+ ExecStoreTuple(tuple, new_slot, InvalidBuffer, true);
+
+ return tuple;
+}
+
/* ----------------------------------------------------------------
* ExecInsert
*
@@ -266,6 +308,9 @@ ExecInsert(ModifyTableState *mtstate,
Oid newId;
List *recheckIndexes = NIL;
TupleTableSlot *result = NULL;
+ TransitionCaptureState *transition_capture;
+
+ transition_capture = mtstate->mt_transition_capture;
/*
* get the heap tuple out of the tuple table slot, making sure we have a
@@ -279,32 +324,32 @@ ExecInsert(ModifyTableState *mtstate,
resultRelInfo = estate->es_result_relation_info;
/* Determine the partition to heap_insert the tuple into */
- if (mtstate->mt_partition_dispatch_info)
+ if (mtstate->mt_partition_tuple_routing)
{
int leaf_part_index;
- TupleConversionMap *map;
+ PartitionTupleRouting *ptr = mtstate->mt_partition_tuple_routing;
/*
* Away we go ... If we end up not finding a partition after all,
* ExecFindPartition() does not return and errors out instead.
* Otherwise, the returned value is to be used as an index into arrays
- * mt_partitions[] and mt_partition_tupconv_maps[] that will get us
- * the ResultRelInfo and TupleConversionMap for the partition,
+ * ptr->partitions[] and ptr->parentchild_tupconv_maps[] that will get
+ * us the ResultRelInfo and TupleConversionMap for the partition,
* respectively.
*/
leaf_part_index = ExecFindPartition(resultRelInfo,
- mtstate->mt_partition_dispatch_info,
+ ptr->partition_dispatch_info,
slot,
estate);
Assert(leaf_part_index >= 0 &&
- leaf_part_index < mtstate->mt_num_partitions);
+ leaf_part_index < ptr->num_partitions);
/*
* Save the old ResultRelInfo and switch to the one corresponding to
* the selected partition.
*/
saved_resultRelInfo = resultRelInfo;
- resultRelInfo = mtstate->mt_partitions[leaf_part_index];
+ resultRelInfo = ptr->partitions[leaf_part_index];
/* We do not yet have a way to insert into a foreign partition */
if (resultRelInfo->ri_FdwRoutine)
@@ -331,8 +376,10 @@ ExecInsert(ModifyTableState *mtstate,
* back to tuplestore format.
*/
mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+
+ Assert(mtstate->mt_is_tupconv_perpart == true);
mtstate->mt_transition_capture->tcs_map =
- mtstate->mt_transition_tupconv_maps[leaf_part_index];
+ mtstate->mt_childparent_tupconv_maps[leaf_part_index];
}
else
{
@@ -345,30 +392,21 @@ ExecInsert(ModifyTableState *mtstate,
}
}
if (mtstate->mt_oc_transition_capture != NULL)
+ {
+ Assert(mtstate->mt_is_tupconv_perpart == true);
mtstate->mt_oc_transition_capture->tcs_map =
- mtstate->mt_transition_tupconv_maps[leaf_part_index];
+ mtstate->mt_childparent_tupconv_maps[leaf_part_index];
+ }
/*
* We might need to convert from the parent rowtype to the partition
* rowtype.
*/
- map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
- if (map)
- {
- Relation partrel = resultRelInfo->ri_RelationDesc;
-
- tuple = do_convert_tuple(tuple, map);
-
- /*
- * We must use the partition's tuple descriptor from this point
- * on, until we're finished dealing with the partition. Use the
- * dedicated slot for that.
- */
- slot = mtstate->mt_partition_tuple_slot;
- Assert(slot != NULL);
- ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
- ExecStoreTuple(tuple, slot, InvalidBuffer, true);
- }
+ tuple = ConvertPartitionTupleSlot(mtstate,
+ ptr->parentchild_tupconv_maps[leaf_part_index],
+ tuple,
+ ptr->partition_tuple_slot,
+ &slot);
}
resultRelationDesc = resultRelInfo->ri_RelationDesc;
@@ -486,7 +524,7 @@ ExecInsert(ModifyTableState *mtstate,
/* Check the constraints of the tuple */
if (resultRelationDesc->rd_att->constr || check_partition_constr)
- ExecConstraints(resultRelInfo, slot, estate);
+ ExecConstraints(resultRelInfo, slot, estate, true);
if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0)
{
@@ -622,9 +660,32 @@ ExecInsert(ModifyTableState *mtstate,
setLastTid(&(tuple->t_self));
}
+ /*
+ * If this INSERT is part of a partition-key-UPDATE and we are capturing
+ * transition tables, put this row into the transition NEW TABLE.
+ * (Similarly we need to add the deleted row in OLD TABLE). We need to do
+ * this separately for DELETE and INSERT because they happen on different
+ * tables.
+ */
+ if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+ && mtstate->mt_transition_capture->tcs_update_new_table)
+ {
+ ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+ NULL,
+ tuple,
+ NULL,
+ mtstate->mt_transition_capture);
+
+ /*
+ * Now that we have already captured NEW TABLE row, any AR INSERT
+ * trigger should not again capture it below. Arrange for the same.
+ */
+ transition_capture = NULL;
+ }
+
/* AFTER ROW INSERT Triggers */
ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes,
- mtstate->mt_transition_capture);
+ transition_capture);
list_free(recheckIndexes);
@@ -678,6 +739,8 @@ ExecDelete(ModifyTableState *mtstate,
TupleTableSlot *planSlot,
EPQState *epqstate,
EState *estate,
+ bool *tuple_deleted,
+ bool process_returning,
bool canSetTag)
{
ResultRelInfo *resultRelInfo;
@@ -685,6 +748,12 @@ ExecDelete(ModifyTableState *mtstate,
HTSU_Result result;
HeapUpdateFailureData hufd;
TupleTableSlot *slot = NULL;
+ TransitionCaptureState *transition_capture;
+
+ transition_capture = mtstate->mt_transition_capture;
+
+ if (tuple_deleted)
+ *tuple_deleted = false;
/*
* get information on the (current) result relation
@@ -849,12 +918,39 @@ ldelete:;
if (canSetTag)
(estate->es_processed)++;
+ /* The delete has actually happened, so inform that to the caller */
+ if (tuple_deleted)
+ *tuple_deleted = true;
+
+ /*
+ * In case this is part of update tuple routing, put this row into the
+ * transition OLD TABLE if we are capturing transition tables. We need to
+ * do this separately for DELETE and INSERT because they happen on
+ * different tables.
+ */
+ if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+ && mtstate->mt_transition_capture->tcs_update_old_table)
+ {
+ ExecARUpdateTriggers(estate, resultRelInfo,
+ tupleid,
+ oldtuple,
+ NULL,
+ NULL,
+ mtstate->mt_transition_capture);
+
+ /*
+ * Now that we have already captured OLD TABLE row, any AR DELETE
+ * trigger should not again capture it below. Arrange for the same.
+ */
+ transition_capture = NULL;
+ }
+
/* AFTER ROW DELETE Triggers */
ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple,
- mtstate->mt_transition_capture);
+ transition_capture);
- /* Process RETURNING if present */
- if (resultRelInfo->ri_projectReturning)
+ /* Process RETURNING if present and if requested */
+ if (process_returning && resultRelInfo->ri_projectReturning)
{
/*
* We have to put the target tuple into a slot, which means first we
@@ -947,6 +1043,7 @@ ExecUpdate(ModifyTableState *mtstate,
HTSU_Result result;
HeapUpdateFailureData hufd;
List *recheckIndexes = NIL;
+ TupleConversionMap *saved_tcs_map = NULL;
/*
* abort the operation if not running transactions
@@ -1043,12 +1140,117 @@ lreplace:;
resultRelInfo, slot, estate);
/*
+ * If a partition check fails, try to move the row into the right
+ * partition.
+ */
+ if (resultRelInfo->ri_PartitionCheck &&
+ !ExecPartitionCheck(resultRelInfo, slot, estate))
+ {
+ bool tuple_deleted;
+ TupleTableSlot *ret_slot;
+ PartitionTupleRouting *ptr = mtstate->mt_partition_tuple_routing;
+ int map_index;
+ TupleConversionMap *tupconv_map;
+
+ /*
+ * When an UPDATE is run with a leaf partition, we would not have
+ * partition tuple routing setup. In that case, fail with
+ * partition constraint violation error.
+ */
+ if (ptr == NULL)
+ ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+ /* Do the row movement. */
+
+ /*
+ * Skip RETURNING processing for DELETE. We want to return rows
+ * from INSERT.
+ */
+ ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate,
+ &tuple_deleted, false, false);
+
+ /*
+ * For some reason if DELETE didn't happen (e.g. trigger prevented
+ * it, or it was already deleted by self, or it was concurrently
+ * deleted by another transaction), then we should skip INSERT as
+ * well, otherwise, there will be effectively one new row inserted.
+ *
+ * For a normal UPDATE, the case where the tuple has been the
+ * subject of a concurrent UPDATE or DELETE would be handled by
+ * the EvalPlanQual machinery, but for an UPDATE that we've
+ * translated into a DELETE from this partition and an INSERT into
+ * some other partition, that's not available, because CTID chains
+ * can't span relation boundaries. We mimic the semantics to a
+ * limited extent by skipping the INSERT if the DELETE fails to
+ * find a tuple. This ensures that two concurrent attempts to
+ * UPDATE the same tuple at the same time can't turn one tuple
+ * into two, and that an UPDATE of a just-deleted tuple can't
+ * resurrect it.
+ */
+ if (!tuple_deleted)
+ return NULL;
+
+ /*
+ * UPDATEs set the transition capture map only when a new subplan
+ * is chosen. But for INSERTs, it is set for each row. So after
+ * INSERT, we need to revert back to the map created for UPDATE;
+ * otherwise the next UPDATE will incorrectly use the one created
+ * for INESRT. So first save the one created for UPDATE.
+ */
+ if (mtstate->mt_transition_capture)
+ saved_tcs_map = mtstate->mt_transition_capture->tcs_map;
+
+ /*
+ * resultRelInfo is one of the per-subplan resultRelInfos. So we
+ * should convert the tuple into root's tuple descriptor, since
+ * ExecInsert() starts the search from root. The tuple conversion
+ * map list is in the order of mtstate->resultRelInfo[], so to
+ * retrieve the one for this resultRel, we need to know the
+ * position of the resultRel in mtstate->resultRelInfo[].
+ */
+ map_index = resultRelInfo - mtstate->resultRelInfo;
+ Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
+ tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
+ tuple = ConvertPartitionTupleSlot(mtstate,
+ tupconv_map,
+ tuple,
+ ptr->root_tuple_slot,
+ &slot);
+
+
+ /*
+ * For ExecInsert(), make it look like we are inserting into the
+ * root.
+ */
+ Assert(mtstate->rootResultRelInfo != NULL);
+ estate->es_result_relation_info = mtstate->rootResultRelInfo;
+
+ ret_slot = ExecInsert(mtstate, slot, planSlot, NULL,
+ ONCONFLICT_NONE, estate, canSetTag);
+
+ /*
+ * Revert back the active result relation and the active transition
+ * capture map that we changed above.
+ */
+ estate->es_result_relation_info = resultRelInfo;
+ if (mtstate->mt_transition_capture)
+ {
+ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+ mtstate->mt_transition_capture->tcs_map = saved_tcs_map;
+ }
+ return ret_slot;
+ }
+
+ /*
* Check the constraints of the tuple. Note that we pass the same
* slot for the orig_slot argument, because unlike ExecInsert(), no
* tuple-routing is performed here, hence the slot remains unchanged.
+ * We've already checked the partition constraint above; however, we
+ * must still ensure the tuple passes all other constraints, so we will
+ * call ExecConstraints() and have it validate all remaining checks.
*/
- if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck)
- ExecConstraints(resultRelInfo, slot, estate);
+ if (resultRelationDesc->rd_att->constr)
+ ExecConstraints(resultRelInfo, slot, estate, false);
/*
* replace the heap tuple
@@ -1476,7 +1678,6 @@ static void
ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
{
ResultRelInfo *targetRelInfo = getASTriggerResultRelInfo(mtstate);
- int i;
/* Check for transition tables on the directly targeted relation. */
mtstate->mt_transition_capture =
@@ -1500,60 +1701,148 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
mtstate->mt_oc_transition_capture != NULL)
{
int numResultRelInfos;
+ PartitionTupleRouting *ptr = mtstate->mt_partition_tuple_routing;
- numResultRelInfos = (mtstate->mt_partition_tuple_slot != NULL ?
- mtstate->mt_num_partitions :
+ numResultRelInfos = (ptr != NULL ?
+ ptr->num_partitions :
mtstate->mt_nplans);
+ ExecSetupChildParentMap(mtstate, targetRelInfo, numResultRelInfos,
+ (ptr != NULL));
+
/*
- * Build array of conversion maps from each child's TupleDesc to the
- * one used in the tuplestore. The map pointers may be NULL when no
- * conversion is necessary, which is hopefully a common case for
- * partitions.
+ * Install the conversion map for the first plan for UPDATE and DELETE
+ * operations. It will be advanced each time we switch to the next
+ * plan. (INSERT operations set it every time, so we need not update
+ * mtstate->mt_oc_transition_capture here.)
*/
- mtstate->mt_transition_tupconv_maps = (TupleConversionMap **)
- palloc0(sizeof(TupleConversionMap *) * numResultRelInfos);
+ if (mtstate->mt_transition_capture && mtstate->operation != CMD_INSERT)
+ mtstate->mt_transition_capture->tcs_map =
+ tupconv_map_for_subplan(mtstate, 0);
+ }
+}
- /* Choose the right set of partitions */
- if (mtstate->mt_partition_dispatch_info != NULL)
- {
- /*
- * For tuple routing among partitions, we need TupleDescs based
- * on the partition routing table.
- */
- ResultRelInfo **resultRelInfos = mtstate->mt_partitions;
+/*
+ * Initialize the child-to-root tuple conversion map array.
+ *
+ * This map array is required for two purposes :
+ * 1. For update-tuple-routing. We need to convert the tuple from the subplan
+ * result rel to the root partitioned table descriptor.
+ * 2. For capturing transition tables that are partitions. For UPDATEs, we need
+ * to convert the tuple from subplan result rel to target table descriptor,
+ * and for INSERTs, we need to convert the inserted tuple from leaf partition
+ * to the target table descriptor.
+ *
+ * The caller can request either a per-subplan map or per-leaf-partition map.
+ */
+static void
+ExecSetupChildParentMap(ModifyTableState *mtstate,
+ ResultRelInfo *rootRelInfo,
+ int numResultRelInfos, bool perleaf)
+{
+ TupleDesc outdesc;
+ int i;
- for (i = 0; i < numResultRelInfos; ++i)
- {
- mtstate->mt_transition_tupconv_maps[i] =
- convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc),
- RelationGetDescr(targetRelInfo->ri_RelationDesc),
- gettext_noop("could not convert row type"));
- }
- }
- else
- {
- /* Otherwise we need the ResultRelInfo for each subplan. */
- ResultRelInfo *resultRelInfos = mtstate->resultRelInfo;
+ /* First check if there is already one */
+ if (mtstate->mt_childparent_tupconv_maps)
+ {
+ /*
+ * If per-leaf map is required and the map is already created, that map
+ * has to be per-leaf. If that map is per-subplan, we won't be able to
+ * access the maps leaf-partition-wise. But if the map is per-leaf, we
+ * will be able to access the maps subplan-wise using the
+ * subplan_partition_offsets map using function
+ * tupconv_map_for_subplan(). So if the callers might need to access
+ * the map both leaf-partition-wise and subplan-wise, they should make
+ * sure that the first time this function is called, it should be
+ * called with perleaf=true so that the map created is per-leaf, not
+ * per-subplan.
+ */
+ Assert(!(perleaf && !mtstate->mt_is_tupconv_perpart));
+ return;
+ }
- for (i = 0; i < numResultRelInfos; ++i)
- {
- mtstate->mt_transition_tupconv_maps[i] =
- convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc),
- RelationGetDescr(targetRelInfo->ri_RelationDesc),
- gettext_noop("could not convert row type"));
- }
+ /* Get tuple descriptor of the root partitioned table. */
+ outdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc);
+
+ /*
+ * Build array of conversion maps from each child's TupleDesc to the
+ * one used in the tuplestore. The map pointers may be NULL when no
+ * conversion is necessary, which is hopefully a common case for
+ * partitions.
+ */
+ mtstate->mt_childparent_tupconv_maps = (TupleConversionMap **)
+ palloc0(sizeof(TupleConversionMap *) * numResultRelInfos);
+
+ /* Choose the right set of partitions */
+ if (perleaf)
+ {
+ /*
+ * For tuple routing among partitions, we need TupleDescs based
+ * on the partition routing table.
+ */
+ ResultRelInfo **resultRelInfos;
+
+ Assert(mtstate->mt_partition_tuple_routing != NULL);
+ resultRelInfos = mtstate->mt_partition_tuple_routing->partitions;
+
+ for (i = 0; i < numResultRelInfos; ++i)
+ {
+ mtstate->mt_childparent_tupconv_maps[i] =
+ convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc),
+ outdesc,
+ gettext_noop("could not convert row type"));
}
/*
- * Install the conversion map for the first plan for UPDATE and DELETE
- * operations. It will be advanced each time we switch to the next
- * plan. (INSERT operations set it every time, so we need not update
- * mtstate->mt_oc_transition_capture here.)
+ * Save the info that the tuple conversion map is per-leaf, not
+ * per-subplan
*/
- if (mtstate->mt_transition_capture)
- mtstate->mt_transition_capture->tcs_map =
- mtstate->mt_transition_tupconv_maps[0];
+ mtstate->mt_is_tupconv_perpart = true;
+ }
+ else
+ {
+ /* Otherwise we need the ResultRelInfo for each subplan. */
+ ResultRelInfo *resultRelInfos = mtstate->resultRelInfo;
+
+ for (i = 0; i < numResultRelInfos; ++i)
+ {
+ mtstate->mt_childparent_tupconv_maps[i] =
+ convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc),
+ outdesc,
+ gettext_noop("could not convert row type"));
+ }
+ }
+
+}
+
+/*
+ * For a given subplan index, get the tuple conversion map.
+ */
+static TupleConversionMap *
+tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan)
+{
+ Assert(mtstate->mt_childparent_tupconv_maps != NULL);
+
+ /*
+ * If the tuple conversion map array is per-partition, we need to first get
+ * the index into the partition array.
+ */
+ if (mtstate->mt_is_tupconv_perpart)
+ {
+ int leaf_index;
+ PartitionTupleRouting *ptr = mtstate->mt_partition_tuple_routing;
+
+ Assert(ptr && ptr->subplan_partition_offsets != NULL);
+ leaf_index = ptr->subplan_partition_offsets[whichplan];
+
+ Assert(leaf_index >= 0 && leaf_index < ptr->num_partitions);
+ return mtstate->mt_childparent_tupconv_maps[leaf_index];
+ }
+ else
+ {
+ Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans);
+ return mtstate->mt_childparent_tupconv_maps[whichplan];
}
}
@@ -1660,15 +1949,13 @@ ExecModifyTable(PlanState *pstate)
/* Prepare to convert transition tuples from this child. */
if (node->mt_transition_capture != NULL)
{
- Assert(node->mt_transition_tupconv_maps != NULL);
node->mt_transition_capture->tcs_map =
- node->mt_transition_tupconv_maps[node->mt_whichplan];
+ tupconv_map_for_subplan(node, node->mt_whichplan);
}
if (node->mt_oc_transition_capture != NULL)
{
- Assert(node->mt_transition_tupconv_maps != NULL);
node->mt_oc_transition_capture->tcs_map =
- node->mt_transition_tupconv_maps[node->mt_whichplan];
+ tupconv_map_for_subplan(node, node->mt_whichplan);
}
continue;
}
@@ -1785,7 +2072,8 @@ ExecModifyTable(PlanState *pstate)
break;
case CMD_DELETE:
slot = ExecDelete(node, tupleid, oldtuple, planSlot,
- &node->mt_epqstate, estate, node->canSetTag);
+ &node->mt_epqstate, estate,
+ NULL, true, node->canSetTag);
break;
default:
elog(ERROR, "unknown operation");
@@ -1830,9 +2118,14 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
ResultRelInfo *resultRelInfo;
TupleDesc tupDesc;
Plan *subplan;
+ int firstVarno = 0;
+ Relation firstResultRel = NULL;
ListCell *l;
int i;
Relation rel;
+ bool update_tuple_routing_needed = node->partKeyUpdated;
+ PartitionTupleRouting *ptr = NULL;
+ int num_partitions = 0;
/* check for unsupported flags */
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
@@ -1905,6 +2198,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
resultRelInfo->ri_IndexRelationDescs == NULL)
ExecOpenIndices(resultRelInfo, mtstate->mt_onconflict != ONCONFLICT_NONE);
+ /*
+ * If this is an UPDATE and a BEFORE UPDATE trigger is present, the
+ * trigger itself might modify the partition-key values. So arrange for
+ * tuple routing.
+ */
+ if (resultRelInfo->ri_TrigDesc &&
+ resultRelInfo->ri_TrigDesc->trig_update_before_row &&
+ operation == CMD_UPDATE)
+ update_tuple_routing_needed = true;
+
/* Now init the plan for this result rel */
estate->es_result_relation_info = resultRelInfo;
mtstate->mt_plans[i] = ExecInitNode(subplan, estate, eflags);
@@ -1942,31 +2245,36 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
else
rel = mtstate->resultRelInfo->ri_RelationDesc;
- /* Build state for INSERT tuple routing */
- if (operation == CMD_INSERT &&
- rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
- {
- PartitionDispatch *partition_dispatch_info;
- ResultRelInfo **partitions;
- TupleConversionMap **partition_tupconv_maps;
- TupleTableSlot *partition_tuple_slot;
- int num_parted,
- num_partitions;
+ /*
+ * If it's not a partitioned table after all, UPDATE tuple routing should
+ * not be attempted.
+ */
+ if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+ update_tuple_routing_needed = false;
+ /*
+ * Build state for tuple routing if it's an INSERT or if it's an UPDATE of
+ * partition key.
+ */
+ if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
+ (operation == CMD_INSERT || update_tuple_routing_needed))
+ {
ExecSetupPartitionTupleRouting(rel,
+ mtstate->resultRelInfo,
+ (operation == CMD_UPDATE ? nplans : 0),
node->nominalRelation,
estate,
- &partition_dispatch_info,
- &partitions,
- &partition_tupconv_maps,
- &partition_tuple_slot,
- &num_parted, &num_partitions);
- mtstate->mt_partition_dispatch_info = partition_dispatch_info;
- mtstate->mt_num_dispatch = num_parted;
- mtstate->mt_partitions = partitions;
- mtstate->mt_num_partitions = num_partitions;
- mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
- mtstate->mt_partition_tuple_slot = partition_tuple_slot;
+ &mtstate->mt_partition_tuple_routing);
+
+ ptr = mtstate->mt_partition_tuple_routing;
+ num_partitions = ptr->num_partitions;
+
+ /*
+ * Below are required as reference objects for mapping partition
+ * attno's in expressions such as WithCheckOptions and RETURNING.
+ */
+ firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
+ firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
}
/*
@@ -1977,6 +2285,18 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
ExecSetupTransitionCaptureState(mtstate, estate);
/*
+ * Construct mapping from each of the per-subplan partition attnos to the
+ * root attno. This is required when during update row movement the tuple
+ * descriptor of a source partition does not match the root partitioned
+ * table descriptor. In such a case we need to convert tuples to the root
+ * tuple descriptor, because the search for destination partition starts
+ * from the root. Skip this setup if it's not a partition key update.
+ */
+ if (update_tuple_routing_needed)
+ ExecSetupChildParentMap(mtstate, getASTriggerResultRelInfo(mtstate),
+ mtstate->mt_nplans, false);
+
+ /*
* Initialize any WITH CHECK OPTION constraints if needed.
*/
resultRelInfo = mtstate->resultRelInfo;
@@ -2006,45 +2326,57 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
* Build WITH CHECK OPTION constraints for each leaf partition rel. Note
* that we didn't build the withCheckOptionList for each partition within
* the planner, but simple translation of the varattnos for each partition
- * will suffice. This only occurs for the INSERT case; UPDATE/DELETE
- * cases are handled above.
+ * will suffice. This only occurs for the INSERT case or for UPDATE row
+ * movement. DELETEs and local UPDATEs are handled above.
*/
- if (node->withCheckOptionLists != NIL && mtstate->mt_num_partitions > 0)
+ if (node->withCheckOptionLists != NIL && num_partitions > 0)
{
- List *wcoList;
- PlanState *plan;
+ List *first_wcoList;
/*
* In case of INSERT on partitioned tables, there is only one plan.
* Likewise, there is only one WITH CHECK OPTIONS list, not one per
- * partition. We make a copy of the WCO qual for each partition; note
- * that, if there are SubPlans in there, they all end up attached to
- * the one parent Plan node.
+ * partition. Whereas for UPDATE, there are as many WCOs as there are
+ * plans. So in either case, use the WCO expression of the first
+ * resultRelInfo as a reference to calculate attno's for the WCO
+ * expression of each of the partitions. We make a copy of the WCO
+ * qual for each partition. Note that, if there are SubPlans in there,
+ * they all end up attached to the one parent Plan node.
*/
- Assert(operation == CMD_INSERT &&
- list_length(node->withCheckOptionLists) == 1 &&
- mtstate->mt_nplans == 1);
- wcoList = linitial(node->withCheckOptionLists);
- plan = mtstate->mt_plans[0];
- for (i = 0; i < mtstate->mt_num_partitions; i++)
+ Assert(update_tuple_routing_needed ||
+ (operation == CMD_INSERT &&
+ list_length(node->withCheckOptionLists) == 1 &&
+ mtstate->mt_nplans == 1));
+
+ first_wcoList = linitial(node->withCheckOptionLists);
+ for (i = 0; i < num_partitions; i++)
{
Relation partrel;
List *mapped_wcoList;
List *wcoExprs = NIL;
ListCell *ll;
- resultRelInfo = mtstate->mt_partitions[i];
+ resultRelInfo = ptr->partitions[i];
+
+ /*
+ * If we are referring to a resultRelInfo from one of the update
+ * result rels, that result rel would already have WithCheckOptions
+ * initialized.
+ */
+ if (resultRelInfo->ri_WithCheckOptions)
+ continue;
+
partrel = resultRelInfo->ri_RelationDesc;
- /* varno = node->nominalRelation */
- mapped_wcoList = map_partition_varattnos(wcoList,
- node->nominalRelation,
- partrel, rel, NULL);
+ mapped_wcoList = map_partition_varattnos(first_wcoList,
+ firstVarno,
+ partrel, firstResultRel,
+ NULL);
foreach(ll, mapped_wcoList)
{
WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll));
ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual),
- plan);
+ &mtstate->ps);
wcoExprs = lappend(wcoExprs, wcoExpr);
}
@@ -2061,7 +2393,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
{
TupleTableSlot *slot;
ExprContext *econtext;
- List *returningList;
+ List *firstReturningList;
/*
* Initialize result tuple slot and assign its rowtype using the first
@@ -2098,22 +2430,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
* Build a projection for each leaf partition rel. Note that we
* didn't build the returningList for each partition within the
* planner, but simple translation of the varattnos for each partition
- * will suffice. This only occurs for the INSERT case; UPDATE/DELETE
- * are handled above.
+ * will suffice. This only occurs for the INSERT case or for UPDATE
+ * row movement. DELETEs and local UPDATEs are handled above.
*/
- returningList = linitial(node->returningLists);
- for (i = 0; i < mtstate->mt_num_partitions; i++)
+ firstReturningList = linitial(node->returningLists);
+ for (i = 0; i < num_partitions; i++)
{
Relation partrel;
List *rlist;
- resultRelInfo = mtstate->mt_partitions[i];
+ resultRelInfo = ptr->partitions[i];
+
+ /*
+ * If we are referring to a resultRelInfo from one of the update
+ * result rels, that result rel would already have a returningList
+ * built.
+ */
+ if (resultRelInfo->ri_projectReturning)
+ continue;
+
partrel = resultRelInfo->ri_RelationDesc;
- /* varno = node->nominalRelation */
- rlist = map_partition_varattnos(returningList,
- node->nominalRelation,
- partrel, rel, NULL);
+ /*
+ * Use the returning expression of the first resultRelInfo as a
+ * reference to calculate attno's for the returning expression of
+ * each of the partitions.
+ */
+ rlist = map_partition_varattnos(firstReturningList,
+ firstVarno,
+ partrel, firstResultRel, NULL);
resultRelInfo->ri_projectReturning =
ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps,
resultRelInfo->ri_RelationDesc->rd_att);
@@ -2358,6 +2703,7 @@ void
ExecEndModifyTable(ModifyTableState *node)
{
int i;
+ CmdType operation = node->operation;
/*
* Allow any FDWs to shut down
@@ -2376,29 +2722,46 @@ ExecEndModifyTable(ModifyTableState *node)
/*
* Close all the partitioned tables, leaf partitions, and their indices
*
- * Remember node->mt_partition_dispatch_info[0] corresponds to the root
+ * Remember ptr->partition_dispatch_info[0] corresponds to the root
* partitioned table, which we must not try to close, because it is the
* main target table of the query that will be closed by ExecEndPlan().
* Also, tupslot is NULL for the root partitioned table.
*/
- for (i = 1; i < node->mt_num_dispatch; i++)
+ if (node->mt_partition_tuple_routing)
{
- PartitionDispatch pd = node->mt_partition_dispatch_info[i];
+ PartitionTupleRouting *ptr = node->mt_partition_tuple_routing;
- heap_close(pd->reldesc, NoLock);
- ExecDropSingleTupleTableSlot(pd->tupslot);
- }
- for (i = 0; i < node->mt_num_partitions; i++)
- {
- ResultRelInfo *resultRelInfo = node->mt_partitions[i];
+ for (i = 1; i < ptr->num_dispatch; i++)
+ {
+ PartitionDispatch pd = ptr->partition_dispatch_info[i];
- ExecCloseIndices(resultRelInfo);
- heap_close(resultRelInfo->ri_RelationDesc, NoLock);
- }
+ heap_close(pd->reldesc, NoLock);
+ ExecDropSingleTupleTableSlot(pd->tupslot);
+ }
+ for (i = 0; i < ptr->num_partitions; i++)
+ {
+ ResultRelInfo *resultRelInfo = ptr->partitions[i];
+
+ /*
+ * If this result rel is one of the subplan result rels, let
+ * ExecEndPlan() close it. For INSERTs, this does not apply because
+ * leaf partition result rels are always newly allocated.
+ */
+ if (operation == CMD_UPDATE &&
+ resultRelInfo >= node->resultRelInfo &&
+ resultRelInfo < node->resultRelInfo + node->mt_nplans)
+ continue;
- /* Release the standalone partition tuple descriptor, if any */
- if (node->mt_partition_tuple_slot)
- ExecDropSingleTupleTableSlot(node->mt_partition_tuple_slot);
+ ExecCloseIndices(resultRelInfo);
+ heap_close(resultRelInfo->ri_RelationDesc, NoLock);
+ }
+
+ /* Release the standalone partition tuple descriptors, if any */
+ if (ptr->root_tuple_slot)
+ ExecDropSingleTupleTableSlot(ptr->root_tuple_slot);
+ if (ptr->partition_tuple_slot)
+ ExecDropSingleTupleTableSlot(ptr->partition_tuple_slot);
+ }
/*
* Free the exprcontext
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index d9ff8a7..0f2f970 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -204,6 +204,7 @@ _copyModifyTable(const ModifyTable *from)
COPY_SCALAR_FIELD(canSetTag);
COPY_SCALAR_FIELD(nominalRelation);
COPY_NODE_FIELD(partitioned_rels);
+ COPY_SCALAR_FIELD(partKeyUpdated);
COPY_NODE_FIELD(resultRelations);
COPY_SCALAR_FIELD(resultRelIndex);
COPY_SCALAR_FIELD(rootResultRelIndex);
@@ -2261,6 +2262,7 @@ _copyPartitionedChildRelInfo(const PartitionedChildRelInfo *from)
COPY_SCALAR_FIELD(parent_relid);
COPY_NODE_FIELD(child_rels);
+ COPY_SCALAR_FIELD(is_partition_key_update);
return newnode;
}
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 2866fd7..6e2e3dd 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -908,6 +908,7 @@ _equalPartitionedChildRelInfo(const PartitionedChildRelInfo *a, const Partitione
{
COMPARE_SCALAR_FIELD(parent_relid);
COMPARE_NODE_FIELD(child_rels);
+ COMPARE_SCALAR_FIELD(is_partition_key_update);
return true;
}
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index c97ee24..a5e71a2 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -372,6 +372,7 @@ _outModifyTable(StringInfo str, const ModifyTable *node)
WRITE_BOOL_FIELD(canSetTag);
WRITE_UINT_FIELD(nominalRelation);
WRITE_NODE_FIELD(partitioned_rels);
+ WRITE_BOOL_FIELD(partKeyUpdated);
WRITE_NODE_FIELD(resultRelations);
WRITE_INT_FIELD(resultRelIndex);
WRITE_INT_FIELD(rootResultRelIndex);
@@ -2103,6 +2104,7 @@ _outModifyTablePath(StringInfo str, const ModifyTablePath *node)
WRITE_BOOL_FIELD(canSetTag);
WRITE_UINT_FIELD(nominalRelation);
WRITE_NODE_FIELD(partitioned_rels);
+ WRITE_BOOL_FIELD(partKeyUpdated);
WRITE_NODE_FIELD(resultRelations);
WRITE_NODE_FIELD(subpaths);
WRITE_NODE_FIELD(subroots);
@@ -2525,6 +2527,7 @@ _outPartitionedChildRelInfo(StringInfo str, const PartitionedChildRelInfo *node)
WRITE_UINT_FIELD(parent_relid);
WRITE_NODE_FIELD(child_rels);
+ WRITE_BOOL_FIELD(is_partition_key_update);
}
static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 7eb67fc0..9542b94 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1568,6 +1568,7 @@ _readModifyTable(void)
READ_BOOL_FIELD(canSetTag);
READ_UINT_FIELD(nominalRelation);
READ_NODE_FIELD(partitioned_rels);
+ READ_BOOL_FIELD(partKeyUpdated);
READ_NODE_FIELD(resultRelations);
READ_INT_FIELD(resultRelIndex);
READ_INT_FIELD(rootResultRelIndex);
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 44f6b03..be34463 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1359,7 +1359,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
case RTE_RELATION:
if (rte->relkind == RELKIND_PARTITIONED_TABLE)
partitioned_rels =
- get_partitioned_child_rels(root, rel->relid);
+ get_partitioned_child_rels(root, rel->relid, NULL);
break;
case RTE_SUBQUERY:
build_partitioned_rels = true;
@@ -1397,7 +1397,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
{
List *cprels;
- cprels = get_partitioned_child_rels(root, childrel->relid);
+ cprels = get_partitioned_child_rels(root, childrel->relid, NULL);
partitioned_rels = list_concat(partitioned_rels,
list_copy(cprels));
}
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index d445477..549821e 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -278,6 +278,7 @@ static ProjectSet *make_project_set(List *tlist, Plan *subplan);
static ModifyTable *make_modifytable(PlannerInfo *root,
CmdType operation, bool canSetTag,
Index nominalRelation, List *partitioned_rels,
+ bool partKeyUpdated,
List *resultRelations, List *subplans,
List *withCheckOptionLists, List *returningLists,
List *rowMarks, OnConflictExpr *onconflict, int epqParam);
@@ -2371,6 +2372,7 @@ create_modifytable_plan(PlannerInfo *root, ModifyTablePath *best_path)
best_path->canSetTag,
best_path->nominalRelation,
best_path->partitioned_rels,
+ best_path->partKeyUpdated,
best_path->resultRelations,
subplans,
best_path->withCheckOptionLists,
@@ -6428,6 +6430,7 @@ static ModifyTable *
make_modifytable(PlannerInfo *root,
CmdType operation, bool canSetTag,
Index nominalRelation, List *partitioned_rels,
+ bool partKeyUpdated,
List *resultRelations, List *subplans,
List *withCheckOptionLists, List *returningLists,
List *rowMarks, OnConflictExpr *onconflict, int epqParam)
@@ -6454,6 +6457,7 @@ make_modifytable(PlannerInfo *root,
node->canSetTag = canSetTag;
node->nominalRelation = nominalRelation;
node->partitioned_rels = partitioned_rels;
+ node->partKeyUpdated = partKeyUpdated;
node->resultRelations = resultRelations;
node->resultRelIndex = -1; /* will be set correctly in setrefs.c */
node->rootResultRelIndex = -1; /* will be set correctly in setrefs.c */
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index ef2eaea..ce26bbe 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1101,6 +1101,7 @@ inheritance_planner(PlannerInfo *root)
Query *parent_parse;
Bitmapset *parent_relids = bms_make_singleton(top_parentRTindex);
PlannerInfo **parent_roots = NULL;
+ bool partColsUpdated = false;
Assert(parse->commandType != CMD_INSERT);
@@ -1172,7 +1173,8 @@ inheritance_planner(PlannerInfo *root)
if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE)
{
nominalRelation = top_parentRTindex;
- partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex);
+ partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex,
+ &partColsUpdated);
/* The root partitioned table is included as a child rel */
Assert(list_length(partitioned_rels) >= 1);
}
@@ -1512,6 +1514,7 @@ inheritance_planner(PlannerInfo *root)
parse->canSetTag,
nominalRelation,
partitioned_rels,
+ partColsUpdated,
resultRelations,
subpaths,
subroots,
@@ -2123,6 +2126,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
parse->canSetTag,
parse->resultRelation,
NIL,
+ false,
list_make1_int(parse->resultRelation),
list_make1(path),
list_make1(root),
@@ -6152,17 +6156,22 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid)
/*
* get_partitioned_child_rels
* Returns a list of the RT indexes of the partitioned child relations
- * with rti as the root parent RT index.
+ * with rti as the root parent RT index. Also sets is_partition_key_update
+ * to true if any of the root rte's updated columns is a partition key.
*
* Note: This function might get called even for range table entries that
* are not partitioned tables; in such a case, it will simply return NIL.
*/
List *
-get_partitioned_child_rels(PlannerInfo *root, Index rti)
+get_partitioned_child_rels(PlannerInfo *root, Index rti,
+ bool *is_partition_key_update)
{
List *result = NIL;
ListCell *l;
+ if (is_partition_key_update)
+ *is_partition_key_update = false;
+
foreach(l, root->pcinfo_list)
{
PartitionedChildRelInfo *pc = lfirst_node(PartitionedChildRelInfo, l);
@@ -6170,6 +6179,8 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti)
if (pc->parent_relid == rti)
{
result = pc->child_rels;
+ if (is_partition_key_update)
+ *is_partition_key_update = pc->is_partition_key_update;
break;
}
}
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index f620243..7babb35 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -105,7 +105,8 @@ static void expand_partitioned_rtentry(PlannerInfo *root,
RangeTblEntry *parentrte,
Index parentRTindex, Relation parentrel,
PlanRowMark *top_parentrc, LOCKMODE lockmode,
- List **appinfos, List **partitioned_child_rels);
+ List **appinfos, List **partitioned_child_rels,
+ bool *is_partition_key_update);
static void expand_single_inheritance_child(PlannerInfo *root,
RangeTblEntry *parentrte,
Index parentRTindex, Relation parentrel,
@@ -1466,16 +1467,19 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
if (RelationGetPartitionDesc(oldrelation) != NULL)
{
List *partitioned_child_rels = NIL;
+ bool is_partition_key_update = false;
Assert(rte->relkind == RELKIND_PARTITIONED_TABLE);
/*
* If this table has partitions, recursively expand them in the order
- * in which they appear in the PartitionDesc.
+ * in which they appear in the PartitionDesc. While at it, also
+ * extract the partition key columns of all the partitioned tables.
*/
expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc,
lockmode, &root->append_rel_list,
- &partitioned_child_rels);
+ &partitioned_child_rels,
+ &is_partition_key_update);
/*
* We keep a list of objects in root, each of which maps a root
@@ -1492,6 +1496,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
pcinfo = makeNode(PartitionedChildRelInfo);
pcinfo->parent_relid = rti;
pcinfo->child_rels = partitioned_child_rels;
+ pcinfo->is_partition_key_update = is_partition_key_update;
root->pcinfo_list = lappend(root->pcinfo_list, pcinfo);
}
}
@@ -1568,7 +1573,8 @@ static void
expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
Index parentRTindex, Relation parentrel,
PlanRowMark *top_parentrc, LOCKMODE lockmode,
- List **appinfos, List **partitioned_child_rels)
+ List **appinfos, List **partitioned_child_rels,
+ bool *is_partition_key_update)
{
int i;
RangeTblEntry *childrte;
@@ -1583,6 +1589,17 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
Assert(parentrte->inh);
+ /*
+ * Note down whether any partition key cols are being updated. Though it's
+ * the root partitioned table's updatedCols we are interested in, we
+ * instead use parentrte to get the updatedCols. This is convenient because
+ * parentrte already has the root partrel's updatedCols translated to match
+ * the attribute ordering of parentrel.
+ */
+ if (!*is_partition_key_update)
+ *is_partition_key_update =
+ has_partition_attrs(parentrel, parentrte->updatedCols, NULL);
+
/* First expand the partitioned table itself. */
expand_single_inheritance_child(root, parentrte, parentRTindex, parentrel,
top_parentrc, parentrel,
@@ -1622,7 +1639,8 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
expand_partitioned_rtentry(root, childrte, childRTindex,
childrel, top_parentrc, lockmode,
- appinfos, partitioned_child_rels);
+ appinfos, partitioned_child_rels,
+ is_partition_key_update);
/* Close child relation, but keep locks */
heap_close(childrel, NoLock);
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 68dee0f..0ce5339 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -3207,6 +3207,8 @@ create_lockrows_path(PlannerInfo *root, RelOptInfo *rel,
* 'partitioned_rels' is an integer list of RT indexes of non-leaf tables in
* the partition tree, if this is an UPDATE/DELETE to a partitioned table.
* Otherwise NIL.
+ * 'partKeyUpdated' is true if any partitioning columns are being updated,
+ * either from the named relation or a descendent partitioned table.
* 'resultRelations' is an integer list of actual RT indexes of target rel(s)
* 'subpaths' is a list of Path(s) producing source data (one per rel)
* 'subroots' is a list of PlannerInfo structs (one per rel)
@@ -3220,6 +3222,7 @@ ModifyTablePath *
create_modifytable_path(PlannerInfo *root, RelOptInfo *rel,
CmdType operation, bool canSetTag,
Index nominalRelation, List *partitioned_rels,
+ bool partKeyUpdated,
List *resultRelations, List *subpaths,
List *subroots,
List *withCheckOptionLists, List *returningLists,
@@ -3287,6 +3290,7 @@ create_modifytable_path(PlannerInfo *root, RelOptInfo *rel,
pathnode->canSetTag = canSetTag;
pathnode->nominalRelation = nominalRelation;
pathnode->partitioned_rels = list_copy(partitioned_rels);
+ pathnode->partKeyUpdated = partKeyUpdated;
pathnode->resultRelations = resultRelations;
pathnode->subpaths = subpaths;
pathnode->subroots = subroots;
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 295e9d2..c6fee08 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -54,12 +54,16 @@ extern void check_new_partition_bound(char *relname, Relation parent,
extern Oid get_partition_parent(Oid relid);
extern List *get_qual_from_partbound(Relation rel, Relation parent,
PartitionBoundSpec *spec);
-extern List *map_partition_varattnos(List *expr, int target_varno,
- Relation partrel, Relation parent,
+extern List *map_partition_varattnos(List *expr, int fromrel_varno,
+ Relation to_rel, Relation from_rel,
bool *found_whole_row);
extern List *RelationGetPartitionQual(Relation rel);
extern Expr *get_partition_qual_relid(Oid relid);
-
+extern void pull_child_partition_columns(Relation rel,
+ Relation parent,
+ Bitmapset **partcols);
+extern bool has_partition_attrs(Relation rel, Bitmapset *attnums,
+ bool *used_in_expr);
extern Oid get_default_oid_from_partdesc(PartitionDesc partdesc);
extern Oid get_default_partition_oid(Oid parentId);
extern void update_default_partition_oid(Oid parentId, Oid defaultPartId);
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 64e5aab..7e69c48 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -49,14 +49,51 @@ typedef struct PartitionDispatchData
typedef struct PartitionDispatchData *PartitionDispatch;
+/*-----------------------
+ * PartitionTupleRouting - Encapsulates all information required to execute
+ * tuple-routing between partitions.
+ *
+ * partition_dispatch_info Array of PartitionDispatch objects with one
+ * entry for every partitioned table in the
+ * partition tree.
+ * num_dispatch number of partitioned tables in the partition
+ * tree (= length of partition_dispatch_info[])
+ * partitions Array of ResultRelInfo* objects with one entry
+ * for every leaf partition in the partition tree.
+ * num_partitions Number of leaf partitions in the partition tree
+ * (= 'partitions' array length)
+ * parentchild_tupconv_maps Array of TupleConversionMap objects with one
+ * entry for every leaf partition (required to
+ * convert input tuple based on the root table's
+ * rowtype to a leaf partition's rowtype after
+ * tuple routing is done)
+ * subplan_partition_offsets int Array ordered by UPDATE subplans. Each
+ * element of this array has the index into the
+ * corresponding partition in 'partitions' array.
+ * partition_tuple_slot TupleTableSlot to be used to manipulate any
+ * given leaf partition's rowtype after that
+ * partition is chosen for insertion by
+ * tuple-routing.
+ *-----------------------
+ */
+typedef struct PartitionTupleRouting
+{
+ PartitionDispatch *partition_dispatch_info;
+ int num_dispatch;
+ ResultRelInfo **partitions;
+ int num_partitions;
+ TupleConversionMap **parentchild_tupconv_maps;
+ int *subplan_partition_offsets;
+ TupleTableSlot *partition_tuple_slot;
+ TupleTableSlot *root_tuple_slot;
+} PartitionTupleRouting;
+
extern void ExecSetupPartitionTupleRouting(Relation rel,
+ ResultRelInfo *update_rri,
+ int num_update_rri,
Index resultRTindex,
EState *estate,
- PartitionDispatch **pd,
- ResultRelInfo ***partitions,
- TupleConversionMap ***tup_conv_maps,
- TupleTableSlot **partition_tuple_slot,
- int *num_parted, int *num_partitions);
+ PartitionTupleRouting **partition_tuple_routing);
extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
PartitionDispatch *pd,
TupleTableSlot *slot,
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index b5578f5..5a385e2 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -187,9 +187,12 @@ extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid);
extern void ExecCleanUpTriggerState(EState *estate);
extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids);
extern void ExecConstraints(ResultRelInfo *resultRelInfo,
- TupleTableSlot *slot, EState *estate);
-extern void ExecPartitionCheck(ResultRelInfo *resultRelInfo,
+ TupleTableSlot *slot, EState *estate,
+ bool check_partition_constraint);
+extern bool ExecPartitionCheck(ResultRelInfo *resultRelInfo,
TupleTableSlot *slot, EState *estate);
+extern void ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo,
+ TupleTableSlot *slot, EState *estate);
extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo,
TupleTableSlot *slot, EState *estate);
extern LockTupleMode ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index e05bc04..64cf3dd 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -976,21 +976,15 @@ typedef struct ModifyTableState
TupleTableSlot *mt_existing; /* slot to store existing target tuple in */
List *mt_excludedtlist; /* the excluded pseudo relation's tlist */
TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection target */
- struct PartitionDispatchData **mt_partition_dispatch_info;
- /* Tuple-routing support info */
- int mt_num_dispatch; /* Number of entries in the above array */
- int mt_num_partitions; /* Number of members in the following
- * arrays */
- ResultRelInfo **mt_partitions; /* Per partition result relation pointers */
- TupleConversionMap **mt_partition_tupconv_maps;
- /* Per partition tuple conversion map */
- TupleTableSlot *mt_partition_tuple_slot;
+ struct PartitionTupleRouting *mt_partition_tuple_routing; /* Tuple-routing support info */
struct TransitionCaptureState *mt_transition_capture;
/* controls transition table population for specified operation */
struct TransitionCaptureState *mt_oc_transition_capture;
/* controls transition table population for INSERT...ON CONFLICT UPDATE */
- TupleConversionMap **mt_transition_tupconv_maps;
- /* Per plan/partition tuple conversion */
+ TupleConversionMap **mt_childparent_tupconv_maps;
+ /* Per plan/partition map for tuple conversion from child to root */
+ bool mt_is_tupconv_perpart; /* Is the above map per-partition ? */
+ /* Stores position of update result rels in leaf partitions */
} ModifyTableState;
/* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 9b38d44..b36dafc 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -219,6 +219,7 @@ typedef struct ModifyTable
Index nominalRelation; /* Parent RT index for use of EXPLAIN */
/* RT indexes of non-leaf tables in a partition tree */
List *partitioned_rels;
+ bool partKeyUpdated; /* some part key in hierarchy updated */
List *resultRelations; /* integer list of RT indexes */
int resultRelIndex; /* index of first resultRel in plan's list */
int rootResultRelIndex; /* index of the partitioned table root */
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 9e68e65..43d0164 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -1667,6 +1667,7 @@ typedef struct ModifyTablePath
Index nominalRelation; /* Parent RT index for use of EXPLAIN */
/* RT indexes of non-leaf tables in a partition tree */
List *partitioned_rels;
+ bool partKeyUpdated; /* some part key in hierarchy updated */
List *resultRelations; /* integer list of RT indexes */
List *subpaths; /* Path(s) producing source data */
List *subroots; /* per-target-table PlannerInfos */
@@ -2117,6 +2118,9 @@ typedef struct PartitionedChildRelInfo
Index parent_relid;
List *child_rels;
+ bool is_partition_key_update; /* is the partition key of any of
+ * the partitioned tables
+ * updated? */
} PartitionedChildRelInfo;
/*
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index e9ed16a..39ce47d 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -238,6 +238,7 @@ extern ModifyTablePath *create_modifytable_path(PlannerInfo *root,
RelOptInfo *rel,
CmdType operation, bool canSetTag,
Index nominalRelation, List *partitioned_rels,
+ bool partColsUpdated,
List *resultRelations, List *subpaths,
List *subroots,
List *withCheckOptionLists, List *returningLists,
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h
index 2801bfd..9f0533c 100644
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -57,7 +57,8 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr);
extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid);
-extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti);
+extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti,
+ bool *is_partition_key_update);
extern List *get_partitioned_child_rels_for_join(PlannerInfo *root,
Relids join_relids);
diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
index b69ceaa..dd6242b 100644
--- a/src/test/regress/expected/update.out
+++ b/src/test/regress/expected/update.out
@@ -198,36 +198,371 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a)
DROP TABLE update_test;
DROP TABLE upsert_test;
--- update to a partition should check partition bound constraint for the new tuple
-create table range_parted (
+---------------------------
+-- UPDATE with row movement
+---------------------------
+-- update to a partition should check partition bound constraint for the new tuple.
+-- If partition key is updated, the row should be moved to the appropriate
+-- partition. updatable views using partitions should enforce the check options
+-- for the rows that have been moved.
+create table mintab(c1 int);
+insert into mintab values (120);
+CREATE TABLE range_parted (
a text,
- b int
+ b bigint,
+ c numeric,
+ d int,
+ e varchar
) partition by range (a, b);
-create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10);
-create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20);
+CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 from mintab) WITH CHECK OPTION;
+-- Create partitions intentionally in descending bound order, so as to test
+-- that update-row-movement works with the leaf partitions not in bound order.
+create table part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int);
+alter table range_parted attach partition part_b_20_b_30 for values from ('b', 20) to ('b', 30);
+create table part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) partition by range (c);
create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10);
-create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20);
-insert into part_a_1_a_10 values ('a', 1);
-insert into part_b_10_b_20 values ('b', 10);
--- fail
-update part_a_1_a_10 set a = 'b' where a = 'a';
-ERROR: new row for relation "part_a_1_a_10" violates partition constraint
-DETAIL: Failing row contains (b, 1).
-update range_parted set b = b - 1 where b = 10;
-ERROR: new row for relation "part_b_10_b_20" violates partition constraint
-DETAIL: Failing row contains (b, 9).
--- ok
-update range_parted set b = b + 1 where b = 10;
+alter table range_parted attach partition part_b_10_b_20 for values from ('b', 10) to ('b', 20);
+create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20);
+create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10);
+-- This tests partition-key UPDATE on a partitioned table that does not have any child partitions
+update part_b_10_b_20 set b = b - 6;
+-- As mentioned above, the partition creation is intentionally kept in descending bound order.
+create table part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) partition by range (d);
+alter table part_c_100_200 drop column e, drop column c, drop column a;
+alter table part_c_100_200 add column c numeric, add column e varchar, add column a text;
+alter table part_c_100_200 drop column b;
+alter table part_c_100_200 add column b bigint;
+create table part_d_1_15 partition of part_c_100_200 for values from (1) to (15);
+create table part_d_15_20 partition of part_c_100_200 for values from (15) to (20);
+alter table part_b_10_b_20 attach partition part_c_100_200 for values from (100) to (200);
+create table part_c_1_100 (e varchar, d int, c numeric, b bigint, a text);
+alter table part_b_10_b_20 attach partition part_c_1_100 for values from (1) to (100);
+\set init_range_parted 'truncate range_parted; insert into range_parted values (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)'
+\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted order by 1, 2, 3, 4, 5, 6'
+:init_range_parted;
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 | 1 |
+ part_a_1_a_10 | a | 1 | 1 | 1 |
+ part_c_1_100 | b | 12 | 96 | 1 |
+ part_c_1_100 | b | 13 | 97 | 2 |
+ part_d_15_20 | b | 15 | 105 | 16 |
+ part_d_15_20 | b | 17 | 105 | 19 |
+(6 rows)
+
+-- The order of subplans should be in bound order
+explain (costs off) update range_parted set c = c - 50 where c > 97;
+ QUERY PLAN
+-------------------------------------
+ Update on range_parted
+ Update on part_a_1_a_10
+ Update on part_a_10_a_20
+ Update on part_b_1_b_10
+ Update on part_c_1_100
+ Update on part_d_1_15
+ Update on part_d_15_20
+ Update on part_b_20_b_30
+ -> Seq Scan on part_a_1_a_10
+ Filter: (c > '97'::numeric)
+ -> Seq Scan on part_a_10_a_20
+ Filter: (c > '97'::numeric)
+ -> Seq Scan on part_b_1_b_10
+ Filter: (c > '97'::numeric)
+ -> Seq Scan on part_c_1_100
+ Filter: (c > '97'::numeric)
+ -> Seq Scan on part_d_1_15
+ Filter: (c > '97'::numeric)
+ -> Seq Scan on part_d_15_20
+ Filter: (c > '97'::numeric)
+ -> Seq Scan on part_b_20_b_30
+ Filter: (c > '97'::numeric)
+(22 rows)
+
+-- fail (row movement happens only within the partition subtree) :
+update part_c_100_200 set c = c - 20, d = c where c = 105;
+ERROR: new row for relation "part_c_100_200" violates partition constraint
+DETAIL: Failing row contains (105, 85, null, b, 15).
+-- fail (no partition key update, so no attempt to move tuple, but "a = 'a'" violates partition constraint enforced by root partition)
+update part_b_10_b_20 set a = 'a';
+ERROR: new row for relation "part_c_1_100" violates partition constraint
+DETAIL: Failing row contains (null, 1, 96, 12, a).
+-- success; partition key update, no constraint violation
+update range_parted set d = d - 10 where d > 10;
+-- success; no partition key update, no constraint violation
+update range_parted set e = d;
+-- No row found :
+update part_c_1_100 set c = c + 20 where c = 98;
+-- ok (row movement)
+update part_b_10_b_20 set c = c + 20 returning c, b, a;
+ c | b | a
+-----+----+---
+ 116 | 12 | b
+ 117 | 13 | b
+ 125 | 15 | b
+ 125 | 17 | b
+(4 rows)
+
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+---+---
+ part_a_10_a_20 | a | 10 | 200 | 1 | 1
+ part_a_1_a_10 | a | 1 | 1 | 1 | 1
+ part_d_1_15 | b | 12 | 116 | 1 | 1
+ part_d_1_15 | b | 13 | 117 | 2 | 2
+ part_d_1_15 | b | 15 | 125 | 6 | 6
+ part_d_1_15 | b | 17 | 125 | 9 | 9
+(6 rows)
+
+-- fail (row movement happens only within the partition subtree) :
+update part_b_10_b_20 set b = b - 6 where c > 116 returning *;
+ERROR: new row for relation "part_d_1_15" violates partition constraint
+DETAIL: Failing row contains (2, 117, 2, b, 7).
+-- ok (row movement, with subset of rows moved into different partition)
+update range_parted set b = b - 6 where c > 116 returning a, b + c;
+ a | ?column?
+---+----------
+ a | 204
+ b | 124
+ b | 134
+ b | 136
+(4 rows)
+
+:show_data;
+ partname | a | b | c | d | e
+---------------+---+----+-----+---+---
+ part_a_1_a_10 | a | 1 | 1 | 1 | 1
+ part_a_1_a_10 | a | 4 | 200 | 1 | 1
+ part_b_1_b_10 | b | 7 | 117 | 2 | 2
+ part_b_1_b_10 | b | 9 | 125 | 6 | 6
+ part_d_1_15 | b | 11 | 125 | 9 | 9
+ part_d_1_15 | b | 12 | 116 | 1 | 1
+(6 rows)
+
+-- update partition key using updatable view.
+-- succeeds
+update upview set c = 199 where b = 4;
+-- fail, check option violation
+update upview set c = 120 where b = 4;
+ERROR: new row violates check option for view "upview"
+DETAIL: Failing row contains (a, 4, 120, 1, 1).
+-- fail, row movement with check option violation
+update upview set a = 'b', b = 15, c = 120 where b = 4;
+ERROR: new row violates check option for view "upview"
+DETAIL: Failing row contains (b, 15, 120, 1, 1).
+-- succeeds, row movement , check option passes
+update upview set a = 'b', b = 15 where b = 4;
+:show_data;
+ partname | a | b | c | d | e
+---------------+---+----+-----+---+---
+ part_a_1_a_10 | a | 1 | 1 | 1 | 1
+ part_b_1_b_10 | b | 7 | 117 | 2 | 2
+ part_b_1_b_10 | b | 9 | 125 | 6 | 6
+ part_d_1_15 | b | 11 | 125 | 9 | 9
+ part_d_1_15 | b | 12 | 116 | 1 | 1
+ part_d_1_15 | b | 15 | 199 | 1 | 1
+(6 rows)
+
+-- cleanup
+drop view upview;
+-- RETURNING having whole-row vars.
+----------------------------------
+:init_range_parted;
+update range_parted set c = 95 where a = 'b' and b > 10 and c > 100 returning (range_parted) , *;
+ range_parted | a | b | c | d | e
+---------------+---+----+----+----+---
+ (b,15,95,16,) | b | 15 | 95 | 16 |
+ (b,17,95,19,) | b | 17 | 95 | 19 |
+(2 rows)
+
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 | 1 |
+ part_a_1_a_10 | a | 1 | 1 | 1 |
+ part_c_1_100 | b | 12 | 96 | 1 |
+ part_c_1_100 | b | 13 | 97 | 2 |
+ part_c_1_100 | b | 15 | 95 | 16 |
+ part_c_1_100 | b | 17 | 95 | 19 |
+(6 rows)
+
+-- Transition tables with update row movement
+---------------------------------------------
+:init_range_parted;
+create function trans_updatetrigfunc() returns trigger language plpgsql as
+$$
+ begin
+ raise notice 'trigger = %, old table = %, new table = %',
+ TG_NAME,
+ (select string_agg(old_table::text, ', ' order by a) from old_table),
+ (select string_agg(new_table::text, ', ' order by a) from new_table);
+ return null;
+ end;
+$$;
+create trigger trans_updatetrig
+ after update on range_parted referencing old table as old_table new table as new_table
+ for each statement execute procedure trans_updatetrigfunc();
+update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96;
+NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,110,1,), (b,13,98,2,), (b,15,106,16,), (b,17,106,19,)
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 | 1 |
+ part_a_1_a_10 | a | 1 | 1 | 1 |
+ part_c_1_100 | b | 13 | 98 | 2 |
+ part_d_15_20 | b | 15 | 106 | 16 |
+ part_d_15_20 | b | 17 | 106 | 19 |
+ part_d_1_15 | b | 12 | 110 | 1 |
+(6 rows)
+
+:init_range_parted;
+-- Enabling OLD TABLE capture for both DELETE as well as UPDATE stmt triggers
+-- should not cause DELETEd rows to be captured twice. Similar thing for
+-- INSERT triggers and inserted rows.
+create trigger trans_deletetrig
+ after delete on range_parted referencing old table as old_table
+ for each statement execute procedure trans_updatetrigfunc();
+create trigger trans_inserttrig
+ after insert on range_parted referencing new table as new_table
+ for each statement execute procedure trans_updatetrigfunc();
+update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96;
+NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,146,1,), (b,13,147,2,), (b,15,155,16,), (b,17,155,19,)
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 | 1 |
+ part_a_1_a_10 | a | 1 | 1 | 1 |
+ part_d_15_20 | b | 15 | 155 | 16 |
+ part_d_15_20 | b | 17 | 155 | 19 |
+ part_d_1_15 | b | 12 | 146 | 1 |
+ part_d_1_15 | b | 13 | 147 | 2 |
+(6 rows)
+
+drop trigger trans_updatetrig ON range_parted;
+drop trigger trans_deletetrig ON range_parted;
+drop trigger trans_inserttrig ON range_parted;
+-- Install BR triggers on child partition, so that transition tuple conversion takes place.
+create function func_parted_mod_b() returns trigger as $$
+begin
+ NEW.b = NEW.b + 1;
+ return NEW;
+end $$ language plpgsql;
+create trigger trig_c1_100 before update or insert on part_c_1_100
+ for each row execute procedure func_parted_mod_b();
+create trigger trig_d1_15 before update or insert on part_d_1_15
+ for each row execute procedure func_parted_mod_b();
+create trigger trig_d15_20 before update or insert on part_d_15_20
+ for each row execute procedure func_parted_mod_b();
+:init_range_parted;
+update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96;
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 | 1 |
+ part_a_1_a_10 | a | 1 | 1 | 1 |
+ part_c_1_100 | b | 15 | 98 | 2 |
+ part_d_15_20 | b | 17 | 106 | 16 |
+ part_d_15_20 | b | 19 | 106 | 19 |
+ part_d_1_15 | b | 15 | 110 | 1 |
+(6 rows)
+
+:init_range_parted;
+update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96;
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 | 1 |
+ part_a_1_a_10 | a | 1 | 1 | 1 |
+ part_d_15_20 | b | 17 | 155 | 16 |
+ part_d_15_20 | b | 19 | 155 | 19 |
+ part_d_1_15 | b | 15 | 146 | 1 |
+ part_d_1_15 | b | 16 | 147 | 2 |
+(6 rows)
+
+drop trigger trig_c1_100 ON part_c_1_100;
+drop trigger trig_d1_15 ON part_d_1_15;
+drop trigger trig_d15_20 ON part_d_15_20;
+drop function func_parted_mod_b();
+-- statement triggers with update row movement
+---------------------------------------------------
+:init_range_parted;
+create function trigfunc() returns trigger language plpgsql as
+$$
+ begin
+ raise notice 'trigger = % fired on table % during %',
+ TG_NAME, TG_TABLE_NAME, TG_OP;
+ return null;
+ end;
+$$;
+-- Triggers on root partition
+create trigger parent_delete_trig
+ after delete on range_parted for each statement execute procedure trigfunc();
+create trigger parent_update_trig
+ after update on range_parted for each statement execute procedure trigfunc();
+create trigger parent_insert_trig
+ after insert on range_parted for each statement execute procedure trigfunc();
+-- Triggers on leaf partition part_c_1_100
+create trigger c1_delete_trig
+ after delete on part_c_1_100 for each statement execute procedure trigfunc();
+create trigger c1_update_trig
+ after update on part_c_1_100 for each statement execute procedure trigfunc();
+create trigger c1_insert_trig
+ after insert on part_c_1_100 for each statement execute procedure trigfunc();
+-- Triggers on leaf partition part_d_1_15
+create trigger d1_delete_trig
+ after delete on part_d_1_15 for each statement execute procedure trigfunc();
+create trigger d1_update_trig
+ after update on part_d_1_15 for each statement execute procedure trigfunc();
+create trigger d1_insert_trig
+ after insert on part_d_1_15 for each statement execute procedure trigfunc();
+-- Triggers on leaf partition part_d_15_20
+create trigger d15_delete_trig
+ after delete on part_d_15_20 for each statement execute procedure trigfunc();
+create trigger d15_update_trig
+ after update on part_d_15_20 for each statement execute procedure trigfunc();
+create trigger d15_insert_trig
+ after insert on part_d_15_20 for each statement execute procedure trigfunc();
+-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or insert statement triggers should be fired.
+update range_parted set c = c - 50 where c > 97;
+NOTICE: trigger = parent_update_trig fired on table range_parted during UPDATE
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 150 | 1 |
+ part_a_1_a_10 | a | 1 | 1 | 1 |
+ part_c_1_100 | b | 12 | 96 | 1 |
+ part_c_1_100 | b | 13 | 97 | 2 |
+ part_c_1_100 | b | 15 | 55 | 16 |
+ part_c_1_100 | b | 17 | 55 | 19 |
+(6 rows)
+
+drop trigger parent_delete_trig ON range_parted;
+drop trigger parent_update_trig ON range_parted;
+drop trigger parent_insert_trig ON range_parted;
+drop trigger c1_delete_trig ON part_c_1_100;
+drop trigger c1_update_trig ON part_c_1_100;
+drop trigger c1_insert_trig ON part_c_1_100;
+drop trigger d1_delete_trig ON part_d_1_15;
+drop trigger d1_update_trig ON part_d_1_15;
+drop trigger d1_insert_trig ON part_d_1_15;
+drop trigger d15_delete_trig ON part_d_15_20;
+drop trigger d15_update_trig ON part_d_15_20;
+drop trigger d15_insert_trig ON part_d_15_20;
+drop table mintab;
-- Creating default partition for range
+:init_range_parted;
create table part_def partition of range_parted default;
\d+ part_def
- Table "public.part_def"
- Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
---------+---------+-----------+----------+---------+----------+--------------+-------------
- a | text | | | | extended | |
- b | integer | | | | plain | |
+ Table "public.part_def"
+ Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
+--------+-------------------+-----------+----------+---------+----------+--------------+-------------
+ a | text | | | | extended | |
+ b | bigint | | | | plain | |
+ c | numeric | | | | main | |
+ d | integer | | | | plain | |
+ e | character varying | | | | extended | |
Partition of: range_parted DEFAULT
-Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20)))))
+Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'a'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'b'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '20'::bigint) AND (b < '30'::bigint)))))
insert into range_parted values ('c', 9);
-- ok
@@ -235,7 +570,55 @@ update part_def set a = 'd' where a = 'c';
-- fail
update part_def set a = 'a' where a = 'd';
ERROR: new row for relation "part_def" violates partition constraint
-DETAIL: Failing row contains (a, 9).
+DETAIL: Failing row contains (a, 9, null, null, null).
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 | 1 |
+ part_a_1_a_10 | a | 1 | 1 | 1 |
+ part_c_1_100 | b | 12 | 96 | 1 |
+ part_c_1_100 | b | 13 | 97 | 2 |
+ part_d_15_20 | b | 15 | 105 | 16 |
+ part_d_15_20 | b | 17 | 105 | 19 |
+ part_def | d | 9 | | |
+(7 rows)
+
+-- Update row movement from non-default to default partition.
+-- Fail, default partition is not under part_a_10_a_20;
+update part_a_10_a_20 set a = 'ad' where a = 'a';
+ERROR: new row for relation "part_a_10_a_20" violates partition constraint
+DETAIL: Failing row contains (ad, 10, 200, 1, null).
+-- Success
+update range_parted set a = 'ad' where a = 'a';
+update range_parted set a = 'bd' where a = 'b';
+:show_data;
+ partname | a | b | c | d | e
+----------+----+----+-----+----+---
+ part_def | ad | 1 | 1 | 1 |
+ part_def | ad | 10 | 200 | 1 |
+ part_def | bd | 12 | 96 | 1 |
+ part_def | bd | 13 | 97 | 2 |
+ part_def | bd | 15 | 105 | 16 |
+ part_def | bd | 17 | 105 | 19 |
+ part_def | d | 9 | | |
+(7 rows)
+
+-- Update row movement from default to non-default partitions.
+-- Success
+update range_parted set a = 'a' where a = 'ad';
+update range_parted set a = 'b' where a = 'bd';
+:show_data;
+ partname | a | b | c | d | e
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 | 1 |
+ part_a_1_a_10 | a | 1 | 1 | 1 |
+ part_c_1_100 | b | 12 | 96 | 1 |
+ part_c_1_100 | b | 13 | 97 | 2 |
+ part_d_15_20 | b | 15 | 105 | 16 |
+ part_d_15_20 | b | 17 | 105 | 19 |
+ part_def | d | 9 | | |
+(7 rows)
+
create table list_parted (
a text,
b int
@@ -250,6 +633,111 @@ ERROR: new row for relation "list_default" violates partition constraint
DETAIL: Failing row contains (a, 10).
-- ok
update list_default set a = 'x' where a = 'd';
+drop table list_parted;
+--------------
+-- UPDATE with
+-- partition key or non-partition columns, with different column ordering,
+-- triggers.
+--------------
+-- Setup
+--------
+create table list_parted (a numeric, b int, c int8) partition by list (a);
+create table sub_parted partition of list_parted for values in (1) partition by list (b);
+create table sub_part1(b int, c int8, a numeric);
+alter table sub_parted attach partition sub_part1 for values in (1);
+create table sub_part2(b int, c int8, a numeric);
+alter table sub_parted attach partition sub_part2 for values in (2);
+create table list_part1(a numeric, b int, c int8);
+alter table list_parted attach partition list_part1 for values in (2,3);
+insert into list_parted values (2,5,50);
+insert into list_parted values (3,6,60);
+insert into sub_parted values (1,1,60);
+insert into sub_parted values (1,2,10);
+-- Test partition constraint violation when intermediate ancestor is used and
+-- constraint is inherited from upper root.
+update sub_parted set a = 2 where c = 10;
+ERROR: new row for relation "sub_part2" violates partition constraint
+DETAIL: Failing row contains (2, 10, 2).
+-- UPDATE which does not modify partition key of partitions that are chosen for update.
+select tableoid::regclass::text , * from list_parted where a = 2 order by 1;
+ tableoid | a | b | c
+------------+---+---+----
+ list_part1 | 2 | 5 | 50
+(1 row)
+
+update list_parted set b = c + a where a = 2;
+select tableoid::regclass::text , * from list_parted where a = 2 order by 1;
+ tableoid | a | b | c
+------------+---+----+----
+ list_part1 | 2 | 52 | 50
+(1 row)
+
+-----------
+-- Triggers can cause UPDATE row movement if it modified partition key.
+-----------
+create function func_parted_mod_b() returns trigger as $$
+begin
+ NEW.b = 2; -- This is changing partition key column.
+ return NEW;
+end $$ language plpgsql;
+create trigger parted_mod_b before update on sub_part1
+ for each row execute procedure func_parted_mod_b();
+select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4;
+ tableoid | a | b | c
+------------+---+----+----
+ list_part1 | 2 | 52 | 50
+ list_part1 | 3 | 6 | 60
+ sub_part1 | 1 | 1 | 60
+ sub_part2 | 1 | 2 | 10
+(4 rows)
+
+-- This should do the tuple routing even though there is no explicit
+-- partition-key update, because there is a trigger on sub_part1
+update list_parted set c = 70 where b = 1 ;
+select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4;
+ tableoid | a | b | c
+------------+---+----+----
+ list_part1 | 2 | 52 | 50
+ list_part1 | 3 | 6 | 60
+ sub_part2 | 1 | 2 | 10
+ sub_part2 | 1 | 2 | 70
+(4 rows)
+
+drop trigger parted_mod_b ON sub_part1 ;
+-- If BR DELETE trigger prevented DELETE from happening, we should also skip
+-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT.
+create or replace function func_parted_mod_b() returns trigger as $$
+begin return NULL; end $$ language plpgsql;
+create trigger trig_skip_delete before delete on sub_part1
+ for each row execute procedure func_parted_mod_b();
+update list_parted set b = 1 where c = 70;
+select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4;
+ tableoid | a | b | c
+------------+---+----+----
+ list_part1 | 2 | 52 | 50
+ list_part1 | 3 | 6 | 60
+ sub_part1 | 1 | 1 | 70
+ sub_part2 | 1 | 2 | 10
+(4 rows)
+
+drop trigger trig_skip_delete ON sub_part1 ;
+-- UPDATE partition-key with FROM clause. If join produces multiple output
+-- rows for the same row to be modified, we should tuple-route the row only once.
+-- There should not be any rows inserted.
+create table non_parted (id int);
+insert into non_parted values (1), (1), (1), (2), (2), (2), (3), (3), (3);
+update list_parted t1 set a = 2 from non_parted t2 where t1.a = t2.id and a = 1;
+select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4;
+ tableoid | a | b | c
+------------+---+----+----
+ list_part1 | 2 | 1 | 70
+ list_part1 | 2 | 2 | 10
+ list_part1 | 2 | 52 | 50
+ list_part1 | 3 | 6 | 60
+(4 rows)
+
+drop table non_parted;
+drop function func_parted_mod_b();
-- create custom operator class and hash function, for the same reason
-- explained in alter_table.sql
create or replace function dummy_hashint4(a int4, seed int8) returns int8 as
@@ -271,9 +759,8 @@ insert into hpart4 values (3, 4);
update hpart1 set a = 3, b=4 where a = 1;
ERROR: new row for relation "hpart1" violates partition constraint
DETAIL: Failing row contains (3, 4).
+-- ok : row movement
update hash_parted set b = b - 1 where b = 1;
-ERROR: new row for relation "hpart1" violates partition constraint
-DETAIL: Failing row contains (1, 0).
-- ok
update hash_parted set b = b + 8 where b = 1;
-- cleanup
diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
index 0c70d64..10c10c7 100644
--- a/src/test/regress/sql/update.sql
+++ b/src/test/regress/sql/update.sql
@@ -107,25 +107,233 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a)
DROP TABLE update_test;
DROP TABLE upsert_test;
--- update to a partition should check partition bound constraint for the new tuple
-create table range_parted (
+
+---------------------------
+-- UPDATE with row movement
+---------------------------
+
+-- update to a partition should check partition bound constraint for the new tuple.
+-- If partition key is updated, the row should be moved to the appropriate
+-- partition. updatable views using partitions should enforce the check options
+-- for the rows that have been moved.
+create table mintab(c1 int);
+insert into mintab values (120);
+CREATE TABLE range_parted (
a text,
- b int
+ b bigint,
+ c numeric,
+ d int,
+ e varchar
) partition by range (a, b);
-create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10);
-create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20);
+CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 from mintab) WITH CHECK OPTION;
+
+-- Create partitions intentionally in descending bound order, so as to test
+-- that update-row-movement works with the leaf partitions not in bound order.
+create table part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int);
+alter table range_parted attach partition part_b_20_b_30 for values from ('b', 20) to ('b', 30);
+create table part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) partition by range (c);
create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10);
-create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20);
-insert into part_a_1_a_10 values ('a', 1);
-insert into part_b_10_b_20 values ('b', 10);
+alter table range_parted attach partition part_b_10_b_20 for values from ('b', 10) to ('b', 20);
+create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20);
+create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10);
+
+-- This tests partition-key UPDATE on a partitioned table that does not have any child partitions
+update part_b_10_b_20 set b = b - 6;
+
+-- As mentioned above, the partition creation is intentionally kept in descending bound order.
+create table part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) partition by range (d);
+alter table part_c_100_200 drop column e, drop column c, drop column a;
+alter table part_c_100_200 add column c numeric, add column e varchar, add column a text;
+alter table part_c_100_200 drop column b;
+alter table part_c_100_200 add column b bigint;
+create table part_d_1_15 partition of part_c_100_200 for values from (1) to (15);
+create table part_d_15_20 partition of part_c_100_200 for values from (15) to (20);
+
+alter table part_b_10_b_20 attach partition part_c_100_200 for values from (100) to (200);
+
+create table part_c_1_100 (e varchar, d int, c numeric, b bigint, a text);
+alter table part_b_10_b_20 attach partition part_c_1_100 for values from (1) to (100);
+
+\set init_range_parted 'truncate range_parted; insert into range_parted values (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)'
+\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted order by 1, 2, 3, 4, 5, 6'
+:init_range_parted;
+:show_data;
+
+-- The order of subplans should be in bound order
+explain (costs off) update range_parted set c = c - 50 where c > 97;
+
+-- fail (row movement happens only within the partition subtree) :
+update part_c_100_200 set c = c - 20, d = c where c = 105;
+-- fail (no partition key update, so no attempt to move tuple, but "a = 'a'" violates partition constraint enforced by root partition)
+update part_b_10_b_20 set a = 'a';
+-- success; partition key update, no constraint violation
+update range_parted set d = d - 10 where d > 10;
+-- success; no partition key update, no constraint violation
+update range_parted set e = d;
+-- No row found :
+update part_c_1_100 set c = c + 20 where c = 98;
+-- ok (row movement)
+update part_b_10_b_20 set c = c + 20 returning c, b, a;
+:show_data;
+
+-- fail (row movement happens only within the partition subtree) :
+update part_b_10_b_20 set b = b - 6 where c > 116 returning *;
+-- ok (row movement, with subset of rows moved into different partition)
+update range_parted set b = b - 6 where c > 116 returning a, b + c;
+
+:show_data;
+
+-- update partition key using updatable view.
+
+-- succeeds
+update upview set c = 199 where b = 4;
+-- fail, check option violation
+update upview set c = 120 where b = 4;
+-- fail, row movement with check option violation
+update upview set a = 'b', b = 15, c = 120 where b = 4;
+-- succeeds, row movement , check option passes
+update upview set a = 'b', b = 15 where b = 4;
+
+:show_data;
+
+-- cleanup
+drop view upview;
+
+-- RETURNING having whole-row vars.
+----------------------------------
+:init_range_parted;
+update range_parted set c = 95 where a = 'b' and b > 10 and c > 100 returning (range_parted) , *;
+:show_data;
+
+
+-- Transition tables with update row movement
+---------------------------------------------
+:init_range_parted;
+
+create function trans_updatetrigfunc() returns trigger language plpgsql as
+$$
+ begin
+ raise notice 'trigger = %, old table = %, new table = %',
+ TG_NAME,
+ (select string_agg(old_table::text, ', ' order by a) from old_table),
+ (select string_agg(new_table::text, ', ' order by a) from new_table);
+ return null;
+ end;
+$$;
+
+create trigger trans_updatetrig
+ after update on range_parted referencing old table as old_table new table as new_table
+ for each statement execute procedure trans_updatetrigfunc();
+
+update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96;
+:show_data;
+:init_range_parted;
+
+-- Enabling OLD TABLE capture for both DELETE as well as UPDATE stmt triggers
+-- should not cause DELETEd rows to be captured twice. Similar thing for
+-- INSERT triggers and inserted rows.
+create trigger trans_deletetrig
+ after delete on range_parted referencing old table as old_table
+ for each statement execute procedure trans_updatetrigfunc();
+create trigger trans_inserttrig
+ after insert on range_parted referencing new table as new_table
+ for each statement execute procedure trans_updatetrigfunc();
+update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96;
+:show_data;
+drop trigger trans_updatetrig ON range_parted;
+drop trigger trans_deletetrig ON range_parted;
+drop trigger trans_inserttrig ON range_parted;
+
+-- Install BR triggers on child partition, so that transition tuple conversion takes place.
+create function func_parted_mod_b() returns trigger as $$
+begin
+ NEW.b = NEW.b + 1;
+ return NEW;
+end $$ language plpgsql;
+create trigger trig_c1_100 before update or insert on part_c_1_100
+ for each row execute procedure func_parted_mod_b();
+create trigger trig_d1_15 before update or insert on part_d_1_15
+ for each row execute procedure func_parted_mod_b();
+create trigger trig_d15_20 before update or insert on part_d_15_20
+ for each row execute procedure func_parted_mod_b();
+:init_range_parted;
+update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96;
+:show_data;
+:init_range_parted;
+update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96;
+:show_data;
+drop trigger trig_c1_100 ON part_c_1_100;
+drop trigger trig_d1_15 ON part_d_1_15;
+drop trigger trig_d15_20 ON part_d_15_20;
+drop function func_parted_mod_b();
+
+
+-- statement triggers with update row movement
+---------------------------------------------------
+
+:init_range_parted;
+
+create function trigfunc() returns trigger language plpgsql as
+$$
+ begin
+ raise notice 'trigger = % fired on table % during %',
+ TG_NAME, TG_TABLE_NAME, TG_OP;
+ return null;
+ end;
+$$;
+-- Triggers on root partition
+create trigger parent_delete_trig
+ after delete on range_parted for each statement execute procedure trigfunc();
+create trigger parent_update_trig
+ after update on range_parted for each statement execute procedure trigfunc();
+create trigger parent_insert_trig
+ after insert on range_parted for each statement execute procedure trigfunc();
+
+-- Triggers on leaf partition part_c_1_100
+create trigger c1_delete_trig
+ after delete on part_c_1_100 for each statement execute procedure trigfunc();
+create trigger c1_update_trig
+ after update on part_c_1_100 for each statement execute procedure trigfunc();
+create trigger c1_insert_trig
+ after insert on part_c_1_100 for each statement execute procedure trigfunc();
+
+-- Triggers on leaf partition part_d_1_15
+create trigger d1_delete_trig
+ after delete on part_d_1_15 for each statement execute procedure trigfunc();
+create trigger d1_update_trig
+ after update on part_d_1_15 for each statement execute procedure trigfunc();
+create trigger d1_insert_trig
+ after insert on part_d_1_15 for each statement execute procedure trigfunc();
+-- Triggers on leaf partition part_d_15_20
+create trigger d15_delete_trig
+ after delete on part_d_15_20 for each statement execute procedure trigfunc();
+create trigger d15_update_trig
+ after update on part_d_15_20 for each statement execute procedure trigfunc();
+create trigger d15_insert_trig
+ after insert on part_d_15_20 for each statement execute procedure trigfunc();
+
+-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or insert statement triggers should be fired.
+update range_parted set c = c - 50 where c > 97;
+:show_data;
+
+drop trigger parent_delete_trig ON range_parted;
+drop trigger parent_update_trig ON range_parted;
+drop trigger parent_insert_trig ON range_parted;
+drop trigger c1_delete_trig ON part_c_1_100;
+drop trigger c1_update_trig ON part_c_1_100;
+drop trigger c1_insert_trig ON part_c_1_100;
+drop trigger d1_delete_trig ON part_d_1_15;
+drop trigger d1_update_trig ON part_d_1_15;
+drop trigger d1_insert_trig ON part_d_1_15;
+drop trigger d15_delete_trig ON part_d_15_20;
+drop trigger d15_update_trig ON part_d_15_20;
+drop trigger d15_insert_trig ON part_d_15_20;
+
+drop table mintab;
--- fail
-update part_a_1_a_10 set a = 'b' where a = 'a';
-update range_parted set b = b - 1 where b = 10;
--- ok
-update range_parted set b = b + 1 where b = 10;
-- Creating default partition for range
+:init_range_parted;
create table part_def partition of range_parted default;
\d+ part_def
insert into range_parted values ('c', 9);
@@ -134,6 +342,21 @@ update part_def set a = 'd' where a = 'c';
-- fail
update part_def set a = 'a' where a = 'd';
+:show_data;
+
+-- Update row movement from non-default to default partition.
+-- Fail, default partition is not under part_a_10_a_20;
+update part_a_10_a_20 set a = 'ad' where a = 'a';
+-- Success
+update range_parted set a = 'ad' where a = 'a';
+update range_parted set a = 'bd' where a = 'b';
+:show_data;
+-- Update row movement from default to non-default partitions.
+-- Success
+update range_parted set a = 'a' where a = 'ad';
+update range_parted set a = 'b' where a = 'bd';
+:show_data;
+
create table list_parted (
a text,
b int
@@ -148,6 +371,84 @@ update list_default set a = 'a' where a = 'd';
-- ok
update list_default set a = 'x' where a = 'd';
+drop table list_parted;
+
+--------------
+-- UPDATE with
+-- partition key or non-partition columns, with different column ordering,
+-- triggers.
+--------------
+
+-- Setup
+--------
+create table list_parted (a numeric, b int, c int8) partition by list (a);
+create table sub_parted partition of list_parted for values in (1) partition by list (b);
+
+create table sub_part1(b int, c int8, a numeric);
+alter table sub_parted attach partition sub_part1 for values in (1);
+create table sub_part2(b int, c int8, a numeric);
+alter table sub_parted attach partition sub_part2 for values in (2);
+
+create table list_part1(a numeric, b int, c int8);
+alter table list_parted attach partition list_part1 for values in (2,3);
+
+insert into list_parted values (2,5,50);
+insert into list_parted values (3,6,60);
+insert into sub_parted values (1,1,60);
+insert into sub_parted values (1,2,10);
+
+-- Test partition constraint violation when intermediate ancestor is used and
+-- constraint is inherited from upper root.
+update sub_parted set a = 2 where c = 10;
+
+-- UPDATE which does not modify partition key of partitions that are chosen for update.
+select tableoid::regclass::text , * from list_parted where a = 2 order by 1;
+update list_parted set b = c + a where a = 2;
+select tableoid::regclass::text , * from list_parted where a = 2 order by 1;
+
+
+-----------
+-- Triggers can cause UPDATE row movement if it modified partition key.
+-----------
+create function func_parted_mod_b() returns trigger as $$
+begin
+ NEW.b = 2; -- This is changing partition key column.
+ return NEW;
+end $$ language plpgsql;
+create trigger parted_mod_b before update on sub_part1
+ for each row execute procedure func_parted_mod_b();
+
+select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4;
+
+-- This should do the tuple routing even though there is no explicit
+-- partition-key update, because there is a trigger on sub_part1
+update list_parted set c = 70 where b = 1 ;
+select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4;
+
+drop trigger parted_mod_b ON sub_part1 ;
+
+-- If BR DELETE trigger prevented DELETE from happening, we should also skip
+-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT.
+create or replace function func_parted_mod_b() returns trigger as $$
+begin return NULL; end $$ language plpgsql;
+create trigger trig_skip_delete before delete on sub_part1
+ for each row execute procedure func_parted_mod_b();
+update list_parted set b = 1 where c = 70;
+select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4;
+
+drop trigger trig_skip_delete ON sub_part1 ;
+
+-- UPDATE partition-key with FROM clause. If join produces multiple output
+-- rows for the same row to be modified, we should tuple-route the row only once.
+-- There should not be any rows inserted.
+create table non_parted (id int);
+insert into non_parted values (1), (1), (1), (2), (2), (2), (3), (3), (3);
+update list_parted t1 set a = 2 from non_parted t2 where t1.a = t2.id and a = 1;
+select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4;
+drop table non_parted;
+
+drop function func_parted_mod_b();
+
-- create custom operator class and hash function, for the same reason
-- explained in alter_table.sql
create or replace function dummy_hashint4(a int4, seed int8) returns int8 as
@@ -169,6 +470,7 @@ insert into hpart4 values (3, 4);
-- fail
update hpart1 set a = 3, b=4 where a = 1;
+-- ok : row movement
update hash_parted set b = b - 1 where b = 1;
-- ok
update hash_parted set b = b + 8 where b = 1;