diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index e6f50ec..1517757 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -3005,6 +3005,11 @@ VALUES ('Albany', NULL, NULL, 'NY'); foreign table partitions. + + Updating the partition key of a row might cause it to be moved into a + different partition where this row satisfies its partition constraint. + + Example @@ -3297,9 +3302,22 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02 - An UPDATE that causes a row to move from one partition to - another fails, because the new value of the row fails to satisfy the - implicit partition constraint of the original partition. + When an UPDATE causes a row to move from one + partition to another, there is a chance that another concurrent + UPDATE or DELETE misses this row. + Suppose, session 1 is performing an UPDATE on a + partition key, and meanwhile a concurrent session 2 for which this row + is visible, performs an UPDATE or + DELETE operation on this row. Session 2 can silently + miss the row if the row is deleted from the partition due to session + 1's activity. In such case, session 2's + UPDATE/DELETE, being unaware of + the row movement, interprets that the row has just been deleted so there + is nothing to be done for this row. Whereas, in the usual case where the + table is not partitioned, or where there is no row movement, session 2 + would have identified the newly updated row and carried + UPDATE/DELETE on this new row + version. diff --git a/doc/src/sgml/ref/update.sgml b/doc/src/sgml/ref/update.sgml index c0d0f71..3c665f0 100644 --- a/doc/src/sgml/ref/update.sgml +++ b/doc/src/sgml/ref/update.sgml @@ -282,10 +282,17 @@ UPDATE count In the case of a partitioned table, updating a row might cause it to no - longer satisfy the partition constraint. Since there is no provision to - move the row to the partition appropriate to the new value of its - partitioning key, an error will occur in this case. This can also happen - when updating a partition directly. + longer satisfy the partition constraint of the containing partition. In that + case, if there is some other partition in the partition tree for which this + row satisfies its partition constraint, then the row is moved to that + partition. If there isn't such a partition, an error will occur. The error + will also occur when updating a partition directly. Behind the scenes, the + row movement is actually a DELETE and + INSERT operation. However, there is a possibility that a + concurrent UPDATE or DELETE on the + same row may miss this row. For details see the section + . + diff --git a/doc/src/sgml/trigger.sgml b/doc/src/sgml/trigger.sgml index bf5d3f9..aaffc4d 100644 --- a/doc/src/sgml/trigger.sgml +++ b/doc/src/sgml/trigger.sgml @@ -154,6 +154,29 @@ + If an UPDATE on a partitioned table causes a row to move + to another partition, it will be performed as a DELETE + from the original partition followed by INSERT into the + new partition. In this case, all row-level BEFORE + UPDATE triggers and all row-level + BEFORE DELETE triggers are fired on + the original partition. Then all row-level BEFORE + INSERT triggers are fired on the destination partition. + The possibility of surprising outcomes should be considered when all these + triggers affect the row being moved. As far as AFTER ROW + triggers are concerned, AFTER DELETE + and AFTER INSERT triggers are + applied; but AFTER UPDATE triggers + are not applied because the UPDATE has been converted to + a DELETE and INSERT. As far as + statement-level triggers are concerned, none of the + DELETE or INSERT triggers are fired, + even if row movement occurs; only the UPDATE triggers + defined on the target table used in the UPDATE statement + will be fired. + + + Trigger functions invoked by per-statement triggers should always return NULL. Trigger functions invoked by per-row triggers can return a table row (a value of diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index d622305..57dc08f 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -1441,7 +1441,8 @@ get_qual_from_partbound(Relation rel, Relation parent, /* * map_partition_varattnos - maps varattno of any Vars in expr from the - * parent attno to partition attno. + * attno's of 'from_rel' partition to the attno's of 'to_rel' partition. + * The rels can be both leaf partition or a partitioned table. * * We must allow for cases where physical attnos of a partition can be * different from the parent's. @@ -1454,8 +1455,8 @@ get_qual_from_partbound(Relation rel, Relation parent, * are working on Lists, so it's less messy to do the casts internally. */ List * -map_partition_varattnos(List *expr, int target_varno, - Relation partrel, Relation parent, +map_partition_varattnos(List *expr, int fromrel_varno, + Relation to_rel, Relation from_rel, bool *found_whole_row) { bool my_found_whole_row = false; @@ -1464,14 +1465,14 @@ map_partition_varattnos(List *expr, int target_varno, { AttrNumber *part_attnos; - part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel), - RelationGetDescr(parent), + part_attnos = convert_tuples_by_name_map(RelationGetDescr(to_rel), + RelationGetDescr(from_rel), gettext_noop("could not convert row type")); expr = (List *) map_variable_attnos((Node *) expr, - target_varno, 0, + fromrel_varno, 0, part_attnos, - RelationGetDescr(parent)->natts, - RelationGetForm(partrel)->reltype, + RelationGetDescr(from_rel)->natts, + RelationGetForm(to_rel)->reltype, &my_found_whole_row); } @@ -2595,6 +2596,69 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) } /* + * Checks if any of the 'attnums' is a partition key attribute for rel + * + * Sets *used_in_expr if any of the 'attnums' is found to be referenced in some + * partition key expression. It's possible for a column to be both used + * directly and as part of an expression; if that happens, *used_in_expr may + * end up as either true or false. That's OK for current uses of this + * function, because *used_in_expr is only used to tailor the error message + * text. + */ +bool +has_partition_attrs(Relation rel, Bitmapset *attnums, bool *used_in_expr) +{ + PartitionKey key; + int partnatts; + List *partexprs; + ListCell *partexprs_item; + int i; + + if (attnums == NULL || rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + return false; + + key = RelationGetPartitionKey(rel); + partnatts = get_partition_natts(key); + partexprs = get_partition_exprs(key); + + partexprs_item = list_head(partexprs); + for (i = 0; i < partnatts; i++) + { + AttrNumber partattno = get_partition_col_attnum(key, i); + + if (partattno != 0) + { + if (bms_is_member(partattno - FirstLowInvalidHeapAttributeNumber, + attnums)) + { + if (used_in_expr) + *used_in_expr = false; + return true; + } + } + else + { + /* Arbitrary expression */ + Node *expr = (Node *) lfirst(partexprs_item); + Bitmapset *expr_attrs = NULL; + + /* Find all attributes referenced */ + pull_varattnos(expr, 1, &expr_attrs); + partexprs_item = lnext(partexprs_item); + + if (bms_overlap(attnums, expr_attrs)) + { + if (used_in_expr) + *used_in_expr = true; + return true; + } + } + } + + return false; +} + +/* * qsort_partition_hbound_cmp * * We sort hash bounds by modulus, then by remainder. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index d6b235c..39c2921 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -165,12 +165,9 @@ typedef struct CopyStateData bool volatile_defexprs; /* is any of defexprs volatile? */ List *range_table; - PartitionDispatch *partition_dispatch_info; - int num_dispatch; /* Number of entries in the above array */ - int num_partitions; /* Number of members in the following arrays */ - ResultRelInfo **partitions; /* Per partition result relation pointers */ - TupleConversionMap **partition_tupconv_maps; - TupleTableSlot *partition_tuple_slot; + PartitionTupleRouting *partition_tuple_routing; /* all tuple-routing info + * for partitions. + */ TransitionCaptureState *transition_capture; TupleConversionMap **transition_tupconv_maps; @@ -2471,27 +2468,16 @@ CopyFrom(CopyState cstate) */ if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { - PartitionDispatch *partition_dispatch_info; - ResultRelInfo **partitions; - TupleConversionMap **partition_tupconv_maps; - TupleTableSlot *partition_tuple_slot; - int num_parted, - num_partitions; + PartitionTupleRouting *ptr; ExecSetupPartitionTupleRouting(cstate->rel, + NULL, + 0, 1, estate, - &partition_dispatch_info, - &partitions, - &partition_tupconv_maps, - &partition_tuple_slot, - &num_parted, &num_partitions); - cstate->partition_dispatch_info = partition_dispatch_info; - cstate->num_dispatch = num_parted; - cstate->partitions = partitions; - cstate->num_partitions = num_partitions; - cstate->partition_tupconv_maps = partition_tupconv_maps; - cstate->partition_tuple_slot = partition_tuple_slot; + &cstate->partition_tuple_routing); + + ptr = cstate->partition_tuple_routing; /* * If we are capturing transition tuples, they may need to be @@ -2504,11 +2490,11 @@ CopyFrom(CopyState cstate) int i; cstate->transition_tupconv_maps = (TupleConversionMap **) - palloc0(sizeof(TupleConversionMap *) * cstate->num_partitions); - for (i = 0; i < cstate->num_partitions; ++i) + palloc0(sizeof(TupleConversionMap *) * ptr->num_partitions); + for (i = 0; i < ptr->num_partitions; ++i) { cstate->transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(cstate->partitions[i]->ri_RelationDesc), + convert_tuples_by_name(RelationGetDescr(ptr->partitions[i]->ri_RelationDesc), RelationGetDescr(cstate->rel), gettext_noop("could not convert row type")); } @@ -2528,7 +2514,7 @@ CopyFrom(CopyState cstate) if ((resultRelInfo->ri_TrigDesc != NULL && (resultRelInfo->ri_TrigDesc->trig_insert_before_row || resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) || - cstate->partition_dispatch_info != NULL || + cstate->partition_tuple_routing != NULL || cstate->volatile_defexprs) { useHeapMultiInsert = false; @@ -2603,10 +2589,11 @@ CopyFrom(CopyState cstate) ExecStoreTuple(tuple, slot, InvalidBuffer, false); /* Determine the partition to heap_insert the tuple into */ - if (cstate->partition_dispatch_info) + if (cstate->partition_tuple_routing) { int leaf_part_index; TupleConversionMap *map; + PartitionTupleRouting *ptr = cstate->partition_tuple_routing; /* * Away we go ... If we end up not finding a partition after all, @@ -2617,11 +2604,11 @@ CopyFrom(CopyState cstate) * partition, respectively. */ leaf_part_index = ExecFindPartition(resultRelInfo, - cstate->partition_dispatch_info, + ptr->partition_dispatch_info, slot, estate); Assert(leaf_part_index >= 0 && - leaf_part_index < cstate->num_partitions); + leaf_part_index < ptr->num_partitions); /* * If this tuple is mapped to a partition that is not same as the @@ -2639,7 +2626,7 @@ CopyFrom(CopyState cstate) * to the selected partition. */ saved_resultRelInfo = resultRelInfo; - resultRelInfo = cstate->partitions[leaf_part_index]; + resultRelInfo = ptr->partitions[leaf_part_index]; /* We do not yet have a way to insert into a foreign partition */ if (resultRelInfo->ri_FdwRoutine) @@ -2686,7 +2673,7 @@ CopyFrom(CopyState cstate) * We might need to convert from the parent rowtype to the * partition rowtype. */ - map = cstate->partition_tupconv_maps[leaf_part_index]; + map = ptr->parentchild_tupconv_maps[leaf_part_index]; if (map) { Relation partrel = resultRelInfo->ri_RelationDesc; @@ -2698,7 +2685,7 @@ CopyFrom(CopyState cstate) * point on. Use a dedicated slot from this point on until * we're finished dealing with the partition. */ - slot = cstate->partition_tuple_slot; + slot = ptr->partition_tuple_slot; Assert(slot != NULL); ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); ExecStoreTuple(tuple, slot, InvalidBuffer, true); @@ -2749,7 +2736,7 @@ CopyFrom(CopyState cstate) /* Check the constraints of the tuple */ if (cstate->rel->rd_att->constr || check_partition_constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); if (useHeapMultiInsert) { @@ -2850,8 +2837,9 @@ CopyFrom(CopyState cstate) ExecCloseIndices(resultRelInfo); /* Close all the partitioned tables, leaf partitions, and their indices */ - if (cstate->partition_dispatch_info) + if (cstate->partition_tuple_routing) { + PartitionTupleRouting *ptr = cstate->partition_tuple_routing; int i; /* @@ -2860,23 +2848,23 @@ CopyFrom(CopyState cstate) * the main target table of COPY that will be closed eventually by * DoCopy(). Also, tupslot is NULL for the root partitioned table. */ - for (i = 1; i < cstate->num_dispatch; i++) + for (i = 1; i < ptr->num_dispatch; i++) { - PartitionDispatch pd = cstate->partition_dispatch_info[i]; + PartitionDispatch pd = ptr->partition_dispatch_info[i]; heap_close(pd->reldesc, NoLock); ExecDropSingleTupleTableSlot(pd->tupslot); } - for (i = 0; i < cstate->num_partitions; i++) + for (i = 0; i < ptr->num_partitions; i++) { - ResultRelInfo *resultRelInfo = cstate->partitions[i]; + ResultRelInfo *resultRelInfo = ptr->partitions[i]; ExecCloseIndices(resultRelInfo); heap_close(resultRelInfo->ri_RelationDesc, NoLock); } /* Release the standalone partition tuple descriptor */ - ExecDropSingleTupleTableSlot(cstate->partition_tuple_slot); + ExecDropSingleTupleTableSlot(ptr->partition_tuple_slot); } /* Close any trigger target relations */ diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index d979ce2..64c2185 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -468,7 +468,6 @@ static void RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid, void *arg); static void RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid, Oid oldrelid, void *arg); -static bool is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr); static PartitionSpec *transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy); static void ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs, List **partexprs, Oid *partopclass, Oid *partcollation, char strategy); @@ -6492,68 +6491,6 @@ ATPrepDropColumn(List **wqueue, Relation rel, bool recurse, bool recursing, } /* - * Checks if attnum is a partition attribute for rel - * - * Sets *used_in_expr if attnum is found to be referenced in some partition - * key expression. It's possible for a column to be both used directly and - * as part of an expression; if that happens, *used_in_expr may end up as - * either true or false. That's OK for current uses of this function, because - * *used_in_expr is only used to tailor the error message text. - */ -static bool -is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr) -{ - PartitionKey key; - int partnatts; - List *partexprs; - ListCell *partexprs_item; - int i; - - if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) - return false; - - key = RelationGetPartitionKey(rel); - partnatts = get_partition_natts(key); - partexprs = get_partition_exprs(key); - - partexprs_item = list_head(partexprs); - for (i = 0; i < partnatts; i++) - { - AttrNumber partattno = get_partition_col_attnum(key, i); - - if (partattno != 0) - { - if (attnum == partattno) - { - if (used_in_expr) - *used_in_expr = false; - return true; - } - } - else - { - /* Arbitrary expression */ - Node *expr = (Node *) lfirst(partexprs_item); - Bitmapset *expr_attrs = NULL; - - /* Find all attributes referenced */ - pull_varattnos(expr, 1, &expr_attrs); - partexprs_item = lnext(partexprs_item); - - if (bms_is_member(attnum - FirstLowInvalidHeapAttributeNumber, - expr_attrs)) - { - if (used_in_expr) - *used_in_expr = true; - return true; - } - } - } - - return false; -} - -/* * Return value is the address of the dropped column. */ static ObjectAddress @@ -6613,7 +6550,9 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName, colName))); /* Don't drop columns used in the partition key */ - if (is_partition_attr(rel, attnum, &is_expr)) + if (has_partition_attrs(rel, + bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber), + &is_expr)) { if (!is_expr) ereport(ERROR, @@ -8837,7 +8776,9 @@ ATPrepAlterColumnType(List **wqueue, colName))); /* Don't alter columns used in the partition key */ - if (is_partition_attr(rel, attnum, &is_expr)) + if (has_partition_attrs(rel, + bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber), + &is_expr)) { if (!is_expr) ereport(ERROR, diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 92ae382..73ec872 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2854,8 +2854,13 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, { HeapTuple trigtuple; - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); - if (fdw_trigtuple == NULL) + /* + * Note: if the UPDATE is converted into a DELETE+INSERT as part of + * update-partition-key operation, then this function is also called + * separately for DELETE and INSERT to capture transition table rows. + * In such case, either old tuple or new tuple can be NULL. + */ + if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid)) trigtuple = GetTupleForTrigger(estate, NULL, relinfo, @@ -5414,7 +5419,12 @@ AfterTriggerPendingOnRel(Oid relid) * triggers actually need to be queued. It is also called after each row, * even if there are no triggers for that event, if there are any AFTER * STATEMENT triggers for the statement which use transition tables, so that - * the transition tuplestores can be built. + * the transition tuplestores can be built. Furthermore, if the transition + * capture is happening for UPDATEd rows being moved to another partition due + * partition-key change, then this function is called once when the row is + * deleted (to capture OLD row), and once when the row is inserted to another + * partition (to capture NEW row). This is done separately because DELETE and + * INSERT happen on different tables. * * Transition tuplestores are built now, rather than when events are pulled * off of the queue because AFTER ROW triggers are allowed to select from the @@ -5463,12 +5473,25 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, bool update_new_table = transition_capture->tcs_update_new_table; bool insert_new_table = transition_capture->tcs_insert_new_table;; - if ((event == TRIGGER_EVENT_DELETE && delete_old_table) || - (event == TRIGGER_EVENT_UPDATE && update_old_table)) + /* + * For INSERT events newtup should be non-NULL, for DELETE events + * oldtup should be non-NULL, whereas for UPDATE events normally both + * oldtup and newtup are non-NULL. But for UPDATE event fired for + * capturing transition tuples during UPDATE partition-key row + * movement, oldtup is NULL when the event is for row being inserted, + * whereas newtup is NULL when the event is for row being deleted. + */ + Assert(!(event == TRIGGER_EVENT_DELETE && delete_old_table && + oldtup == NULL)); + Assert(!(event == TRIGGER_EVENT_INSERT && insert_new_table && + newtup == NULL)); + + if (oldtup != NULL && + ((event == TRIGGER_EVENT_DELETE && delete_old_table) || + (event == TRIGGER_EVENT_UPDATE && update_old_table))) { Tuplestorestate *old_tuplestore; - Assert(oldtup != NULL); old_tuplestore = transition_capture->tcs_private->old_tuplestore; if (map != NULL) @@ -5481,12 +5504,12 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, else tuplestore_puttuple(old_tuplestore, oldtup); } - if ((event == TRIGGER_EVENT_INSERT && insert_new_table) || - (event == TRIGGER_EVENT_UPDATE && update_new_table)) + if (newtup != NULL && + ((event == TRIGGER_EVENT_INSERT && insert_new_table) || + (event == TRIGGER_EVENT_UPDATE && update_new_table))) { Tuplestorestate *new_tuplestore; - Assert(newtup != NULL); new_tuplestore = transition_capture->tcs_private->new_tuplestore; if (original_insert_tuple != NULL) @@ -5502,11 +5525,17 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, tuplestore_puttuple(new_tuplestore, newtup); } - /* If transition tables are the only reason we're here, return. */ + /* + * If transition tables are the only reason we're here, return. As + * mentioned above, we can also be here during update tuple routing in + * presence of transition tables, in which case this function is called + * separately for oldtup and newtup, so either can be NULL, not both. + */ if (trigdesc == NULL || (event == TRIGGER_EVENT_DELETE && !trigdesc->trig_delete_after_row) || (event == TRIGGER_EVENT_INSERT && !trigdesc->trig_insert_after_row) || - (event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row)) + (event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row) || + (event == TRIGGER_EVENT_UPDATE && ((oldtup == NULL) ^ (newtup == NULL)))) return; } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index dbaa47f..5ec92d5 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1849,16 +1849,12 @@ ExecRelCheck(ResultRelInfo *resultRelInfo, * ExecPartitionCheck --- check that tuple meets the partition constraint. * * Exported in executor.h for outside use. + * Returns true if it meets the partition constraint, else returns false. */ -void +bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate) { - Relation rel = resultRelInfo->ri_RelationDesc; - TupleDesc tupdesc = RelationGetDescr(rel); - Bitmapset *modifiedCols; - Bitmapset *insertedCols; - Bitmapset *updatedCols; ExprContext *econtext; /* @@ -1886,52 +1882,69 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, * As in case of the catalogued constraints, we treat a NULL result as * success here, not a failure. */ - if (!ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext)) - { - char *val_desc; - Relation orig_rel = rel; + return ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext); +} + +/* + * ExecPartitionCheckEmitError - Form and emit an error message after a failed + * partition constraint check. + */ +void +ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + EState *estate) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + Relation orig_rel = rel; + TupleDesc tupdesc = RelationGetDescr(rel); + char *val_desc; + Bitmapset *modifiedCols; + Bitmapset *insertedCols; + Bitmapset *updatedCols; - /* See the comment above. */ - if (resultRelInfo->ri_PartitionRoot) + /* + * Need to first convert the tuple to the root partitioned table's row + * type. For details, check similar comments in ExecConstraints(). + */ + if (resultRelInfo->ri_PartitionRoot) + { + HeapTuple tuple = ExecFetchSlotTuple(slot); + TupleDesc old_tupdesc = RelationGetDescr(rel); + TupleConversionMap *map; + + rel = resultRelInfo->ri_PartitionRoot; + tupdesc = RelationGetDescr(rel); + /* a reverse map */ + map = convert_tuples_by_name(old_tupdesc, tupdesc, + gettext_noop("could not convert row type")); + if (map != NULL) { - HeapTuple tuple = ExecFetchSlotTuple(slot); - TupleDesc old_tupdesc = RelationGetDescr(rel); - TupleConversionMap *map; - - rel = resultRelInfo->ri_PartitionRoot; - tupdesc = RelationGetDescr(rel); - /* a reverse map */ - map = convert_tuples_by_name(old_tupdesc, tupdesc, - gettext_noop("could not convert row type")); - if (map != NULL) - { - tuple = do_convert_tuple(tuple, map); - ExecSetSlotDescriptor(slot, tupdesc); - ExecStoreTuple(tuple, slot, InvalidBuffer, false); - } + tuple = do_convert_tuple(tuple, map); + ExecSetSlotDescriptor(slot, tupdesc); + ExecStoreTuple(tuple, slot, InvalidBuffer, false); } - - insertedCols = GetInsertedColumns(resultRelInfo, estate); - updatedCols = GetUpdatedColumns(resultRelInfo, estate); - modifiedCols = bms_union(insertedCols, updatedCols); - val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), - slot, - tupdesc, - modifiedCols, - 64); - ereport(ERROR, - (errcode(ERRCODE_CHECK_VIOLATION), - errmsg("new row for relation \"%s\" violates partition constraint", - RelationGetRelationName(orig_rel)), - val_desc ? errdetail("Failing row contains %s.", val_desc) : 0)); } + + insertedCols = GetInsertedColumns(resultRelInfo, estate); + updatedCols = GetUpdatedColumns(resultRelInfo, estate); + modifiedCols = bms_union(insertedCols, updatedCols); + val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), + slot, + tupdesc, + modifiedCols, + 64); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("new row for relation \"%s\" violates partition constraint", + RelationGetRelationName(orig_rel)), + val_desc ? errdetail("Failing row contains %s.", val_desc) : 0)); } /* * ExecConstraints - check constraints of the tuple in 'slot' * - * This checks the traditional NOT NULL and check constraints, as well as - * the partition constraint, if any. + * This checks the traditional NOT NULL and check constraints, and if + * requested, checks the partition constraint. * * Note: 'slot' contains the tuple to check the constraints of, which may * have been converted from the original input tuple after tuple routing. @@ -1939,7 +1952,8 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, */ void ExecConstraints(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate) + TupleTableSlot *slot, EState *estate, + bool check_partition_constraint) { Relation rel = resultRelInfo->ri_RelationDesc; TupleDesc tupdesc = RelationGetDescr(rel); @@ -2055,8 +2069,9 @@ ExecConstraints(ResultRelInfo *resultRelInfo, } } - if (resultRelInfo->ri_PartitionCheck) - ExecPartitionCheck(resultRelInfo, slot, estate); + if (check_partition_constraint && resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate)) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); } diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 2fc411a..180798f 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -41,54 +41,91 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel, * ExecSetupPartitionTupleRouting - set up information needed during * tuple routing for partitioned tables * + * 'update_rri' contains the UPDATE per-subplan result rels. For the output + * param 'partitions', we don't allocate new ResultRelInfo objects for + * leaf partitions for which they are already available in 'update_rri'. + * + * 'num_update_rri' is the number of elements in 'update_rri' array or zero for + * INSERT. + * * Output arguments: - * 'pd' receives an array of PartitionDispatch objects with one entry for - * every partitioned table in the partition tree - * 'partitions' receives an array of ResultRelInfo* objects with one entry for - * every leaf partition in the partition tree - * 'tup_conv_maps' receives an array of TupleConversionMap objects with one - * entry for every leaf partition (required to convert input tuple based - * on the root table's rowtype to a leaf partition's rowtype after tuple - * routing is done) - * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used - * to manipulate any given leaf partition's rowtype after that partition - * is chosen by tuple-routing. - * 'num_parted' receives the number of partitioned tables in the partition - * tree (= the number of entries in the 'pd' output array) - * 'num_partitions' receives the number of leaf partitions in the partition - * tree (= the number of entries in the 'partitions' and 'tup_conv_maps' - * output arrays + * + * 'partition_tuple_routing' encapsulates all the partition related information + * required to do tuple routing. * * Note that all the relations in the partition tree are locked using the * RowExclusiveLock mode upon return from this function. */ void ExecSetupPartitionTupleRouting(Relation rel, + ResultRelInfo *update_rri, + int num_update_rri, Index resultRTindex, EState *estate, - PartitionDispatch **pd, - ResultRelInfo ***partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions) + PartitionTupleRouting **partition_tuple_routing) { TupleDesc tupDesc = RelationGetDescr(rel); List *leaf_parts; ListCell *cell; int i; - ResultRelInfo *leaf_part_rri; + ResultRelInfo *leaf_part_arr = NULL; + int update_rri_index = 0; + bool is_update = (num_update_rri > 0); + PartitionTupleRouting *ptr; /* * Get the information about the partition tree after locking all the * partitions. */ (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); - *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); - *num_partitions = list_length(leaf_parts); - *partitions = (ResultRelInfo **) palloc(*num_partitions * + ptr = *partition_tuple_routing = + (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting)); + ptr->partition_dispatch_info = + RelationGetPartitionDispatchInfo(rel, &ptr->num_dispatch, &leaf_parts); + ptr->num_partitions = list_length(leaf_parts); + ptr->partitions = (ResultRelInfo **) palloc(ptr->num_partitions * sizeof(ResultRelInfo *)); - *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions * - sizeof(TupleConversionMap *)); + ptr->parentchild_tupconv_maps = + (TupleConversionMap **) palloc0(ptr->num_partitions * + sizeof(TupleConversionMap *)); + + if (is_update) + { + /* + * For updates, if the leaf partition is already present in the + * per-subplan result rels, we re-use that rather than initialize a new + * result rel. The per-subplan resultrels and the resultrels of the + * leaf partitions are both in the same canonical order. So while going + * through the leaf partition oids, we need to keep track of the next + * per-subplan result rel to be looked for in the leaf partition + * resultrels. So, set update_rri_index to the first per-subplan result + * rel, and then shift it as we find them one by one while scanning the + * leaf partition oids. + */ + update_rri_index = 0; + + /* + * Prepare for generating the mapping from subplan result rels to leaf + * partition position. + */ + ptr->subplan_partition_offsets = palloc(num_update_rri * sizeof(int)); + + /* + * For UPDATEs, we require an additional tuple slot for storing + * transient tuples that are converted to the root table descriptor. + */ + ptr->root_tuple_slot = MakeTupleTableSlot(); + } + else + { + /* + * For inserts, we need to create all new result rels, so avoid + * repeated pallocs by allocating memory for all the result rels in + * bulk. + */ + leaf_part_arr = (ResultRelInfo *) palloc0(ptr->num_partitions * + sizeof(ResultRelInfo)); + } /* * Initialize an empty slot that will be used to manipulate tuples of any @@ -96,39 +133,82 @@ ExecSetupPartitionTupleRouting(Relation rel, * (such as ModifyTableState) and released when the node finishes * processing. */ - *partition_tuple_slot = MakeTupleTableSlot(); + ptr->partition_tuple_slot = MakeTupleTableSlot(); - leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions * - sizeof(ResultRelInfo)); i = 0; foreach(cell, leaf_parts) { - Relation partrel; + ResultRelInfo *leaf_part_rri; + Relation partrel = NULL; TupleDesc part_tupdesc; + Oid leaf_oid = lfirst_oid(cell); + + if (is_update) + { + /* Is this leaf partition present in the update resultrel? */ + if (update_rri_index < num_update_rri && + RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid) + { + leaf_part_rri = &update_rri[update_rri_index]; + partrel = leaf_part_rri->ri_RelationDesc; + + /* + * This is required when we convert the partition's tuple to be + * compatible with the root partitioned table's tuple + * descriptor. When generating the per-subplan UPDATE result + * rels, this was not set. + */ + leaf_part_rri->ri_PartitionRoot = rel; + + /* + * Save the position of this update rel in the leaf partitions + * array + */ + ptr->subplan_partition_offsets[update_rri_index] = i; + + update_rri_index++; + } + else + leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo)); + } + else + { + /* For INSERTs, we already have an array of result rels allocated */ + leaf_part_rri = leaf_part_arr + i; + } /* - * We locked all the partitions above including the leaf partitions. - * Note that each of the relations in *partitions are eventually - * closed by the caller. + * If we didn't open the partition rel, it means we haven't initialized + * the result rel either. */ - partrel = heap_open(lfirst_oid(cell), NoLock); + if (!partrel) + { + /* + * We locked all the partitions above including the leaf + * partitions. Note that each of the newly opened relations in + * *partitions are eventually closed by the caller. + */ + partrel = heap_open(leaf_oid, NoLock); + InitResultRelInfo(leaf_part_rri, + partrel, + resultRTindex, + rel, + estate->es_instrument); + } + part_tupdesc = RelationGetDescr(partrel); /* * Save a tuple conversion map to convert a tuple routed to this * partition from the parent's type to the partition's. */ - (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, + ptr->parentchild_tupconv_maps[i] = convert_tuples_by_name(tupDesc, part_tupdesc, gettext_noop("could not convert row type")); - InitResultRelInfo(leaf_part_rri, - partrel, - resultRTindex, - rel, - estate->es_instrument); - /* - * Verify result relation is a valid target for INSERT. + * Verify result relation is a valid target for insert operation. Even + * for updates, we are doing this for tuple-routing, so again, we need + * to check the validity for insert operation. */ CheckValidResultRel(leaf_part_rri, CMD_INSERT); @@ -144,9 +224,15 @@ ExecSetupPartitionTupleRouting(Relation rel, estate->es_leaf_result_relations = lappend(estate->es_leaf_result_relations, leaf_part_rri); - (*partitions)[i] = leaf_part_rri++; + ptr->partitions[i] = leaf_part_rri; i++; } + + /* + * For UPDATE, we should have found all the per-subplan resultrels in the + * leaf partitions. + */ + Assert(!is_update || update_rri_index == num_update_rri); } /* @@ -177,8 +263,9 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, * First check the root table's partition constraint, if any. No point in * routing the tuple if it doesn't belong in the root table itself. */ - if (resultRelInfo->ri_PartitionCheck) - ExecPartitionCheck(resultRelInfo, slot, estate); + if (resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate)) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); /* start with the root partitioned table */ parent = pd[0]; diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index fb538c0..e11f7cb 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -401,7 +401,7 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot) /* Check the constraints of the tuple */ if (rel->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); /* Store the slot into tuple that we can inspect. */ tuple = ExecMaterializeSlot(slot); @@ -466,7 +466,7 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, /* Check the constraints of the tuple */ if (rel->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); /* Store the slot into tuple that we can write. */ tuple = ExecMaterializeSlot(slot); diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 201c607..919b32d 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -46,6 +46,7 @@ #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" +#include "optimizer/var.h" #include "parser/parsetree.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" @@ -63,7 +64,16 @@ static bool ExecOnConflictUpdate(ModifyTableState *mtstate, EState *estate, bool canSetTag, TupleTableSlot **returning); - +static void ExecSetupChildParentMap(ModifyTableState *mtstate, + ResultRelInfo *rootRelInfo, + int numResultRelInfos, bool perleaf); +static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node, + int whichplan); +static HeapTuple ConvertPartitionTupleSlot(ModifyTableState *mtstate, + TupleConversionMap *map, + HeapTuple tuple, + TupleTableSlot *new_slot, + TupleTableSlot **p_old_slot); /* * Verify that the tuples to be produced by INSERT or UPDATE match the * target relation's rowtype @@ -241,6 +251,38 @@ ExecCheckTIDVisible(EState *estate, ReleaseBuffer(buffer); } +/* + * ConvertPartitionTupleSlot -- convenience function for tuple conversion using + * 'map'. The tuple, if converted, is stored in 'new_slot', and 'p_my_slot' is + * updated with the 'new_slot'. 'new_slot' typically should be one of the + * dedicated partition tuple slots. If map is NULL, keeps p_my_slot unchanged. + * + * Returns the converted tuple, unless map is NULL, in which case original + * tuple is returned unmodified. + */ +static HeapTuple +ConvertPartitionTupleSlot(ModifyTableState *mtstate, + TupleConversionMap *map, + HeapTuple tuple, + TupleTableSlot *new_slot, + TupleTableSlot **p_my_slot) +{ + if (!map) + return tuple; + + tuple = do_convert_tuple(tuple, map); + + /* + * Change the partition tuple slot descriptor, as per converted tuple. + */ + *p_my_slot = new_slot; + Assert(new_slot != NULL); + ExecSetSlotDescriptor(new_slot, map->outdesc); + ExecStoreTuple(tuple, new_slot, InvalidBuffer, true); + + return tuple; +} + /* ---------------------------------------------------------------- * ExecInsert * @@ -266,6 +308,9 @@ ExecInsert(ModifyTableState *mtstate, Oid newId; List *recheckIndexes = NIL; TupleTableSlot *result = NULL; + TransitionCaptureState *transition_capture; + + transition_capture = mtstate->mt_transition_capture; /* * get the heap tuple out of the tuple table slot, making sure we have a @@ -279,32 +324,32 @@ ExecInsert(ModifyTableState *mtstate, resultRelInfo = estate->es_result_relation_info; /* Determine the partition to heap_insert the tuple into */ - if (mtstate->mt_partition_dispatch_info) + if (mtstate->mt_partition_tuple_routing) { int leaf_part_index; - TupleConversionMap *map; + PartitionTupleRouting *ptr = mtstate->mt_partition_tuple_routing; /* * Away we go ... If we end up not finding a partition after all, * ExecFindPartition() does not return and errors out instead. * Otherwise, the returned value is to be used as an index into arrays - * mt_partitions[] and mt_partition_tupconv_maps[] that will get us - * the ResultRelInfo and TupleConversionMap for the partition, + * ptr->partitions[] and ptr->parentchild_tupconv_maps[] that will get + * us the ResultRelInfo and TupleConversionMap for the partition, * respectively. */ leaf_part_index = ExecFindPartition(resultRelInfo, - mtstate->mt_partition_dispatch_info, + ptr->partition_dispatch_info, slot, estate); Assert(leaf_part_index >= 0 && - leaf_part_index < mtstate->mt_num_partitions); + leaf_part_index < ptr->num_partitions); /* * Save the old ResultRelInfo and switch to the one corresponding to * the selected partition. */ saved_resultRelInfo = resultRelInfo; - resultRelInfo = mtstate->mt_partitions[leaf_part_index]; + resultRelInfo = ptr->partitions[leaf_part_index]; /* We do not yet have a way to insert into a foreign partition */ if (resultRelInfo->ri_FdwRoutine) @@ -331,8 +376,10 @@ ExecInsert(ModifyTableState *mtstate, * back to tuplestore format. */ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + + Assert(mtstate->mt_is_tupconv_perpart == true); mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[leaf_part_index]; + mtstate->mt_childparent_tupconv_maps[leaf_part_index]; } else { @@ -345,30 +392,21 @@ ExecInsert(ModifyTableState *mtstate, } } if (mtstate->mt_oc_transition_capture != NULL) + { + Assert(mtstate->mt_is_tupconv_perpart == true); mtstate->mt_oc_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[leaf_part_index]; + mtstate->mt_childparent_tupconv_maps[leaf_part_index]; + } /* * We might need to convert from the parent rowtype to the partition * rowtype. */ - map = mtstate->mt_partition_tupconv_maps[leaf_part_index]; - if (map) - { - Relation partrel = resultRelInfo->ri_RelationDesc; - - tuple = do_convert_tuple(tuple, map); - - /* - * We must use the partition's tuple descriptor from this point - * on, until we're finished dealing with the partition. Use the - * dedicated slot for that. - */ - slot = mtstate->mt_partition_tuple_slot; - Assert(slot != NULL); - ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); - ExecStoreTuple(tuple, slot, InvalidBuffer, true); - } + tuple = ConvertPartitionTupleSlot(mtstate, + ptr->parentchild_tupconv_maps[leaf_part_index], + tuple, + ptr->partition_tuple_slot, + &slot); } resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -486,7 +524,7 @@ ExecInsert(ModifyTableState *mtstate, /* Check the constraints of the tuple */ if (resultRelationDesc->rd_att->constr || check_partition_constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0) { @@ -622,9 +660,32 @@ ExecInsert(ModifyTableState *mtstate, setLastTid(&(tuple->t_self)); } + /* + * If this INSERT is part of a partition-key-UPDATE and we are capturing + * transition tables, put this row into the transition NEW TABLE. + * (Similarly we need to add the deleted row in OLD TABLE). We need to do + * this separately for DELETE and INSERT because they happen on different + * tables. + */ + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_new_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, NULL, + NULL, + tuple, + NULL, + mtstate->mt_transition_capture); + + /* + * Now that we have already captured NEW TABLE row, any AR INSERT + * trigger should not again capture it below. Arrange for the same. + */ + transition_capture = NULL; + } + /* AFTER ROW INSERT Triggers */ ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes, - mtstate->mt_transition_capture); + transition_capture); list_free(recheckIndexes); @@ -678,6 +739,8 @@ ExecDelete(ModifyTableState *mtstate, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, + bool *tuple_deleted, + bool process_returning, bool canSetTag) { ResultRelInfo *resultRelInfo; @@ -685,6 +748,12 @@ ExecDelete(ModifyTableState *mtstate, HTSU_Result result; HeapUpdateFailureData hufd; TupleTableSlot *slot = NULL; + TransitionCaptureState *transition_capture; + + transition_capture = mtstate->mt_transition_capture; + + if (tuple_deleted) + *tuple_deleted = false; /* * get information on the (current) result relation @@ -849,12 +918,39 @@ ldelete:; if (canSetTag) (estate->es_processed)++; + /* The delete has actually happened, so inform that to the caller */ + if (tuple_deleted) + *tuple_deleted = true; + + /* + * In case this is part of update tuple routing, put this row into the + * transition OLD TABLE if we are capturing transition tables. We need to + * do this separately for DELETE and INSERT because they happen on + * different tables. + */ + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_old_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, + tupleid, + oldtuple, + NULL, + NULL, + mtstate->mt_transition_capture); + + /* + * Now that we have already captured OLD TABLE row, any AR DELETE + * trigger should not again capture it below. Arrange for the same. + */ + transition_capture = NULL; + } + /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, - mtstate->mt_transition_capture); + transition_capture); - /* Process RETURNING if present */ - if (resultRelInfo->ri_projectReturning) + /* Process RETURNING if present and if requested */ + if (process_returning && resultRelInfo->ri_projectReturning) { /* * We have to put the target tuple into a slot, which means first we @@ -947,6 +1043,7 @@ ExecUpdate(ModifyTableState *mtstate, HTSU_Result result; HeapUpdateFailureData hufd; List *recheckIndexes = NIL; + TupleConversionMap *saved_tcs_map = NULL; /* * abort the operation if not running transactions @@ -1043,12 +1140,117 @@ lreplace:; resultRelInfo, slot, estate); /* + * If a partition check fails, try to move the row into the right + * partition. + */ + if (resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate)) + { + bool tuple_deleted; + TupleTableSlot *ret_slot; + PartitionTupleRouting *ptr = mtstate->mt_partition_tuple_routing; + int map_index; + TupleConversionMap *tupconv_map; + + /* + * When an UPDATE is run with a leaf partition, we would not have + * partition tuple routing setup. In that case, fail with + * partition constraint violation error. + */ + if (ptr == NULL) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); + + /* Do the row movement. */ + + /* + * Skip RETURNING processing for DELETE. We want to return rows + * from INSERT. + */ + ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate, + &tuple_deleted, false, false); + + /* + * For some reason if DELETE didn't happen (e.g. trigger prevented + * it, or it was already deleted by self, or it was concurrently + * deleted by another transaction), then we should skip INSERT as + * well, otherwise, there will be effectively one new row inserted. + * + * For a normal UPDATE, the case where the tuple has been the + * subject of a concurrent UPDATE or DELETE would be handled by + * the EvalPlanQual machinery, but for an UPDATE that we've + * translated into a DELETE from this partition and an INSERT into + * some other partition, that's not available, because CTID chains + * can't span relation boundaries. We mimic the semantics to a + * limited extent by skipping the INSERT if the DELETE fails to + * find a tuple. This ensures that two concurrent attempts to + * UPDATE the same tuple at the same time can't turn one tuple + * into two, and that an UPDATE of a just-deleted tuple can't + * resurrect it. + */ + if (!tuple_deleted) + return NULL; + + /* + * UPDATEs set the transition capture map only when a new subplan + * is chosen. But for INSERTs, it is set for each row. So after + * INSERT, we need to revert back to the map created for UPDATE; + * otherwise the next UPDATE will incorrectly use the one created + * for INESRT. So first save the one created for UPDATE. + */ + if (mtstate->mt_transition_capture) + saved_tcs_map = mtstate->mt_transition_capture->tcs_map; + + /* + * resultRelInfo is one of the per-subplan resultRelInfos. So we + * should convert the tuple into root's tuple descriptor, since + * ExecInsert() starts the search from root. The tuple conversion + * map list is in the order of mtstate->resultRelInfo[], so to + * retrieve the one for this resultRel, we need to know the + * position of the resultRel in mtstate->resultRelInfo[]. + */ + map_index = resultRelInfo - mtstate->resultRelInfo; + Assert(map_index >= 0 && map_index < mtstate->mt_nplans); + tupconv_map = tupconv_map_for_subplan(mtstate, map_index); + tuple = ConvertPartitionTupleSlot(mtstate, + tupconv_map, + tuple, + ptr->root_tuple_slot, + &slot); + + + /* + * For ExecInsert(), make it look like we are inserting into the + * root. + */ + Assert(mtstate->rootResultRelInfo != NULL); + estate->es_result_relation_info = mtstate->rootResultRelInfo; + + ret_slot = ExecInsert(mtstate, slot, planSlot, NULL, + ONCONFLICT_NONE, estate, canSetTag); + + /* + * Revert back the active result relation and the active transition + * capture map that we changed above. + */ + estate->es_result_relation_info = resultRelInfo; + if (mtstate->mt_transition_capture) + { + mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + mtstate->mt_transition_capture->tcs_map = saved_tcs_map; + } + return ret_slot; + } + + /* * Check the constraints of the tuple. Note that we pass the same * slot for the orig_slot argument, because unlike ExecInsert(), no * tuple-routing is performed here, hence the slot remains unchanged. + * We've already checked the partition constraint above; however, we + * must still ensure the tuple passes all other constraints, so we will + * call ExecConstraints() and have it validate all remaining checks. */ - if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck) - ExecConstraints(resultRelInfo, slot, estate); + if (resultRelationDesc->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate, false); /* * replace the heap tuple @@ -1476,7 +1678,6 @@ static void ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) { ResultRelInfo *targetRelInfo = getASTriggerResultRelInfo(mtstate); - int i; /* Check for transition tables on the directly targeted relation. */ mtstate->mt_transition_capture = @@ -1500,60 +1701,148 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) mtstate->mt_oc_transition_capture != NULL) { int numResultRelInfos; + PartitionTupleRouting *ptr = mtstate->mt_partition_tuple_routing; - numResultRelInfos = (mtstate->mt_partition_tuple_slot != NULL ? - mtstate->mt_num_partitions : + numResultRelInfos = (ptr != NULL ? + ptr->num_partitions : mtstate->mt_nplans); + ExecSetupChildParentMap(mtstate, targetRelInfo, numResultRelInfos, + (ptr != NULL)); + /* - * Build array of conversion maps from each child's TupleDesc to the - * one used in the tuplestore. The map pointers may be NULL when no - * conversion is necessary, which is hopefully a common case for - * partitions. + * Install the conversion map for the first plan for UPDATE and DELETE + * operations. It will be advanced each time we switch to the next + * plan. (INSERT operations set it every time, so we need not update + * mtstate->mt_oc_transition_capture here.) */ - mtstate->mt_transition_tupconv_maps = (TupleConversionMap **) - palloc0(sizeof(TupleConversionMap *) * numResultRelInfos); + if (mtstate->mt_transition_capture && mtstate->operation != CMD_INSERT) + mtstate->mt_transition_capture->tcs_map = + tupconv_map_for_subplan(mtstate, 0); + } +} - /* Choose the right set of partitions */ - if (mtstate->mt_partition_dispatch_info != NULL) - { - /* - * For tuple routing among partitions, we need TupleDescs based - * on the partition routing table. - */ - ResultRelInfo **resultRelInfos = mtstate->mt_partitions; +/* + * Initialize the child-to-root tuple conversion map array. + * + * This map array is required for two purposes : + * 1. For update-tuple-routing. We need to convert the tuple from the subplan + * result rel to the root partitioned table descriptor. + * 2. For capturing transition tables that are partitions. For UPDATEs, we need + * to convert the tuple from subplan result rel to target table descriptor, + * and for INSERTs, we need to convert the inserted tuple from leaf partition + * to the target table descriptor. + * + * The caller can request either a per-subplan map or per-leaf-partition map. + */ +static void +ExecSetupChildParentMap(ModifyTableState *mtstate, + ResultRelInfo *rootRelInfo, + int numResultRelInfos, bool perleaf) +{ + TupleDesc outdesc; + int i; - for (i = 0; i < numResultRelInfos; ++i) - { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc), - RelationGetDescr(targetRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); - } - } - else - { - /* Otherwise we need the ResultRelInfo for each subplan. */ - ResultRelInfo *resultRelInfos = mtstate->resultRelInfo; + /* First check if there is already one */ + if (mtstate->mt_childparent_tupconv_maps) + { + /* + * If per-leaf map is required and the map is already created, that map + * has to be per-leaf. If that map is per-subplan, we won't be able to + * access the maps leaf-partition-wise. But if the map is per-leaf, we + * will be able to access the maps subplan-wise using the + * subplan_partition_offsets map using function + * tupconv_map_for_subplan(). So if the callers might need to access + * the map both leaf-partition-wise and subplan-wise, they should make + * sure that the first time this function is called, it should be + * called with perleaf=true so that the map created is per-leaf, not + * per-subplan. + */ + Assert(!(perleaf && !mtstate->mt_is_tupconv_perpart)); + return; + } - for (i = 0; i < numResultRelInfos; ++i) - { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc), - RelationGetDescr(targetRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); - } + /* Get tuple descriptor of the root partitioned table. */ + outdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc); + + /* + * Build array of conversion maps from each child's TupleDesc to the + * one used in the tuplestore. The map pointers may be NULL when no + * conversion is necessary, which is hopefully a common case for + * partitions. + */ + mtstate->mt_childparent_tupconv_maps = (TupleConversionMap **) + palloc0(sizeof(TupleConversionMap *) * numResultRelInfos); + + /* Choose the right set of partitions */ + if (perleaf) + { + /* + * For tuple routing among partitions, we need TupleDescs based + * on the partition routing table. + */ + ResultRelInfo **resultRelInfos; + + Assert(mtstate->mt_partition_tuple_routing != NULL); + resultRelInfos = mtstate->mt_partition_tuple_routing->partitions; + + for (i = 0; i < numResultRelInfos; ++i) + { + mtstate->mt_childparent_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc), + outdesc, + gettext_noop("could not convert row type")); } /* - * Install the conversion map for the first plan for UPDATE and DELETE - * operations. It will be advanced each time we switch to the next - * plan. (INSERT operations set it every time, so we need not update - * mtstate->mt_oc_transition_capture here.) + * Save the info that the tuple conversion map is per-leaf, not + * per-subplan */ - if (mtstate->mt_transition_capture) - mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[0]; + mtstate->mt_is_tupconv_perpart = true; + } + else + { + /* Otherwise we need the ResultRelInfo for each subplan. */ + ResultRelInfo *resultRelInfos = mtstate->resultRelInfo; + + for (i = 0; i < numResultRelInfos; ++i) + { + mtstate->mt_childparent_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc), + outdesc, + gettext_noop("could not convert row type")); + } + } + +} + +/* + * For a given subplan index, get the tuple conversion map. + */ +static TupleConversionMap * +tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan) +{ + Assert(mtstate->mt_childparent_tupconv_maps != NULL); + + /* + * If the tuple conversion map array is per-partition, we need to first get + * the index into the partition array. + */ + if (mtstate->mt_is_tupconv_perpart) + { + int leaf_index; + PartitionTupleRouting *ptr = mtstate->mt_partition_tuple_routing; + + Assert(ptr && ptr->subplan_partition_offsets != NULL); + leaf_index = ptr->subplan_partition_offsets[whichplan]; + + Assert(leaf_index >= 0 && leaf_index < ptr->num_partitions); + return mtstate->mt_childparent_tupconv_maps[leaf_index]; + } + else + { + Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans); + return mtstate->mt_childparent_tupconv_maps[whichplan]; } } @@ -1660,15 +1949,13 @@ ExecModifyTable(PlanState *pstate) /* Prepare to convert transition tuples from this child. */ if (node->mt_transition_capture != NULL) { - Assert(node->mt_transition_tupconv_maps != NULL); node->mt_transition_capture->tcs_map = - node->mt_transition_tupconv_maps[node->mt_whichplan]; + tupconv_map_for_subplan(node, node->mt_whichplan); } if (node->mt_oc_transition_capture != NULL) { - Assert(node->mt_transition_tupconv_maps != NULL); node->mt_oc_transition_capture->tcs_map = - node->mt_transition_tupconv_maps[node->mt_whichplan]; + tupconv_map_for_subplan(node, node->mt_whichplan); } continue; } @@ -1785,7 +2072,8 @@ ExecModifyTable(PlanState *pstate) break; case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, - &node->mt_epqstate, estate, node->canSetTag); + &node->mt_epqstate, estate, + NULL, true, node->canSetTag); break; default: elog(ERROR, "unknown operation"); @@ -1830,9 +2118,14 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ResultRelInfo *resultRelInfo; TupleDesc tupDesc; Plan *subplan; + int firstVarno = 0; + Relation firstResultRel = NULL; ListCell *l; int i; Relation rel; + bool update_tuple_routing_needed = node->partKeyUpdated; + PartitionTupleRouting *ptr = NULL; + int num_partitions = 0; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -1905,6 +2198,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_IndexRelationDescs == NULL) ExecOpenIndices(resultRelInfo, mtstate->mt_onconflict != ONCONFLICT_NONE); + /* + * If this is an UPDATE and a BEFORE UPDATE trigger is present, the + * trigger itself might modify the partition-key values. So arrange for + * tuple routing. + */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_before_row && + operation == CMD_UPDATE) + update_tuple_routing_needed = true; + /* Now init the plan for this result rel */ estate->es_result_relation_info = resultRelInfo; mtstate->mt_plans[i] = ExecInitNode(subplan, estate, eflags); @@ -1942,31 +2245,36 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) else rel = mtstate->resultRelInfo->ri_RelationDesc; - /* Build state for INSERT tuple routing */ - if (operation == CMD_INSERT && - rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - PartitionDispatch *partition_dispatch_info; - ResultRelInfo **partitions; - TupleConversionMap **partition_tupconv_maps; - TupleTableSlot *partition_tuple_slot; - int num_parted, - num_partitions; + /* + * If it's not a partitioned table after all, UPDATE tuple routing should + * not be attempted. + */ + if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + update_tuple_routing_needed = false; + /* + * Build state for tuple routing if it's an INSERT or if it's an UPDATE of + * partition key. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE && + (operation == CMD_INSERT || update_tuple_routing_needed)) + { ExecSetupPartitionTupleRouting(rel, + mtstate->resultRelInfo, + (operation == CMD_UPDATE ? nplans : 0), node->nominalRelation, estate, - &partition_dispatch_info, - &partitions, - &partition_tupconv_maps, - &partition_tuple_slot, - &num_parted, &num_partitions); - mtstate->mt_partition_dispatch_info = partition_dispatch_info; - mtstate->mt_num_dispatch = num_parted; - mtstate->mt_partitions = partitions; - mtstate->mt_num_partitions = num_partitions; - mtstate->mt_partition_tupconv_maps = partition_tupconv_maps; - mtstate->mt_partition_tuple_slot = partition_tuple_slot; + &mtstate->mt_partition_tuple_routing); + + ptr = mtstate->mt_partition_tuple_routing; + num_partitions = ptr->num_partitions; + + /* + * Below are required as reference objects for mapping partition + * attno's in expressions such as WithCheckOptions and RETURNING. + */ + firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; } /* @@ -1977,6 +2285,18 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ExecSetupTransitionCaptureState(mtstate, estate); /* + * Construct mapping from each of the per-subplan partition attnos to the + * root attno. This is required when during update row movement the tuple + * descriptor of a source partition does not match the root partitioned + * table descriptor. In such a case we need to convert tuples to the root + * tuple descriptor, because the search for destination partition starts + * from the root. Skip this setup if it's not a partition key update. + */ + if (update_tuple_routing_needed) + ExecSetupChildParentMap(mtstate, getASTriggerResultRelInfo(mtstate), + mtstate->mt_nplans, false); + + /* * Initialize any WITH CHECK OPTION constraints if needed. */ resultRelInfo = mtstate->resultRelInfo; @@ -2006,45 +2326,57 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build WITH CHECK OPTION constraints for each leaf partition rel. Note * that we didn't build the withCheckOptionList for each partition within * the planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * cases are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE row + * movement. DELETEs and local UPDATEs are handled above. */ - if (node->withCheckOptionLists != NIL && mtstate->mt_num_partitions > 0) + if (node->withCheckOptionLists != NIL && num_partitions > 0) { - List *wcoList; - PlanState *plan; + List *first_wcoList; /* * In case of INSERT on partitioned tables, there is only one plan. * Likewise, there is only one WITH CHECK OPTIONS list, not one per - * partition. We make a copy of the WCO qual for each partition; note - * that, if there are SubPlans in there, they all end up attached to - * the one parent Plan node. + * partition. Whereas for UPDATE, there are as many WCOs as there are + * plans. So in either case, use the WCO expression of the first + * resultRelInfo as a reference to calculate attno's for the WCO + * expression of each of the partitions. We make a copy of the WCO + * qual for each partition. Note that, if there are SubPlans in there, + * they all end up attached to the one parent Plan node. */ - Assert(operation == CMD_INSERT && - list_length(node->withCheckOptionLists) == 1 && - mtstate->mt_nplans == 1); - wcoList = linitial(node->withCheckOptionLists); - plan = mtstate->mt_plans[0]; - for (i = 0; i < mtstate->mt_num_partitions; i++) + Assert(update_tuple_routing_needed || + (operation == CMD_INSERT && + list_length(node->withCheckOptionLists) == 1 && + mtstate->mt_nplans == 1)); + + first_wcoList = linitial(node->withCheckOptionLists); + for (i = 0; i < num_partitions; i++) { Relation partrel; List *mapped_wcoList; List *wcoExprs = NIL; ListCell *ll; - resultRelInfo = mtstate->mt_partitions[i]; + resultRelInfo = ptr->partitions[i]; + + /* + * If we are referring to a resultRelInfo from one of the update + * result rels, that result rel would already have WithCheckOptions + * initialized. + */ + if (resultRelInfo->ri_WithCheckOptions) + continue; + partrel = resultRelInfo->ri_RelationDesc; - /* varno = node->nominalRelation */ - mapped_wcoList = map_partition_varattnos(wcoList, - node->nominalRelation, - partrel, rel, NULL); + mapped_wcoList = map_partition_varattnos(first_wcoList, + firstVarno, + partrel, firstResultRel, + NULL); foreach(ll, mapped_wcoList) { WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), - plan); + &mtstate->ps); wcoExprs = lappend(wcoExprs, wcoExpr); } @@ -2061,7 +2393,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) { TupleTableSlot *slot; ExprContext *econtext; - List *returningList; + List *firstReturningList; /* * Initialize result tuple slot and assign its rowtype using the first @@ -2098,22 +2430,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build a projection for each leaf partition rel. Note that we * didn't build the returningList for each partition within the * planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE + * row movement. DELETEs and local UPDATEs are handled above. */ - returningList = linitial(node->returningLists); - for (i = 0; i < mtstate->mt_num_partitions; i++) + firstReturningList = linitial(node->returningLists); + for (i = 0; i < num_partitions; i++) { Relation partrel; List *rlist; - resultRelInfo = mtstate->mt_partitions[i]; + resultRelInfo = ptr->partitions[i]; + + /* + * If we are referring to a resultRelInfo from one of the update + * result rels, that result rel would already have a returningList + * built. + */ + if (resultRelInfo->ri_projectReturning) + continue; + partrel = resultRelInfo->ri_RelationDesc; - /* varno = node->nominalRelation */ - rlist = map_partition_varattnos(returningList, - node->nominalRelation, - partrel, rel, NULL); + /* + * Use the returning expression of the first resultRelInfo as a + * reference to calculate attno's for the returning expression of + * each of the partitions. + */ + rlist = map_partition_varattnos(firstReturningList, + firstVarno, + partrel, firstResultRel, NULL); resultRelInfo->ri_projectReturning = ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, resultRelInfo->ri_RelationDesc->rd_att); @@ -2358,6 +2703,7 @@ void ExecEndModifyTable(ModifyTableState *node) { int i; + CmdType operation = node->operation; /* * Allow any FDWs to shut down @@ -2376,29 +2722,46 @@ ExecEndModifyTable(ModifyTableState *node) /* * Close all the partitioned tables, leaf partitions, and their indices * - * Remember node->mt_partition_dispatch_info[0] corresponds to the root + * Remember ptr->partition_dispatch_info[0] corresponds to the root * partitioned table, which we must not try to close, because it is the * main target table of the query that will be closed by ExecEndPlan(). * Also, tupslot is NULL for the root partitioned table. */ - for (i = 1; i < node->mt_num_dispatch; i++) + if (node->mt_partition_tuple_routing) { - PartitionDispatch pd = node->mt_partition_dispatch_info[i]; + PartitionTupleRouting *ptr = node->mt_partition_tuple_routing; - heap_close(pd->reldesc, NoLock); - ExecDropSingleTupleTableSlot(pd->tupslot); - } - for (i = 0; i < node->mt_num_partitions; i++) - { - ResultRelInfo *resultRelInfo = node->mt_partitions[i]; + for (i = 1; i < ptr->num_dispatch; i++) + { + PartitionDispatch pd = ptr->partition_dispatch_info[i]; - ExecCloseIndices(resultRelInfo); - heap_close(resultRelInfo->ri_RelationDesc, NoLock); - } + heap_close(pd->reldesc, NoLock); + ExecDropSingleTupleTableSlot(pd->tupslot); + } + for (i = 0; i < ptr->num_partitions; i++) + { + ResultRelInfo *resultRelInfo = ptr->partitions[i]; + + /* + * If this result rel is one of the subplan result rels, let + * ExecEndPlan() close it. For INSERTs, this does not apply because + * leaf partition result rels are always newly allocated. + */ + if (operation == CMD_UPDATE && + resultRelInfo >= node->resultRelInfo && + resultRelInfo < node->resultRelInfo + node->mt_nplans) + continue; - /* Release the standalone partition tuple descriptor, if any */ - if (node->mt_partition_tuple_slot) - ExecDropSingleTupleTableSlot(node->mt_partition_tuple_slot); + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + } + + /* Release the standalone partition tuple descriptors, if any */ + if (ptr->root_tuple_slot) + ExecDropSingleTupleTableSlot(ptr->root_tuple_slot); + if (ptr->partition_tuple_slot) + ExecDropSingleTupleTableSlot(ptr->partition_tuple_slot); + } /* * Free the exprcontext diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index d9ff8a7..0f2f970 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -204,6 +204,7 @@ _copyModifyTable(const ModifyTable *from) COPY_SCALAR_FIELD(canSetTag); COPY_SCALAR_FIELD(nominalRelation); COPY_NODE_FIELD(partitioned_rels); + COPY_SCALAR_FIELD(partKeyUpdated); COPY_NODE_FIELD(resultRelations); COPY_SCALAR_FIELD(resultRelIndex); COPY_SCALAR_FIELD(rootResultRelIndex); @@ -2261,6 +2262,7 @@ _copyPartitionedChildRelInfo(const PartitionedChildRelInfo *from) COPY_SCALAR_FIELD(parent_relid); COPY_NODE_FIELD(child_rels); + COPY_SCALAR_FIELD(is_partition_key_update); return newnode; } diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 2866fd7..6e2e3dd 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -908,6 +908,7 @@ _equalPartitionedChildRelInfo(const PartitionedChildRelInfo *a, const Partitione { COMPARE_SCALAR_FIELD(parent_relid); COMPARE_NODE_FIELD(child_rels); + COMPARE_SCALAR_FIELD(is_partition_key_update); return true; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index c97ee24..a5e71a2 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -372,6 +372,7 @@ _outModifyTable(StringInfo str, const ModifyTable *node) WRITE_BOOL_FIELD(canSetTag); WRITE_UINT_FIELD(nominalRelation); WRITE_NODE_FIELD(partitioned_rels); + WRITE_BOOL_FIELD(partKeyUpdated); WRITE_NODE_FIELD(resultRelations); WRITE_INT_FIELD(resultRelIndex); WRITE_INT_FIELD(rootResultRelIndex); @@ -2103,6 +2104,7 @@ _outModifyTablePath(StringInfo str, const ModifyTablePath *node) WRITE_BOOL_FIELD(canSetTag); WRITE_UINT_FIELD(nominalRelation); WRITE_NODE_FIELD(partitioned_rels); + WRITE_BOOL_FIELD(partKeyUpdated); WRITE_NODE_FIELD(resultRelations); WRITE_NODE_FIELD(subpaths); WRITE_NODE_FIELD(subroots); @@ -2525,6 +2527,7 @@ _outPartitionedChildRelInfo(StringInfo str, const PartitionedChildRelInfo *node) WRITE_UINT_FIELD(parent_relid); WRITE_NODE_FIELD(child_rels); + WRITE_BOOL_FIELD(is_partition_key_update); } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 7eb67fc0..9542b94 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -1568,6 +1568,7 @@ _readModifyTable(void) READ_BOOL_FIELD(canSetTag); READ_UINT_FIELD(nominalRelation); READ_NODE_FIELD(partitioned_rels); + READ_BOOL_FIELD(partKeyUpdated); READ_NODE_FIELD(resultRelations); READ_INT_FIELD(resultRelIndex); READ_INT_FIELD(rootResultRelIndex); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 44f6b03..be34463 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1359,7 +1359,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, case RTE_RELATION: if (rte->relkind == RELKIND_PARTITIONED_TABLE) partitioned_rels = - get_partitioned_child_rels(root, rel->relid); + get_partitioned_child_rels(root, rel->relid, NULL); break; case RTE_SUBQUERY: build_partitioned_rels = true; @@ -1397,7 +1397,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, { List *cprels; - cprels = get_partitioned_child_rels(root, childrel->relid); + cprels = get_partitioned_child_rels(root, childrel->relid, NULL); partitioned_rels = list_concat(partitioned_rels, list_copy(cprels)); } diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index d445477..549821e 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -278,6 +278,7 @@ static ProjectSet *make_project_set(List *tlist, Plan *subplan); static ModifyTable *make_modifytable(PlannerInfo *root, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool partKeyUpdated, List *resultRelations, List *subplans, List *withCheckOptionLists, List *returningLists, List *rowMarks, OnConflictExpr *onconflict, int epqParam); @@ -2371,6 +2372,7 @@ create_modifytable_plan(PlannerInfo *root, ModifyTablePath *best_path) best_path->canSetTag, best_path->nominalRelation, best_path->partitioned_rels, + best_path->partKeyUpdated, best_path->resultRelations, subplans, best_path->withCheckOptionLists, @@ -6428,6 +6430,7 @@ static ModifyTable * make_modifytable(PlannerInfo *root, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool partKeyUpdated, List *resultRelations, List *subplans, List *withCheckOptionLists, List *returningLists, List *rowMarks, OnConflictExpr *onconflict, int epqParam) @@ -6454,6 +6457,7 @@ make_modifytable(PlannerInfo *root, node->canSetTag = canSetTag; node->nominalRelation = nominalRelation; node->partitioned_rels = partitioned_rels; + node->partKeyUpdated = partKeyUpdated; node->resultRelations = resultRelations; node->resultRelIndex = -1; /* will be set correctly in setrefs.c */ node->rootResultRelIndex = -1; /* will be set correctly in setrefs.c */ diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index ef2eaea..ce26bbe 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1101,6 +1101,7 @@ inheritance_planner(PlannerInfo *root) Query *parent_parse; Bitmapset *parent_relids = bms_make_singleton(top_parentRTindex); PlannerInfo **parent_roots = NULL; + bool partColsUpdated = false; Assert(parse->commandType != CMD_INSERT); @@ -1172,7 +1173,8 @@ inheritance_planner(PlannerInfo *root) if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE) { nominalRelation = top_parentRTindex; - partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex); + partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex, + &partColsUpdated); /* The root partitioned table is included as a child rel */ Assert(list_length(partitioned_rels) >= 1); } @@ -1512,6 +1514,7 @@ inheritance_planner(PlannerInfo *root) parse->canSetTag, nominalRelation, partitioned_rels, + partColsUpdated, resultRelations, subpaths, subroots, @@ -2123,6 +2126,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, parse->canSetTag, parse->resultRelation, NIL, + false, list_make1_int(parse->resultRelation), list_make1(path), list_make1(root), @@ -6152,17 +6156,22 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) /* * get_partitioned_child_rels * Returns a list of the RT indexes of the partitioned child relations - * with rti as the root parent RT index. + * with rti as the root parent RT index. Also sets is_partition_key_update + * to true if any of the root rte's updated columns is a partition key. * * Note: This function might get called even for range table entries that * are not partitioned tables; in such a case, it will simply return NIL. */ List * -get_partitioned_child_rels(PlannerInfo *root, Index rti) +get_partitioned_child_rels(PlannerInfo *root, Index rti, + bool *is_partition_key_update) { List *result = NIL; ListCell *l; + if (is_partition_key_update) + *is_partition_key_update = false; + foreach(l, root->pcinfo_list) { PartitionedChildRelInfo *pc = lfirst_node(PartitionedChildRelInfo, l); @@ -6170,6 +6179,8 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti) if (pc->parent_relid == rti) { result = pc->child_rels; + if (is_partition_key_update) + *is_partition_key_update = pc->is_partition_key_update; break; } } diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index f620243..7babb35 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -105,7 +105,8 @@ static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, PlanRowMark *top_parentrc, LOCKMODE lockmode, - List **appinfos, List **partitioned_child_rels); + List **appinfos, List **partitioned_child_rels, + bool *is_partition_key_update); static void expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, @@ -1466,16 +1467,19 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) if (RelationGetPartitionDesc(oldrelation) != NULL) { List *partitioned_child_rels = NIL; + bool is_partition_key_update = false; Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); /* * If this table has partitions, recursively expand them in the order - * in which they appear in the PartitionDesc. + * in which they appear in the PartitionDesc. While at it, also + * extract the partition key columns of all the partitioned tables. */ expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc, lockmode, &root->append_rel_list, - &partitioned_child_rels); + &partitioned_child_rels, + &is_partition_key_update); /* * We keep a list of objects in root, each of which maps a root @@ -1492,6 +1496,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) pcinfo = makeNode(PartitionedChildRelInfo); pcinfo->parent_relid = rti; pcinfo->child_rels = partitioned_child_rels; + pcinfo->is_partition_key_update = is_partition_key_update; root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); } } @@ -1568,7 +1573,8 @@ static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, PlanRowMark *top_parentrc, LOCKMODE lockmode, - List **appinfos, List **partitioned_child_rels) + List **appinfos, List **partitioned_child_rels, + bool *is_partition_key_update) { int i; RangeTblEntry *childrte; @@ -1583,6 +1589,17 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, Assert(parentrte->inh); + /* + * Note down whether any partition key cols are being updated. Though it's + * the root partitioned table's updatedCols we are interested in, we + * instead use parentrte to get the updatedCols. This is convenient because + * parentrte already has the root partrel's updatedCols translated to match + * the attribute ordering of parentrel. + */ + if (!*is_partition_key_update) + *is_partition_key_update = + has_partition_attrs(parentrel, parentrte->updatedCols, NULL); + /* First expand the partitioned table itself. */ expand_single_inheritance_child(root, parentrte, parentRTindex, parentrel, top_parentrc, parentrel, @@ -1622,7 +1639,8 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) expand_partitioned_rtentry(root, childrte, childRTindex, childrel, top_parentrc, lockmode, - appinfos, partitioned_child_rels); + appinfos, partitioned_child_rels, + is_partition_key_update); /* Close child relation, but keep locks */ heap_close(childrel, NoLock); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 68dee0f..0ce5339 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -3207,6 +3207,8 @@ create_lockrows_path(PlannerInfo *root, RelOptInfo *rel, * 'partitioned_rels' is an integer list of RT indexes of non-leaf tables in * the partition tree, if this is an UPDATE/DELETE to a partitioned table. * Otherwise NIL. + * 'partKeyUpdated' is true if any partitioning columns are being updated, + * either from the named relation or a descendent partitioned table. * 'resultRelations' is an integer list of actual RT indexes of target rel(s) * 'subpaths' is a list of Path(s) producing source data (one per rel) * 'subroots' is a list of PlannerInfo structs (one per rel) @@ -3220,6 +3222,7 @@ ModifyTablePath * create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool partKeyUpdated, List *resultRelations, List *subpaths, List *subroots, List *withCheckOptionLists, List *returningLists, @@ -3287,6 +3290,7 @@ create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, pathnode->canSetTag = canSetTag; pathnode->nominalRelation = nominalRelation; pathnode->partitioned_rels = list_copy(partitioned_rels); + pathnode->partKeyUpdated = partKeyUpdated; pathnode->resultRelations = resultRelations; pathnode->subpaths = subpaths; pathnode->subroots = subroots; diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 295e9d2..c6fee08 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -54,12 +54,16 @@ extern void check_new_partition_bound(char *relname, Relation parent, extern Oid get_partition_parent(Oid relid); extern List *get_qual_from_partbound(Relation rel, Relation parent, PartitionBoundSpec *spec); -extern List *map_partition_varattnos(List *expr, int target_varno, - Relation partrel, Relation parent, +extern List *map_partition_varattnos(List *expr, int fromrel_varno, + Relation to_rel, Relation from_rel, bool *found_whole_row); extern List *RelationGetPartitionQual(Relation rel); extern Expr *get_partition_qual_relid(Oid relid); - +extern void pull_child_partition_columns(Relation rel, + Relation parent, + Bitmapset **partcols); +extern bool has_partition_attrs(Relation rel, Bitmapset *attnums, + bool *used_in_expr); extern Oid get_default_oid_from_partdesc(PartitionDesc partdesc); extern Oid get_default_partition_oid(Oid parentId); extern void update_default_partition_oid(Oid parentId, Oid defaultPartId); diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 64e5aab..7e69c48 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -49,14 +49,51 @@ typedef struct PartitionDispatchData typedef struct PartitionDispatchData *PartitionDispatch; +/*----------------------- + * PartitionTupleRouting - Encapsulates all information required to execute + * tuple-routing between partitions. + * + * partition_dispatch_info Array of PartitionDispatch objects with one + * entry for every partitioned table in the + * partition tree. + * num_dispatch number of partitioned tables in the partition + * tree (= length of partition_dispatch_info[]) + * partitions Array of ResultRelInfo* objects with one entry + * for every leaf partition in the partition tree. + * num_partitions Number of leaf partitions in the partition tree + * (= 'partitions' array length) + * parentchild_tupconv_maps Array of TupleConversionMap objects with one + * entry for every leaf partition (required to + * convert input tuple based on the root table's + * rowtype to a leaf partition's rowtype after + * tuple routing is done) + * subplan_partition_offsets int Array ordered by UPDATE subplans. Each + * element of this array has the index into the + * corresponding partition in 'partitions' array. + * partition_tuple_slot TupleTableSlot to be used to manipulate any + * given leaf partition's rowtype after that + * partition is chosen for insertion by + * tuple-routing. + *----------------------- + */ +typedef struct PartitionTupleRouting +{ + PartitionDispatch *partition_dispatch_info; + int num_dispatch; + ResultRelInfo **partitions; + int num_partitions; + TupleConversionMap **parentchild_tupconv_maps; + int *subplan_partition_offsets; + TupleTableSlot *partition_tuple_slot; + TupleTableSlot *root_tuple_slot; +} PartitionTupleRouting; + extern void ExecSetupPartitionTupleRouting(Relation rel, + ResultRelInfo *update_rri, + int num_update_rri, Index resultRTindex, EState *estate, - PartitionDispatch **pd, - ResultRelInfo ***partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions); + PartitionTupleRouting **partition_tuple_routing); extern int ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, TupleTableSlot *slot, diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index b5578f5..5a385e2 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -187,9 +187,12 @@ extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid); extern void ExecCleanUpTriggerState(EState *estate); extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); extern void ExecConstraints(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate); -extern void ExecPartitionCheck(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate, + bool check_partition_constraint); +extern bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); +extern void ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate); extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); extern LockTupleMode ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index e05bc04..64cf3dd 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -976,21 +976,15 @@ typedef struct ModifyTableState TupleTableSlot *mt_existing; /* slot to store existing target tuple in */ List *mt_excludedtlist; /* the excluded pseudo relation's tlist */ TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection target */ - struct PartitionDispatchData **mt_partition_dispatch_info; - /* Tuple-routing support info */ - int mt_num_dispatch; /* Number of entries in the above array */ - int mt_num_partitions; /* Number of members in the following - * arrays */ - ResultRelInfo **mt_partitions; /* Per partition result relation pointers */ - TupleConversionMap **mt_partition_tupconv_maps; - /* Per partition tuple conversion map */ - TupleTableSlot *mt_partition_tuple_slot; + struct PartitionTupleRouting *mt_partition_tuple_routing; /* Tuple-routing support info */ struct TransitionCaptureState *mt_transition_capture; /* controls transition table population for specified operation */ struct TransitionCaptureState *mt_oc_transition_capture; /* controls transition table population for INSERT...ON CONFLICT UPDATE */ - TupleConversionMap **mt_transition_tupconv_maps; - /* Per plan/partition tuple conversion */ + TupleConversionMap **mt_childparent_tupconv_maps; + /* Per plan/partition map for tuple conversion from child to root */ + bool mt_is_tupconv_perpart; /* Is the above map per-partition ? */ + /* Stores position of update result rels in leaf partitions */ } ModifyTableState; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 9b38d44..b36dafc 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -219,6 +219,7 @@ typedef struct ModifyTable Index nominalRelation; /* Parent RT index for use of EXPLAIN */ /* RT indexes of non-leaf tables in a partition tree */ List *partitioned_rels; + bool partKeyUpdated; /* some part key in hierarchy updated */ List *resultRelations; /* integer list of RT indexes */ int resultRelIndex; /* index of first resultRel in plan's list */ int rootResultRelIndex; /* index of the partitioned table root */ diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 9e68e65..43d0164 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -1667,6 +1667,7 @@ typedef struct ModifyTablePath Index nominalRelation; /* Parent RT index for use of EXPLAIN */ /* RT indexes of non-leaf tables in a partition tree */ List *partitioned_rels; + bool partKeyUpdated; /* some part key in hierarchy updated */ List *resultRelations; /* integer list of RT indexes */ List *subpaths; /* Path(s) producing source data */ List *subroots; /* per-target-table PlannerInfos */ @@ -2117,6 +2118,9 @@ typedef struct PartitionedChildRelInfo Index parent_relid; List *child_rels; + bool is_partition_key_update; /* is the partition key of any of + * the partitioned tables + * updated? */ } PartitionedChildRelInfo; /* diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index e9ed16a..39ce47d 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -238,6 +238,7 @@ extern ModifyTablePath *create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool partColsUpdated, List *resultRelations, List *subpaths, List *subroots, List *withCheckOptionLists, List *returningLists, diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 2801bfd..9f0533c 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -57,7 +57,8 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr); extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid); -extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti); +extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti, + bool *is_partition_key_update); extern List *get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids); diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index b69ceaa..dd6242b 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -198,36 +198,371 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a) DROP TABLE update_test; DROP TABLE upsert_test; --- update to a partition should check partition bound constraint for the new tuple -create table range_parted ( +--------------------------- +-- UPDATE with row movement +--------------------------- +-- update to a partition should check partition bound constraint for the new tuple. +-- If partition key is updated, the row should be moved to the appropriate +-- partition. updatable views using partitions should enforce the check options +-- for the rows that have been moved. +create table mintab(c1 int); +insert into mintab values (120); +CREATE TABLE range_parted ( a text, - b int + b bigint, + c numeric, + d int, + e varchar ) partition by range (a, b); -create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); -create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); +CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 from mintab) WITH CHECK OPTION; +-- Create partitions intentionally in descending bound order, so as to test +-- that update-row-movement works with the leaf partitions not in bound order. +create table part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int); +alter table range_parted attach partition part_b_20_b_30 for values from ('b', 20) to ('b', 30); +create table part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) partition by range (c); create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10); -create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20); -insert into part_a_1_a_10 values ('a', 1); -insert into part_b_10_b_20 values ('b', 10); --- fail -update part_a_1_a_10 set a = 'b' where a = 'a'; -ERROR: new row for relation "part_a_1_a_10" violates partition constraint -DETAIL: Failing row contains (b, 1). -update range_parted set b = b - 1 where b = 10; -ERROR: new row for relation "part_b_10_b_20" violates partition constraint -DETAIL: Failing row contains (b, 9). --- ok -update range_parted set b = b + 1 where b = 10; +alter table range_parted attach partition part_b_10_b_20 for values from ('b', 10) to ('b', 20); +create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); +create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); +-- This tests partition-key UPDATE on a partitioned table that does not have any child partitions +update part_b_10_b_20 set b = b - 6; +-- As mentioned above, the partition creation is intentionally kept in descending bound order. +create table part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) partition by range (d); +alter table part_c_100_200 drop column e, drop column c, drop column a; +alter table part_c_100_200 add column c numeric, add column e varchar, add column a text; +alter table part_c_100_200 drop column b; +alter table part_c_100_200 add column b bigint; +create table part_d_1_15 partition of part_c_100_200 for values from (1) to (15); +create table part_d_15_20 partition of part_c_100_200 for values from (15) to (20); +alter table part_b_10_b_20 attach partition part_c_100_200 for values from (100) to (200); +create table part_c_1_100 (e varchar, d int, c numeric, b bigint, a text); +alter table part_b_10_b_20 attach partition part_c_1_100 for values from (1) to (100); +\set init_range_parted 'truncate range_parted; insert into range_parted values (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)' +\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted order by 1, 2, 3, 4, 5, 6' +:init_range_parted; +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | +(6 rows) + +-- The order of subplans should be in bound order +explain (costs off) update range_parted set c = c - 50 where c > 97; + QUERY PLAN +------------------------------------- + Update on range_parted + Update on part_a_1_a_10 + Update on part_a_10_a_20 + Update on part_b_1_b_10 + Update on part_c_1_100 + Update on part_d_1_15 + Update on part_d_15_20 + Update on part_b_20_b_30 + -> Seq Scan on part_a_1_a_10 + Filter: (c > '97'::numeric) + -> Seq Scan on part_a_10_a_20 + Filter: (c > '97'::numeric) + -> Seq Scan on part_b_1_b_10 + Filter: (c > '97'::numeric) + -> Seq Scan on part_c_1_100 + Filter: (c > '97'::numeric) + -> Seq Scan on part_d_1_15 + Filter: (c > '97'::numeric) + -> Seq Scan on part_d_15_20 + Filter: (c > '97'::numeric) + -> Seq Scan on part_b_20_b_30 + Filter: (c > '97'::numeric) +(22 rows) + +-- fail (row movement happens only within the partition subtree) : +update part_c_100_200 set c = c - 20, d = c where c = 105; +ERROR: new row for relation "part_c_100_200" violates partition constraint +DETAIL: Failing row contains (105, 85, null, b, 15). +-- fail (no partition key update, so no attempt to move tuple, but "a = 'a'" violates partition constraint enforced by root partition) +update part_b_10_b_20 set a = 'a'; +ERROR: new row for relation "part_c_1_100" violates partition constraint +DETAIL: Failing row contains (null, 1, 96, 12, a). +-- success; partition key update, no constraint violation +update range_parted set d = d - 10 where d > 10; +-- success; no partition key update, no constraint violation +update range_parted set e = d; +-- No row found : +update part_c_1_100 set c = c + 20 where c = 98; +-- ok (row movement) +update part_b_10_b_20 set c = c + 20 returning c, b, a; + c | b | a +-----+----+--- + 116 | 12 | b + 117 | 13 | b + 125 | 15 | b + 125 | 17 | b +(4 rows) + +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+---+--- + part_a_10_a_20 | a | 10 | 200 | 1 | 1 + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_d_1_15 | b | 12 | 116 | 1 | 1 + part_d_1_15 | b | 13 | 117 | 2 | 2 + part_d_1_15 | b | 15 | 125 | 6 | 6 + part_d_1_15 | b | 17 | 125 | 9 | 9 +(6 rows) + +-- fail (row movement happens only within the partition subtree) : +update part_b_10_b_20 set b = b - 6 where c > 116 returning *; +ERROR: new row for relation "part_d_1_15" violates partition constraint +DETAIL: Failing row contains (2, 117, 2, b, 7). +-- ok (row movement, with subset of rows moved into different partition) +update range_parted set b = b - 6 where c > 116 returning a, b + c; + a | ?column? +---+---------- + a | 204 + b | 124 + b | 134 + b | 136 +(4 rows) + +:show_data; + partname | a | b | c | d | e +---------------+---+----+-----+---+--- + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_a_1_a_10 | a | 4 | 200 | 1 | 1 + part_b_1_b_10 | b | 7 | 117 | 2 | 2 + part_b_1_b_10 | b | 9 | 125 | 6 | 6 + part_d_1_15 | b | 11 | 125 | 9 | 9 + part_d_1_15 | b | 12 | 116 | 1 | 1 +(6 rows) + +-- update partition key using updatable view. +-- succeeds +update upview set c = 199 where b = 4; +-- fail, check option violation +update upview set c = 120 where b = 4; +ERROR: new row violates check option for view "upview" +DETAIL: Failing row contains (a, 4, 120, 1, 1). +-- fail, row movement with check option violation +update upview set a = 'b', b = 15, c = 120 where b = 4; +ERROR: new row violates check option for view "upview" +DETAIL: Failing row contains (b, 15, 120, 1, 1). +-- succeeds, row movement , check option passes +update upview set a = 'b', b = 15 where b = 4; +:show_data; + partname | a | b | c | d | e +---------------+---+----+-----+---+--- + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_b_1_b_10 | b | 7 | 117 | 2 | 2 + part_b_1_b_10 | b | 9 | 125 | 6 | 6 + part_d_1_15 | b | 11 | 125 | 9 | 9 + part_d_1_15 | b | 12 | 116 | 1 | 1 + part_d_1_15 | b | 15 | 199 | 1 | 1 +(6 rows) + +-- cleanup +drop view upview; +-- RETURNING having whole-row vars. +---------------------------------- +:init_range_parted; +update range_parted set c = 95 where a = 'b' and b > 10 and c > 100 returning (range_parted) , *; + range_parted | a | b | c | d | e +---------------+---+----+----+----+--- + (b,15,95,16,) | b | 15 | 95 | 16 | + (b,17,95,19,) | b | 17 | 95 | 19 | +(2 rows) + +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_c_1_100 | b | 15 | 95 | 16 | + part_c_1_100 | b | 17 | 95 | 19 | +(6 rows) + +-- Transition tables with update row movement +--------------------------------------------- +:init_range_parted; +create function trans_updatetrigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = %, old table = %, new table = %', + TG_NAME, + (select string_agg(old_table::text, ', ' order by a) from old_table), + (select string_agg(new_table::text, ', ' order by a) from new_table); + return null; + end; +$$; +create trigger trans_updatetrig + after update on range_parted referencing old table as old_table new table as new_table + for each statement execute procedure trans_updatetrigfunc(); +update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96; +NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,110,1,), (b,13,98,2,), (b,15,106,16,), (b,17,106,19,) +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 13 | 98 | 2 | + part_d_15_20 | b | 15 | 106 | 16 | + part_d_15_20 | b | 17 | 106 | 19 | + part_d_1_15 | b | 12 | 110 | 1 | +(6 rows) + +:init_range_parted; +-- Enabling OLD TABLE capture for both DELETE as well as UPDATE stmt triggers +-- should not cause DELETEd rows to be captured twice. Similar thing for +-- INSERT triggers and inserted rows. +create trigger trans_deletetrig + after delete on range_parted referencing old table as old_table + for each statement execute procedure trans_updatetrigfunc(); +create trigger trans_inserttrig + after insert on range_parted referencing new table as new_table + for each statement execute procedure trans_updatetrigfunc(); +update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96; +NOTICE: trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,146,1,), (b,13,147,2,), (b,15,155,16,), (b,17,155,19,) +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_d_15_20 | b | 15 | 155 | 16 | + part_d_15_20 | b | 17 | 155 | 19 | + part_d_1_15 | b | 12 | 146 | 1 | + part_d_1_15 | b | 13 | 147 | 2 | +(6 rows) + +drop trigger trans_updatetrig ON range_parted; +drop trigger trans_deletetrig ON range_parted; +drop trigger trans_inserttrig ON range_parted; +-- Install BR triggers on child partition, so that transition tuple conversion takes place. +create function func_parted_mod_b() returns trigger as $$ +begin + NEW.b = NEW.b + 1; + return NEW; +end $$ language plpgsql; +create trigger trig_c1_100 before update or insert on part_c_1_100 + for each row execute procedure func_parted_mod_b(); +create trigger trig_d1_15 before update or insert on part_d_1_15 + for each row execute procedure func_parted_mod_b(); +create trigger trig_d15_20 before update or insert on part_d_15_20 + for each row execute procedure func_parted_mod_b(); +:init_range_parted; +update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96; +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 15 | 98 | 2 | + part_d_15_20 | b | 17 | 106 | 16 | + part_d_15_20 | b | 19 | 106 | 19 | + part_d_1_15 | b | 15 | 110 | 1 | +(6 rows) + +:init_range_parted; +update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96; +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_d_15_20 | b | 17 | 155 | 16 | + part_d_15_20 | b | 19 | 155 | 19 | + part_d_1_15 | b | 15 | 146 | 1 | + part_d_1_15 | b | 16 | 147 | 2 | +(6 rows) + +drop trigger trig_c1_100 ON part_c_1_100; +drop trigger trig_d1_15 ON part_d_1_15; +drop trigger trig_d15_20 ON part_d_15_20; +drop function func_parted_mod_b(); +-- statement triggers with update row movement +--------------------------------------------------- +:init_range_parted; +create function trigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = % fired on table % during %', + TG_NAME, TG_TABLE_NAME, TG_OP; + return null; + end; +$$; +-- Triggers on root partition +create trigger parent_delete_trig + after delete on range_parted for each statement execute procedure trigfunc(); +create trigger parent_update_trig + after update on range_parted for each statement execute procedure trigfunc(); +create trigger parent_insert_trig + after insert on range_parted for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_c_1_100 +create trigger c1_delete_trig + after delete on part_c_1_100 for each statement execute procedure trigfunc(); +create trigger c1_update_trig + after update on part_c_1_100 for each statement execute procedure trigfunc(); +create trigger c1_insert_trig + after insert on part_c_1_100 for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_d_1_15 +create trigger d1_delete_trig + after delete on part_d_1_15 for each statement execute procedure trigfunc(); +create trigger d1_update_trig + after update on part_d_1_15 for each statement execute procedure trigfunc(); +create trigger d1_insert_trig + after insert on part_d_1_15 for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_d_15_20 +create trigger d15_delete_trig + after delete on part_d_15_20 for each statement execute procedure trigfunc(); +create trigger d15_update_trig + after update on part_d_15_20 for each statement execute procedure trigfunc(); +create trigger d15_insert_trig + after insert on part_d_15_20 for each statement execute procedure trigfunc(); +-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or insert statement triggers should be fired. +update range_parted set c = c - 50 where c > 97; +NOTICE: trigger = parent_update_trig fired on table range_parted during UPDATE +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 150 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_c_1_100 | b | 15 | 55 | 16 | + part_c_1_100 | b | 17 | 55 | 19 | +(6 rows) + +drop trigger parent_delete_trig ON range_parted; +drop trigger parent_update_trig ON range_parted; +drop trigger parent_insert_trig ON range_parted; +drop trigger c1_delete_trig ON part_c_1_100; +drop trigger c1_update_trig ON part_c_1_100; +drop trigger c1_insert_trig ON part_c_1_100; +drop trigger d1_delete_trig ON part_d_1_15; +drop trigger d1_update_trig ON part_d_1_15; +drop trigger d1_insert_trig ON part_d_1_15; +drop trigger d15_delete_trig ON part_d_15_20; +drop trigger d15_update_trig ON part_d_15_20; +drop trigger d15_insert_trig ON part_d_15_20; +drop table mintab; -- Creating default partition for range +:init_range_parted; create table part_def partition of range_parted default; \d+ part_def - Table "public.part_def" - Column | Type | Collation | Nullable | Default | Storage | Stats target | Description ---------+---------+-----------+----------+---------+----------+--------------+------------- - a | text | | | | extended | | - b | integer | | | | plain | | + Table "public.part_def" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+-------------------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | bigint | | | | plain | | + c | numeric | | | | main | | + d | integer | | | | plain | | + e | character varying | | | | extended | | Partition of: range_parted DEFAULT -Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20))))) +Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'a'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'b'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '20'::bigint) AND (b < '30'::bigint))))) insert into range_parted values ('c', 9); -- ok @@ -235,7 +570,55 @@ update part_def set a = 'd' where a = 'c'; -- fail update part_def set a = 'a' where a = 'd'; ERROR: new row for relation "part_def" violates partition constraint -DETAIL: Failing row contains (a, 9). +DETAIL: Failing row contains (a, 9, null, null, null). +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | + part_def | d | 9 | | | +(7 rows) + +-- Update row movement from non-default to default partition. +-- Fail, default partition is not under part_a_10_a_20; +update part_a_10_a_20 set a = 'ad' where a = 'a'; +ERROR: new row for relation "part_a_10_a_20" violates partition constraint +DETAIL: Failing row contains (ad, 10, 200, 1, null). +-- Success +update range_parted set a = 'ad' where a = 'a'; +update range_parted set a = 'bd' where a = 'b'; +:show_data; + partname | a | b | c | d | e +----------+----+----+-----+----+--- + part_def | ad | 1 | 1 | 1 | + part_def | ad | 10 | 200 | 1 | + part_def | bd | 12 | 96 | 1 | + part_def | bd | 13 | 97 | 2 | + part_def | bd | 15 | 105 | 16 | + part_def | bd | 17 | 105 | 19 | + part_def | d | 9 | | | +(7 rows) + +-- Update row movement from default to non-default partitions. +-- Success +update range_parted set a = 'a' where a = 'ad'; +update range_parted set a = 'b' where a = 'bd'; +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | + part_def | d | 9 | | | +(7 rows) + create table list_parted ( a text, b int @@ -250,6 +633,111 @@ ERROR: new row for relation "list_default" violates partition constraint DETAIL: Failing row contains (a, 10). -- ok update list_default set a = 'x' where a = 'd'; +drop table list_parted; +-------------- +-- UPDATE with +-- partition key or non-partition columns, with different column ordering, +-- triggers. +-------------- +-- Setup +-------- +create table list_parted (a numeric, b int, c int8) partition by list (a); +create table sub_parted partition of list_parted for values in (1) partition by list (b); +create table sub_part1(b int, c int8, a numeric); +alter table sub_parted attach partition sub_part1 for values in (1); +create table sub_part2(b int, c int8, a numeric); +alter table sub_parted attach partition sub_part2 for values in (2); +create table list_part1(a numeric, b int, c int8); +alter table list_parted attach partition list_part1 for values in (2,3); +insert into list_parted values (2,5,50); +insert into list_parted values (3,6,60); +insert into sub_parted values (1,1,60); +insert into sub_parted values (1,2,10); +-- Test partition constraint violation when intermediate ancestor is used and +-- constraint is inherited from upper root. +update sub_parted set a = 2 where c = 10; +ERROR: new row for relation "sub_part2" violates partition constraint +DETAIL: Failing row contains (2, 10, 2). +-- UPDATE which does not modify partition key of partitions that are chosen for update. +select tableoid::regclass::text , * from list_parted where a = 2 order by 1; + tableoid | a | b | c +------------+---+---+---- + list_part1 | 2 | 5 | 50 +(1 row) + +update list_parted set b = c + a where a = 2; +select tableoid::regclass::text , * from list_parted where a = 2 order by 1; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 +(1 row) + +----------- +-- Triggers can cause UPDATE row movement if it modified partition key. +----------- +create function func_parted_mod_b() returns trigger as $$ +begin + NEW.b = 2; -- This is changing partition key column. + return NEW; +end $$ language plpgsql; +create trigger parted_mod_b before update on sub_part1 + for each row execute procedure func_parted_mod_b(); +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part1 | 1 | 1 | 60 + sub_part2 | 1 | 2 | 10 +(4 rows) + +-- This should do the tuple routing even though there is no explicit +-- partition-key update, because there is a trigger on sub_part1 +update list_parted set c = 70 where b = 1 ; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part2 | 1 | 2 | 10 + sub_part2 | 1 | 2 | 70 +(4 rows) + +drop trigger parted_mod_b ON sub_part1 ; +-- If BR DELETE trigger prevented DELETE from happening, we should also skip +-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT. +create or replace function func_parted_mod_b() returns trigger as $$ +begin return NULL; end $$ language plpgsql; +create trigger trig_skip_delete before delete on sub_part1 + for each row execute procedure func_parted_mod_b(); +update list_parted set b = 1 where c = 70; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part1 | 1 | 1 | 70 + sub_part2 | 1 | 2 | 10 +(4 rows) + +drop trigger trig_skip_delete ON sub_part1 ; +-- UPDATE partition-key with FROM clause. If join produces multiple output +-- rows for the same row to be modified, we should tuple-route the row only once. +-- There should not be any rows inserted. +create table non_parted (id int); +insert into non_parted values (1), (1), (1), (2), (2), (2), (3), (3), (3); +update list_parted t1 set a = 2 from non_parted t2 where t1.a = t2.id and a = 1; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 1 | 70 + list_part1 | 2 | 2 | 10 + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 +(4 rows) + +drop table non_parted; +drop function func_parted_mod_b(); -- create custom operator class and hash function, for the same reason -- explained in alter_table.sql create or replace function dummy_hashint4(a int4, seed int8) returns int8 as @@ -271,9 +759,8 @@ insert into hpart4 values (3, 4); update hpart1 set a = 3, b=4 where a = 1; ERROR: new row for relation "hpart1" violates partition constraint DETAIL: Failing row contains (3, 4). +-- ok : row movement update hash_parted set b = b - 1 where b = 1; -ERROR: new row for relation "hpart1" violates partition constraint -DETAIL: Failing row contains (1, 0). -- ok update hash_parted set b = b + 8 where b = 1; -- cleanup diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index 0c70d64..10c10c7 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -107,25 +107,233 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a) DROP TABLE update_test; DROP TABLE upsert_test; --- update to a partition should check partition bound constraint for the new tuple -create table range_parted ( + +--------------------------- +-- UPDATE with row movement +--------------------------- + +-- update to a partition should check partition bound constraint for the new tuple. +-- If partition key is updated, the row should be moved to the appropriate +-- partition. updatable views using partitions should enforce the check options +-- for the rows that have been moved. +create table mintab(c1 int); +insert into mintab values (120); +CREATE TABLE range_parted ( a text, - b int + b bigint, + c numeric, + d int, + e varchar ) partition by range (a, b); -create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); -create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); +CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 from mintab) WITH CHECK OPTION; + +-- Create partitions intentionally in descending bound order, so as to test +-- that update-row-movement works with the leaf partitions not in bound order. +create table part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int); +alter table range_parted attach partition part_b_20_b_30 for values from ('b', 20) to ('b', 30); +create table part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) partition by range (c); create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10); -create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20); -insert into part_a_1_a_10 values ('a', 1); -insert into part_b_10_b_20 values ('b', 10); +alter table range_parted attach partition part_b_10_b_20 for values from ('b', 10) to ('b', 20); +create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); +create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); + +-- This tests partition-key UPDATE on a partitioned table that does not have any child partitions +update part_b_10_b_20 set b = b - 6; + +-- As mentioned above, the partition creation is intentionally kept in descending bound order. +create table part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) partition by range (d); +alter table part_c_100_200 drop column e, drop column c, drop column a; +alter table part_c_100_200 add column c numeric, add column e varchar, add column a text; +alter table part_c_100_200 drop column b; +alter table part_c_100_200 add column b bigint; +create table part_d_1_15 partition of part_c_100_200 for values from (1) to (15); +create table part_d_15_20 partition of part_c_100_200 for values from (15) to (20); + +alter table part_b_10_b_20 attach partition part_c_100_200 for values from (100) to (200); + +create table part_c_1_100 (e varchar, d int, c numeric, b bigint, a text); +alter table part_b_10_b_20 attach partition part_c_1_100 for values from (1) to (100); + +\set init_range_parted 'truncate range_parted; insert into range_parted values (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)' +\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted order by 1, 2, 3, 4, 5, 6' +:init_range_parted; +:show_data; + +-- The order of subplans should be in bound order +explain (costs off) update range_parted set c = c - 50 where c > 97; + +-- fail (row movement happens only within the partition subtree) : +update part_c_100_200 set c = c - 20, d = c where c = 105; +-- fail (no partition key update, so no attempt to move tuple, but "a = 'a'" violates partition constraint enforced by root partition) +update part_b_10_b_20 set a = 'a'; +-- success; partition key update, no constraint violation +update range_parted set d = d - 10 where d > 10; +-- success; no partition key update, no constraint violation +update range_parted set e = d; +-- No row found : +update part_c_1_100 set c = c + 20 where c = 98; +-- ok (row movement) +update part_b_10_b_20 set c = c + 20 returning c, b, a; +:show_data; + +-- fail (row movement happens only within the partition subtree) : +update part_b_10_b_20 set b = b - 6 where c > 116 returning *; +-- ok (row movement, with subset of rows moved into different partition) +update range_parted set b = b - 6 where c > 116 returning a, b + c; + +:show_data; + +-- update partition key using updatable view. + +-- succeeds +update upview set c = 199 where b = 4; +-- fail, check option violation +update upview set c = 120 where b = 4; +-- fail, row movement with check option violation +update upview set a = 'b', b = 15, c = 120 where b = 4; +-- succeeds, row movement , check option passes +update upview set a = 'b', b = 15 where b = 4; + +:show_data; + +-- cleanup +drop view upview; + +-- RETURNING having whole-row vars. +---------------------------------- +:init_range_parted; +update range_parted set c = 95 where a = 'b' and b > 10 and c > 100 returning (range_parted) , *; +:show_data; + + +-- Transition tables with update row movement +--------------------------------------------- +:init_range_parted; + +create function trans_updatetrigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = %, old table = %, new table = %', + TG_NAME, + (select string_agg(old_table::text, ', ' order by a) from old_table), + (select string_agg(new_table::text, ', ' order by a) from new_table); + return null; + end; +$$; + +create trigger trans_updatetrig + after update on range_parted referencing old table as old_table new table as new_table + for each statement execute procedure trans_updatetrigfunc(); + +update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96; +:show_data; +:init_range_parted; + +-- Enabling OLD TABLE capture for both DELETE as well as UPDATE stmt triggers +-- should not cause DELETEd rows to be captured twice. Similar thing for +-- INSERT triggers and inserted rows. +create trigger trans_deletetrig + after delete on range_parted referencing old table as old_table + for each statement execute procedure trans_updatetrigfunc(); +create trigger trans_inserttrig + after insert on range_parted referencing new table as new_table + for each statement execute procedure trans_updatetrigfunc(); +update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96; +:show_data; +drop trigger trans_updatetrig ON range_parted; +drop trigger trans_deletetrig ON range_parted; +drop trigger trans_inserttrig ON range_parted; + +-- Install BR triggers on child partition, so that transition tuple conversion takes place. +create function func_parted_mod_b() returns trigger as $$ +begin + NEW.b = NEW.b + 1; + return NEW; +end $$ language plpgsql; +create trigger trig_c1_100 before update or insert on part_c_1_100 + for each row execute procedure func_parted_mod_b(); +create trigger trig_d1_15 before update or insert on part_d_1_15 + for each row execute procedure func_parted_mod_b(); +create trigger trig_d15_20 before update or insert on part_d_15_20 + for each row execute procedure func_parted_mod_b(); +:init_range_parted; +update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96; +:show_data; +:init_range_parted; +update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96; +:show_data; +drop trigger trig_c1_100 ON part_c_1_100; +drop trigger trig_d1_15 ON part_d_1_15; +drop trigger trig_d15_20 ON part_d_15_20; +drop function func_parted_mod_b(); + + +-- statement triggers with update row movement +--------------------------------------------------- + +:init_range_parted; + +create function trigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = % fired on table % during %', + TG_NAME, TG_TABLE_NAME, TG_OP; + return null; + end; +$$; +-- Triggers on root partition +create trigger parent_delete_trig + after delete on range_parted for each statement execute procedure trigfunc(); +create trigger parent_update_trig + after update on range_parted for each statement execute procedure trigfunc(); +create trigger parent_insert_trig + after insert on range_parted for each statement execute procedure trigfunc(); + +-- Triggers on leaf partition part_c_1_100 +create trigger c1_delete_trig + after delete on part_c_1_100 for each statement execute procedure trigfunc(); +create trigger c1_update_trig + after update on part_c_1_100 for each statement execute procedure trigfunc(); +create trigger c1_insert_trig + after insert on part_c_1_100 for each statement execute procedure trigfunc(); + +-- Triggers on leaf partition part_d_1_15 +create trigger d1_delete_trig + after delete on part_d_1_15 for each statement execute procedure trigfunc(); +create trigger d1_update_trig + after update on part_d_1_15 for each statement execute procedure trigfunc(); +create trigger d1_insert_trig + after insert on part_d_1_15 for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_d_15_20 +create trigger d15_delete_trig + after delete on part_d_15_20 for each statement execute procedure trigfunc(); +create trigger d15_update_trig + after update on part_d_15_20 for each statement execute procedure trigfunc(); +create trigger d15_insert_trig + after insert on part_d_15_20 for each statement execute procedure trigfunc(); + +-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or insert statement triggers should be fired. +update range_parted set c = c - 50 where c > 97; +:show_data; + +drop trigger parent_delete_trig ON range_parted; +drop trigger parent_update_trig ON range_parted; +drop trigger parent_insert_trig ON range_parted; +drop trigger c1_delete_trig ON part_c_1_100; +drop trigger c1_update_trig ON part_c_1_100; +drop trigger c1_insert_trig ON part_c_1_100; +drop trigger d1_delete_trig ON part_d_1_15; +drop trigger d1_update_trig ON part_d_1_15; +drop trigger d1_insert_trig ON part_d_1_15; +drop trigger d15_delete_trig ON part_d_15_20; +drop trigger d15_update_trig ON part_d_15_20; +drop trigger d15_insert_trig ON part_d_15_20; + +drop table mintab; --- fail -update part_a_1_a_10 set a = 'b' where a = 'a'; -update range_parted set b = b - 1 where b = 10; --- ok -update range_parted set b = b + 1 where b = 10; -- Creating default partition for range +:init_range_parted; create table part_def partition of range_parted default; \d+ part_def insert into range_parted values ('c', 9); @@ -134,6 +342,21 @@ update part_def set a = 'd' where a = 'c'; -- fail update part_def set a = 'a' where a = 'd'; +:show_data; + +-- Update row movement from non-default to default partition. +-- Fail, default partition is not under part_a_10_a_20; +update part_a_10_a_20 set a = 'ad' where a = 'a'; +-- Success +update range_parted set a = 'ad' where a = 'a'; +update range_parted set a = 'bd' where a = 'b'; +:show_data; +-- Update row movement from default to non-default partitions. +-- Success +update range_parted set a = 'a' where a = 'ad'; +update range_parted set a = 'b' where a = 'bd'; +:show_data; + create table list_parted ( a text, b int @@ -148,6 +371,84 @@ update list_default set a = 'a' where a = 'd'; -- ok update list_default set a = 'x' where a = 'd'; +drop table list_parted; + +-------------- +-- UPDATE with +-- partition key or non-partition columns, with different column ordering, +-- triggers. +-------------- + +-- Setup +-------- +create table list_parted (a numeric, b int, c int8) partition by list (a); +create table sub_parted partition of list_parted for values in (1) partition by list (b); + +create table sub_part1(b int, c int8, a numeric); +alter table sub_parted attach partition sub_part1 for values in (1); +create table sub_part2(b int, c int8, a numeric); +alter table sub_parted attach partition sub_part2 for values in (2); + +create table list_part1(a numeric, b int, c int8); +alter table list_parted attach partition list_part1 for values in (2,3); + +insert into list_parted values (2,5,50); +insert into list_parted values (3,6,60); +insert into sub_parted values (1,1,60); +insert into sub_parted values (1,2,10); + +-- Test partition constraint violation when intermediate ancestor is used and +-- constraint is inherited from upper root. +update sub_parted set a = 2 where c = 10; + +-- UPDATE which does not modify partition key of partitions that are chosen for update. +select tableoid::regclass::text , * from list_parted where a = 2 order by 1; +update list_parted set b = c + a where a = 2; +select tableoid::regclass::text , * from list_parted where a = 2 order by 1; + + +----------- +-- Triggers can cause UPDATE row movement if it modified partition key. +----------- +create function func_parted_mod_b() returns trigger as $$ +begin + NEW.b = 2; -- This is changing partition key column. + return NEW; +end $$ language plpgsql; +create trigger parted_mod_b before update on sub_part1 + for each row execute procedure func_parted_mod_b(); + +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + +-- This should do the tuple routing even though there is no explicit +-- partition-key update, because there is a trigger on sub_part1 +update list_parted set c = 70 where b = 1 ; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + +drop trigger parted_mod_b ON sub_part1 ; + +-- If BR DELETE trigger prevented DELETE from happening, we should also skip +-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT. +create or replace function func_parted_mod_b() returns trigger as $$ +begin return NULL; end $$ language plpgsql; +create trigger trig_skip_delete before delete on sub_part1 + for each row execute procedure func_parted_mod_b(); +update list_parted set b = 1 where c = 70; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + +drop trigger trig_skip_delete ON sub_part1 ; + +-- UPDATE partition-key with FROM clause. If join produces multiple output +-- rows for the same row to be modified, we should tuple-route the row only once. +-- There should not be any rows inserted. +create table non_parted (id int); +insert into non_parted values (1), (1), (1), (2), (2), (2), (3), (3), (3); +update list_parted t1 set a = 2 from non_parted t2 where t1.a = t2.id and a = 1; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; +drop table non_parted; + +drop function func_parted_mod_b(); + -- create custom operator class and hash function, for the same reason -- explained in alter_table.sql create or replace function dummy_hashint4(a int4, seed int8) returns int8 as @@ -169,6 +470,7 @@ insert into hpart4 values (3, 4); -- fail update hpart1 set a = 3, b=4 where a = 1; +-- ok : row movement update hash_parted set b = b - 1 where b = 1; -- ok update hash_parted set b = b + 8 where b = 1;