diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index b05a9c2..5a436a1 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -2993,6 +2993,11 @@ VALUES ('Albany', NULL, NULL, 'NY'); foreign table partitions. + + Updating the partition key of a row might cause it to be moved into a + different partition where this row satisfies its partition constraint. + + Example @@ -3285,9 +3290,20 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02 - An UPDATE that causes a row to move from one partition to - another fails, because the new value of the row fails to satisfy the - implicit partition constraint of the original partition. + When an UPDATE causes a row to move from one partition to + another, there is a chance that another concurrent UPDATE or + DELETE misses this row. Suppose, during the row movement, + the row is still visible for the concurrent session, and it is about to + do an UPDATE or DELETE operation on the same + row. This DML operation can silently miss this row if the row now gets + deleted from the partition by the first session as part of its + UPDATE row movement. In such case, the concurrent + UPDATE/DELETE, being unaware of the row + movement, interprets that the row has just been deleted so there is + nothing to be done for this row. Whereas, in the usual case where the + table is not partitioned, or where there is no row movement, the second + session would have identified the newly updated row and carried + UPDATE/DELETE on this new row version. diff --git a/doc/src/sgml/ref/update.sgml b/doc/src/sgml/ref/update.sgml index 8a1619f..28cfc1a 100644 --- a/doc/src/sgml/ref/update.sgml +++ b/doc/src/sgml/ref/update.sgml @@ -282,10 +282,17 @@ UPDATE count In the case of a partitioned table, updating a row might cause it to no - longer satisfy the partition constraint. Since there is no provision to - move the row to the partition appropriate to the new value of its - partitioning key, an error will occur in this case. This can also happen - when updating a partition directly. + longer satisfy the partition constraint of the containing partition. In that + case, if there is some other partition in the partition tree for which this + row satisfies its partition constraint, then the row is moved to that + partition. If there isn't such a partition, an error will occur. The error + will also occur when updating a partition directly. Behind the scenes, the + row movement is actually a DELETE and + INSERT operation. However, there is a possibility that a + concurrent UPDATE or DELETE on the same row may miss + this row. For details see the section + . + diff --git a/doc/src/sgml/trigger.sgml b/doc/src/sgml/trigger.sgml index 950245d..72300a0 100644 --- a/doc/src/sgml/trigger.sgml +++ b/doc/src/sgml/trigger.sgml @@ -160,6 +160,29 @@ + If an UPDATE on a partitioned table causes a row to + move to another partition, it will be performed as a + DELETE from the original partition followed by + INSERT into the new partition. In this case, all + row-level BEFORE UPDATE triggers and all + row-level BEFORE DELETE triggers are fired + on the original partition. Then all row-level BEFORE + INSERT triggers are fired on the destination partition. + The possibility of surprising outcomes should be considered when all these + triggers affect the row being moved. As far as AFTER ROW + triggers are concerned, AFTER DELETE and + AFTER INSERT triggers are applied; but + AFTER UPDATE triggers are not applied + because the UPDATE has been converted to a + DELETE and INSERT. As far as + statement-level triggers are concerned, none of the + DELETE or INSERT triggers are fired, + even if row movement occurs; only the UPDATE triggers + defined on the target table used in the UPDATE statement + will be fired. + + + Trigger functions invoked by per-statement triggers should always return NULL. Trigger functions invoked by per-row triggers can return a table row (a value of diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index c6bd02f..7539dde 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -878,7 +878,8 @@ get_qual_from_partbound(Relation rel, Relation parent, /* * map_partition_varattnos - maps varattno of any Vars in expr from the - * parent attno to partition attno. + * attno's of 'from_rel' partition to the attno's of 'to_rel' partition. + * The rels can be both leaf partition or a partitioned table. * * We must allow for cases where physical attnos of a partition can be * different from the parent's. @@ -891,8 +892,8 @@ get_qual_from_partbound(Relation rel, Relation parent, * are working on Lists, so it's less messy to do the casts internally. */ List * -map_partition_varattnos(List *expr, int target_varno, - Relation partrel, Relation parent, +map_partition_varattnos(List *expr, int fromrel_varno, + Relation to_rel, Relation from_rel, bool *found_whole_row) { AttrNumber *part_attnos; @@ -901,14 +902,14 @@ map_partition_varattnos(List *expr, int target_varno, if (expr == NIL) return NIL; - part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel), - RelationGetDescr(parent), + part_attnos = convert_tuples_by_name_map(RelationGetDescr(to_rel), + RelationGetDescr(from_rel), gettext_noop("could not convert row type")); expr = (List *) map_variable_attnos((Node *) expr, - target_varno, 0, + fromrel_varno, 0, part_attnos, - RelationGetDescr(parent)->natts, - RelationGetForm(partrel)->reltype, + RelationGetDescr(from_rel)->natts, + RelationGetForm(to_rel)->reltype, &my_found_whole_row); if (found_whole_row) *found_whole_row = my_found_whole_row; @@ -2054,6 +2055,77 @@ error_exit: } /* + * For each column of rel which is in the partition key or which appears + * in an expression which is in the partition key, translate the attribute + * number of that column according to the given parent, and add the resulting + * column number to the bitmapset, offset as we frequently do by + * FirstLowInvalidHeapAttributeNumber. + */ +void +pull_child_partition_columns(Bitmapset **bitmapset, + Relation rel, + Relation parent) +{ + PartitionKey key = RelationGetPartitionKey(rel); + int16 partnatts = get_partition_natts(key); + List *partexprs = get_partition_exprs(key); + ListCell *lc; + Bitmapset *child_keycols = NULL; + int i; + AttrNumber *map; + int child_keycol = -1; + + /* + * First, compute the complete set of partition columns for this rel. For + * compatibility with the API exposed by pull_varattnos, we offset the + * column numbers by FirstLowInvalidHeapAttributeNumber. + */ + for (i = 0; i < partnatts; i++) + { + AttrNumber partattno = get_partition_col_attnum(key, i); + + if (partattno != 0) + child_keycols = + bms_add_member(child_keycols, + partattno - FirstLowInvalidHeapAttributeNumber); + } + foreach(lc, partexprs) + { + Node *expr = (Node *) lfirst(lc); + + pull_varattnos(expr, 1, &child_keycols); + } + + /* + * Next, work out how to convert from the attribute numbers for the child + * to the attribute numbers for the parent. + */ + map = + convert_tuples_by_name_map(RelationGetDescr(parent), + RelationGetDescr(rel), + gettext_noop("could not convert row type")); + + /* + * For each child key column we have identified, translate to the + * corresponding parent key column. Entry 0 in the map array corresponds + * to attribute number 1, which corresponds to a bitmapset entry for 1 - + * FirstLowInvalidHeapAttributeNumber. + */ + while ((child_keycol = bms_next_member(child_keycols, child_keycol)) >= 0) + { + int kc = child_keycol + FirstLowInvalidHeapAttributeNumber; + + Assert(kc > 0 && kc <= RelationGetNumberOfAttributes(rel)); + *bitmapset = + bms_add_member(*bitmapset, + map[kc - 1] - FirstLowInvalidHeapAttributeNumber); + } + + /* Release memory. */ + pfree(map); +} + +/* * qsort_partition_list_value_cmp * * Compare two list partition bound datums diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index cfa3f05..4ac5bd6 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -168,7 +168,7 @@ typedef struct CopyStateData PartitionDispatch *partition_dispatch_info; int num_dispatch; /* Number of entries in the above array */ int num_partitions; /* Number of members in the following arrays */ - ResultRelInfo *partitions; /* Per partition result relation */ + ResultRelInfo **partitions; /* Per partition result relation pointers */ TupleConversionMap **partition_tupconv_maps; TupleTableSlot *partition_tuple_slot; TransitionCaptureState *transition_capture; @@ -2446,13 +2446,15 @@ CopyFrom(CopyState cstate) if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { PartitionDispatch *partition_dispatch_info; - ResultRelInfo *partitions; + ResultRelInfo **partitions; TupleConversionMap **partition_tupconv_maps; TupleTableSlot *partition_tuple_slot; int num_parted, num_partitions; ExecSetupPartitionTupleRouting(cstate->rel, + NULL, + 0, 1, estate, &partition_dispatch_info, @@ -2482,7 +2484,7 @@ CopyFrom(CopyState cstate) for (i = 0; i < cstate->num_partitions; ++i) { cstate->transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(cstate->partitions[i].ri_RelationDesc), + convert_tuples_by_name(RelationGetDescr(cstate->partitions[i]->ri_RelationDesc), RelationGetDescr(cstate->rel), gettext_noop("could not convert row type")); } @@ -2616,7 +2618,7 @@ CopyFrom(CopyState cstate) * to the selected partition. */ saved_resultRelInfo = resultRelInfo; - resultRelInfo = cstate->partitions + leaf_part_index; + resultRelInfo = cstate->partitions[leaf_part_index]; /* We do not yet have a way to insert into a foreign partition */ if (resultRelInfo->ri_FdwRoutine) @@ -2726,7 +2728,7 @@ CopyFrom(CopyState cstate) /* Check the constraints of the tuple */ if (cstate->rel->rd_att->constr || check_partition_constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); if (useHeapMultiInsert) { @@ -2846,7 +2848,7 @@ CopyFrom(CopyState cstate) } for (i = 0; i < cstate->num_partitions; i++) { - ResultRelInfo *resultRelInfo = cstate->partitions + i; + ResultRelInfo *resultRelInfo = cstate->partitions[i]; ExecCloseIndices(resultRelInfo); heap_close(resultRelInfo->ri_RelationDesc, NoLock); diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index da0850b..6904c4e 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -67,15 +67,6 @@ int SessionReplicationRole = SESSION_REPLICATION_ROLE_ORIGIN; /* How many levels deep into trigger execution are we? */ static int MyTriggerDepth = 0; -/* - * Note that similar macros also exist in executor/execMain.c. There does not - * appear to be any good header to put them into, given the structures that - * they use, so we let them be duplicated. Be sure to update all if one needs - * to be changed, however. - */ -#define GetUpdatedColumns(relinfo, estate) \ - (rt_fetch((relinfo)->ri_RangeTableIndex, (estate)->es_range_table)->updatedCols) - /* Local function prototypes */ static void ConvertTriggerToFK(CreateTrigStmt *stmt, Oid funcoid); static void SetTriggerFlags(TriggerDesc *trigdesc, Trigger *trigger); @@ -2903,8 +2894,13 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, { HeapTuple trigtuple; - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); - if (fdw_trigtuple == NULL) + /* + * Note: if the UPDATE is converted into a DELETE+INSERT as part of + * update-partition-key operation, then this function is also called + * separately for DELETE and INSERT to capture transition table rows. + * In such case, either old tuple or new tuple can be NULL. + */ + if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid)) trigtuple = GetTupleForTrigger(estate, NULL, relinfo, @@ -5211,7 +5207,12 @@ AfterTriggerPendingOnRel(Oid relid) * triggers actually need to be queued. It is also called after each row, * even if there are no triggers for that event, if there are any AFTER * STATEMENT triggers for the statement which use transition tables, so that - * the transition tuplestores can be built. + * the transition tuplestores can be built. Furthermore, if the transition + * capture is happening for UPDATEd rows being moved to another partition due + * partition-key change, then this function is called once when the row is + * deleted (to capture OLD row), and once when the row is inserted to another + * partition (to capture NEW row). This is done separately because DELETE and + * INSERT happen on different tables. * * Transition tuplestores are built now, rather than when events are pulled * off of the queue because AFTER ROW triggers are allowed to select from the @@ -5260,12 +5261,27 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, bool update_new_table = transition_capture->tcs_update_new_table; bool insert_new_table = transition_capture->tcs_insert_new_table;; - if ((event == TRIGGER_EVENT_DELETE && delete_old_table) || - (event == TRIGGER_EVENT_UPDATE && update_old_table)) + /* + * For capturing transition tuples for UPDATE events fired during + * partition row movement, either oldtup or newtup can be NULL, + * depending on whether the event is for row being deleted from old + * partition or it's for row being inserted into the new partition. But + * in any case, oldtup should always be non-NULL for DELETE events, and + * newtup should be non-NULL for INSERT events, because for transition + * capture with partition row movement, INSERT and DELETE events don't + * fire; only UPDATE event is fired. + */ + Assert(!(event == TRIGGER_EVENT_DELETE && delete_old_table && + oldtup == NULL)); + Assert(!(event == TRIGGER_EVENT_INSERT && insert_new_table && + newtup == NULL)); + + if (oldtup != NULL && + ((event == TRIGGER_EVENT_DELETE && delete_old_table) || + (event == TRIGGER_EVENT_UPDATE && update_old_table))) { Tuplestorestate *old_tuplestore; - Assert(oldtup != NULL); old_tuplestore = transition_capture->tcs_old_tuplestore; if (map != NULL) @@ -5278,12 +5294,12 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, else tuplestore_puttuple(old_tuplestore, oldtup); } - if ((event == TRIGGER_EVENT_INSERT && insert_new_table) || - (event == TRIGGER_EVENT_UPDATE && update_new_table)) + if (newtup != NULL && + ((event == TRIGGER_EVENT_INSERT && insert_new_table) || + (event == TRIGGER_EVENT_UPDATE && update_new_table))) { Tuplestorestate *new_tuplestore; - Assert(newtup != NULL); if (event == TRIGGER_EVENT_INSERT) new_tuplestore = transition_capture->tcs_insert_tuplestore; else @@ -5306,7 +5322,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, if (trigdesc == NULL || (event == TRIGGER_EVENT_DELETE && !trigdesc->trig_delete_after_row) || (event == TRIGGER_EVENT_INSERT && !trigdesc->trig_insert_after_row) || - (event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row)) + (event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row) || + (event == TRIGGER_EVENT_UPDATE && (oldtup == NULL || newtup == NULL))) return; } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 4b594d4..1508f72 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -65,6 +65,18 @@ #include "utils/snapmgr.h" #include "utils/tqual.h" +/* + * Entry of a temporary hash table. During UPDATE tuple routing, we want to + * know which of the leaf partitions are present in the UPDATE per-subplan + * resultRelInfo array (ModifyTableState->resultRelInfo[]). This hash table + * is searchable by the oids of the subplan result rels. + */ +typedef struct ResultRelOidsEntry +{ + Oid rel_oid; + ResultRelInfo *resultRelInfo; +} ResultRelOidsEntry; + /* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */ ExecutorStart_hook_type ExecutorStart_hook = NULL; @@ -104,19 +116,6 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel, int maxfieldlen); static void EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree); -static void ExecPartitionCheck(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate); - -/* - * Note that GetUpdatedColumns() also exists in commands/trigger.c. There does - * not appear to be any good header to put it into, given the structures that - * it uses, so we let them be duplicated. Be sure to update both if one needs - * to be changed, however. - */ -#define GetInsertedColumns(relinfo, estate) \ - (rt_fetch((relinfo)->ri_RangeTableIndex, (estate)->es_range_table)->insertedCols) -#define GetUpdatedColumns(relinfo, estate) \ - (rt_fetch((relinfo)->ri_RangeTableIndex, (estate)->es_range_table)->updatedCols) /* end of local decls */ @@ -1850,15 +1849,10 @@ ExecRelCheck(ResultRelInfo *resultRelInfo, /* * ExecPartitionCheck --- check that tuple meets the partition constraint. */ -static void +bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate) { - Relation rel = resultRelInfo->ri_RelationDesc; - TupleDesc tupdesc = RelationGetDescr(rel); - Bitmapset *modifiedCols; - Bitmapset *insertedCols; - Bitmapset *updatedCols; ExprContext *econtext; /* @@ -1886,52 +1880,66 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, * As in case of the catalogued constraints, we treat a NULL result as * success here, not a failure. */ - if (!ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext)) - { - char *val_desc; - Relation orig_rel = rel; + return ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext); +} - /* See the comment above. */ - if (resultRelInfo->ri_PartitionRoot) +/* + * ExecPartitionCheckEmitError - Form and emit an error message after a failed + * partition constraint check. + */ +void +ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + EState *estate) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + Relation orig_rel = rel; + TupleDesc tupdesc = RelationGetDescr(rel); + char *val_desc; + Bitmapset *modifiedCols; + Bitmapset *insertedCols; + Bitmapset *updatedCols; + + /* See the comments in ExecConstraints. */ + if (resultRelInfo->ri_PartitionRoot) + { + HeapTuple tuple = ExecFetchSlotTuple(slot); + TupleDesc old_tupdesc = RelationGetDescr(rel); + TupleConversionMap *map; + + rel = resultRelInfo->ri_PartitionRoot; + tupdesc = RelationGetDescr(rel); + /* a reverse map */ + map = convert_tuples_by_name(old_tupdesc, tupdesc, + gettext_noop("could not convert row type")); + if (map != NULL) { - HeapTuple tuple = ExecFetchSlotTuple(slot); - TupleDesc old_tupdesc = RelationGetDescr(rel); - TupleConversionMap *map; - - rel = resultRelInfo->ri_PartitionRoot; - tupdesc = RelationGetDescr(rel); - /* a reverse map */ - map = convert_tuples_by_name(old_tupdesc, tupdesc, - gettext_noop("could not convert row type")); - if (map != NULL) - { - tuple = do_convert_tuple(tuple, map); - ExecSetSlotDescriptor(slot, tupdesc); - ExecStoreTuple(tuple, slot, InvalidBuffer, false); - } + tuple = do_convert_tuple(tuple, map); + ExecSetSlotDescriptor(slot, tupdesc); + ExecStoreTuple(tuple, slot, InvalidBuffer, false); } - - insertedCols = GetInsertedColumns(resultRelInfo, estate); - updatedCols = GetUpdatedColumns(resultRelInfo, estate); - modifiedCols = bms_union(insertedCols, updatedCols); - val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), - slot, - tupdesc, - modifiedCols, - 64); - ereport(ERROR, - (errcode(ERRCODE_CHECK_VIOLATION), - errmsg("new row for relation \"%s\" violates partition constraint", - RelationGetRelationName(orig_rel)), - val_desc ? errdetail("Failing row contains %s.", val_desc) : 0)); } + + insertedCols = GetInsertedColumns(resultRelInfo, estate); + updatedCols = GetUpdatedColumns(resultRelInfo, estate); + modifiedCols = bms_union(insertedCols, updatedCols); + val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), + slot, + tupdesc, + modifiedCols, + 64); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("new row for relation \"%s\" violates partition constraint", + RelationGetRelationName(orig_rel)), + val_desc ? errdetail("Failing row contains %s.", val_desc) : 0)); } /* * ExecConstraints - check constraints of the tuple in 'slot' * - * This checks the traditional NOT NULL and check constraints, as well as - * the partition constraint, if any. + * This checks the traditional NOT NULL and check constraints, and if requested, + * checks the partition constraint. * * Note: 'slot' contains the tuple to check the constraints of, which may * have been converted from the original input tuple after tuple routing. @@ -1939,7 +1947,8 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, */ void ExecConstraints(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate) + TupleTableSlot *slot, EState *estate, + bool check_partition_constraint) { Relation rel = resultRelInfo->ri_RelationDesc; TupleDesc tupdesc = RelationGetDescr(rel); @@ -2055,8 +2064,9 @@ ExecConstraints(ResultRelInfo *resultRelInfo, } } - if (resultRelInfo->ri_PartitionCheck) - ExecPartitionCheck(resultRelInfo, slot, estate); + if (check_partition_constraint && resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate)) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); } @@ -3242,34 +3252,40 @@ EvalPlanQualEnd(EPQState *epqstate) * ExecSetupPartitionTupleRouting - set up information needed during * tuple routing for partitioned tables * + * 'update_rri' has the UPDATE per-subplan result rels. + * 'num_update_rri' : number of UPDATE per-subplan result rels. For INSERT, + * this is 0. + * * Output arguments: * 'pd' receives an array of PartitionDispatch objects with one entry for * every partitioned table in the partition tree - * 'partitions' receives an array of ResultRelInfo objects with one entry for + * 'partitions' receives an array of ResultRelInfo* objects with one entry for * every leaf partition in the partition tree - * 'tup_conv_maps' receives an array of TupleConversionMap objects with one - * entry for every leaf partition (required to convert input tuple based - * on the root table's rowtype to a leaf partition's rowtype after tuple - * routing is done) + * 'perleaf_parentchild_maps' receives an array of TupleConversionMap objects + * with on entry for every leaf partition (required to convert input tuple + * based on the root table's rowtype to a leaf partition's rowtype after + * tuple routing is done) * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used * to manipulate any given leaf partition's rowtype after that partition * is chosen by tuple-routing. * 'num_parted' receives the number of partitioned tables in the partition * tree (= the number of entries in the 'pd' output array) * 'num_partitions' receives the number of leaf partitions in the partition - * tree (= the number of entries in the 'partitions' and 'tup_conv_maps' - * output arrays + * tree (= the number of entries in the 'partitions' and + * 'perleaf_parentchild_maps' output arrays * * Note that all the relations in the partition tree are locked using the * RowExclusiveLock mode upon return from this function. */ void ExecSetupPartitionTupleRouting(Relation rel, + ResultRelInfo *update_rri, + int num_update_rri, Index resultRTindex, EState *estate, PartitionDispatch **pd, - ResultRelInfo **partitions, - TupleConversionMap ***tup_conv_maps, + ResultRelInfo ***partitions, + TupleConversionMap ***perleaf_parentchild_maps, TupleTableSlot **partition_tuple_slot, int *num_parted, int *num_partitions) { @@ -3277,7 +3293,10 @@ ExecSetupPartitionTupleRouting(Relation rel, List *leaf_parts; ListCell *cell; int i; - ResultRelInfo *leaf_part_rri; + HTAB *result_rel_oids = NULL; + HASHCTL ctl; + ResultRelOidsEntry *hash_entry; + ResultRelInfo *leaf_part_arr; /* * Get the information about the partition tree after locking all the @@ -3286,10 +3305,50 @@ ExecSetupPartitionTupleRouting(Relation rel, (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); *num_partitions = list_length(leaf_parts); - *partitions = (ResultRelInfo *) palloc(*num_partitions * - sizeof(ResultRelInfo)); - *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions * - sizeof(TupleConversionMap *)); + *partitions = (ResultRelInfo **) palloc(*num_partitions * + sizeof(ResultRelInfo *)); + *perleaf_parentchild_maps = (TupleConversionMap **) palloc0(*num_partitions * + sizeof(TupleConversionMap *)); + + /* + * For Updates, if the leaf partition is already present in the per-subplan + * result rels, we re-use that rather than initialize a new result rel. So + * to find whether a given leaf partition already has a resultRel, we build + * the hash table for searching each of the leaf partitions by oid. + */ + if (num_update_rri != 0) + { + ResultRelInfo *resultRelInfo; + + memset(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(ResultRelOidsEntry); + ctl.hcxt = CurrentMemoryContext; + result_rel_oids = hash_create("result_rel_oids temporary hash", + 32, /* start small and extend */ + &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + resultRelInfo = update_rri; + for (i = 0; i < num_update_rri; i++, resultRelInfo++) + { + Oid reloid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + + hash_entry = hash_search(result_rel_oids, &reloid, + HASH_ENTER, NULL); + hash_entry->resultRelInfo = resultRelInfo; + } + } + else + { + /* + * For inserts, we need to create all new result rels, so avoid + * repeated pallocs by allocating memory for all the result rels in + * bulk. + */ + leaf_part_arr = (ResultRelInfo *) palloc0(*num_partitions * + sizeof(ResultRelInfo)); + } /* * Initialize an empty slot that will be used to manipulate tuples of any @@ -3299,36 +3358,76 @@ ExecSetupPartitionTupleRouting(Relation rel, */ *partition_tuple_slot = MakeTupleTableSlot(); - leaf_part_rri = *partitions; i = 0; foreach(cell, leaf_parts) { - Relation partrel; + ResultRelInfo *leaf_part_rri; + Relation partrel = NULL; TupleDesc part_tupdesc; + Oid leaf_oid = lfirst_oid(cell); + + if (num_update_rri != 0) + { + /* + * If this leaf partition is already present in the per-subplan + * resultRelInfos, re-use that resultRelInfo along with its + * already-opened relation; otherwise create a new result rel. + */ + hash_entry = hash_search(result_rel_oids, &leaf_oid, + HASH_FIND, NULL); + if (hash_entry != NULL) + { + leaf_part_rri = hash_entry->resultRelInfo; + partrel = leaf_part_rri->ri_RelationDesc; + + /* + * This is required when converting tuple as per root + * partition tuple descriptor. When generating the update + * plans, this was not set. + */ + leaf_part_rri->ri_PartitionRoot = rel; + } + else + leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo)); + } + else + { + /* For INSERTs, we already have an array of result rels allocated */ + leaf_part_rri = leaf_part_arr + i; + } /* - * We locked all the partitions above including the leaf partitions. - * Note that each of the relations in *partitions are eventually - * closed by the caller. + * If we didn't open the partition rel, it means we haven't + * initialized the result rel as well. */ - partrel = heap_open(lfirst_oid(cell), NoLock); + if (!partrel) + { + /* + * We locked all the partitions above including the leaf + * partitions. Note that each of the newly opened relations in + * *partitions are eventually closed by the caller. + */ + partrel = heap_open(leaf_oid, NoLock); + InitResultRelInfo(leaf_part_rri, + partrel, + resultRTindex, + rel, + estate->es_instrument); + } + part_tupdesc = RelationGetDescr(partrel); /* * Save a tuple conversion map to convert a tuple routed to this * partition from the parent's type to the partition's. */ - (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, - gettext_noop("could not convert row type")); - - InitResultRelInfo(leaf_part_rri, - partrel, - resultRTindex, - rel, - estate->es_instrument); + (*perleaf_parentchild_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, + gettext_noop("could not convert row type")); /* - * Verify result relation is a valid target for INSERT. + * Verify result relation is a valid target for insert operation. Even + * for updates, we are doing this for tuple-routing, so again, we need + * to check the validity for insert operation. */ CheckValidResultRel(leaf_part_rri, CMD_INSERT); @@ -3344,9 +3443,12 @@ ExecSetupPartitionTupleRouting(Relation rel, estate->es_leaf_result_relations = lappend(estate->es_leaf_result_relations, leaf_part_rri); - leaf_part_rri++; + (*partitions)[i] = leaf_part_rri; i++; } + + if (result_rel_oids != NULL) + hash_destroy(result_rel_oids); } /* @@ -3372,8 +3474,9 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, * First check the root table's partition constraint, if any. No point in * routing the tuple if it doesn't belong in the root table itself. */ - if (resultRelInfo->ri_PartitionCheck) - ExecPartitionCheck(resultRelInfo, slot, estate); + if (resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate)) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); result = get_partition_for_tuple(pd, slot, estate, &failed_at, &failed_slot); diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index fbb8108..47afe09 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -401,7 +401,7 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot) /* Check the constraints of the tuple */ if (rel->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); /* Store the slot into tuple that we can inspect. */ tuple = ExecMaterializeSlot(slot); @@ -466,7 +466,7 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate, /* Check the constraints of the tuple */ if (rel->rd_att->constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); /* Store the slot into tuple that we can write. */ tuple = ExecMaterializeSlot(slot); diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index bd84778..ecf51db 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -45,6 +45,7 @@ #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" +#include "optimizer/var.h" #include "parser/parsetree.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" @@ -53,7 +54,6 @@ #include "utils/rel.h" #include "utils/tqual.h" - static bool ExecOnConflictUpdate(ModifyTableState *mtstate, ResultRelInfo *resultRelInfo, ItemPointer conflictTid, @@ -240,6 +240,36 @@ ExecCheckTIDVisible(EState *estate, ReleaseBuffer(buffer); } +/* + * ConvertPartitionTupleSlot -- convenience function for converting tuple and + * storing it into a tuple slot provided through 'new_slot', which typically + * should be one of the dedicated partition tuple slot. Passes the partition + * tuple slot back into output param p_old_slot. If no mapping present, keeps + * p_old_slot unchanged. + * + * Returns the converted tuple. + */ +static HeapTuple +ConvertPartitionTupleSlot(ModifyTableState *mtstate, TupleConversionMap *map, + HeapTuple tuple, + TupleTableSlot *new_slot, TupleTableSlot **p_old_slot) +{ + if (!map) + return tuple; + + tuple = do_convert_tuple(tuple, map); + + /* + * Change the partition tuple slot descriptor, as per converted tuple. + */ + *p_old_slot = new_slot; + Assert(new_slot != NULL); + ExecSetSlotDescriptor(new_slot, map->outdesc); + ExecStoreTuple(tuple, new_slot, InvalidBuffer, true); + + return tuple; +} + /* ---------------------------------------------------------------- * ExecInsert * @@ -281,17 +311,50 @@ ExecInsert(ModifyTableState *mtstate, if (mtstate->mt_partition_dispatch_info) { int leaf_part_index; - TupleConversionMap *map; + ResultRelInfo *rootResultRelInfo; + + /* + * If the original operation is UPDATE, the root partition rel needs + * to be fetched from mtstate->rootResultRelInfo. + */ + rootResultRelInfo = (mtstate->rootResultRelInfo ? + mtstate->rootResultRelInfo : resultRelInfo); + + /* + * If the resultRelInfo is not the root partition (which happens for + * UPDATE), we should convert the tuple into root partition's tuple + * descriptor, since ExecFindPartition() starts the search from root. + * The tuple conversion map list is in the order of + * mtstate->resultRelInfo[], so to retrieve the one for this + * resultRel, we need to know the position of the resultRel in + * mtstate->resultRelInfo[]. Note: We assume that if the resultRelInfo + * does not belong to subplans, then it already matches the root tuple + * descriptor; although there is no such known scenario where this + * could happen. + */ + if (rootResultRelInfo != resultRelInfo && + mtstate->mt_persubplan_childparent_maps != NULL && + resultRelInfo >= mtstate->resultRelInfo && + resultRelInfo <= mtstate->resultRelInfo + mtstate->mt_nplans - 1) + { + int map_index = resultRelInfo - mtstate->resultRelInfo; + + tuple = ConvertPartitionTupleSlot(mtstate, + mtstate->mt_persubplan_childparent_maps[map_index], + tuple, + mtstate->mt_rootpartition_tuple_slot, + &slot); + } /* * Away we go ... If we end up not finding a partition after all, * ExecFindPartition() does not return and errors out instead. * Otherwise, the returned value is to be used as an index into arrays - * mt_partitions[] and mt_partition_tupconv_maps[] that will get us + * mt_partitions[] and mt_perleaf_parentchild_maps[] that will get us * the ResultRelInfo and TupleConversionMap for the partition, * respectively. */ - leaf_part_index = ExecFindPartition(resultRelInfo, + leaf_part_index = ExecFindPartition(rootResultRelInfo, mtstate->mt_partition_dispatch_info, slot, estate); @@ -303,7 +366,7 @@ ExecInsert(ModifyTableState *mtstate, * the selected partition. */ saved_resultRelInfo = resultRelInfo; - resultRelInfo = mtstate->mt_partitions + leaf_part_index; + resultRelInfo = mtstate->mt_partitions[leaf_part_index]; /* We do not yet have a way to insert into a foreign partition */ if (resultRelInfo->ri_FdwRoutine) @@ -331,7 +394,7 @@ ExecInsert(ModifyTableState *mtstate, */ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[leaf_part_index]; + mtstate->mt_perleaf_childparent_maps[leaf_part_index]; } else { @@ -348,23 +411,11 @@ ExecInsert(ModifyTableState *mtstate, * We might need to convert from the parent rowtype to the partition * rowtype. */ - map = mtstate->mt_partition_tupconv_maps[leaf_part_index]; - if (map) - { - Relation partrel = resultRelInfo->ri_RelationDesc; - - tuple = do_convert_tuple(tuple, map); - - /* - * We must use the partition's tuple descriptor from this point - * on, until we're finished dealing with the partition. Use the - * dedicated slot for that. - */ - slot = mtstate->mt_partition_tuple_slot; - Assert(slot != NULL); - ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); - ExecStoreTuple(tuple, slot, InvalidBuffer, true); - } + tuple = ConvertPartitionTupleSlot(mtstate, + mtstate->mt_perleaf_parentchild_maps[leaf_part_index], + tuple, + mtstate->mt_partition_tuple_slot, + &slot); } resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -482,7 +533,7 @@ ExecInsert(ModifyTableState *mtstate, /* Check the constraints of the tuple */ if (resultRelationDesc->rd_att->constr || check_partition_constr) - ExecConstraints(resultRelInfo, slot, estate); + ExecConstraints(resultRelInfo, slot, estate, true); if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0) { @@ -622,6 +673,19 @@ ExecInsert(ModifyTableState *mtstate, ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes, mtstate->mt_transition_capture); + /* + * In case this is part of update tuple routing, put this row into the + * transition NEW TABLE if we are capturing transition tables. We need to + * do this separately for DELETE and INSERT because they happen on + * different tables. + */ + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture) + ExecARUpdateTriggers(estate, resultRelInfo, NULL, + NULL, + tuple, + NULL, + mtstate->mt_transition_capture); + list_free(recheckIndexes); /* @@ -674,6 +738,8 @@ ExecDelete(ModifyTableState *mtstate, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, + bool *delete_skipped, + bool process_returning, bool canSetTag) { ResultRelInfo *resultRelInfo; @@ -682,6 +748,9 @@ ExecDelete(ModifyTableState *mtstate, HeapUpdateFailureData hufd; TupleTableSlot *slot = NULL; + if (delete_skipped) + *delete_skipped = true; + /* * get information on the (current) result relation */ @@ -845,12 +914,29 @@ ldelete:; if (canSetTag) (estate->es_processed)++; + /* The delete has actually happened, so inform that to the caller */ + if (delete_skipped) + *delete_skipped = false; + /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, mtstate->mt_transition_capture); - /* Process RETURNING if present */ - if (resultRelInfo->ri_projectReturning) + /* + * In case this is part of update tuple routing, put this row into the + * transition OLD TABLE if we are capturing transition tables. We need to + * do this separately for DELETE and INSERT because they happen on + * different tables. + */ + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture) + ExecARUpdateTriggers(estate, resultRelInfo, tupleid, + oldtuple, + NULL, + NULL, + mtstate->mt_transition_capture); + + /* Process RETURNING if present and if requested */ + if (process_returning && resultRelInfo->ri_projectReturning) { /* * We have to put the target tuple into a slot, which means first we @@ -943,6 +1029,8 @@ ExecUpdate(ModifyTableState *mtstate, HTSU_Result result; HeapUpdateFailureData hufd; List *recheckIndexes = NIL; + TupleConversionMap *saved_tcs_map = NULL; + /* * abort the operation if not running transactions @@ -1039,12 +1127,82 @@ lreplace:; resultRelInfo, slot, estate); /* + * If a partition check fails, try to move the row into the right + * partition. + */ + if (resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate)) + { + bool delete_skipped; + TupleTableSlot *ret_slot; + + /* + * When an UPDATE is run with a leaf partition, we would not have + * partition tuple routing setup. In that case, fail with + * partition constraint violation error. + */ + if (mtstate->mt_partition_dispatch_info == NULL) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); + + /* Do the row movement. */ + + /* + * Skip RETURNING processing for DELETE. We want to return rows + * from INSERT. + */ + ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate, + &delete_skipped, false, false); + + /* + * For some reason if DELETE didn't happen (for e.g. trigger + * prevented it, or it was already deleted by self, or it was + * concurrently deleted by another transaction), then we should + * skip INSERT as well, otherwise, there will be effectively one + * new row inserted. + * + * For a normal UPDATE, the case where the tuple has been the + * subject of a concurrent UPDATE or DELETE would be handled by + * the EvalPlanQual machinery, but for an UPDATE that we've + * translated into a DELETE from this partition and an INSERT into + * some other partition, that's not available, because CTID chains + * can't span relation boundaries. We mimic the semantics to a + * limited extent by skipping the INSERT if the DELETE fails to + * find a tuple. This ensures that two concurrent attempts to + * UPDATE the same tuple at the same time can't turn one tuple + * into two, and that an UPDATE of a just-deleted tuple can't + * resurrect it. + */ + if (delete_skipped) + return NULL; + + if (mtstate->mt_transition_capture) + saved_tcs_map = mtstate->mt_transition_capture->tcs_map; + + ret_slot = ExecInsert(mtstate, slot, planSlot, NULL, + ONCONFLICT_NONE, estate, canSetTag); + + if (mtstate->mt_transition_capture) + { + /* + * Revert back to the transition capture map created for + * UPDATE; otherwise the next UPDATE will incorrectly use the + * one created for INESRT. + */ + mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + mtstate->mt_transition_capture->tcs_map = saved_tcs_map; + } + return ret_slot; + } + + /* * Check the constraints of the tuple. Note that we pass the same * slot for the orig_slot argument, because unlike ExecInsert(), no * tuple-routing is performed here, hence the slot remains unchanged. + * We have already checked partition constraints above, so skip them + * below. */ - if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck) - ExecConstraints(resultRelInfo, slot, estate); + if (resultRelationDesc->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate, false); /* * replace the heap tuple @@ -1463,6 +1621,45 @@ fireASTriggers(ModifyTableState *node) } /* + * Set up per subplan tuple conversion map from child partition to root + * partitioned table. The map is needed for collecting transition tuples for + * AFTER triggers, and for UPDATE row movement. + */ +static void +ExecSetupPerSubplanChildParentMap(ModifyTableState *mtstate) +{ + TupleConversionMap **tup_conv_maps; + TupleDesc outdesc; + ResultRelInfo *resultRelInfo; + ResultRelInfo *rootRelInfo; + int nplans = mtstate->mt_nplans; + int i; + + Assert(mtstate->operation != CMD_INSERT); + + if (mtstate->mt_persubplan_childparent_maps != NULL) + return; + + rootRelInfo = getASTriggerResultRelInfo(mtstate); + + mtstate->mt_persubplan_childparent_maps = + (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) * nplans); + + /* Get tuple descriptor of the root partition. */ + outdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc); + + resultRelInfo = mtstate->resultRelInfo; + tup_conv_maps = mtstate->mt_persubplan_childparent_maps; + for (i = 0; i < nplans; i++) + { + TupleDesc indesc = RelationGetDescr(resultRelInfo[i].ri_RelationDesc); + + tup_conv_maps[i] = convert_tuples_by_name(indesc, outdesc, + gettext_noop("could not convert row type")); + } +} + +/* * Set up the state needed for collecting transition tuples for AFTER * triggers. */ @@ -1470,63 +1667,115 @@ static void ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) { ResultRelInfo *targetRelInfo = getASTriggerResultRelInfo(mtstate); + ResultRelInfo **resultRelInfos; + int numResultRelInfos; + int update_rri_index = -1; + ResultRelInfo *update_rri = mtstate->resultRelInfo; + Oid cur_reloid = InvalidOid; int i; /* Check for transition tables on the directly targeted relation. */ mtstate->mt_transition_capture = MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc); + if (mtstate->mt_transition_capture == NULL) + return; + /* - * If we found that we need to collect transition tuples then we may also + * Now that we know that we need to collect transition tuples, we may also * need tuple conversion maps for any children that have TupleDescs that * aren't compatible with the tuplestores. */ - if (mtstate->mt_transition_capture != NULL) + + /* Make sure per-subplan mapping is there. */ + if (mtstate->operation != CMD_INSERT) + ExecSetupPerSubplanChildParentMap(mtstate); + + /* + * Install the conversion map for the first plan for UPDATE and DELETE + * operations. It will be advanced each time we switch to the next plan. + * (INSERT operations set it every time.) + */ + if (mtstate->mt_persubplan_childparent_maps) + { + mtstate->mt_transition_capture->tcs_map = + mtstate->mt_persubplan_childparent_maps[0]; + } + + /* If no tuple routing, return without setting up per-leaf-partition map */ + if (mtstate->mt_partition_dispatch_info == NULL) + return; + + numResultRelInfos = mtstate->mt_num_partitions; + resultRelInfos = mtstate->mt_partitions; + + /* + * Build array of conversion maps from each child's TupleDesc to the one + * used in the tuplestore. The map pointers may be NULL when no + * conversion is necessary, which is hopefully a common case for + * partitions. + */ + mtstate->mt_perleaf_childparent_maps = (TupleConversionMap **) + palloc0(sizeof(TupleConversionMap *) * numResultRelInfos); + + /* For Inserts, just create all new map elements. */ + if (mtstate->operation == CMD_INSERT) { - ResultRelInfo *resultRelInfos; - int numResultRelInfos; + for (i = 0; i < numResultRelInfos; ++i) + { + mtstate->mt_perleaf_childparent_maps[i] = + convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc), + RelationGetDescr(targetRelInfo->ri_RelationDesc), + gettext_noop("could not convert row type")); + } + return; + } - /* Find the set of partitions so that we can find their TupleDescs. */ - if (mtstate->mt_partition_dispatch_info != NULL) + /* + * But for Updates, we can share the per-subplan maps with the per-leaf + * maps. + */ + update_rri_index = 0; + update_rri = mtstate->resultRelInfo; + if (mtstate->mt_nplans > 0) + cur_reloid = RelationGetRelid(update_rri[0].ri_RelationDesc); + + for (i = 0; i < numResultRelInfos; ++i) + { + ResultRelInfo *resultRelInfo = mtstate->mt_partitions[i]; + + /* Is this leaf partition present in the update resultrel ? */ + if (cur_reloid == RelationGetRelid(resultRelInfo->ri_RelationDesc)) { + Assert(update_rri_index < mtstate->mt_nplans); + + mtstate->mt_perleaf_childparent_maps[i] = + mtstate->mt_persubplan_childparent_maps[update_rri_index]; + update_rri_index++; + /* - * For INSERT via partitioned table, so we need TupleDescs based - * on the partition routing table. + * If this was the last UPDATE resultrel, indicate that by + * invalidating the cur_reloid. */ - resultRelInfos = mtstate->mt_partitions; - numResultRelInfos = mtstate->mt_num_partitions; + if (update_rri_index == mtstate->mt_nplans) + cur_reloid = InvalidOid; + else + cur_reloid = RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc); } else { - /* Otherwise we need the ResultRelInfo for each subplan. */ - resultRelInfos = mtstate->resultRelInfo; - numResultRelInfos = mtstate->mt_nplans; - } - - /* - * Build array of conversion maps from each child's TupleDesc to the - * one used in the tuplestore. The map pointers may be NULL when no - * conversion is necessary, which is hopefully a common case for - * partitions. - */ - mtstate->mt_transition_tupconv_maps = (TupleConversionMap **) - palloc0(sizeof(TupleConversionMap *) * numResultRelInfos); - for (i = 0; i < numResultRelInfos; ++i) - { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc), + mtstate->mt_perleaf_childparent_maps[i] = + convert_tuples_by_name(RelationGetDescr(resultRelInfo->ri_RelationDesc), RelationGetDescr(targetRelInfo->ri_RelationDesc), gettext_noop("could not convert row type")); } - - /* - * Install the conversion map for the first plan for UPDATE and DELETE - * operations. It will be advanced each time we switch to the next - * plan. (INSERT operations set it every time.) - */ - mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[0]; } + + /* + * We should have found all the per-subplan reloids in the leaf + * partitions. + */ + Assert(update_rri_index == mtstate->mt_nplans); } /* ---------------------------------------------------------------- @@ -1632,9 +1881,9 @@ ExecModifyTable(PlanState *pstate) if (node->mt_transition_capture != NULL) { /* Prepare to convert transition tuples from this child. */ - Assert(node->mt_transition_tupconv_maps != NULL); + Assert(node->mt_persubplan_childparent_maps != NULL); node->mt_transition_capture->tcs_map = - node->mt_transition_tupconv_maps[node->mt_whichplan]; + node->mt_persubplan_childparent_maps[node->mt_whichplan]; } continue; } @@ -1750,7 +1999,8 @@ ExecModifyTable(PlanState *pstate) break; case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, - &node->mt_epqstate, estate, node->canSetTag); + &node->mt_epqstate, estate, + NULL, true, node->canSetTag); break; default: elog(ERROR, "unknown operation"); @@ -1795,9 +2045,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ResultRelInfo *resultRelInfo; TupleDesc tupDesc; Plan *subplan; + int firstVarno = 0; + Relation firstResultRel = NULL; ListCell *l; int i; Relation rel; + bool update_tuple_routing_needed = node->part_cols_updated; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -1870,6 +2123,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_IndexRelationDescs == NULL) ExecOpenIndices(resultRelInfo, mtstate->mt_onconflict != ONCONFLICT_NONE); + /* + * If this is an UPDATE and a BEFORE UPDATE trigger is present, we may + * need to do update tuple routing. + */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_before_row && + operation == CMD_UPDATE) + update_tuple_routing_needed = true; + /* Now init the plan for this result rel */ estate->es_result_relation_info = resultRelInfo; mtstate->mt_plans[i] = ExecInitNode(subplan, estate, eflags); @@ -1907,33 +2169,63 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) else rel = mtstate->resultRelInfo->ri_RelationDesc; - /* Build state for INSERT tuple routing */ - if (operation == CMD_INSERT && - rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + /* Decide whether we need to perform update tuple routing. */ + if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + update_tuple_routing_needed = false; + + /* + * Build state for tuple routing if it's an INSERT or if it's an UPDATE of + * partition key. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE && + (operation == CMD_INSERT || update_tuple_routing_needed)) { PartitionDispatch *partition_dispatch_info; - ResultRelInfo *partitions; - TupleConversionMap **partition_tupconv_maps; + ResultRelInfo **partitions; + TupleConversionMap **perleaf_parentchild_maps; TupleTableSlot *partition_tuple_slot; int num_parted, num_partitions; ExecSetupPartitionTupleRouting(rel, + (operation == CMD_UPDATE ? + mtstate->resultRelInfo : NULL), + (operation == CMD_UPDATE ? nplans : 0), node->nominalRelation, estate, &partition_dispatch_info, &partitions, - &partition_tupconv_maps, + &perleaf_parentchild_maps, &partition_tuple_slot, &num_parted, &num_partitions); mtstate->mt_partition_dispatch_info = partition_dispatch_info; mtstate->mt_num_dispatch = num_parted; mtstate->mt_partitions = partitions; mtstate->mt_num_partitions = num_partitions; - mtstate->mt_partition_tupconv_maps = partition_tupconv_maps; + mtstate->mt_perleaf_parentchild_maps = perleaf_parentchild_maps; mtstate->mt_partition_tuple_slot = partition_tuple_slot; + mtstate->mt_rootpartition_tuple_slot = MakeTupleTableSlot(); + + /* + * Below are required as reference objects for mapping partition + * attno's in expressions such as WCO and RETURNING. + */ + firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; } + /* + * Construct mapping from each of the resultRelInfo attnos to the root + * attno. This is required when during update row movement the tuple + * descriptor of a source partition does not match the root partition + * descriptor. In such case we need to convert tuples to the root + * partition tuple descriptor, because the search for destination + * partition starts from the root. Skip this setup if it's not a partition + * key update. + */ + if (update_tuple_routing_needed) + ExecSetupPerSubplanChildParentMap(mtstate); + /* Build state for collecting transition tuples */ ExecSetupTransitionCaptureState(mtstate, estate); @@ -1967,50 +2259,54 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build WITH CHECK OPTION constraints for each leaf partition rel. Note * that we didn't build the withCheckOptionList for each partition within * the planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * cases are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE row + * movement. DELETEs and local UPDATEs are handled above. */ if (node->withCheckOptionLists != NIL && mtstate->mt_num_partitions > 0) { - List *wcoList; - PlanState *plan; + List *firstWco; /* * In case of INSERT on partitioned tables, there is only one plan. * Likewise, there is only one WITH CHECK OPTIONS list, not one per - * partition. We make a copy of the WCO qual for each partition; note - * that, if there are SubPlans in there, they all end up attached to - * the one parent Plan node. + * partition. Whereas for UPDATE, there are as many WCOs as there are + * plans. So in either case, use the WCO expression of the first + * resultRelInfo as a reference to calculate attno's for the WCO + * expression of each of the partitions. We make a copy of the WCO + * qual for each partition. Note that, if there are SubPlans in there, + * they all end up attached to the one parent Plan node. */ - Assert(operation == CMD_INSERT && - list_length(node->withCheckOptionLists) == 1 && - mtstate->mt_nplans == 1); - wcoList = linitial(node->withCheckOptionLists); - plan = mtstate->mt_plans[0]; - resultRelInfo = mtstate->mt_partitions; + Assert(update_tuple_routing_needed || + (operation == CMD_INSERT && + list_length(node->withCheckOptionLists) == 1 && + mtstate->mt_nplans == 1)); + + firstWco = linitial(node->withCheckOptionLists); for (i = 0; i < mtstate->mt_num_partitions; i++) { - Relation partrel = resultRelInfo->ri_RelationDesc; - List *mapped_wcoList; + Relation partrel; + List *mappedWco; List *wcoExprs = NIL; ListCell *ll; - /* varno = node->nominalRelation */ - mapped_wcoList = map_partition_varattnos(wcoList, - node->nominalRelation, - partrel, rel, NULL); - foreach(ll, mapped_wcoList) + resultRelInfo = mtstate->mt_partitions[i]; + + partrel = resultRelInfo->ri_RelationDesc; + mappedWco = map_partition_varattnos(firstWco, + firstVarno, + partrel, firstResultRel, + NULL); + foreach(ll, mappedWco) { WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), - plan); + &mtstate->ps); wcoExprs = lappend(wcoExprs, wcoExpr); } - resultRelInfo->ri_WithCheckOptions = mapped_wcoList; + resultRelInfo->ri_WithCheckOptions = mappedWco; resultRelInfo->ri_WithCheckOptionExprs = wcoExprs; - resultRelInfo++; } } @@ -2021,7 +2317,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) { TupleTableSlot *slot; ExprContext *econtext; - List *returningList; + List *firstReturningList; /* * Initialize result tuple slot and assign its rowtype using the first @@ -2058,20 +2354,26 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build a projection for each leaf partition rel. Note that we * didn't build the returningList for each partition within the * planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE + * row movement. DELETEs and local UPDATEs are handled above. */ - resultRelInfo = mtstate->mt_partitions; - returningList = linitial(node->returningLists); + firstReturningList = linitial(node->returningLists); for (i = 0; i < mtstate->mt_num_partitions; i++) { - Relation partrel = resultRelInfo->ri_RelationDesc; + Relation partrel; List *rlist; - /* varno = node->nominalRelation */ - rlist = map_partition_varattnos(returningList, - node->nominalRelation, - partrel, rel, NULL); + resultRelInfo = mtstate->mt_partitions[i]; + partrel = resultRelInfo->ri_RelationDesc; + + /* + * Use the returning expression of the first resultRelInfo as a + * reference to calculate attno's for the returning expression of + * each of the partitions. + */ + rlist = map_partition_varattnos(firstReturningList, + firstVarno, + partrel, firstResultRel, NULL); resultRelInfo->ri_projectReturning = ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, resultRelInfo->ri_RelationDesc->rd_att); @@ -2317,6 +2619,7 @@ void ExecEndModifyTable(ModifyTableState *node) { int i; + CmdType operation = node->operation; /* Free transition tables */ if (node->mt_transition_capture != NULL) @@ -2353,13 +2656,25 @@ ExecEndModifyTable(ModifyTableState *node) } for (i = 0; i < node->mt_num_partitions; i++) { - ResultRelInfo *resultRelInfo = node->mt_partitions + i; + ResultRelInfo *resultRelInfo = node->mt_partitions[i]; + + /* + * If this result rel is one of the subplan result rels, let + * ExecEndPlan() close it. For INSERTs, this does not apply because + * all leaf partition result rels are anyway newly allocated. + */ + if (operation == CMD_UPDATE && + resultRelInfo >= node->resultRelInfo && + resultRelInfo < node->resultRelInfo + node->mt_nplans) + continue; ExecCloseIndices(resultRelInfo); heap_close(resultRelInfo->ri_RelationDesc, NoLock); } - /* Release the standalone partition tuple descriptor, if any */ + /* Release the standalone partition tuple descriptors, if any */ + if (node->mt_rootpartition_tuple_slot) + ExecDropSingleTupleTableSlot(node->mt_rootpartition_tuple_slot); if (node->mt_partition_tuple_slot) ExecDropSingleTupleTableSlot(node->mt_partition_tuple_slot); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 9bae264..3cdbd97 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -204,6 +204,7 @@ _copyModifyTable(const ModifyTable *from) COPY_SCALAR_FIELD(canSetTag); COPY_SCALAR_FIELD(nominalRelation); COPY_NODE_FIELD(partitioned_rels); + COPY_SCALAR_FIELD(part_cols_updated); COPY_NODE_FIELD(resultRelations); COPY_SCALAR_FIELD(resultRelIndex); COPY_SCALAR_FIELD(rootResultRelIndex); @@ -2260,6 +2261,7 @@ _copyPartitionedChildRelInfo(const PartitionedChildRelInfo *from) COPY_SCALAR_FIELD(parent_relid); COPY_NODE_FIELD(child_rels); + COPY_BITMAPSET_FIELD(all_part_cols); return newnode; } diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 11731da..a410e46 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -909,6 +909,7 @@ _equalPartitionedChildRelInfo(const PartitionedChildRelInfo *a, const Partitione { COMPARE_SCALAR_FIELD(parent_relid); COMPARE_NODE_FIELD(child_rels); + COMPARE_BITMAPSET_FIELD(all_part_cols); return true; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 9ee3e23..f642bf2 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -367,6 +367,7 @@ _outModifyTable(StringInfo str, const ModifyTable *node) WRITE_BOOL_FIELD(canSetTag); WRITE_UINT_FIELD(nominalRelation); WRITE_NODE_FIELD(partitioned_rels); + WRITE_BOOL_FIELD(part_cols_updated); WRITE_NODE_FIELD(resultRelations); WRITE_INT_FIELD(resultRelIndex); WRITE_INT_FIELD(rootResultRelIndex); @@ -2096,6 +2097,7 @@ _outModifyTablePath(StringInfo str, const ModifyTablePath *node) WRITE_BOOL_FIELD(canSetTag); WRITE_UINT_FIELD(nominalRelation); WRITE_NODE_FIELD(partitioned_rels); + WRITE_BOOL_FIELD(part_cols_updated); WRITE_NODE_FIELD(resultRelations); WRITE_NODE_FIELD(subpaths); WRITE_NODE_FIELD(subroots); @@ -2518,6 +2520,7 @@ _outPartitionedChildRelInfo(StringInfo str, const PartitionedChildRelInfo *node) WRITE_UINT_FIELD(parent_relid); WRITE_NODE_FIELD(child_rels); + WRITE_BITMAPSET_FIELD(all_part_cols); } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 67b9e19..89dd3cf 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -1562,6 +1562,7 @@ _readModifyTable(void) READ_BOOL_FIELD(canSetTag); READ_UINT_FIELD(nominalRelation); READ_NODE_FIELD(partitioned_rels); + READ_BOOL_FIELD(part_cols_updated); READ_NODE_FIELD(resultRelations); READ_INT_FIELD(resultRelIndex); READ_INT_FIELD(rootResultRelIndex); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 2d7e1d8..8c08d50 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1291,7 +1291,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, rte = planner_rt_fetch(rel->relid, root); if (rte->relkind == RELKIND_PARTITIONED_TABLE) { - partitioned_rels = get_partitioned_child_rels(root, rel->relid); + partitioned_rels = get_partitioned_child_rels(root, rel->relid, NULL); /* The root partitioned table is included as a child rel */ Assert(list_length(partitioned_rels) >= 1); } diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 2821662..85e3126 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -277,6 +277,7 @@ static ProjectSet *make_project_set(List *tlist, Plan *subplan); static ModifyTable *make_modifytable(PlannerInfo *root, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool part_cols_updated, List *resultRelations, List *subplans, List *withCheckOptionLists, List *returningLists, List *rowMarks, OnConflictExpr *onconflict, int epqParam); @@ -2361,6 +2362,7 @@ create_modifytable_plan(PlannerInfo *root, ModifyTablePath *best_path) best_path->canSetTag, best_path->nominalRelation, best_path->partitioned_rels, + best_path->part_cols_updated, best_path->resultRelations, subplans, best_path->withCheckOptionLists, @@ -6405,6 +6407,7 @@ static ModifyTable * make_modifytable(PlannerInfo *root, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool part_cols_updated, List *resultRelations, List *subplans, List *withCheckOptionLists, List *returningLists, List *rowMarks, OnConflictExpr *onconflict, int epqParam) @@ -6431,6 +6434,7 @@ make_modifytable(PlannerInfo *root, node->canSetTag = canSetTag; node->nominalRelation = nominalRelation; node->partitioned_rels = partitioned_rels; + node->part_cols_updated = part_cols_updated; node->resultRelations = resultRelations; node->resultRelIndex = -1; /* will be set correctly in setrefs.c */ node->rootResultRelIndex = -1; /* will be set correctly in setrefs.c */ diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 6b79b3a..68e0302 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1056,6 +1056,7 @@ inheritance_planner(PlannerInfo *root) Index rti; RangeTblEntry *parent_rte; List *partitioned_rels = NIL; + bool part_cols_updated = false; Assert(parse->commandType != CMD_INSERT); @@ -1370,9 +1371,15 @@ inheritance_planner(PlannerInfo *root) if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE) { - partitioned_rels = get_partitioned_child_rels(root, parentRTindex); + Bitmapset *all_part_cols = NULL; + + partitioned_rels = get_partitioned_child_rels(root, parentRTindex, + &all_part_cols); /* The root partitioned table is included as a child rel */ Assert(list_length(partitioned_rels) >= 1); + + if (bms_overlap(all_part_cols, parent_rte->updatedCols)) + part_cols_updated = true; } /* Result path must go into outer query's FINAL upperrel */ @@ -1429,6 +1436,7 @@ inheritance_planner(PlannerInfo *root) parse->canSetTag, nominalRelation, partitioned_rels, + part_cols_updated, resultRelations, subpaths, subroots, @@ -2046,6 +2054,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update, parse->canSetTag, parse->resultRelation, NIL, + false, list_make1_int(parse->resultRelation), list_make1(path), list_make1(root), @@ -6076,10 +6085,15 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) * Returns a list of the RT indexes of the partitioned child relations * with rti as the root parent RT index. * + * If all_part_cols_p is non-NULL, *all_part_cols_p is set to a bitmapset + * of all partitioning columns used by the partitioned table or any + * descendent. + * * Note: Only call this function on RTEs known to be partitioned tables. */ List * -get_partitioned_child_rels(PlannerInfo *root, Index rti) +get_partitioned_child_rels(PlannerInfo *root, Index rti, + Bitmapset **all_part_cols_p) { List *result = NIL; ListCell *l; @@ -6091,6 +6105,8 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti) if (pc->parent_relid == rti) { result = pc->child_rels; + if (all_part_cols_p) + *all_part_cols_p = pc->all_part_cols; break; } } diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index ccf2145..fc7c597 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -107,12 +107,14 @@ static void expand_partitioned_rtentry(PlannerInfo *root, PlanRowMark *parentrc, PartitionDesc partdesc, LOCKMODE lockmode, bool *has_child, List **appinfos, + Bitmapset **all_part_cols, List **partitioned_child_rels); static void expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, PlanRowMark *parentrc, Relation childrel, bool *has_child, List **appinfos, + Bitmapset **all_part_cols, List **partitioned_child_rels); static void make_inh_translation_list(Relation oldrelation, Relation newrelation, @@ -1397,6 +1399,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) bool has_child; PartitionedChildRelInfo *pcinfo; List *partitioned_child_rels = NIL; + Bitmapset *all_part_cols = NULL; /* Does RT entry allow inheritance? */ if (!rte->inh) @@ -1479,11 +1482,13 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc, oldrelation, &has_child, &appinfos, + &all_part_cols, &partitioned_child_rels); expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc, RelationGetPartitionDesc(oldrelation), lockmode, &has_child, &appinfos, + &all_part_cols, &partitioned_child_rels); } else @@ -1519,6 +1524,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc, newrelation, &has_child, &appinfos, + &all_part_cols, &partitioned_child_rels); /* Close child relations, but keep locks */ @@ -1558,6 +1564,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); pcinfo->parent_relid = rti; pcinfo->child_rels = partitioned_child_rels; + pcinfo->all_part_cols = all_part_cols; root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); } @@ -1571,6 +1578,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, PlanRowMark *parentrc, PartitionDesc partdesc, LOCKMODE lockmode, bool *has_child, List **appinfos, + Bitmapset **all_part_cols, List **partitioned_child_rels) { int i; @@ -1595,6 +1603,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, expand_single_inheritance_child(root, parentrte, parentRTindex, parentrel, parentrc, childrel, has_child, appinfos, + all_part_cols, partitioned_child_rels); /* If this child is itself partitioned, recurse */ @@ -1604,6 +1613,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, RelationGetPartitionDesc(childrel), lockmode, has_child, appinfos, + all_part_cols, partitioned_child_rels); /* Close child relation, but keep locks */ @@ -1625,6 +1635,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index parentRTindex, Relation parentrel, PlanRowMark *parentrc, Relation childrel, bool *has_child, List **appinfos, + Bitmapset **all_part_cols, List **partitioned_child_rels) { Query *parse = root->parse; @@ -1695,8 +1706,11 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, } } else + { *partitioned_child_rels = lappend_int(*partitioned_child_rels, childRTindex); + pull_child_partition_columns(all_part_cols, childrel, parentrel); + } /* * Build a PlanRowMark if parent is marked FOR UPDATE/SHARE. diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 26567cb..326c858 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -3162,6 +3162,8 @@ create_lockrows_path(PlannerInfo *root, RelOptInfo *rel, * 'partitioned_rels' is an integer list of RT indexes of non-leaf tables in * the partition tree, if this is an UPDATE/DELETE to a partitioned table. * Otherwise NIL. + * 'part_cols_updated' if any partitioning columns are being updated, either + * from the named relation or a descendent partitione table. * 'resultRelations' is an integer list of actual RT indexes of target rel(s) * 'subpaths' is a list of Path(s) producing source data (one per rel) * 'subroots' is a list of PlannerInfo structs (one per rel) @@ -3175,6 +3177,7 @@ ModifyTablePath * create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool part_cols_updated, List *resultRelations, List *subpaths, List *subroots, List *withCheckOptionLists, List *returningLists, @@ -3242,6 +3245,7 @@ create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, pathnode->canSetTag = canSetTag; pathnode->nominalRelation = nominalRelation; pathnode->partitioned_rels = list_copy(partitioned_rels); + pathnode->part_cols_updated = part_cols_updated; pathnode->resultRelations = resultRelations; pathnode->subpaths = subpaths; pathnode->subroots = subroots; diff --git a/src/backend/rewrite/rewriteManip.c b/src/backend/rewrite/rewriteManip.c index 5c17213..58e98c0 100644 --- a/src/backend/rewrite/rewriteManip.c +++ b/src/backend/rewrite/rewriteManip.c @@ -1224,6 +1224,7 @@ typedef struct /* Target type when converting whole-row vars */ Oid to_rowtype; bool *found_whole_row; /* output flag */ + bool coerced_var; /* var is under ConvertRowTypeExpr */ } map_variable_attnos_context; static Node * @@ -1267,22 +1268,29 @@ map_variable_attnos_mutator(Node *node, /* Don't convert unless necessary. */ if (context->to_rowtype != var->vartype) { - ConvertRowtypeExpr *r; - /* Var itself is converted to the requested type. */ newvar->vartype = context->to_rowtype; /* - * And a conversion node on top to convert back to the - * original type. + * If this var is already under a ConvertRowtypeExpr, + * we don't have to add another one. */ - r = makeNode(ConvertRowtypeExpr); - r->arg = (Expr *) newvar; - r->resulttype = var->vartype; - r->convertformat = COERCE_IMPLICIT_CAST; - r->location = -1; - - return (Node *) r; + if (!context->coerced_var) + { + ConvertRowtypeExpr *r; + + /* + * And a conversion node on top to convert back to + * the original type. + */ + r = makeNode(ConvertRowtypeExpr); + r->arg = (Expr *) newvar; + r->resulttype = var->vartype; + r->convertformat = COERCE_IMPLICIT_CAST; + r->location = -1; + + return (Node *) r; + } } } } @@ -1290,6 +1298,28 @@ map_variable_attnos_mutator(Node *node, } /* otherwise fall through to copy the var normally */ } + else if (IsA(node, ConvertRowtypeExpr)) + { + ConvertRowtypeExpr *r = (ConvertRowtypeExpr *) node; + + /* + * If this is coercing a var (which is typical), convert only the var, + * as against adding another ConvertRowtypeExpr over it. + */ + if (IsA(r->arg, Var)) + { + ConvertRowtypeExpr *newnode; + + newnode = (ConvertRowtypeExpr *) palloc(sizeof(ConvertRowtypeExpr)); + *newnode = *r; + context->coerced_var = true; + newnode->arg = (Expr *) map_variable_attnos_mutator((Node *) r->arg, context); + context->coerced_var = false; + + return (Node *) newnode; + } + /* Else fall through the expression tree mutator */ + } else if (IsA(node, Query)) { /* Recurse into RTE subquery or not-yet-planned sublink subquery */ @@ -1321,6 +1351,7 @@ map_variable_attnos(Node *node, context.map_length = map_length; context.to_rowtype = to_rowtype; context.found_whole_row = found_whole_row; + context.coerced_var = false; *found_whole_row = false; diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 2283c67..2e29276 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -68,6 +68,12 @@ typedef struct PartitionDispatchData int *indexes; } PartitionDispatchData; +typedef struct PartitionWalker +{ + List *rels_list; + ListCell *cur_cell; +} PartitionWalker; + typedef struct PartitionDispatchData *PartitionDispatch; extern void RelationBuildPartitionDesc(Relation relation); @@ -80,12 +86,16 @@ extern void check_new_partition_bound(char *relname, Relation parent, extern Oid get_partition_parent(Oid relid); extern List *get_qual_from_partbound(Relation rel, Relation parent, PartitionBoundSpec *spec); -extern List *map_partition_varattnos(List *expr, int target_varno, - Relation partrel, Relation parent, +extern List *map_partition_varattnos(List *expr, int fromrel_varno, + Relation to_rel, Relation from_rel, bool *found_whole_row); extern List *RelationGetPartitionQual(Relation rel); extern Expr *get_partition_qual_relid(Oid relid); +extern void partition_walker_init(PartitionWalker *walker, Relation rel); +extern Relation partition_walker_next(PartitionWalker *walker, + Relation *parent); + /* For tuple routing */ extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, int *num_parted, List **leaf_part_oids); @@ -99,4 +109,8 @@ extern int get_partition_for_tuple(PartitionDispatch *pd, EState *estate, PartitionDispatchData **failed_at, TupleTableSlot **failed_slot); +extern void pull_child_partition_columns(Bitmapset **bitmapset, + Relation rel, + Relation parent); + #endif /* PARTITION_H */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 7708818..8e2bf5f 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -187,7 +187,10 @@ extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid); extern void ExecCleanUpTriggerState(EState *estate); extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); extern void ExecConstraints(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate); + TupleTableSlot *slot, EState *estate, + bool check_partition_constraint); +extern void ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate); extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); extern LockTupleMode ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo); @@ -207,10 +210,12 @@ extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti, HeapTuple tuple); extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti); extern void ExecSetupPartitionTupleRouting(Relation rel, + ResultRelInfo *update_rri, + int num_update_rri, Index resultRTindex, EState *estate, PartitionDispatch **pd, - ResultRelInfo **partitions, + ResultRelInfo ***partitions, TupleConversionMap ***tup_conv_maps, TupleTableSlot **partition_tuple_slot, int *num_parted, int *num_partitions); @@ -218,6 +223,8 @@ extern int ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, TupleTableSlot *slot, EState *estate); +extern bool ExecPartitionCheck(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate); #define EvalPlanQualSetSlot(epqstate, slot) ((epqstate)->origslot = (slot)) extern void EvalPlanQualFetchRowMarks(EPQState *epqstate); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 90a60ab..3034b01 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -511,6 +511,11 @@ typedef struct EState struct dsa_area *es_query_dsa; } EState; +/* For a given result relation, get its columns being inserted/updated. */ +#define GetInsertedColumns(relinfo, estate) \ + (rt_fetch((relinfo)->ri_RangeTableIndex, (estate)->es_range_table)->insertedCols) +#define GetUpdatedColumns(relinfo, estate) \ + (rt_fetch((relinfo)->ri_RangeTableIndex, (estate)->es_range_table)->updatedCols) /* * ExecRowMark - @@ -978,14 +983,31 @@ typedef struct ModifyTableState int mt_num_dispatch; /* Number of entries in the above array */ int mt_num_partitions; /* Number of members in the following * arrays */ - ResultRelInfo *mt_partitions; /* Per partition result relation */ - TupleConversionMap **mt_partition_tupconv_maps; - /* Per partition tuple conversion map */ + ResultRelInfo **mt_partitions; /* Per partition result relation pointers */ + + /* + * Per partition conversion map to convert tuples from root to leaf + * partition + */ + TupleConversionMap **mt_perleaf_parentchild_maps; + + /* + * Per partition conversion map to convert tuples from leaf partition to + * root + */ + TupleConversionMap **mt_perleaf_childparent_maps; + + /* + * Per subplan conversion map to convert tuples from leaf partition to + * root partitioned table + */ + TupleConversionMap **mt_persubplan_childparent_maps; + TupleTableSlot *mt_partition_tuple_slot; + TupleTableSlot *mt_rootpartition_tuple_slot; + struct TransitionCaptureState *mt_transition_capture; /* controls transition table population */ - TupleConversionMap **mt_transition_tupconv_maps; - /* Per plan/partition tuple conversion */ } ModifyTableState; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index a382331..6981f58 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -219,6 +219,7 @@ typedef struct ModifyTable Index nominalRelation; /* Parent RT index for use of EXPLAIN */ /* RT indexes of non-leaf tables in a partition tree */ List *partitioned_rels; + bool part_cols_updated; /* some part col in hierarchy updated */ List *resultRelations; /* integer list of RT indexes */ int resultRelIndex; /* index of first resultRel in plan's list */ int rootResultRelIndex; /* index of the partitioned table root */ diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index a39e59d..e3ff127 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -1579,6 +1579,7 @@ typedef struct ModifyTablePath Index nominalRelation; /* Parent RT index for use of EXPLAIN */ /* RT indexes of non-leaf tables in a partition tree */ List *partitioned_rels; + bool part_cols_updated; /* some part col in hierarchy updated */ List *resultRelations; /* integer list of RT indexes */ List *subpaths; /* Path(s) producing source data */ List *subroots; /* per-target-table PlannerInfos */ @@ -2021,6 +2022,10 @@ typedef struct AppendRelInfo * The child_rels list must contain at least one element, because the parent * partitioned table is itself counted as a child. * + * all_part_cols contains all attribute numbers from the parent that are + * used as partitioning columns by the parent or some descendent which is + * itself partitioned. + * * These structs are kept in the PlannerInfo node's pcinfo_list. */ typedef struct PartitionedChildRelInfo @@ -2029,6 +2034,7 @@ typedef struct PartitionedChildRelInfo Index parent_relid; List *child_rels; + Bitmapset *all_part_cols; } PartitionedChildRelInfo; /* diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index e372f88..b38f2f1 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -238,6 +238,7 @@ extern ModifyTablePath *create_modifytable_path(PlannerInfo *root, RelOptInfo *rel, CmdType operation, bool canSetTag, Index nominalRelation, List *partitioned_rels, + bool part_cols_updated, List *resultRelations, List *subpaths, List *subroots, List *withCheckOptionLists, List *returningLists, diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 2a4cf71..c6c15c5 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -57,6 +57,7 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr); extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid); -extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti); +extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti, + Bitmapset **all_part_cols_p); #endif /* PLANNER_H */ diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out index 9366f04..6c0036b 100644 --- a/src/test/regress/expected/update.out +++ b/src/test/regress/expected/update.out @@ -198,25 +198,425 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a) DROP TABLE update_test; DROP TABLE upsert_test; --- update to a partition should check partition bound constraint for the new tuple -create table range_parted ( +--------------------------- +-- UPDATE with row movement +--------------------------- +-- update to a partition should check partition bound constraint for the new tuple. +-- If partition key is updated, the row should be moved to the appropriate +-- partition. updatable views using partitions should enforce the check options +-- for the rows that have been moved. +create table mintab(c1 int); +insert into mintab values (120); +CREATE TABLE range_parted ( a text, - b int + b int, + c numeric ) partition by range (a, b); -create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); -create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); +CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 from mintab) WITH CHECK OPTION; +-- Create partitions intentionally in descending bound order, so as to test +-- that the sub plans are getting ordered in ascending bound order rather than ordered by the oid values. +create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20) partition by range (c); create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10); -create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20); -insert into part_a_1_a_10 values ('a', 1); -insert into part_b_10_b_20 values ('b', 10); --- fail -update part_a_1_a_10 set a = 'b' where a = 'a'; -ERROR: new row for relation "part_a_1_a_10" violates partition constraint -DETAIL: Failing row contains (b, 1). -update range_parted set b = b - 1 where b = 10; -ERROR: new row for relation "part_b_10_b_20" violates partition constraint -DETAIL: Failing row contains (b, 9). --- ok -update range_parted set b = b + 1 where b = 10; +create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); +create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); +-- This tests partition-key UPDATE on a partitioned table that does not have any child partitions +update part_b_10_b_20 set b = b - 6; +-- As mentioned above, the partition creation is intentionally kept in descending bound order. +create table part_c_100_200 (c numeric, a text, b int); +alter table part_b_10_b_20 attach partition part_c_100_200 for values from (100) to (200); +create table part_c_1_100 (b int, c numeric, a text); +alter table part_b_10_b_20 attach partition part_c_1_100 for values from (1) to (100); +\set init_range_parted 'truncate range_parted; insert into range_parted values (''a'', 1, NULL), (''a'', 10, 200), (''b'', 12, 96), (''b'', 13, 97), (''b'', 15, 105), (''b'', 17, 105)' +:init_range_parted; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_10_a_20 | a | 10 | 200 + part_a_1_a_10 | a | 1 | + part_c_100_200 | b | 15 | 105 + part_c_100_200 | b | 17 | 105 + part_c_1_100 | b | 12 | 96 + part_c_1_100 | b | 13 | 97 +(6 rows) + +-- The order of subplans should be in bound order +explain (costs off) update range_parted set c = c - 50 where c > 97; + QUERY PLAN +------------------------------------- + Update on range_parted + Update on part_a_1_a_10 + Update on part_a_10_a_20 + Update on part_b_1_b_10 + Update on part_c_1_100 + Update on part_c_100_200 + -> Seq Scan on part_a_1_a_10 + Filter: (c > '97'::numeric) + -> Seq Scan on part_a_10_a_20 + Filter: (c > '97'::numeric) + -> Seq Scan on part_b_1_b_10 + Filter: (c > '97'::numeric) + -> Seq Scan on part_c_1_100 + Filter: (c > '97'::numeric) + -> Seq Scan on part_c_100_200 + Filter: (c > '97'::numeric) +(16 rows) + +-- fail (row movement happens only within the partition subtree) : +update part_c_1_100 set c = c + 20 where c = 96; +ERROR: new row for relation "part_c_1_100" violates partition constraint +DETAIL: Failing row contains (12, 116, b). +-- No row found : +update part_c_1_100 set c = c + 20 where c = 98; +-- ok (row movement) +update part_b_10_b_20 set c = c + 20 returning c, b, a; + c | b | a +-----+----+--- + 116 | 12 | b + 117 | 13 | b + 125 | 15 | b + 125 | 17 | b +(4 rows) + +select a, b, c from part_c_1_100 order by 1, 2, 3; + a | b | c +---+---+--- +(0 rows) + +select a, b, c from part_c_100_200 order by 1, 2, 3; + a | b | c +---+----+----- + b | 12 | 116 + b | 13 | 117 + b | 15 | 125 + b | 17 | 125 +(4 rows) + +-- fail (row movement happens only within the partition subtree) : +update part_b_10_b_20 set b = b - 6 where c > 116 returning *; +ERROR: new row for relation "part_c_100_200" violates partition constraint +DETAIL: Failing row contains (117, b, 7). +-- ok (row movement, with subset of rows moved into different partition) +update range_parted set b = b - 6 where c > 116 returning a, b + c; + a | ?column? +---+---------- + a | 204 + b | 124 + b | 134 + b | 136 +(4 rows) + +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_1_a_10 | a | 1 | + part_a_1_a_10 | a | 4 | 200 + part_b_1_b_10 | b | 7 | 117 + part_b_1_b_10 | b | 9 | 125 + part_c_100_200 | b | 11 | 125 + part_c_100_200 | b | 12 | 116 +(6 rows) + +-- update partition key using updatable view. +-- succeeds +update upview set c = 199 where b = 4; +-- fail, check option violation +update upview set c = 120 where b = 4; +ERROR: new row violates check option for view "upview" +DETAIL: Failing row contains (a, 4, 120). +-- fail, row movement with check option violation +update upview set a = 'b', b = 15, c = 120 where b = 4; +ERROR: new row violates check option for view "upview" +DETAIL: Failing row contains (b, 15, 120). +-- succeeds, row movement , check option passes +update upview set a = 'b', b = 15 where b = 4; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_1_a_10 | a | 1 | + part_b_1_b_10 | b | 7 | 117 + part_b_1_b_10 | b | 9 | 125 + part_c_100_200 | b | 11 | 125 + part_c_100_200 | b | 12 | 116 + part_c_100_200 | b | 15 | 199 +(6 rows) + -- cleanup -drop table range_parted; +drop view upview; +-- RETURNING having whole-row vars. +---------------------------------- +:init_range_parted; +update range_parted set c = 95 where a = 'b' and b > 10 and c > 100 returning (range_parted) , *; + range_parted | a | b | c +--------------+---+----+---- + (b,15,95) | b | 15 | 95 + (b,17,95) | b | 17 | 95 +(2 rows) + +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_10_a_20 | a | 10 | 200 + part_a_1_a_10 | a | 1 | + part_c_1_100 | b | 12 | 96 + part_c_1_100 | b | 13 | 97 + part_c_1_100 | b | 15 | 95 + part_c_1_100 | b | 17 | 95 +(6 rows) + +-- Transition tables with update row movement +--------------------------------------------- +:init_range_parted; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_10_a_20 | a | 10 | 200 + part_a_1_a_10 | a | 1 | + part_c_100_200 | b | 15 | 105 + part_c_100_200 | b | 17 | 105 + part_c_1_100 | b | 12 | 96 + part_c_1_100 | b | 13 | 97 +(6 rows) + +create function trans_updatetrigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = %, old table = %, new table = %', + TG_NAME, + (select string_agg(old_table::text, ', ' order by a) from old_table), + (select string_agg(new_table::text, ', ' order by a) from new_table); + return null; + end; +$$; +create trigger trans_updatetrig + after update on range_parted referencing old table as old_table new table as new_table + for each statement execute procedure trans_updatetrigfunc(); +update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96; +NOTICE: trigger = trans_updatetrig, old table = (b,12,96), (b,13,97), (b,15,105), (b,17,105), new table = (b,12,110), (b,13,98), (b,15,106), (b,17,106) +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_10_a_20 | a | 10 | 200 + part_a_1_a_10 | a | 1 | + part_c_100_200 | b | 12 | 110 + part_c_100_200 | b | 15 | 106 + part_c_100_200 | b | 17 | 106 + part_c_1_100 | b | 13 | 98 +(6 rows) + +:init_range_parted; +update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96; +NOTICE: trigger = trans_updatetrig, old table = (b,12,96), (b,13,97), (b,15,105), (b,17,105), new table = (b,12,146), (b,13,147), (b,15,155), (b,17,155) +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_10_a_20 | a | 10 | 200 + part_a_1_a_10 | a | 1 | + part_c_100_200 | b | 12 | 146 + part_c_100_200 | b | 13 | 147 + part_c_100_200 | b | 15 | 155 + part_c_100_200 | b | 17 | 155 +(6 rows) + +drop trigger trans_updatetrig ON range_parted; +-- Install BR triggers on child partition, so that transition tuple conversion takes place. +create function func_parted_mod_b() returns trigger as $$ +begin + NEW.b = NEW.b + 1; + return NEW; +end $$ language plpgsql; +create trigger trig_c1_100 before update or insert on part_c_1_100 + for each row execute procedure func_parted_mod_b(); +create trigger trig_c100_200 before update or insert on part_c_100_200 + for each row execute procedure func_parted_mod_b(); +:init_range_parted; +update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_10_a_20 | a | 10 | 200 + part_a_1_a_10 | a | 1 | + part_c_100_200 | b | 15 | 110 + part_c_100_200 | b | 17 | 106 + part_c_100_200 | b | 19 | 106 + part_c_1_100 | b | 15 | 98 +(6 rows) + +:init_range_parted; +update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_10_a_20 | a | 10 | 200 + part_a_1_a_10 | a | 1 | + part_c_100_200 | b | 15 | 146 + part_c_100_200 | b | 16 | 147 + part_c_100_200 | b | 17 | 155 + part_c_100_200 | b | 19 | 155 +(6 rows) + +drop trigger trig_c1_100 ON part_c_1_100; +drop trigger trig_c100_200 ON part_c_100_200; +drop function func_parted_mod_b(); +-- statement triggers with update row movement +--------------------------------------------------- +:init_range_parted; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_10_a_20 | a | 10 | 200 + part_a_1_a_10 | a | 1 | + part_c_100_200 | b | 15 | 105 + part_c_100_200 | b | 17 | 105 + part_c_1_100 | b | 12 | 96 + part_c_1_100 | b | 13 | 97 +(6 rows) + +create function trigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = % fired on table % during %', + TG_NAME, TG_TABLE_NAME, TG_OP; + return null; + end; +$$; +-- Triggers on root partition +create trigger parent_delete_trig + after delete on range_parted for each statement execute procedure trigfunc(); +create trigger parent_update_trig + after update on range_parted for each statement execute procedure trigfunc(); +create trigger parent_insert_trig + after insert on range_parted for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_c_1_100 +create trigger c1_delete_trig + after delete on part_c_1_100 for each statement execute procedure trigfunc(); +create trigger c1_update_trig + after update on part_c_1_100 for each statement execute procedure trigfunc(); +create trigger c1_insert_trig + after insert on part_c_1_100 for each statement execute procedure trigfunc(); +-- Triggers on leaf partition part_c_100_200 +create trigger c100_delete_trig + after delete on part_c_100_200 for each statement execute procedure trigfunc(); +create trigger c100_update_trig + after update on part_c_100_200 for each statement execute procedure trigfunc(); +create trigger c100_insert_trig + after insert on part_c_100_200 for each statement execute procedure trigfunc(); +-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or insert statement triggers should be fired. +update range_parted set c = c - 50 where c > 97; +NOTICE: trigger = parent_update_trig fired on table range_parted during UPDATE +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + partname | a | b | c +----------------+---+----+----- + part_a_10_a_20 | a | 10 | 150 + part_a_1_a_10 | a | 1 | + part_c_1_100 | b | 12 | 96 + part_c_1_100 | b | 13 | 97 + part_c_1_100 | b | 15 | 55 + part_c_1_100 | b | 17 | 55 +(6 rows) + +drop table mintab, range_parted CASCADE; +-------------- +-- UPDATE with +-- partition key or non-partition columns, with different column ordering, +-- triggers. +-------------- +-- Setup +-------- +create table list_parted (a numeric, b int, c int8) partition by list (a); +create table sub_parted partition of list_parted for values in (1) partition by list (b); +create table sub_part1(b int, c int8, a numeric); +alter table sub_parted attach partition sub_part1 for values in (1); +create table sub_part2(b int, c int8, a numeric); +alter table sub_parted attach partition sub_part2 for values in (2); +create table list_part1(a numeric, b int, c int8); +alter table list_parted attach partition list_part1 for values in (2,3); +insert into list_parted values (2,5,50); +insert into list_parted values (3,6,60); +insert into sub_parted values (1,1,60); +insert into sub_parted values (1,2,10); +-- Test partition constraint violation when intermediate ancestor is used and +-- constraint is inherited from upper root. +update sub_parted set a = 2 where c = 10; +ERROR: new row for relation "sub_part2" violates partition constraint +DETAIL: Failing row contains (2, 10, 2). +-- UPDATE which does not modify partition key of partitions that are chosen for update. +select tableoid::regclass::text , * from list_parted where a = 2 order by 1; + tableoid | a | b | c +------------+---+---+---- + list_part1 | 2 | 5 | 50 +(1 row) + +update list_parted set b = c + a where a = 2; +select tableoid::regclass::text , * from list_parted where a = 2 order by 1; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 +(1 row) + +----------- +-- Triggers can cause UPDATE row movement if it modified partition key. +----------- +create function func_parted_mod_b() returns trigger as $$ +begin + NEW.b = 2; -- This is changing partition key column. + return NEW; +end $$ language plpgsql; +create trigger parted_mod_b before update on sub_part1 + for each row execute procedure func_parted_mod_b(); +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part1 | 1 | 1 | 60 + sub_part2 | 1 | 2 | 10 +(4 rows) + +-- This should do the tuple routing even though there is no explicit +-- partition-key update, because there is a trigger on sub_part1 +update list_parted set c = 70 where b = 1 ; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part2 | 1 | 2 | 10 + sub_part2 | 1 | 2 | 70 +(4 rows) + +drop trigger parted_mod_b ON sub_part1 ; +-- If BR DELETE trigger prevented DELETE from happening, we should also skip +-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT. +create or replace function func_parted_mod_b() returns trigger as $$ +begin return NULL; end $$ language plpgsql; +create trigger trig_skip_delete before delete on sub_part1 + for each row execute procedure func_parted_mod_b(); +update list_parted set b = 1 where c = 70; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 + sub_part1 | 1 | 1 | 70 + sub_part2 | 1 | 2 | 10 +(4 rows) + +drop trigger trig_skip_delete ON sub_part1 ; +-- UPDATE partition-key with FROM clause. If join produces multiple output +-- rows for the same row to be modified, we should tuple-route the row only once. +-- There should not be any rows inserted. +create table non_parted (id int); +insert into non_parted values (1), (1), (1), (2), (2), (2), (3), (3), (3); +update list_parted t1 set a = 2 from non_parted t2 where t1.a = t2.id and a = 1; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 1 | 70 + list_part1 | 2 | 2 | 10 + list_part1 | 2 | 52 | 50 + list_part1 | 3 | 6 | 60 +(4 rows) + +drop table non_parted; +drop function func_parted_mod_b ( ) ; +drop table list_parted; diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql index 6637119..da5130d 100644 --- a/src/test/regress/sql/update.sql +++ b/src/test/regress/sql/update.sql @@ -107,23 +107,253 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a) DROP TABLE update_test; DROP TABLE upsert_test; --- update to a partition should check partition bound constraint for the new tuple -create table range_parted ( + +--------------------------- +-- UPDATE with row movement +--------------------------- + +-- update to a partition should check partition bound constraint for the new tuple. +-- If partition key is updated, the row should be moved to the appropriate +-- partition. updatable views using partitions should enforce the check options +-- for the rows that have been moved. +create table mintab(c1 int); +insert into mintab values (120); +CREATE TABLE range_parted ( a text, - b int + b int, + c numeric ) partition by range (a, b); -create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); -create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); +CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 from mintab) WITH CHECK OPTION; + +-- Create partitions intentionally in descending bound order, so as to test +-- that the sub plans are getting ordered in ascending bound order rather than ordered by the oid values. +create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20) partition by range (c); create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10); -create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20); -insert into part_a_1_a_10 values ('a', 1); -insert into part_b_10_b_20 values ('b', 10); +create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20); +create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10); + +-- This tests partition-key UPDATE on a partitioned table that does not have any child partitions +update part_b_10_b_20 set b = b - 6; + +-- As mentioned above, the partition creation is intentionally kept in descending bound order. +create table part_c_100_200 (c numeric, a text, b int); +alter table part_b_10_b_20 attach partition part_c_100_200 for values from (100) to (200); +create table part_c_1_100 (b int, c numeric, a text); +alter table part_b_10_b_20 attach partition part_c_1_100 for values from (1) to (100); + +\set init_range_parted 'truncate range_parted; insert into range_parted values (''a'', 1, NULL), (''a'', 10, 200), (''b'', 12, 96), (''b'', 13, 97), (''b'', 15, 105), (''b'', 17, 105)' +:init_range_parted; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + +-- The order of subplans should be in bound order +explain (costs off) update range_parted set c = c - 50 where c > 97; + +-- fail (row movement happens only within the partition subtree) : +update part_c_1_100 set c = c + 20 where c = 96; +-- No row found : +update part_c_1_100 set c = c + 20 where c = 98; +-- ok (row movement) +update part_b_10_b_20 set c = c + 20 returning c, b, a; +select a, b, c from part_c_1_100 order by 1, 2, 3; +select a, b, c from part_c_100_200 order by 1, 2, 3; + +-- fail (row movement happens only within the partition subtree) : +update part_b_10_b_20 set b = b - 6 where c > 116 returning *; +-- ok (row movement, with subset of rows moved into different partition) +update range_parted set b = b - 6 where c > 116 returning a, b + c; --- fail -update part_a_1_a_10 set a = 'b' where a = 'a'; -update range_parted set b = b - 1 where b = 10; --- ok -update range_parted set b = b + 1 where b = 10; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + +-- update partition key using updatable view. + +-- succeeds +update upview set c = 199 where b = 4; +-- fail, check option violation +update upview set c = 120 where b = 4; +-- fail, row movement with check option violation +update upview set a = 'b', b = 15, c = 120 where b = 4; +-- succeeds, row movement , check option passes +update upview set a = 'b', b = 15 where b = 4; + +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; -- cleanup -drop table range_parted; +drop view upview; + +-- RETURNING having whole-row vars. +---------------------------------- +:init_range_parted; +update range_parted set c = 95 where a = 'b' and b > 10 and c > 100 returning (range_parted) , *; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + + +-- Transition tables with update row movement +--------------------------------------------- +:init_range_parted; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + +create function trans_updatetrigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = %, old table = %, new table = %', + TG_NAME, + (select string_agg(old_table::text, ', ' order by a) from old_table), + (select string_agg(new_table::text, ', ' order by a) from new_table); + return null; + end; +$$; + +create trigger trans_updatetrig + after update on range_parted referencing old table as old_table new table as new_table + for each statement execute procedure trans_updatetrigfunc(); + +update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; +:init_range_parted; +update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; +drop trigger trans_updatetrig ON range_parted; + +-- Install BR triggers on child partition, so that transition tuple conversion takes place. +create function func_parted_mod_b() returns trigger as $$ +begin + NEW.b = NEW.b + 1; + return NEW; +end $$ language plpgsql; +create trigger trig_c1_100 before update or insert on part_c_1_100 + for each row execute procedure func_parted_mod_b(); +create trigger trig_c100_200 before update or insert on part_c_100_200 + for each row execute procedure func_parted_mod_b(); +:init_range_parted; +update range_parted set c = (case when c = 96 then 110 else c + 1 end ) where a = 'b' and b > 10 and c >= 96; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; +:init_range_parted; +update range_parted set c = c + 50 where a = 'b' and b > 10 and c >= 96; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; +drop trigger trig_c1_100 ON part_c_1_100; +drop trigger trig_c100_200 ON part_c_100_200; +drop function func_parted_mod_b(); + + +-- statement triggers with update row movement +--------------------------------------------------- + +:init_range_parted; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + +create function trigfunc() returns trigger language plpgsql as +$$ + begin + raise notice 'trigger = % fired on table % during %', + TG_NAME, TG_TABLE_NAME, TG_OP; + return null; + end; +$$; +-- Triggers on root partition +create trigger parent_delete_trig + after delete on range_parted for each statement execute procedure trigfunc(); +create trigger parent_update_trig + after update on range_parted for each statement execute procedure trigfunc(); +create trigger parent_insert_trig + after insert on range_parted for each statement execute procedure trigfunc(); + +-- Triggers on leaf partition part_c_1_100 +create trigger c1_delete_trig + after delete on part_c_1_100 for each statement execute procedure trigfunc(); +create trigger c1_update_trig + after update on part_c_1_100 for each statement execute procedure trigfunc(); +create trigger c1_insert_trig + after insert on part_c_1_100 for each statement execute procedure trigfunc(); + +-- Triggers on leaf partition part_c_100_200 +create trigger c100_delete_trig + after delete on part_c_100_200 for each statement execute procedure trigfunc(); +create trigger c100_update_trig + after update on part_c_100_200 for each statement execute procedure trigfunc(); +create trigger c100_insert_trig + after insert on part_c_100_200 for each statement execute procedure trigfunc(); + +-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or insert statement triggers should be fired. +update range_parted set c = c - 50 where c > 97; +select tableoid::regclass::text partname, * from range_parted order by 1, 2, 3, 4; + +drop table mintab, range_parted CASCADE; + + + +-------------- +-- UPDATE with +-- partition key or non-partition columns, with different column ordering, +-- triggers. +-------------- + +-- Setup +-------- +create table list_parted (a numeric, b int, c int8) partition by list (a); +create table sub_parted partition of list_parted for values in (1) partition by list (b); + +create table sub_part1(b int, c int8, a numeric); +alter table sub_parted attach partition sub_part1 for values in (1); +create table sub_part2(b int, c int8, a numeric); +alter table sub_parted attach partition sub_part2 for values in (2); + +create table list_part1(a numeric, b int, c int8); +alter table list_parted attach partition list_part1 for values in (2,3); + +insert into list_parted values (2,5,50); +insert into list_parted values (3,6,60); +insert into sub_parted values (1,1,60); +insert into sub_parted values (1,2,10); + +-- Test partition constraint violation when intermediate ancestor is used and +-- constraint is inherited from upper root. +update sub_parted set a = 2 where c = 10; + +-- UPDATE which does not modify partition key of partitions that are chosen for update. +select tableoid::regclass::text , * from list_parted where a = 2 order by 1; +update list_parted set b = c + a where a = 2; +select tableoid::regclass::text , * from list_parted where a = 2 order by 1; + + +----------- +-- Triggers can cause UPDATE row movement if it modified partition key. +----------- +create function func_parted_mod_b() returns trigger as $$ +begin + NEW.b = 2; -- This is changing partition key column. + return NEW; +end $$ language plpgsql; +create trigger parted_mod_b before update on sub_part1 + for each row execute procedure func_parted_mod_b(); + +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + +-- This should do the tuple routing even though there is no explicit +-- partition-key update, because there is a trigger on sub_part1 +update list_parted set c = 70 where b = 1 ; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + +drop trigger parted_mod_b ON sub_part1 ; + +-- If BR DELETE trigger prevented DELETE from happening, we should also skip +-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT. +create or replace function func_parted_mod_b() returns trigger as $$ +begin return NULL; end $$ language plpgsql; +create trigger trig_skip_delete before delete on sub_part1 + for each row execute procedure func_parted_mod_b(); +update list_parted set b = 1 where c = 70; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; + +drop trigger trig_skip_delete ON sub_part1 ; + +-- UPDATE partition-key with FROM clause. If join produces multiple output +-- rows for the same row to be modified, we should tuple-route the row only once. +-- There should not be any rows inserted. +create table non_parted (id int); +insert into non_parted values (1), (1), (1), (2), (2), (2), (3), (3), (3); +update list_parted t1 set a = 2 from non_parted t2 where t1.a = t2.id and a = 1; +select tableoid::regclass::text , * from list_parted order by 1, 2, 3, 4; +drop table non_parted; + +drop function func_parted_mod_b ( ) ; +drop table list_parted;