From 690ef5127fd8f7d81b954e1462b65ace542e8f60 Mon Sep 17 00:00:00 2001 From: amit Date: Wed, 27 Jul 2016 16:59:21 +0900 Subject: [PATCH 8/9] Tuple routing for partitioned tables. Both COPY FROM and INSERT. --- src/backend/catalog/partition.c | 347 ++++++++++++++++++++++++++++++- src/backend/commands/copy.c | 205 ++++++++++++++++++- src/backend/commands/tablecmds.c | 1 + src/backend/executor/execMain.c | 47 ++++- src/backend/executor/nodeModifyTable.c | 123 +++++++++++ src/backend/optimizer/plan/createplan.c | 60 ++++++ src/backend/optimizer/util/plancat.c | 20 ++- src/backend/parser/analyze.c | 9 + src/include/catalog/partition.h | 7 + src/include/executor/executor.h | 6 + src/include/nodes/execnodes.h | 10 + src/include/optimizer/plancat.h | 1 + src/test/regress/expected/insert.out | 59 +++++- src/test/regress/sql/insert.sql | 28 +++ 14 files changed, 911 insertions(+), 12 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index fb1ab0e..8f6cbc9 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -260,6 +260,18 @@ static List *generate_partition_qual(Relation rel, bool recurse); static PartitionTreeNode GetPartitionTreeNodeRecurse(Relation rel, int offset); static int get_leaf_partition_count(PartitionTreeNode ptnode); +/* Support get_partition_for_tuple() */ +static PartitionKeyExecInfo *BuildPartitionKeyExecInfo(Relation rel); +static void FormPartitionKeyDatum(PartitionKeyExecInfo *pkinfo, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull); +static int list_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, + Datum value, bool isnull); +static int range_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, + Datum *tuple); + /* List partition related support functions */ static PartitionList *make_list_from_spec(PartitionKey key, PartitionListSpec *list_spec); @@ -280,6 +292,9 @@ static int32 partition_range_bound_cmp(PartitionKey key, PartitionRangeBound *b1 PartitionRangeBound *b2); static int32 partition_range_tuple_cmp(PartitionKey key, Datum *val1, Datum *val2); static bool partition_range_overlaps(PartitionKey key, PartitionRange *r1, PartitionRange *r2); +static bool tuple_rightof_bound(PartitionKey key, Datum *tuple, PartitionRangeBound *bound); +static bool tuple_leftof_bound(PartitionKey key, Datum *tuple, PartitionRangeBound *bound); +static int bsearch_ranges(PartitionKey key, int n, RangeInfo *rangeinfo, Datum *tuple); /* * Partition key related functions @@ -1167,7 +1182,7 @@ get_leaf_partition_oids_v2(PartitionTreeNode ptnode) node = node->next; } else - result = lappend_oid(result, ptnode->pdesc->parts[i]->oid); + result = lappend_oid(result, ptnode->pdesc->oids[i]); } return result; @@ -1833,7 +1848,7 @@ GetPartitionTreeNodeRecurse(Relation rel, int offset) /* First build our own node */ parent = (PartitionTreeNode) palloc0(sizeof(PartitionTreeNodeData)); - parent->pkinfo = NULL; + parent->pkinfo = BuildPartitionKeyExecInfo(rel); parent->pdesc = RelationGetPartitionDesc(rel); parent->relid = RelationGetRelid(rel); parent->offset = offset; @@ -1851,7 +1866,7 @@ GetPartitionTreeNodeRecurse(Relation rel, int offset) prev = NULL; for (i = 0; i < parent->pdesc->nparts; i++) { - Oid relid = parent->pdesc->parts[i]->oid; + Oid relid = parent->pdesc->oids[i]; int offset; Relation rel; PartitionTreeNode child; @@ -1917,6 +1932,267 @@ get_leaf_partition_count(PartitionTreeNode ptnode) return result; } +/* + * BuildPartitionKeyExecInfo + * Construct a list of PartitionKeyExecInfo records for an open + * relation + * + * PartitionKeyExecInfo stores the information about the partition key + * that's needed when inserting tuples into a partitioned table; especially, + * partition key expression state if there are any expression columns in + * the partition key. Normally we build a PartitionKeyExecInfo for a + * partitioned table just once per command, and then use it for (potentially) + * many tuples. + * + */ +static PartitionKeyExecInfo * +BuildPartitionKeyExecInfo(Relation rel) +{ + PartitionKeyExecInfo *pkinfo; + + pkinfo = (PartitionKeyExecInfo *) palloc0(sizeof(PartitionKeyExecInfo)); + pkinfo->pi_Key = copy_partition_key(rel->rd_partkey); + pkinfo->pi_ExpressionState = NIL; + + return pkinfo; +} + +/* + * FormPartitionKeyDatum + * Construct values[] and isnull[] arrays for partition key columns + */ +static void +FormPartitionKeyDatum(PartitionKeyExecInfo *pkinfo, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull) +{ + ListCell *partexpr_item; + int i; + + if (pkinfo->pi_Key->partexprs != NIL && pkinfo->pi_ExpressionState == NIL) + { + /* First time through, set up expression evaluation state */ + pkinfo->pi_ExpressionState = (List *) + ExecPrepareExpr((Expr *) pkinfo->pi_Key->partexprs, + estate); + /* Check caller has set up context correctly */ + Assert(GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + } + + partexpr_item = list_head(pkinfo->pi_ExpressionState); + for (i = 0; i < pkinfo->pi_Key->partnatts; i++) + { + AttrNumber keycol = pkinfo->pi_Key->partattrs[i]; + Datum pkDatum; + bool isNull; + + if (keycol != 0) + { + /* Plain column; get the value directly from the heap tuple */ + pkDatum = slot_getattr(slot, keycol, &isNull); + } + else + { + /* Expression; need to evaluate it */ + if (partexpr_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + pkDatum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), + GetPerTupleExprContext(estate), + &isNull, + NULL); + partexpr_item = lnext(partexpr_item); + } + values[i] = pkDatum; + isnull[i] = isNull; + } + + if (partexpr_item != NULL) + elog(ERROR, "wrong number of partition key expressions"); +} + +/* + * get_partition_for_tuple + * Recursively finds the "leaf" partition for tuple + * + * Returns -1 if no partition is found and sets *failed_at to the OID of + * the partitioned table whose partition was not found. + */ +int +get_partition_for_tuple(PartitionTreeNode ptnode, + TupleTableSlot *slot, + EState *estate, + Oid *failed_at) +{ + Relation partRel; + PartitionKeyExecInfo *pkinfo = ptnode->pkinfo; + PartitionTreeNode node; + Datum values[PARTITION_MAX_KEYS]; + bool isnull[PARTITION_MAX_KEYS]; + int i; + int index; + + /* Guard against stack overflow due to overly deep partition tree */ + check_stack_depth(); + + if (ptnode->pdesc->nparts == 0) + { + *failed_at = ptnode->relid; + return -1; + } + + /* Extract partition key from tuple */ + Assert(GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + FormPartitionKeyDatum(pkinfo, slot, estate, values, isnull); + + /* Disallow nulls, if range partition key */ + for (i = 0; i < pkinfo->pi_Key->partnatts; i++) + if (isnull[i] && pkinfo->pi_Key->strategy == PARTITION_STRAT_RANGE) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("range partition key contains null"))); + + switch (pkinfo->pi_Key->strategy) + { + case PARTITION_STRAT_LIST: + index = list_partition_for_tuple(pkinfo->pi_Key, ptnode->pdesc, + values[0], isnull[0]); + break; + + case PARTITION_STRAT_RANGE: + index = range_partition_for_tuple(pkinfo->pi_Key, ptnode->pdesc, + values); + break; + } + + /* No partition found at this level */ + if (index < 0) + { + *failed_at = ptnode->relid; + return index; + } + + partRel = heap_open(ptnode->pdesc->oids[index], NoLock); + + /* Don't recurse if the index'th partition is a leaf partition. */ + if (partRel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + { + PartitionTreeNode prev; + + /* + * Index returned above is the array index within pdesc->parts[] of + * the parent rel, however, we want to return the leaf partition index + * across the whole partition tree. Note that some partitions within + * pdesc->parts[] may be partitioned themselves and hence stand for + * the leaf partitions in their partition subtrees. We would need to + * skip past the indexes of leaf partitions of all such partition + * subtrees if they are to left of the above returned index. In fact, + * finding the PartitionTreeNode of the rightmost subtree is enough + * since its offset counts the leaf partitions on its left including + * those of partition subtrees to its left. + */ + prev = node = ptnode->downlink; + if (node && node->index < index) + { + /* + * Find the partition tree node such that its index value is the + * greatest value less than the above returned index. + */ + while (node) + { + if (node->index > index) + { + node = prev; + break; + } + + prev = node; + node = node->next; + } + + if (!node) + node = prev; + Assert (node != NULL); + + index = node->offset + node->num_leaf_parts + + (index - node->index - 1); + } + else + /* + * The easy case where we don't have any partition subtree to the + * left of the index. + */ + index = ptnode->offset + index; + + heap_close(partRel, NoLock); + return index; + } + + heap_close(partRel, NoLock); + + /* + * Need to perform recursion as the selected partition is partitioned + * itself. Locate the PartitionTreeNode corresponding to the partition + * passing it down. + */ + node = ptnode->downlink; + while (node->next != NULL && node->index != index) + node = node->next; + Assert (node != NULL); + + return get_partition_for_tuple(node, slot, estate, failed_at); +} + +/* + * list_partition_for_tuple + * Find the list partition for a tuple + * + * Returns -1 if none found. + */ +static int +list_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, + Datum value, bool isnull) +{ + ListInfo *listinfo; + int found; + + Assert(pdesc->nparts > 0); + Assert(pdesc->bounds->listinfo != NULL); + listinfo = pdesc->bounds->listinfo; + + if (isnull && listinfo->has_null) + return listinfo->null_index; + else if (!isnull) + { + found = bsearch_list_values(listinfo->values, + listinfo->nvalues, + value, + key); + if (found >= 0) + return listinfo->indexes[found]; + } + + /* Control reaches here if isnull and !listinfo->has_null */ + return -1; +} + +/* + * range_partition_for_tuple + * Search the range partition for a range key ('values') + * + * Returns -1 if none found. + */ +static int +range_partition_for_tuple(PartitionKey key, PartitionDesc pdesc, Datum *tuple) +{ + Assert(pdesc->nparts > 0); + Assert(pdesc->bounds->rangeinfo != NULL); + + return bsearch_ranges(key, pdesc->nparts, + pdesc->bounds->rangeinfo, tuple); +} + /* List partition related support functions */ /* @@ -2285,3 +2561,68 @@ partition_range_tuple_cmp(PartitionKey key, Datum *val1, Datum *val2) return result; } + +/* + * bsearch_ranges + * Workhorse of range_partition_for_tuple + */ +static int +bsearch_ranges(PartitionKey key, int n, RangeInfo *rangeinfo, Datum *tuple) +{ + int low, high; + + /* Good ol' bsearch */ + low = 0; + high = n - 1; + while (low <= high) + { + int idx = (low + high) / 2; + + if (rangeinfo->ranges[idx]->upper->infinite) + { + if (tuple_rightof_bound(key, tuple, rangeinfo->ranges[idx]->lower)) + return idx; + + break; + } + else if (tuple_leftof_bound(key, tuple, rangeinfo->ranges[idx]->upper)) + { + if (rangeinfo->ranges[idx]->lower->infinite) + return idx; + + if (tuple_rightof_bound(key, tuple, rangeinfo->ranges[idx]->lower)) + return idx; + + high = idx - 1; + continue; + } + + low = idx + 1; + } + + return -1; +} + +/* Does range key lie to the right of partition bound */ +static bool +tuple_rightof_bound(PartitionKey key, Datum *tuple, PartitionRangeBound *bound) +{ + int32 cmpval = partition_range_tuple_cmp(key, tuple, bound->val); + + if (!cmpval) + return bound->lower ? bound->inclusive : !bound->inclusive; + + return cmpval > 0; +} + +/* Does range key lie to the left of partition bound */ +static bool +tuple_leftof_bound(PartitionKey key, Datum *tuple, PartitionRangeBound *bound) +{ + int32 cmpval = partition_range_tuple_cmp(key, tuple, bound->val); + + if (!cmpval) + return !bound->lower ? bound->inclusive : !bound->inclusive; + + return cmpval < 0; +} diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 157d219..932ed62 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -30,6 +30,7 @@ #include "commands/defrem.h" #include "commands/trigger.h" #include "executor/executor.h" +#include "foreign/fdwapi.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "mb/pg_wchar.h" @@ -161,6 +162,11 @@ typedef struct CopyStateData ExprState **defexprs; /* array of default att expressions */ bool volatile_defexprs; /* is any of defexprs volatile? */ List *range_table; + PartitionTreeNode ptnode; /* partition descriptor node tree */ + ResultRelInfo *partitions; + TupleConversionMap **partition_tupconv_maps; + List *partition_fdw_priv_lists; + int num_partitions; /* * These variables are used to reduce overhead in textual COPY FROM. @@ -1382,6 +1388,94 @@ BeginCopy(ParseState *pstate, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("table \"%s\" does not have OIDs", RelationGetRelationName(cstate->rel)))); + + /* + * Initialize state for CopyFrom tuple routing. Watch out for + * any foreign partitions. + */ + if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + List *leaf_part_oids; + ListCell *cell; + int i; + int num_leaf_parts; + ResultRelInfo *leaf_rel_rri; + PlannerInfo *root = makeNode(PlannerInfo); /* mostly dummy */ + Query *parse = makeNode(Query); /* ditto */ + ModifyTable *plan = makeNode(ModifyTable); /* ditto */ + RangeTblEntry *fdw_rte = makeNode(RangeTblEntry); /* ditto */ + List *fdw_private_lists = NIL; + + cstate->ptnode = RelationGetPartitionTreeNode(rel); + leaf_part_oids = get_leaf_partition_oids_v2(cstate->ptnode); + num_leaf_parts = list_length(leaf_part_oids); + + cstate->num_partitions = num_leaf_parts; + cstate->partitions = (ResultRelInfo *) + palloc0(num_leaf_parts * sizeof(ResultRelInfo)); + cstate->partition_tupconv_maps = (TupleConversionMap **) + palloc0(num_leaf_parts * sizeof(TupleConversionMap *)); + + /* For use below, iff a partition found to be a foreign table */ + plan->operation = CMD_INSERT; + plan->plans = list_make1(makeNode(Result)); + fdw_rte->rtekind = RTE_RELATION; + fdw_rte->relkind = RELKIND_FOREIGN_TABLE; + parse->rtable = list_make1(fdw_rte); + root->parse = parse; + + leaf_rel_rri = cstate->partitions; + i = 0; + foreach(cell, leaf_part_oids) + { + Relation leaf_rel; + + leaf_rel = heap_open(lfirst_oid(cell), RowExclusiveLock); + + /* + * Verify result relation is a valid target for the current + * operation. + */ + CheckValidResultRel(leaf_rel, CMD_INSERT); + + InitResultRelInfo(leaf_rel_rri, + leaf_rel, + 1, /* dummy */ + false, /* no need for partition check */ + 0); + + /* Open partition indices */ + ExecOpenIndices(leaf_rel_rri, false); + + /* Special dance for foreign tables */ + if (leaf_rel_rri->ri_FdwRoutine) + { + List *fdw_private; + + fdw_rte->relid = RelationGetRelid(leaf_rel); + fdw_private = leaf_rel_rri->ri_FdwRoutine->PlanForeignModify(root, + plan, + 1, + 0); + fdw_private_lists = lappend(fdw_private_lists, fdw_private); + } + + if (!equalTupleDescs(tupDesc, RelationGetDescr(leaf_rel))) + cstate->partition_tupconv_maps[i] = + convert_tuples_by_name(tupDesc, + RelationGetDescr(leaf_rel), + gettext_noop("could not convert row type")); + + leaf_rel_rri++; + i++; + } + + cstate->partition_fdw_priv_lists = fdw_private_lists; + pfree(fdw_rte); + pfree(plan); + pfree(parse); + pfree(root); + } } else { @@ -1677,6 +1771,8 @@ ClosePipeToProgram(CopyState cstate) static void EndCopy(CopyState cstate) { + int i; + if (cstate->is_program) { ClosePipeToProgram(cstate); @@ -1690,6 +1786,23 @@ EndCopy(CopyState cstate) cstate->filename))); } + /* Close all partitions and indices thereof */ + for (i = 0; i < cstate->num_partitions; i++) + { + ResultRelInfo *resultRelInfo = cstate->partitions + i; + + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + + /* XXX - EState not handy here to pass to EndForeignModify() */ + if (resultRelInfo->ri_FdwRoutine && + resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL) + resultRelInfo->ri_FdwRoutine->EndForeignModify(NULL, resultRelInfo); + + if (cstate->partition_tupconv_maps[i]) + pfree(cstate->partition_tupconv_maps[i]); + } + MemoryContextDelete(cstate->copycontext); pfree(cstate); } @@ -2240,6 +2353,7 @@ CopyFrom(CopyState cstate) Datum *values; bool *nulls; ResultRelInfo *resultRelInfo; + ResultRelInfo *saved_resultRelInfo = NULL; EState *estate = CreateExecutorState(); /* for ExecConstraints() */ ExprContext *econtext; TupleTableSlot *myslot; @@ -2260,7 +2374,8 @@ CopyFrom(CopyState cstate) Assert(cstate->rel); - if (cstate->rel->rd_rel->relkind != RELKIND_RELATION) + if (cstate->rel->rd_rel->relkind != RELKIND_RELATION && + cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) { if (cstate->rel->rd_rel->relkind == RELKIND_VIEW) ereport(ERROR, @@ -2368,6 +2483,7 @@ CopyFrom(CopyState cstate) InitResultRelInfo(resultRelInfo, cstate->rel, 1, /* dummy rangetable index */ + true, /* do load partition check expression */ 0); ExecOpenIndices(resultRelInfo, false); @@ -2395,6 +2511,7 @@ CopyFrom(CopyState cstate) if ((resultRelInfo->ri_TrigDesc != NULL && (resultRelInfo->ri_TrigDesc->trig_insert_before_row || resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) || + cstate->ptnode != NULL || cstate->volatile_defexprs) { useHeapMultiInsert = false; @@ -2416,10 +2533,46 @@ CopyFrom(CopyState cstate) */ ExecBSInsertTriggers(estate, resultRelInfo); + /* Initialize FDW partition insert plans */ + if (cstate->ptnode) + { + int i, + j; + List *fdw_private_lists = cstate->partition_fdw_priv_lists; + ModifyTableState *mtstate = makeNode(ModifyTableState); + ResultRelInfo *leaf_part_rri; + + /* Mostly dummy containing enough state for BeginForeignModify */ + mtstate->ps.state = estate; + mtstate->operation = CMD_INSERT; + + j = 0; + leaf_part_rri = cstate->partitions; + for (i = 0; i < cstate->num_partitions; i++) + { + if (leaf_part_rri->ri_FdwRoutine) + { + List *fdw_private; + + Assert(fdw_private_lists); + fdw_private = list_nth(fdw_private_lists, j++); + leaf_part_rri->ri_FdwRoutine->BeginForeignModify(mtstate, + leaf_part_rri, + fdw_private, + 0, 0); + } + leaf_part_rri++; + } + } + values = (Datum *) palloc(tupDesc->natts * sizeof(Datum)); nulls = (bool *) palloc(tupDesc->natts * sizeof(bool)); - bistate = GetBulkInsertState(); + if (useHeapMultiInsert) + bistate = GetBulkInsertState(); + else + bistate = NULL; + econtext = GetPerTupleExprContext(estate); /* Set up callback to identify error line number */ @@ -2471,6 +2624,31 @@ CopyFrom(CopyState cstate) slot = myslot; ExecStoreTuple(tuple, slot, InvalidBuffer, false); + /* Determine the partition */ + saved_resultRelInfo = resultRelInfo; + if (cstate->ptnode) + { + int i_leaf_partition; + TupleConversionMap *map; + + econtext->ecxt_scantuple = slot; + i_leaf_partition = ExecFindPartition(resultRelInfo, + cstate->ptnode, + slot, + estate); + Assert(i_leaf_partition >= 0 && + i_leaf_partition < cstate->num_partitions); + + resultRelInfo = cstate->partitions + i_leaf_partition; + estate->es_result_relation_info = resultRelInfo; + + map = cstate->partition_tupconv_maps[i_leaf_partition]; + if (map) + tuple = do_convert_tuple(tuple, map); + + tuple->t_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + } + skip_tuple = false; /* BEFORE ROW INSERT Triggers */ @@ -2491,7 +2669,16 @@ CopyFrom(CopyState cstate) if (cstate->rel->rd_att->constr || resultRelInfo->ri_PartitionCheck) ExecConstraints(resultRelInfo, slot, estate); - if (useHeapMultiInsert) + if (resultRelInfo->ri_FdwRoutine) + { + resultRelInfo->ri_FdwRoutine->ExecForeignInsert(estate, + resultRelInfo, + slot, + NULL); + /* AFTER ROW INSERT Triggers */ + ExecARInsertTriggers(estate, resultRelInfo, tuple, NIL); + } + else if (useHeapMultiInsert) { /* Add this tuple to the tuple buffer */ if (nBufferedTuples == 0) @@ -2521,7 +2708,8 @@ CopyFrom(CopyState cstate) List *recheckIndexes = NIL; /* OK, store the tuple and create index entries for it */ - heap_insert(cstate->rel, tuple, mycid, hi_options, bistate); + heap_insert(resultRelInfo->ri_RelationDesc, + tuple, mycid, hi_options, bistate); if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), @@ -2541,6 +2729,12 @@ CopyFrom(CopyState cstate) * tuples inserted by an INSERT command. */ processed++; + + if (saved_resultRelInfo) + { + resultRelInfo = saved_resultRelInfo; + estate->es_result_relation_info = resultRelInfo; + } } } @@ -2554,7 +2748,8 @@ CopyFrom(CopyState cstate) /* Done, clean up */ error_context_stack = errcallback.previous; - FreeBulkInsertState(bistate); + if (bistate) + FreeBulkInsertState(bistate); MemoryContextSwitchTo(oldcontext); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 4a7c98d..152c575 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -1236,6 +1236,7 @@ ExecuteTruncate(TruncateStmt *stmt) InitResultRelInfo(resultRelInfo, rel, 0, /* dummy rangetable index */ + false, 0); resultRelInfo++; } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 714b49c..e2853a2 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -826,6 +826,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) InitResultRelInfo(resultRelInfo, resultRelation, resultRelationIndex, + true, estate->es_instrument); resultRelInfo++; } @@ -1215,6 +1216,7 @@ void InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, + bool load_partition_check, int instrument_options) { MemSet(resultRelInfo, 0, sizeof(ResultRelInfo)); @@ -1252,8 +1254,10 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_ConstraintExprs = NULL; resultRelInfo->ri_junkFilter = NULL; resultRelInfo->ri_projectReturning = NULL; - resultRelInfo->ri_PartitionCheck = - RelationGetPartitionQual(resultRelationDesc, true); + if (load_partition_check) + resultRelInfo->ri_PartitionCheck = + RelationGetPartitionQual(resultRelationDesc, + true); } /* @@ -1316,6 +1320,7 @@ ExecGetTriggerResultRel(EState *estate, Oid relid) InitResultRelInfo(rInfo, rel, 0, /* dummy rangetable index */ + true, estate->es_instrument); estate->es_trig_target_relations = lappend(estate->es_trig_target_relations, rInfo); @@ -2997,3 +3002,41 @@ EvalPlanQualEnd(EPQState *epqstate) epqstate->planstate = NULL; epqstate->origslot = NULL; } + +int +ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionTreeNode ptnode, + TupleTableSlot *slot, EState *estate) +{ + int i_leaf_partition; + Oid failed_at; + + i_leaf_partition = get_partition_for_tuple(ptnode, slot, estate, + &failed_at); + + if (i_leaf_partition < 0) + { + Relation rel = resultRelInfo->ri_RelationDesc; + char *val_desc; + Bitmapset *insertedCols, + *updatedCols, + *modifiedCols; + TupleDesc tupDesc = RelationGetDescr(rel); + + insertedCols = GetInsertedColumns(resultRelInfo, estate); + updatedCols = GetUpdatedColumns(resultRelInfo, estate); + modifiedCols = bms_union(insertedCols, updatedCols); + val_desc = ExecBuildSlotValueDescription(RelationGetRelid(rel), + slot, + tupDesc, + modifiedCols, + 64); + Assert(OidIsValid(failed_at)); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("no partition of relation \"%s\" found for row", + get_rel_name(failed_at)), + val_desc ? errdetail("Failing row contains %s.", val_desc) : 0)); + } + + return i_leaf_partition; +} diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 5b0e8cf..cb47035 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -243,6 +243,7 @@ ExecInsert(ModifyTableState *mtstate, { HeapTuple tuple; ResultRelInfo *resultRelInfo; + ResultRelInfo *saved_resultRelInfo = NULL; Relation resultRelationDesc; Oid newId; List *recheckIndexes = NIL; @@ -257,6 +258,31 @@ ExecInsert(ModifyTableState *mtstate, * get information on the (current) result relation */ resultRelInfo = estate->es_result_relation_info; + + saved_resultRelInfo = resultRelInfo; + + if (mtstate->mt_partition_tree_root) + { + int i_leaf_partition; + ExprContext *econtext = GetPerTupleExprContext(estate); + TupleConversionMap *map; + + econtext->ecxt_scantuple = slot; + i_leaf_partition = ExecFindPartition(resultRelInfo, + mtstate->mt_partition_tree_root, + slot, + estate); + Assert(i_leaf_partition >= 0 && + i_leaf_partition < mtstate->mt_num_partitions); + + resultRelInfo = mtstate->mt_partitions + i_leaf_partition; + estate->es_result_relation_info = resultRelInfo; + + map = mtstate->mt_partition_tupconv_maps[i_leaf_partition]; + if (map) + tuple = do_convert_tuple(tuple, map); + } + resultRelationDesc = resultRelInfo->ri_RelationDesc; /* @@ -496,6 +522,12 @@ ExecInsert(ModifyTableState *mtstate, list_free(recheckIndexes); + if (saved_resultRelInfo) + { + resultRelInfo = saved_resultRelInfo; + estate->es_result_relation_info = resultRelInfo; + } + /* * Check any WITH CHECK OPTION constraints from parent views. We are * required to do this after testing all constraints and uniqueness @@ -1550,6 +1582,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) Plan *subplan; ListCell *l; int i; + Relation rel; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -1640,6 +1673,79 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_result_relation_info = saved_resultRelInfo; + /* Build state for INSERT tuple routing */ + rel = mtstate->resultRelInfo->ri_RelationDesc; + if (operation == CMD_INSERT && + rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + int i, + j, + num_leaf_parts; + List *leaf_part_oids; + ListCell *cell; + ResultRelInfo *leaf_rel_rri; + + mtstate->mt_partition_tree_root = RelationGetPartitionTreeNode(rel); + leaf_part_oids = get_leaf_partition_oids_v2(mtstate->mt_partition_tree_root); + num_leaf_parts = list_length(leaf_part_oids); + + mtstate->mt_num_partitions = num_leaf_parts; + mtstate->mt_partitions = (ResultRelInfo *) + palloc0(num_leaf_parts * sizeof(ResultRelInfo)); + mtstate->mt_partition_tupconv_maps = (TupleConversionMap **) + palloc0(num_leaf_parts * sizeof(TupleConversionMap *)); + + leaf_rel_rri = mtstate->mt_partitions; + i = j = 0; + foreach(cell, leaf_part_oids) + { + Relation leaf_rel; + + leaf_rel = heap_open(lfirst_oid(cell), RowExclusiveLock); + + /* + * Verify result relation is a valid target for the current + * operation + */ + CheckValidResultRel(leaf_rel, CMD_INSERT); + + InitResultRelInfo(leaf_rel_rri, + leaf_rel, + 1, /* dummy */ + false, /* no need for partition checks */ + eflags); + + /* Open partition indices (note: ON CONFLICT unsupported)*/ + if (leaf_rel_rri->ri_RelationDesc->rd_rel->relhasindex && + operation != CMD_DELETE && + leaf_rel_rri->ri_IndexRelationDescs == NULL) + ExecOpenIndices(leaf_rel_rri, false); + + if (leaf_rel_rri->ri_FdwRoutine) + { + /* As many fdw_private's in fdwPrivLists as FDW partitions */ + List *fdw_private = (List *) list_nth(node->fdwPrivLists, j); + + leaf_rel_rri->ri_FdwRoutine->BeginForeignModify(mtstate, + leaf_rel_rri, + fdw_private, + 0, + eflags); + j++; + } + + if (!equalTupleDescs(RelationGetDescr(rel), + RelationGetDescr(leaf_rel))) + mtstate->mt_partition_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(rel), + RelationGetDescr(leaf_rel), + gettext_noop("could not convert row type")); + + leaf_rel_rri++; + i++; + } + } + /* * Initialize any WITH CHECK OPTION constraints if needed. */ @@ -1957,6 +2063,23 @@ ExecEndModifyTable(ModifyTableState *node) resultRelInfo); } + /* Close all partitions and indices thereof */ + for (i = 0; i < node->mt_num_partitions; i++) + { + ResultRelInfo *resultRelInfo = node->mt_partitions + i; + + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + + if (resultRelInfo->ri_FdwRoutine && + resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL) + resultRelInfo->ri_FdwRoutine->EndForeignModify(node->ps.state, + resultRelInfo); + + if (node->mt_partition_tupconv_maps[i]) + pfree(node->mt_partition_tupconv_maps[i]); + } + /* * Free the exprcontext */ diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 47158f6..32f4031 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -22,6 +22,7 @@ #include "access/stratnum.h" #include "access/sysattr.h" #include "catalog/pg_class.h" +#include "catalog/pg_partitioned_table_fn.h" #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/extensible.h" @@ -6152,6 +6153,65 @@ make_modifytable(PlannerInfo *root, node->fdwPrivLists = fdw_private_list; node->fdwDirectModifyPlans = direct_modify_plans; + /* Collect insert plans for all FDW-managed partitions */ + if (node->operation == CMD_INSERT) + { + RangeTblEntry *rte, + **saved_simple_rte_array; + List *partition_oids; + + Assert(list_length(resultRelations) == 1); + rte = rt_fetch(linitial_int(resultRelations), root->parse->rtable); + Assert(rte->rtekind == RTE_RELATION); + + if (rte->relkind != RELKIND_PARTITIONED_TABLE) + return node; + + partition_oids = get_leaf_partition_oids(rte->relid, NoLock); + + /* Discard any previous content which is useless anyway */ + fdw_private_list = NIL; + + /* To force FDW driver fetch the intended RTE */ + saved_simple_rte_array = root->simple_rte_array; + root->simple_rte_array = (RangeTblEntry **) + palloc0(2 * sizeof(RangeTblEntry *)); + foreach(lc, partition_oids) + { + Oid myoid = lfirst_oid(lc); + FdwRoutine *fdwroutine; + List *fdw_private; + + if (!oid_is_foreign_table(myoid)) + continue; + + fdwroutine = GetFdwRoutineByRelId(myoid); + if (fdwroutine && fdwroutine->PlanForeignModify) + { + RangeTblEntry *fdw_rte; + + fdw_rte = copyObject(rte); + fdw_rte->relid = myoid; + fdw_rte->relkind = RELKIND_FOREIGN_TABLE; + + /* Assumes PlanForeignModify() uses planner_rt_fetch(). */ + root->simple_rte_array[1] = fdw_rte; + + fdw_private = fdwroutine->PlanForeignModify(root, node, 1, 0); + pfree(fdw_rte); + } + else + fdw_private = NIL; + + fdw_private_list = lappend(fdw_private_list, fdw_private); + } + + pfree(root->simple_rte_array); + root->simple_rte_array = saved_simple_rte_array; + + node->fdwPrivLists = fdw_private_list; + } + return node; } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 8036d3f..f8bfa4b 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1214,7 +1214,12 @@ get_relation_constraints(PlannerInfo *root, } } - /* Append partition predicates, if any */ + /* + * Append partition predicates, if any. Note that we request the + * parent's quals *not* to be included (by passing false) because if the + * parent's quals cause it to be excluded, this relation will not be + * processed in the first place. + */ pcqual = RelationGetPartitionQual(relation, false); if (pcqual) { @@ -1708,3 +1713,16 @@ has_row_triggers(PlannerInfo *root, Index rti, CmdType event) heap_close(relation, NoLock); return result; } + +bool +oid_is_foreign_table(Oid relid) +{ + Relation rel; + char relkind; + + rel = heap_open(relid, NoLock); + relkind = rel->rd_rel->relkind; + heap_close(rel, NoLock); + + return relkind == RELKIND_FOREIGN_TABLE; +} diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index eac86cc..9f87f57 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -25,6 +25,7 @@ #include "postgres.h" #include "access/sysattr.h" +#include "catalog/pg_partitioned_table_fn.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -797,8 +798,16 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) /* Process ON CONFLICT, if any. */ if (stmt->onConflictClause) + { + /* Bail out if target relation is partitioned table */ + if (pstate->p_target_rangetblentry->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ON CONFLICT clause is not supported with partitioned tables"))); + qry->onConflict = transformOnConflictClause(pstate, stmt->onConflictClause); + } /* * If we have a RETURNING clause, we need to add the target relation to diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index d3e789a..badd566 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -14,6 +14,8 @@ #define PARTITION_H #include "fmgr.h" +#include "executor/tuptable.h" +#include "nodes/execnodes.h" #include "parser/parse_node.h" #include "utils/relcache.h" @@ -76,4 +78,9 @@ extern List *RelationGetPartitionQual(Relation rel, bool recurse); /* For tuple routing */ extern PartitionTreeNode RelationGetPartitionTreeNode(Relation rel); extern List *get_leaf_partition_oids_v2(PartitionTreeNode ptnode); + +extern int get_partition_for_tuple(PartitionTreeNode ptnode, + TupleTableSlot *slot, + EState *estate, + Oid *failed_at); #endif /* PARTITION_H */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 39521ed..93a9cf3 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -14,6 +14,7 @@ #ifndef EXECUTOR_H #define EXECUTOR_H +#include "catalog/partition.h" #include "executor/execdesc.h" #include "nodes/parsenodes.h" @@ -188,6 +189,7 @@ extern void CheckValidResultRel(Relation resultRel, CmdType operation); extern void InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, Index resultRelationIndex, + bool load_partition_check, int instrument_options); extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid); extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); @@ -211,6 +213,10 @@ extern void EvalPlanQualSetPlan(EPQState *epqstate, extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti, HeapTuple tuple); extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti); +extern int ExecFindPartition(ResultRelInfo *resultRelInfo, + PartitionTreeNode ptnode, + TupleTableSlot *slot, + EState *estate); #define EvalPlanQualSetSlot(epqstate, slot) ((epqstate)->origslot = (slot)) extern void EvalPlanQualFetchRowMarks(EPQState *epqstate); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index e35da66..39ca517 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -16,6 +16,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/tupconvert.h" #include "executor/instrument.h" #include "lib/pairingheap.h" #include "nodes/params.h" @@ -1140,6 +1141,15 @@ typedef struct ModifyTableState * tlist */ TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection * target */ + struct PartitionTreeNodeData *mt_partition_tree_root; + /* Partition descriptor node tree */ + ResultRelInfo *mt_partitions; /* Per leaf partition target + * relations */ + TupleConversionMap **mt_partition_tupconv_maps; + /* Per leaf partition + * tuple conversion map */ + int mt_num_partitions; /* Number of leaf partition target + * relations in the above array */ } ModifyTableState; /* ---------------- diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 125274e..fac606c 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -56,5 +56,6 @@ extern Selectivity join_selectivity(PlannerInfo *root, SpecialJoinInfo *sjinfo); extern bool has_row_triggers(PlannerInfo *root, Index rti, CmdType event); +extern bool oid_is_foreign_table(Oid relid); #endif /* PLANCAT_H */ diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 89d5760..0f83bc1 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -222,6 +222,62 @@ DETAIL: Failing row contains (cc, 1). -- ok insert into part_EE_FF_1_10 values ('ff', 1); insert into part_EE_FF_10_20 values ('ff', 11); +-- Check tuple routing for partitioned tables +-- fail +insert into range_parted values ('a', 0); +ERROR: no partition of relation "range_parted" found for row +DETAIL: Failing row contains (a, 0). +-- ok +insert into range_parted values ('a', 1); +insert into range_parted values ('a', 10); +-- fail +insert into range_parted values ('a', 20); +ERROR: no partition of relation "range_parted" found for row +DETAIL: Failing row contains (a, 20). +-- ok +insert into range_parted values ('b', 1); +insert into range_parted values ('b', 10); +select tableoid::regclass, * from range_parted; + tableoid | a | b +----------------+---+---- + part_a_1_a_10 | a | 1 + part_a_1_a_10 | a | 1 + part_a_10_a_20 | a | 10 + part_b_1_b_10 | b | 1 + part_b_10_b_20 | b | 10 + part_b_10_b_20 | b | 10 +(6 rows) + +-- fail (no list partition defined which accepts nulls) +insert into list_parted (b) values (1); +ERROR: no partition of relation "list_parted" found for row +DETAIL: Failing row contains (null, 1). +create table part_nulls partition of list_parted for values in (null); +-- ok +insert into list_parted (b) values (1); +insert into list_parted (a) values ('aA'); +-- fail (partition of part_EE_FF not found) +insert into list_parted values ('EE', 0); +ERROR: no partition of relation "part_ee_ff" found for row +DETAIL: Failing row contains (EE, 0). +insert into part_EE_FF values ('EE', 0); +ERROR: no partition of relation "part_ee_ff" found for row +DETAIL: Failing row contains (EE, 0). +-- ok +insert into list_parted values ('EE', 1); +insert into part_EE_FF values ('EE', 10); +select tableoid::regclass, * from list_parted; + tableoid | a | b +------------------+----+---- + part_aa_bb | aA | + part_cc_dd | cC | 1 + part_ee_ff_1_10 | ff | 1 + part_ee_ff_1_10 | EE | 1 + part_ee_ff_10_20 | ff | 11 + part_ee_ff_10_20 | EE | 10 + part_nulls | | 1 +(7 rows) + -- cleanup drop table range_parted cascade; NOTICE: drop cascades to 4 other objects @@ -230,9 +286,10 @@ drop cascades to table part_a_10_a_20 drop cascades to table part_b_1_b_10 drop cascades to table part_b_10_b_20 drop table list_parted cascade; -NOTICE: drop cascades to 5 other objects +NOTICE: drop cascades to 6 other objects DETAIL: drop cascades to table part_aa_bb drop cascades to table part_cc_dd drop cascades to table part_ee_ff drop cascades to table part_ee_ff_1_10 drop cascades to table part_ee_ff_10_20 +drop cascades to table part_nulls diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 4bf042e..d1b5a09 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -137,6 +137,34 @@ insert into part_EE_FF_1_10 values ('cc', 1); insert into part_EE_FF_1_10 values ('ff', 1); insert into part_EE_FF_10_20 values ('ff', 11); +-- Check tuple routing for partitioned tables + +-- fail +insert into range_parted values ('a', 0); +-- ok +insert into range_parted values ('a', 1); +insert into range_parted values ('a', 10); +-- fail +insert into range_parted values ('a', 20); +-- ok +insert into range_parted values ('b', 1); +insert into range_parted values ('b', 10); +select tableoid::regclass, * from range_parted; + +-- fail (no list partition defined which accepts nulls) +insert into list_parted (b) values (1); +create table part_nulls partition of list_parted for values in (null); +-- ok +insert into list_parted (b) values (1); +insert into list_parted (a) values ('aA'); +-- fail (partition of part_EE_FF not found) +insert into list_parted values ('EE', 0); +insert into part_EE_FF values ('EE', 0); +-- ok +insert into list_parted values ('EE', 1); +insert into part_EE_FF values ('EE', 10); +select tableoid::regclass, * from list_parted; + -- cleanup drop table range_parted cascade; drop table list_parted cascade; -- 1.7.1