diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 65a34a2..345ed17 100644 *** a/src/backend/optimizer/path/allpaths.c --- b/src/backend/optimizer/path/allpaths.c *************** remove_unused_subquery_outputs(Query *su *** 3306,3311 **** --- 3306,3315 ---- /* * If subquery has regular DISTINCT (not DISTINCT ON), we're wasting our * time: all its output columns must be used in the distinctClause. + * (Note: the latter is not necessarily true anymore, because planner.c + * might have found some of the DISTINCT columns to be redundant and + * dropped them. But they'd still have sortgroupref markings, so unless + * we improve the heuristic below, we would not recognize them as unused.) */ if (subquery->distinctClause && !subquery->hasDistinctOn) return; *************** remove_unused_subquery_outputs(Query *su *** 3348,3358 **** /* * If it has a sortgroupref number, it's used in some sort/group ! * clause so we'd better not remove it. Also, don't remove any ! * resjunk columns, since their reason for being has nothing to do ! * with anybody reading the subquery's output. (It's likely that ! * resjunk columns in a sub-SELECT would always have ressortgroupref ! * set, but even if they don't, it seems imprudent to remove them.) */ if (tle->ressortgroupref || tle->resjunk) continue; --- 3352,3365 ---- /* * If it has a sortgroupref number, it's used in some sort/group ! * clause so we'd better not remove it. (This is a conservative ! * heuristic, since it might not actually be used by any surviving ! * sort/group clause; but we don't bother to expend the cycles needed ! * for a more accurate test.) Also, don't remove any resjunk columns, ! * since their reason for being has nothing to do with anybody reading ! * the subquery's output. (It's likely that resjunk columns in a ! * sub-SELECT would always have ressortgroupref set, but even if they ! * don't, it seems imprudent to remove them.) */ if (tle->ressortgroupref || tle->resjunk) continue; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 008492b..a9ccfed 100644 *** a/src/backend/optimizer/plan/planner.c --- b/src/backend/optimizer/plan/planner.c *************** static double preprocess_limit(PlannerIn *** 125,130 **** --- 125,133 ---- int64 *offset_est, int64 *count_est); static bool limit_needed(Query *parse); static void remove_useless_groupby_columns(PlannerInfo *root); + static void remove_useless_distinct_columns(PlannerInfo *root); + static List *remove_dependent_grouping_clauses(PlannerInfo *root, + List *clauselist); static List *preprocess_groupclause(PlannerInfo *root, List *force); static List *extract_rollup_sets(List *groupingSets); static List *reorder_grouping_sets(List *groupingSets, List *sortclause); *************** subquery_planner(PlannerGlobal *glob, Qu *** 965,970 **** --- 968,976 ---- /* Remove any redundant GROUP BY columns */ remove_useless_groupby_columns(root); + /* Likewise for redundant DISTINCT columns */ + remove_useless_distinct_columns(root); + /* * If we have any outer joins, try to reduce them to plain inner joins. * This step is most easily done after we've done expression *************** static void *** 2941,2967 **** remove_useless_groupby_columns(PlannerInfo *root) { Query *parse = root->parse; - Bitmapset **groupbyattnos; - Bitmapset **surplusvars; - ListCell *lc; - int relid; - - /* No chance to do anything if there are less than two GROUP BY items */ - if (list_length(parse->groupClause) < 2) - return; /* Don't fiddle with the GROUP BY clause if the query has grouping sets */ if (parse->groupingSets) return; /* ! * Scan the GROUP BY clause to find GROUP BY items that are simple Vars. ! * Fill groupbyattnos[k] with a bitmapset of the column attnos of RTE k ! * that are GROUP BY items. */ ! groupbyattnos = (Bitmapset **) palloc0(sizeof(Bitmapset *) * ! (list_length(parse->rtable) + 1)); ! foreach(lc, parse->groupClause) { SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList); --- 2947,3022 ---- remove_useless_groupby_columns(PlannerInfo *root) { Query *parse = root->parse; /* Don't fiddle with the GROUP BY clause if the query has grouping sets */ if (parse->groupingSets) return; + parse->groupClause = + remove_dependent_grouping_clauses(root, parse->groupClause); + } + + /* + * remove_useless_distinct_columns + * Like remove_useless_groupby_columns, but for the DISTINCT clause + * + * If we have both a multi-member GROUP BY clause and a multi-member DISTINCT + * clause, this will do a lot of the same catalog lookup work that + * remove_useless_groupby_columns already did. That seems like an unlikely + * case, so for now we don't worry about it, but eventually it might be good + * to refactor to avoid that. + */ + static void + remove_useless_distinct_columns(PlannerInfo *root) + { + Query *parse = root->parse; + /* ! * Don't try to remove anything from a DISTINCT ON clause. For this case, ! * the distinctClauses are closely entwined with the ORDER BY clause, so ! * we'd better not meddle with them. You might think that we can just ! * apply the same optimizations to the ORDER BY too, but we can't since ! * removing an item there could affect the order of the query results. */ ! if (parse->hasDistinctOn) ! return; ! ! parse->distinctClause = ! remove_dependent_grouping_clauses(root, parse->distinctClause); ! } ! ! /* ! * remove_dependent_grouping_clauses ! * Process clauselist (a list of SortGroupClause) and remove any items ! * that can be proven to be functionally dependent on other items. ! * We will have the same grouping semantics without them. ! * ! * If any item from the list can be removed, then a new list is built which ! * does not contain the removed items. If nothing can be removed then the ! * original list is returned. ! */ ! static List * ! remove_dependent_grouping_clauses(PlannerInfo *root, ! List *clauselist) ! { ! Query *parse = root->parse; ! Bitmapset **clauseattnos; ! Bitmapset **surplusvars; ! ListCell *lc; ! int relid; ! ! /* No chance of removing anything if there are fewer than two items */ ! if (list_length(clauselist) < 2) ! return clauselist; ! ! /* ! * Scan the clauselist to find items that are simple Vars. Fill ! * clauseattnos[k] with a bitmapset of the column attnos of RTE k that ! * appear in the clauselist. ! */ ! clauseattnos = (Bitmapset **) palloc0(sizeof(Bitmapset *) * ! (list_length(parse->rtable) + 1)); ! foreach(lc, clauselist) { SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList); *************** remove_useless_groupby_columns(PlannerIn *** 2971,2979 **** * Ignore non-Vars and Vars from other query levels. * * XXX in principle, stable expressions containing Vars could also be ! * removed, if all the Vars are functionally dependent on other GROUP ! * BY items. But it's not clear that such cases occur often enough to ! * be worth troubling over. */ if (!IsA(var, Var) || var->varlevelsup > 0) --- 3026,3034 ---- * Ignore non-Vars and Vars from other query levels. * * XXX in principle, stable expressions containing Vars could also be ! * removed, if all the Vars are functionally dependent on other items ! * in the clauselist. But it's not clear that such cases occur often ! * enough to be worth troubling over. */ if (!IsA(var, Var) || var->varlevelsup > 0) *************** remove_useless_groupby_columns(PlannerIn *** 2982,2996 **** /* OK, remember we have this Var */ relid = var->varno; Assert(relid <= list_length(parse->rtable)); ! groupbyattnos[relid] = bms_add_member(groupbyattnos[relid], ! var->varattno - FirstLowInvalidHeapAttributeNumber); } /* * Consider each relation and see if it is possible to remove some of its ! * Vars from GROUP BY. For simplicity and speed, we do the actual removal ! * in a separate pass. Here, we just fill surplusvars[k] with a bitmapset ! * of the column attnos of RTE k that are removable GROUP BY items. */ surplusvars = NULL; /* don't allocate array unless required */ relid = 0; --- 3037,3055 ---- /* OK, remember we have this Var */ relid = var->varno; Assert(relid <= list_length(parse->rtable)); ! clauseattnos[relid] = bms_add_member(clauseattnos[relid], ! var->varattno - FirstLowInvalidHeapAttributeNumber); } /* * Consider each relation and see if it is possible to remove some of its ! * Vars from the clauselist. We can do so if they are functionally ! * dependent on other Vars from the same relation that are also in the ! * clauselist (independently of any other relations that are mentioned). ! * ! * For simplicity and speed, we do the actual removal in a separate pass. ! * Here, we just fill surplusvars[k] with a bitmapset of the column attnos ! * of RTE k that are removable clauselist items. */ surplusvars = NULL; /* don't allocate array unless required */ relid = 0; *************** remove_useless_groupby_columns(PlannerIn *** 3007,3014 **** if (rte->rtekind != RTE_RELATION) continue; ! /* Nothing to do unless this rel has multiple Vars in GROUP BY */ ! relattnos = groupbyattnos[relid]; if (bms_membership(relattnos) != BMS_MULTIPLE) continue; --- 3066,3073 ---- if (rte->rtekind != RTE_RELATION) continue; ! /* Nothing to do unless this rel has multiple Vars in clauselist */ ! relattnos = clauseattnos[relid]; if (bms_membership(relattnos) != BMS_MULTIPLE) continue; *************** remove_useless_groupby_columns(PlannerIn *** 3022,3028 **** /* * If the primary key is a proper subset of relattnos then we have ! * some items in the GROUP BY that can be removed. */ if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1) { --- 3081,3087 ---- /* * If the primary key is a proper subset of relattnos then we have ! * some items in the clauselist that can be removed. */ if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1) { *************** remove_useless_groupby_columns(PlannerIn *** 3044,3058 **** } /* ! * If we found any surplus Vars, build a new GROUP BY clause without them. * (Note: this may leave some TLEs with unreferenced ressortgroupref * markings, but that's harmless.) */ if (surplusvars != NULL) { ! List *new_groupby = NIL; ! foreach(lc, parse->groupClause) { SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList); --- 3103,3117 ---- } /* ! * If we found any surplus Vars, build a new clause list without them. * (Note: this may leave some TLEs with unreferenced ressortgroupref * markings, but that's harmless.) */ if (surplusvars != NULL) { ! List *new_clauselist = NIL; ! foreach(lc, clauselist) { SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList); *************** remove_useless_groupby_columns(PlannerIn *** 3066,3076 **** var->varlevelsup > 0 || !bms_is_member(var->varattno - FirstLowInvalidHeapAttributeNumber, surplusvars[var->varno])) ! new_groupby = lappend(new_groupby, sgc); } ! parse->groupClause = new_groupby; } } /* --- 3125,3138 ---- var->varlevelsup > 0 || !bms_is_member(var->varattno - FirstLowInvalidHeapAttributeNumber, surplusvars[var->varno])) ! new_clauselist = lappend(new_clauselist, sgc); } ! return new_clauselist; } + + /* nothing to change, just return the old list */ + return clauselist; } /* diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index f85e913..2976c4b 100644 *** a/src/test/regress/expected/aggregates.out --- b/src/test/regress/expected/aggregates.out *************** explain (costs off) select * from t3 gro *** 1017,1022 **** --- 1017,1081 ---- -> Seq Scan on t3 (3 rows) + -- + -- Test removal of redundant DISTINCT columns + -- + -- Non-primary-key columns can be removed from DISTINCT clause + explain (costs off) select distinct a,b,c,d from t1; + QUERY PLAN + ---------------------- + HashAggregate + Group Key: a, b + -> Seq Scan on t1 + (3 rows) + + -- No removal can happen if the complete PK is not present in DISTINCT clause + explain (costs off) select distinct a,c,d from t1; + QUERY PLAN + ---------------------- + HashAggregate + Group Key: a, c, d + -> Seq Scan on t1 + (3 rows) + + -- Test removal across multiple relations + explain (costs off) select distinct t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z + from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y; + QUERY PLAN + ------------------------------------------------------ + HashAggregate + Group Key: t1.a, t1.b, t2.x, t2.y + -> Hash Join + Hash Cond: ((t2.x = t1.a) AND (t2.y = t1.b)) + -> Seq Scan on t2 + -> Hash + -> Seq Scan on t1 + (7 rows) + + -- Test case where t1 can be optimized but not t2 + explain (costs off) select distinct t1.a,t1.b,t1.c,t1.d,t2.x,t2.z + from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y; + QUERY PLAN + ------------------------------------------------------ + HashAggregate + Group Key: t1.a, t1.b, t2.x, t2.z + -> Hash Join + Hash Cond: ((t2.x = t1.a) AND (t2.y = t1.b)) + -> Seq Scan on t2 + -> Hash + -> Seq Scan on t1 + (7 rows) + + -- Ensure we don't remove DISTINCT ON items + explain (costs off) select distinct on (a,b,c) d from t1 order by a,b,c,d; + QUERY PLAN + ------------------------------ + Unique + -> Sort + Sort Key: a, b, c, d + -> Seq Scan on t1 + (4 rows) + drop table t1; drop table t2; drop table t3; diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 84c6e9b..fd03559 100644 *** a/src/test/regress/expected/join.out --- b/src/test/regress/expected/join.out *************** select d.* from d left join (select * fr *** 4126,4147 **** -> Seq Scan on d (8 rows) ! -- similarly, but keying off a DISTINCT clause explain (costs off) select d.* from d left join (select distinct * from b) s on d.a = s.id; ! QUERY PLAN ! -------------------------------------- ! Merge Right Join ! Merge Cond: (b.id = d.a) ! -> Unique ! -> Sort ! Sort Key: b.id, b.c_id ! -> Seq Scan on b ! -> Sort ! Sort Key: d.a ! -> Seq Scan on d ! (9 rows) -- check join removal works when uniqueness of the join condition is enforced -- by a UNION --- 4126,4147 ---- -> Seq Scan on d (8 rows) ! -- similarly, but keying off a DISTINCT clause (again, removal of b.c_id ! -- from the DISTINCT step happens too late for join removal) explain (costs off) select d.* from d left join (select distinct * from b) s on d.a = s.id; ! QUERY PLAN ! --------------------------------------- ! Hash Left Join ! Hash Cond: (d.a = s.id) ! -> Seq Scan on d ! -> Hash ! -> Subquery Scan on s ! -> HashAggregate ! Group Key: b.id ! -> Seq Scan on b ! (8 rows) -- check join removal works when uniqueness of the join condition is enforced -- by a UNION diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 506d044..1ee26ca 100644 *** a/src/test/regress/sql/aggregates.sql --- b/src/test/regress/sql/aggregates.sql *************** group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z; *** 362,367 **** --- 362,387 ---- -- Cannot optimize when PK is deferrable explain (costs off) select * from t3 group by a,b,c; + -- + -- Test removal of redundant DISTINCT columns + -- + -- Non-primary-key columns can be removed from DISTINCT clause + explain (costs off) select distinct a,b,c,d from t1; + + -- No removal can happen if the complete PK is not present in DISTINCT clause + explain (costs off) select distinct a,c,d from t1; + + -- Test removal across multiple relations + explain (costs off) select distinct t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z + from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y; + + -- Test case where t1 can be optimized but not t2 + explain (costs off) select distinct t1.a,t1.b,t1.c,t1.d,t2.x,t2.z + from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y; + + -- Ensure we don't remove DISTINCT ON items + explain (costs off) select distinct on (a,b,c) d from t1 order by a,b,c,d; + drop table t1; drop table t2; drop table t3; diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index b1e05a3..31fd397 100644 *** a/src/test/regress/sql/join.sql --- b/src/test/regress/sql/join.sql *************** explain (costs off) *** 1360,1366 **** select d.* from d left join (select * from b group by b.id, b.c_id) s on d.a = s.id; ! -- similarly, but keying off a DISTINCT clause explain (costs off) select d.* from d left join (select distinct * from b) s on d.a = s.id; --- 1360,1367 ---- select d.* from d left join (select * from b group by b.id, b.c_id) s on d.a = s.id; ! -- similarly, but keying off a DISTINCT clause (again, removal of b.c_id ! -- from the DISTINCT step happens too late for join removal) explain (costs off) select d.* from d left join (select distinct * from b) s on d.a = s.id;