diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 4f93afdebc..ec4095bd2e 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -328,6 +328,12 @@ pathkeys_contained_in(List *keys1, List *keys2) return false; } +/*************************************************************/ +bool debug_group_by_reorder_by_pathkeys = true; +bool debug_group_by_match_order_by = true; +bool debug_cheapest_group_by = true; +/************************************************************/ + /* * Reorder GROUP BY pathkeys and clauses to match order of pathkeys. Function * returns new lists, original GROUP BY lists stay untouched. @@ -341,6 +347,9 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, ListCell *key; int n; + if (debug_group_by_reorder_by_pathkeys == false) + return 0; + if (pathkeys == NIL || *group_pathkeys == NIL) return 0; @@ -380,6 +389,157 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, return n; } +/* + * Order tail of list of group pathkeys by uniqueness descendetly. It allows to + * speedup sorting. Returns newly allocated lists, old ones stay untouched. + * n_preordered defines a head of list which order should be prevented. + */ +void +get_cheapest_group_keys_order(PlannerInfo *root, double nrows, + List *target_list, + List **group_pathkeys, List **group_clauses, + int n_preordered) +{ + struct + { + PathKey *pathkey; + SortGroupClause *sgc; + Node *pathkeyExpr; + } + *keys, tmp; + int nkeys = list_length(*group_pathkeys) - n_preordered; + List *pathkeyExprList = NIL, + *new_group_pathkeys = NIL, + *new_group_clauses = NIL; + ListCell *cell; + int i = 0, n_keys_to_est; + + if (nkeys < 2) + return; /* nothing to do */ + + /* + * Will try to match ORDER BY pathkeys in hope that one sort is cheaper than + * two + */ + if (debug_group_by_match_order_by && + n_preordered == 0 && root->sort_pathkeys) + { + bool _save_debug_group_by_reorder_by_pathkeys = + debug_group_by_reorder_by_pathkeys; /* DEBUG ONLY, to be removed */ + + debug_group_by_reorder_by_pathkeys = true; + n_preordered = group_keys_reorder_by_pathkeys(root->sort_pathkeys, + group_pathkeys, + group_clauses); + debug_group_by_reorder_by_pathkeys = _save_debug_group_by_reorder_by_pathkeys; + + nkeys = list_length(*group_pathkeys) - n_preordered; + if (nkeys < 2) + return; /* nothing to do */ + } + + if (!debug_cheapest_group_by) + return; + + keys = palloc(nkeys * sizeof(*keys)); + + /* + * Collect information about pathkey for subsequent usage + */ + for_each_cell(cell, list_nth_cell(*group_pathkeys, n_preordered)) + { + PathKey *pathkey = (PathKey *) lfirst(cell); + + keys[i].pathkey = pathkey; + keys[i].sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref, + *group_clauses); + keys[i].pathkeyExpr = get_sortgroupclause_expr(keys[i].sgc, + target_list); + i++; + } + + /* + * Find the cheapest to sort order of columns. We will find a first column + * with bigger number of group, then pair (first column in pair is already + * defined in first step), them triple and so on. + */ + for(n_keys_to_est = 1; n_keys_to_est <= nkeys - 1; n_keys_to_est++) + { + ListCell *tail_cell; + int best_i = 0; + double best_est_num_groups = -1; + + /* expand list of columns and remeber last cell */ + pathkeyExprList = lappend(pathkeyExprList, NULL); + tail_cell = list_tail(pathkeyExprList); + + /* + * Find the best last column - the best means bigger number of groups, + * previous columns are already choosen + */ + for(i = n_keys_to_est - 1; i < nkeys; i++) + { + double est_num_groups; + + lfirst(tail_cell) = keys[i].pathkeyExpr; + est_num_groups = estimate_num_groups(root, pathkeyExprList, + nrows, NULL); + + if (est_num_groups > best_est_num_groups) + { + best_est_num_groups = est_num_groups; + best_i = i; + } + } + + /* Save the best choice */ + lfirst(tail_cell) = keys[best_i].pathkeyExpr; + if (best_i != n_keys_to_est - 1) + { + tmp = keys[n_keys_to_est - 1]; + keys[n_keys_to_est - 1] = keys[best_i]; + keys[best_i] = tmp; + } + } + list_free(pathkeyExprList); + + /* + * Construct result lists, keys array is already ordered to get a cheapest + * sort + */ + i = 0; + foreach(cell, *group_pathkeys) + { + PathKey *pathkey; + SortGroupClause *sgc; + + if (i < n_preordered) + { + pathkey = (PathKey *) lfirst(cell); + sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref, + *group_clauses); + } + else + { + pathkey = keys[i - n_preordered].pathkey; + sgc = keys[i - n_preordered].sgc; + } + + new_group_pathkeys = lappend(new_group_pathkeys, pathkey); + new_group_clauses = lappend(new_group_clauses, sgc); + + i++; + } + + pfree(keys); + + /* Just append the rest GROUP BY clauses */ + new_group_clauses = list_concat_unique_ptr(new_group_clauses, *group_clauses); + + *group_pathkeys = new_group_pathkeys; + *group_clauses = new_group_clauses; +} + /* * get_cheapest_path_for_pathkeys * Find the cheapest path (according to the specified criterion) that diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 1e7809edf2..f0e23d3354 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -6184,7 +6184,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, bool is_sorted; List *group_pathkeys = root->group_pathkeys, *group_clauses = parse->groupClause; - int n_preordered_groups; + int n_preordered_groups = 0; if (parse->groupingSets) { @@ -6208,11 +6208,20 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, { /* Sort the cheapest-total path if it isn't already sorted */ if (!is_sorted) + { + if (!parse->groupingSets) + get_cheapest_group_keys_order(root, + path->rows, + extra->targetList, + &group_pathkeys, + &group_clauses, + n_preordered_groups); path = (Path *) create_sort_path(root, grouped_rel, path, group_pathkeys, -1.0); + } /* Now decide what to stick atop it */ if (parse->groupingSets) @@ -6286,6 +6295,12 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, { if (path != partially_grouped_rel->cheapest_total_path) continue; + get_cheapest_group_keys_order(root, + path->rows, + extra->targetList, + &group_pathkeys, + &group_clauses, + n_preordered_groups); path = (Path *) create_sort_path(root, grouped_rel, path, @@ -6560,11 +6575,19 @@ create_partial_grouping_paths(PlannerInfo *root, { /* Sort the cheapest partial path, if it isn't already */ if (!is_sorted) + { + get_cheapest_group_keys_order(root, + path->rows, + extra->targetList, + &group_pathkeys, + &group_clauses, + n_preordered_groups); path = (Path *) create_sort_path(root, partially_grouped_rel, path, group_pathkeys, -1.0); + } if (parse->hasAggs) add_path(partially_grouped_rel, (Path *) @@ -6611,11 +6634,19 @@ create_partial_grouping_paths(PlannerInfo *root, /* Sort the cheapest partial path, if it isn't already */ if (!is_sorted) + { + get_cheapest_group_keys_order(root, + path->rows, + extra->targetList, + &group_pathkeys, + &group_clauses, + n_preordered_groups); path = (Path *) create_sort_path(root, partially_grouped_rel, path, group_pathkeys, -1.0); + } if (parse->hasAggs) add_partial_path(partially_grouped_rel, (Path *) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index fa3c8a7905..4175d252c0 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1822,7 +1822,35 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, - +/*********************************************************/ + { + {"debug_group_by_reorder_by_pathkeys", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("enable reorder GROUP BY by pathkeys"), + NULL + }, + &debug_group_by_reorder_by_pathkeys, + true, + NULL, NULL, NULL + }, + { + {"debug_enable_group_by_match_order_by", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("enable matching GROUP BY by ORDER BY."), + NULL + }, + &debug_group_by_match_order_by, + true, + NULL, NULL, NULL + }, + { + {"debug_enable_cheapest_group_by", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("find a cheapest order of columns in GROUP BY."), + NULL + }, + &debug_cheapest_group_by, + true, + NULL, NULL, NULL + }, +/********************************************************/ /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 226b293622..ef91e0dae5 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -193,6 +193,17 @@ extern bool pathkeys_contained_in(List *keys1, List *keys2); extern int group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, List **group_clauses); +/*********************************************************/ +extern bool debug_group_by_reorder_by_pathkeys; +extern bool debug_group_by_match_order_by; +extern bool debug_cheapest_group_by; +/********************************************************/ +extern void get_cheapest_group_keys_order(PlannerInfo *root, + double nrows, + List *target_list, + List **group_pathkeys, + List **group_clauses, + int n_preordered); extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys, Relids required_outer, CostSelector cost_criterion, diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index e302dfbdce..31dcf70e47 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -2071,19 +2071,145 @@ SELECT i/2 AS p, format('%60s', i%2) AS v, i/4 AS c, - i/8 AS d + i/8 AS d, + (random() * (10000/8))::int as e --the same as d but no correlation with p INTO btg FROM generate_series(1, 10000) i; -CREATE INDEX ON btg(p, v); VACUUM btg; ANALYZE btg; +-- GROUP BY optimization by reorder columns by frequency SET enable_hashagg=off; SET max_parallel_workers= 0; SET max_parallel_workers_per_gather = 0; +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, v; + QUERY PLAN +----------------------------- + GroupAggregate + Group Key: p, v + -> Sort + Sort Key: p, v + -> Seq Scan on btg +(5 rows) + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p; + QUERY PLAN +----------------------------- + GroupAggregate + Group Key: p, v + -> Sort + Sort Key: p, v + -> Seq Scan on btg +(5 rows) + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, c; + QUERY PLAN +----------------------------- + GroupAggregate + Group Key: p, v, c + -> Sort + Sort Key: p, v, c + -> Seq Scan on btg +(5 rows) + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY v, p, c; + QUERY PLAN +----------------------------- + GroupAggregate + Group Key: v, p, c + -> Sort + Sort Key: v, p, c + -> Seq Scan on btg +(5 rows) + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, d, c; + QUERY PLAN +------------------------------ + GroupAggregate + Group Key: p, v, d, c + -> Sort + Sort Key: p, v, d, c + -> Seq Scan on btg +(5 rows) + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY v, p, d ,c; + QUERY PLAN +------------------------------ + GroupAggregate + Group Key: v, p, d, c + -> Sort + Sort Key: v, p, d, c + -> Seq Scan on btg +(5 rows) + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY p, v, d ,c; + QUERY PLAN +------------------------------ + GroupAggregate + Group Key: p, v, d, c + -> Sort + Sort Key: p, v, d, c + -> Seq Scan on btg +(5 rows) + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, d, e; + QUERY PLAN +----------------------------- + GroupAggregate + Group Key: p, d, e + -> Sort + Sort Key: p, d, e + -> Seq Scan on btg +(5 rows) + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, e, d; + QUERY PLAN +----------------------------- + GroupAggregate + Group Key: p, e, d + -> Sort + Sort Key: p, e, d + -> Seq Scan on btg +(5 rows) + +CREATE STATISTICS btg_dep ON d, e, p FROM btg; +ANALYZE btg; +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, d, e; + QUERY PLAN +----------------------------- + GroupAggregate + Group Key: p, e, d + -> Sort + Sort Key: p, e, d + -> Seq Scan on btg +(5 rows) + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, e, d; + QUERY PLAN +----------------------------- + GroupAggregate + Group Key: p, e, d + -> Sort + Sort Key: p, e, d + -> Seq Scan on btg +(5 rows) + +-- GROUP BY optimization by reorder columns by index scan +CREATE INDEX ON btg(p, v); SET enable_seqscan=off; SET enable_bitmapscan=off; --- GROUP BY optimization by reorder columns by index scan +VACUUM btg; EXPLAIN (COSTS off) SELECT count(*) FROM btg GROUP BY p, v; QUERY PLAN diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index b983f9c506..3915a837f0 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -1140,7 +1140,7 @@ SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, pl QUERY PLAN -------------------------------------------------------------------------------- GroupAggregate - Group Key: t1.c, t2.c, t3.c + Group Key: t1.c, t3.c, t2.c -> Sort Sort Key: t1.c, t3.c -> Append @@ -1284,7 +1284,7 @@ SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, ph QUERY PLAN -------------------------------------------------------------------------------- GroupAggregate - Group Key: t1.c, t2.c, t3.c + Group Key: t1.c, t3.c, t2.c -> Sort Sort Key: t1.c, t3.c -> Append diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 054a381dad..a686a75fb0 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -244,9 +244,9 @@ EXPLAIN (COSTS off) QUERY PLAN ----------------------------------- GroupAggregate - Group Key: a, b, c, d + Group Key: a, d, c, b -> Sort - Sort Key: a, b, c, d + Sort Key: a, d, c, b -> Seq Scan on ndistinct (5 rows) @@ -255,9 +255,9 @@ EXPLAIN (COSTS off) QUERY PLAN ----------------------------------- GroupAggregate - Group Key: b, c, d + Group Key: b, d, c -> Sort - Sort Key: b, c, d + Sort Key: b, d, c -> Seq Scan on ndistinct (5 rows) @@ -281,9 +281,9 @@ EXPLAIN (COSTS off) QUERY PLAN ----------------------------------- GroupAggregate - Group Key: a, b + Group Key: b, a -> Sort - Sort Key: a, b + Sort Key: b, a -> Seq Scan on ndistinct (5 rows) @@ -292,9 +292,9 @@ EXPLAIN (COSTS off) QUERY PLAN ----------------------------------- GroupAggregate - Group Key: a, b, c + Group Key: b, a, c -> Sort - Sort Key: a, b, c + Sort Key: b, a, c -> Seq Scan on ndistinct (5 rows) @@ -303,9 +303,9 @@ EXPLAIN (COSTS off) QUERY PLAN ----------------------------------- GroupAggregate - Group Key: a, b, c, d + Group Key: d, b, a, c -> Sort - Sort Key: a, b, c, d + Sort Key: d, b, a, c -> Seq Scan on ndistinct (5 rows) diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 7ef703f3a7..e4415c8d84 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -915,22 +915,65 @@ SELECT i/2 AS p, format('%60s', i%2) AS v, i/4 AS c, - i/8 AS d + i/8 AS d, + (random() * (10000/8))::int as e --the same as d but no correlation with p INTO btg FROM generate_series(1, 10000) i; -CREATE INDEX ON btg(p, v); VACUUM btg; ANALYZE btg; +-- GROUP BY optimization by reorder columns by frequency + SET enable_hashagg=off; SET max_parallel_workers= 0; SET max_parallel_workers_per_gather = 0; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, v; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, c; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, c ORDER BY v, p, c; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, d, c; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY v, p, d ,c; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY v, p, d, c ORDER BY p, v, d ,c; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, d, e; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, e, d; + +CREATE STATISTICS btg_dep ON d, e, p FROM btg; +ANALYZE btg; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, d, e; + +EXPLAIN (COSTS off) +SELECT count(*) FROM btg GROUP BY p, e, d; + + +-- GROUP BY optimization by reorder columns by index scan + +CREATE INDEX ON btg(p, v); SET enable_seqscan=off; SET enable_bitmapscan=off; +VACUUM btg; --- GROUP BY optimization by reorder columns by index scan EXPLAIN (COSTS off) SELECT count(*) FROM btg GROUP BY p, v;