From 20a213b27d337bed72256589f0267f30a7724fe8 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 27 Oct 2016 15:24:42 +0200 Subject: [PATCH 7/9] WIP: use ndistinct for selectivity estimation in clausesel.c --- src/backend/optimizer/path/clausesel.c | 382 ++++++++++++++++++++++++++------- 1 file changed, 299 insertions(+), 83 deletions(-) diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index fddbcc4..da5c340 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -47,9 +47,10 @@ typedef struct RangeQueryClause static void addRangeClause(RangeQueryClause **rqlist, Node *clause, bool varonleft, bool isLTsel, Selectivity s2); -#define STATS_TYPE_FDEPS 0x01 -#define STATS_TYPE_MCV 0x02 -#define STATS_TYPE_HIST 0x04 +#define STATS_TYPE_NDIST 0x01 +#define STATS_TYPE_FDEPS 0x02 +#define STATS_TYPE_MCV 0x04 +#define STATS_TYPE_HIST 0x08 static bool clause_is_mv_compatible(Node *clause, Index relid, Bitmapset **attnums, int type); @@ -70,6 +71,10 @@ static List *clauselist_mv_split(PlannerInfo *root, Index relid, static Selectivity clauselist_mv_selectivity(PlannerInfo *root, List *clauses, MVStatisticInfo *mvstats); +static Selectivity clauselist_mv_selectivity_ndist(PlannerInfo *root, + Index relid, List *clauses, MVStatisticInfo *mvstats, + Index varRelid, JoinType jointype, SpecialJoinInfo *sjinfo); + static Selectivity clauselist_mv_selectivity_deps(PlannerInfo *root, Index relid, List *clauses, MVStatisticInfo *mvstats, Index varRelid, JoinType jointype, SpecialJoinInfo *sjinfo); @@ -282,6 +287,37 @@ clauselist_selectivity(PlannerInfo *root, } } + /* And finally, try to use ndistinct coefficients. */ + if (has_stats(stats, STATS_TYPE_NDIST) && + (count_mv_attnums(clauses, relid, STATS_TYPE_NDIST) >= 2)) + { + MVStatisticInfo *mvstat; + Bitmapset *mvattnums; + + /* collect attributes from the compatible conditions */ + mvattnums = collect_mv_attnums(clauses, relid, STATS_TYPE_NDIST); + + /* and search for the statistic covering the most attributes */ + mvstat = choose_mv_statistics(stats, mvattnums, STATS_TYPE_NDIST); + + if (mvstat != NULL) /* we have a matching stats */ + { + /* clauses compatible with multi-variate stats */ + List *mvclauses = NIL; + + /* split the clauselist into regular and mv-clauses */ + clauses = clauselist_mv_split(root, relid, clauses, &mvclauses, + mvstat, STATS_TYPE_NDIST); + + /* we've chosen the histogram to match the clauses */ + Assert(mvclauses != NIL); + + /* compute the multivariate stats (dependencies) */ + s1 *= clauselist_mv_selectivity_ndist(root, relid, mvclauses, mvstat, + varRelid, jointype, sjinfo); + } + } + /* * Initial scan over clauses. Anything that doesn't look like a potential * rangequery clause gets multiplied into s1 and forgotten. Anything that @@ -939,6 +975,261 @@ clause_selectivity(PlannerInfo *root, return s1; } + +/* + * estimate selectivity of clauses using multivariate statistic + * + * Perform estimation of the clauses using a MCV list. + * + * This assumes all the clauses are compatible with the selected statistics + * (e.g. only reference columns covered by the statistics, use supported + * operator, etc.). + * + * TODO: We may support some additional conditions, most importantly those + * matching multiple columns (e.g. "a = b" or "a < b"). + * + * TODO: Clamp the selectivity by min of the per-clause selectivities (i.e. the + * selectivity of the most restrictive clause), because that's the maximum + * we can ever get from ANDed list of clauses. This may probably prevent + * issues with hitting too many buckets and low precision histograms. + * + * TODO: We may remember the lowest frequency in the MCV list, and then later + * use it as a upper boundary for the selectivity (had there been a more + * frequent item, it'd be in the MCV list). This might improve cases with + * low-detail histograms. + * + * TODO: We may also derive some additional boundaries for the selectivity from + * the MCV list, because + * + * (a) if we have a "full equality condition" (one equality condition on + * each column of the statistic) and we found a match in the MCV list, + * then this is the final selectivity (and pretty accurate), + * + * (b) if we have a "full equality condition" and we haven't found a match + * in the MCV list, then the selectivity is below the lowest frequency + * found in the MCV list, + * + * TODO: When applying the clauses to the histogram/MCV list, we can do that + * from the most selective clauses first, because that'll eliminate the + * buckets/items sooner (so we'll be able to skip them without inspection, + * which is more expensive). But this requires really knowing the per-clause + * selectivities in advance, and that's not what we do now. + */ +static Selectivity +clauselist_mv_selectivity(PlannerInfo *root, List *clauses, MVStatisticInfo *mvstats) +{ + bool fullmatch = false; + Selectivity s1 = 0.0, + s2 = 0.0; + + /* + * Lowest frequency in the MCV list (may be used as an upper bound for + * full equality conditions that did not match any MCV item). + */ + Selectivity mcv_low = 0.0; + + /* + * TODO: Evaluate simple 1D selectivities, use the smallest one as an + * upper bound, product as lower bound, and sort the clauses in ascending + * order by selectivity (to optimize the MCV/histogram evaluation). + */ + + /* Evaluate the MCV first. */ + s1 = clauselist_mv_selectivity_mcvlist(root, clauses, mvstats, + &fullmatch, &mcv_low); + + /* + * If we got a full equality match on the MCV list, we're done (and the + * estimate is pretty good). + */ + if (fullmatch && (s1 > 0.0)) + return s1; + + /* + * TODO if (fullmatch) without matching MCV item, use the mcv_low + * selectivity as upper bound + */ + + s2 = clauselist_mv_selectivity_histogram(root, clauses, mvstats); + + /* TODO clamp to <= 1.0 (or more strictly, when possible) */ + return s1 + s2; +} + +static MVNDistinctItem * +find_widest_ndistinct_item(MVNDistinct ndistinct, Bitmapset *attnums, + int16 *attmap) +{ + int i; + MVNDistinctItem *widest = NULL; + + /* number of attnums in clauses */ + int nattnums = bms_num_members(attnums); + + /* with less than two attributes, we can bail out right away */ + if (nattnums < 2) + return NULL; + + /* + * Iterate over the MVNDistinctItem items and find the widest one from + * those fully-matched by clasuse. + */ + for (i = 0; i < ndistinct->nitems; i++) + { + int j; + bool full_match = true; + MVNDistinctItem *item = &ndistinct->items[i]; + + /* + * Skip items referencing more attributes than available clauses, + * as those can't be fully matched. + */ + if (item->nattrs > nattnums) + continue; + + /* We can skip items with fewer attributes than the best one. */ + if (widest && (widest->nattrs >= item->nattrs)) + continue; + + /* + * Check that the item actually is fully covered by clauses. We + * have to translate all attribute numbers. + */ + for (j = 0; j < item->nattrs; j++) + { + int attnum = attmap[item->attrs[j]]; + + if (! bms_is_member(attnum, attnums)) + { + full_match = false; + break; + } + } + + /* + * If the item is not fully matched by clauses, we can't use + * it for the estimation. + */ + if (! full_match) + continue; + + /* + * We have a fully-matched item, and we already know it has to + * be wider than the current one (otherwise we'd skip it before + * inspecting it at the very beginning). + */ + widest = item; + } + + return widest; +} + +static bool +attnum_in_ndistinct_item(MVNDistinctItem *item, int attnum, int16 *attmap) +{ + int j; + + for (j = 0; j < item->nattrs; j++) + { + if (attnum == attmap[item->attrs[j]]) + return true; + } + + return false; +} + +static Selectivity +clauselist_mv_selectivity_ndist(PlannerInfo *root, Index relid, + List *clauses, MVStatisticInfo *mvstats, + Index varRelid, JoinType jointype, + SpecialJoinInfo *sjinfo) +{ + ListCell *lc; + Selectivity s1 = 1.0; + MVNDistinct ndistinct; + MVNDistinctItem *item; + Bitmapset *attnums; + List *clauses_filtered = NIL; + + /* we should only get here if the statistics includes ndistinct */ + Assert(mvstats->ndist_enabled && mvstats->ndist_built); + + /* load the ndistinct items stored in the statistics */ + ndistinct = load_mv_ndistinct(mvstats->mvoid); + + /* collect attnums in the clauses */ + attnums = collect_mv_attnums(clauses, relid, STATS_TYPE_NDIST); + + Assert(bms_num_members(attnums) >= 2); + + /* + * Search for the widest ndistinct item (covering the most clauses), and + * then use it to estimate the number of entries. + */ + item = find_widest_ndistinct_item(ndistinct, attnums, + mvstats->stakeys->values); + + if (item) + { + /* + * We have an applicable item, so identify all covered clauses, and + * remove them from the list of clauses. + */ + foreach(lc, clauses) + { + Bitmapset *attnums_clause = NULL; + Node *clause = (Node *) lfirst(lc); + + /* + * XXX We need the attnum referenced by the clause, and this is the + * easiest way to get it (but maybe not the best one). At this point + * we should only see equality clauses, so just error out if we + * stumble upon something else. + */ + if (! clause_is_mv_compatible(clause, relid, &attnums_clause, + STATS_TYPE_NDIST)) + elog(ERROR, "clause not compatible with ndistinct stats"); + + /* + * We also expect only simple equality clauses, with a single Var. + * + * XXX This checks the number of attnums, not the number of Vars, + * but clause_is_mv_compatible only accepts (Var=Const) clauses. + */ + Assert(bms_num_members(attnums_clause) == 1); + + /* + * If the clause matches the selected ndistinct item, add it to + * the list of ndistinct clauses. + */ + if (!attnum_in_ndistinct_item(item, + bms_singleton_member(attnums_clause), + mvstats->stakeys->values)) + clauses_filtered = lappend(clauses_filtered, clause); + } + + /* Compute selectivity using the ndistinct item. */ + s1 *= (1.0 / item->ndistinct); + + /* + * Throw away the clauses matched by the ndistinct, so that we don't + * estimate them twice. + */ + clauses = clauses_filtered; + } + + /* And now simply multiply with selectivities of the remaining clauses. */ + foreach (lc, clauses) + { + Node *clause = (Node *) lfirst(lc); + + s1 *= clause_selectivity(root, clause, varRelid, jointype, sjinfo); + } + + return s1; +} + + /* * When applying functional dependencies, we start with the strongest ones * strongest dependencies. That is, we select the dependency that: @@ -1147,85 +1438,6 @@ clauselist_mv_selectivity_deps(PlannerInfo *root, Index relid, return s1; } -/* - * estimate selectivity of clauses using multivariate statistic - * - * Perform estimation of the clauses using a MCV list. - * - * This assumes all the clauses are compatible with the selected statistics - * (e.g. only reference columns covered by the statistics, use supported - * operator, etc.). - * - * TODO: We may support some additional conditions, most importantly those - * matching multiple columns (e.g. "a = b" or "a < b"). - * - * TODO: Clamp the selectivity by min of the per-clause selectivities (i.e. the - * selectivity of the most restrictive clause), because that's the maximum - * we can ever get from ANDed list of clauses. This may probably prevent - * issues with hitting too many buckets and low precision histograms. - * - * TODO: We may remember the lowest frequency in the MCV list, and then later - * use it as a upper boundary for the selectivity (had there been a more - * frequent item, it'd be in the MCV list). This might improve cases with - * low-detail histograms. - * - * TODO: We may also derive some additional boundaries for the selectivity from - * the MCV list, because - * - * (a) if we have a "full equality condition" (one equality condition on - * each column of the statistic) and we found a match in the MCV list, - * then this is the final selectivity (and pretty accurate), - * - * (b) if we have a "full equality condition" and we haven't found a match - * in the MCV list, then the selectivity is below the lowest frequency - * found in the MCV list, - * - * TODO: When applying the clauses to the histogram/MCV list, we can do that - * from the most selective clauses first, because that'll eliminate the - * buckets/items sooner (so we'll be able to skip them without inspection, - * which is more expensive). But this requires really knowing the per-clause - * selectivities in advance, and that's not what we do now. - */ -static Selectivity -clauselist_mv_selectivity(PlannerInfo *root, List *clauses, MVStatisticInfo *mvstats) -{ - bool fullmatch = false; - Selectivity s1 = 0.0, - s2 = 0.0; - - /* - * Lowest frequency in the MCV list (may be used as an upper bound for - * full equality conditions that did not match any MCV item). - */ - Selectivity mcv_low = 0.0; - - /* - * TODO: Evaluate simple 1D selectivities, use the smallest one as an - * upper bound, product as lower bound, and sort the clauses in ascending - * order by selectivity (to optimize the MCV/histogram evaluation). - */ - - /* Evaluate the MCV first. */ - s1 = clauselist_mv_selectivity_mcvlist(root, clauses, mvstats, - &fullmatch, &mcv_low); - - /* - * If we got a full equality match on the MCV list, we're done (and the - * estimate is pretty good). - */ - if (fullmatch && (s1 > 0.0)) - return s1; - - /* - * TODO if (fullmatch) without matching MCV item, use the mcv_low - * selectivity as upper bound - */ - - s2 = clauselist_mv_selectivity_histogram(root, clauses, mvstats); - - /* TODO clamp to <= 1.0 (or more strictly, when possible) */ - return s1 + s2; -} /* * Collect attributes from mv-compatible clauses. @@ -1409,7 +1621,8 @@ choose_mv_statistics(List *stats, Bitmapset *attnums, int types) int numattrs = info->stakeys->dim1; /* skip statistics not matching any of the requested types */ - if (! ((info->deps_built && (STATS_TYPE_FDEPS & types)) || + if (! ((info->ndist_built && (STATS_TYPE_NDIST & types)) || + (info->deps_built && (STATS_TYPE_FDEPS & types)) || (info->mcv_built && (STATS_TYPE_MCV & types)) || (info->hist_built && (STATS_TYPE_HIST & types)))) continue; @@ -1703,6 +1916,9 @@ clause_is_mv_compatible(Node *clause, Index relid, Bitmapset **attnums, int type static bool stats_type_matches(MVStatisticInfo *stat, int type) { + if ((type & STATS_TYPE_NDIST) && stat->ndist_built) + return true; + if ((type & STATS_TYPE_FDEPS) && stat->deps_built) return true; -- 2.5.5