From a5badc43aa37d249c562a4605478bb7c897b76f6 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Sun, 23 Oct 2016 17:37:27 +0200 Subject: [PATCH 4/9] PATCH: selectivity estimation using functional dependencies Use functional dependencies to correct selectivity estimates of equality clauses. For now this only works with regular WHERE conditions, not join clauses etc. Given two equality clauses (a = 1) AND (b = 2) we compute selectivity for each condition, and then combine them using formula P(a=1, b=2) = P(a=1) * [degree + (1 - degree) * P(b=2)] where 'degree' of the functional dependence (a => b) is a number between [0,1] measuring how much the knowledge of 'a' determines the value of 'b'. For 'degree=0' this degrades to independence, for 'degree=1' we get perfect functional dependency. Estimates of more than two clauses are computed recursively, so for example (a = 1) AND (b = 2) AND (c = 3) is first split into P(a=1, b=2, c=3) = P(a=1, b=2) * [d + (1-d) * P(c=3)] where 'd' is degree of (a,b => c) functional dependency. And then the first part of the estimate is computed recursively: P(a=1, b=2) = P(a=1) * [d + (1-d) * P(b=2)] where 'd' is degree of (a => b) dependency. The patch includes regression tests with functional dependencies on several synthetic datasets (random, perfectly correlated, etc.) --- doc/src/sgml/planstats.sgml | 178 +++++- src/backend/optimizer/path/clausesel.c | 781 +++++++++++++++++++++++++- src/backend/utils/mvstats/README.stats | 45 +- src/backend/utils/mvstats/common.c | 1 + src/backend/utils/mvstats/dependencies.c | 68 +++ src/include/utils/mvstats.h | 6 +- src/test/regress/expected/mv_dependencies.out | 28 +- src/test/regress/sql/mv_dependencies.sql | 19 +- 8 files changed, 1072 insertions(+), 54 deletions(-) diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml index d5b975d..5436c8a 100644 --- a/doc/src/sgml/planstats.sgml +++ b/doc/src/sgml/planstats.sgml @@ -504,7 +504,7 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = 't'; EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1; - QUERY PLAN + QUERY PLAN ------------------------------------------------------------------------------------------------- Seq Scan on t (cost=0.00..170.00 rows=100 width=8) (actual time=0.031..2.870 rows=100 loops=1) Filter: (a = 1) @@ -527,7 +527,7 @@ EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1; EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1; - QUERY PLAN + QUERY PLAN ----------------------------------------------------------------------------------------------- Seq Scan on t (cost=0.00..195.00 rows=1 width=8) (actual time=0.033..3.006 rows=100 loops=1) Filter: ((a = 1) AND (b = 1)) @@ -547,11 +547,11 @@ EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1; Overestimates, i.e. errors in the opposite direction, are also possible. Consider for example the following combination of range conditions, each - matching + matching roughly half the rows. EXPLAIN ANALYZE SELECT * FROM t WHERE a <= 49 AND b > 49; - QUERY PLAN + QUERY PLAN ------------------------------------------------------------------------------------------------ Seq Scan on t (cost=0.00..195.00 rows=2500 width=8) (actual time=1.607..1.607 rows=0 loops=1) Filter: ((a <= 49) AND (b > 49)) @@ -587,6 +587,176 @@ EXPLAIN ANALYZE SELECT * FROM t WHERE a <= 49 AND b > 49; sections. + + Functional Dependencies + + + The simplest type of multivariate statistics are functional dependencies, + used in definitions of database normal forms. When simplified, saying that + b is functionally dependent on a means that + knowledge of value of a is sufficient to determine value of + b. + + + + In normalized databases, only functional dependencies on primary keys + and super keys are allowed. In practice however many data sets are not + fully normalized, for example thanks to intentional denormalization for + performance reasons. The table t is an example of a data set + with functional dependencies. As a = b for all rows in the + table, a is functionally dependent on b and + b is functionally dependent on a. + + + + Functional dependencies directly affect accuracy of the estimates, as + conditions on the dependent column(s) do not restrict the result set, + and are often redundant, causing underestimates. In the first example, + either a = 1 or b = 1 is sufficient (however see + ). + + + + To inform the planner about the functional dependencies, or rather to + instruct it to search for them during ANALYZE, we can use + the CREATE STATISTICS command. + + +CREATE STATISTICS s1 ON t (a,b) WITH (dependencies); +ANALYZE t; +EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual time=0.095..3.118 rows=100 loops=1) + Filter: ((a = 1) AND (b = 1)) + Rows Removed by Filter: 9900 + Planning time: 0.367 ms + Execution time: 3.380 ms +(5 rows) + + + As you can see, the estimate improved quite a bit, as the planner is now + aware of the functional dependencies and eliminates the second condition + when computing the estimates. + + + + Let's inspect multivariate statistics on a table, as defined by + CREATE STATISTICS and built by ANALYZE. If you're + using psql, the easiest way to list statistics on a table + is by using \d. + + +\d t + Table "public.t" + Column | Type | Modifiers +--------+---------+----------- + a | integer | + b | integer | +Statistics: + "public.s1" (dependencies) ON (a, b) + + + + + + Similarly to per-column statistics, multivariate statistics are stored in + a system catalog called pg_mv_statistic, but + there is also a more convenient view pg_mv_stats. + To inspect the statistics s1 defined above, + you may do this: + + +SELECT tablename, staname, attnums, depsbytes, depsinfo + FROM pg_mv_stats WHERE staname = 's1'; + + tablename | staname | attnums | depsbytes | depsinfo +-----------+---------+---------+-----------+---------------- + t | s1 | 1 2 | 32 | dependencies=2 +(1 row) + + + This shows that the statistic is defined on table t, + attnums lists attribute numbers of columns + (references pg_attribute). It also shows + ANALYZE found two functional dependencies, and size when + serialized into a bytea column. Inspecting the functional + dependencies is possible using pg_mv_stats_dependencies_show + function. + + +SELECT pg_mv_stats_dependencies_show(stadeps) + FROM pg_mv_statistic WHERE staname = 's1'; + + pg_mv_stats_dependencies_show +------------------------------- + (1) => 2, (2) => 1 +(1 row) + + + Which confirms a is functionally dependent on b and + b is functionally dependent on a. + + + + Now let's quickly discuss how this knowledge is applied when estimating + the selectivity. The planner walks through the conditions and attempts + to identify which conditions are already implied by other conditions, + and eliminates them (but only for the estimation, all conditions will be + checked on tuples during execution). In the example query, either of + the conditions may get eliminated, improving the estimate. This happens + in clauselist_apply_dependencies in clausesel.c. + + + + Limitations of functional dependencies + + + The first limitation of functional dependencies is that they only work + with simple equality conditions, comparing columns and constant values. + It's not possible to use them to eliminate equality conditions comparing + two columns or a column to an expression, range clauses, LIKE + or any other type of condition. + + + + When eliminating the implied conditions, the planner assumes that the + conditions are compatible. Consider the following example, violating + this assumption: + + +EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 10; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual time=2.992..2.992 rows=0 loops=1) + Filter: ((a = 1) AND (b = 10)) + Rows Removed by Filter: 10000 + Planning time: 0.232 ms + Execution time: 3.033 ms +(5 rows) + + + There are no rows with this combination of values, however the planner + is unable to verify whether the values match - it only knows that + the columns are functionally dependent. + + + + This assumption is more about queries executed on the database - in many + cases it's actually satisfied (e.g. when the GUI only allows selecting + compatible values). But if that's not the case, functional dependencies + may not be a viable option. + + + + For additional information about functional dependencies, see + src/backend/utils/mvstats/README.dependencies. + + + + + + diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index af2934a..cc79282 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -14,14 +14,19 @@ */ #include "postgres.h" +#include "access/sysattr.h" +#include "catalog/pg_operator.h" #include "nodes/makefuncs.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/plancat.h" +#include "optimizer/var.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" +#include "utils/mvstats.h" #include "utils/selfuncs.h" +#include "utils/typcache.h" /* @@ -41,6 +46,33 @@ typedef struct RangeQueryClause static void addRangeClause(RangeQueryClause **rqlist, Node *clause, bool varonleft, bool isLTsel, Selectivity s2); +#define STATS_TYPE_FDEPS 0x01 + +static bool clause_is_mv_compatible(Node *clause, Index relid, AttrNumber *attnum); + +static Bitmapset *collect_mv_attnums(List *clauses, Index relid); + +static int count_mv_attnums(List *clauses, Index relid); + +static int count_varnos(List *clauses, Index *relid); + +static MVStatisticInfo *choose_mv_statistics(List *mvstats, Bitmapset *attnums, + int types); + +static List *clauselist_mv_split(PlannerInfo *root, Index relid, + List *clauses, List **mvclauses, + MVStatisticInfo *mvstats, int types); + +static Selectivity clauselist_mv_selectivity_deps(PlannerInfo *root, + Index relid, List *clauses, MVStatisticInfo *mvstats, + Index varRelid, JoinType jointype, SpecialJoinInfo *sjinfo); + +static bool has_stats(List *stats, int type); + +static List *find_stats(PlannerInfo *root, Index relid); + +static bool stats_type_matches(MVStatisticInfo *stat, int type); + /**************************************************************************** * ROUTINES TO COMPUTE SELECTIVITIES @@ -60,7 +92,19 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause, * subclauses. However, that's only right if the subclauses have independent * probabilities, and in reality they are often NOT independent. So, * we want to be smarter where we can. - + * + * The first thing we try to do is applying multivariate statistics, in a way + * that intends to minimize the overhead when there are no multivariate stats + * on the relation. Thus we do several simple (and inexpensive) checks first, + * to verify that suitable multivariate statistics exist. + * + * If we identify such multivariate statistics apply, we try to apply them. + * Currently we only have (soft) functional dependencies, so we try to reduce + * the list of clauses. + * + * Then we remove the clauses estimated using multivariate stats, and process + * the rest of the clauses using the regular per-column stats. + * * Currently, the only extra smarts we have is to recognize "range queries", * such as "x > 34 AND x < 42". Clauses are recognized as possible range * query components if they are restriction opclauses whose operators have @@ -99,15 +143,81 @@ clauselist_selectivity(PlannerInfo *root, RangeQueryClause *rqlist = NULL; ListCell *l; + /* processing mv stats */ + Oid relid = InvalidOid; + + /* list of multivariate stats on the relation */ + List *stats = NIL; + /* - * If there's exactly one clause, then no use in trying to match up pairs, - * so just go directly to clause_selectivity(). + * If there's exactly one clause, then multivariate statistics is futile + * at this level (we might be able to apply them later if it's AND/OR + * clause). So just go directly to clause_selectivity(). */ if (list_length(clauses) == 1) return clause_selectivity(root, (Node *) linitial(clauses), varRelid, jointype, sjinfo); /* + * To fetch the statistics, we first need to determine the rel. Currently + * we only support estimates of simple restrictions referencing a single + * baserel (no join statistics). However set_baserel_size_estimates() sets + * varRelid=0 so we have to actually inspect the clauses by pull_varnos + * and see if there's just a single varno referenced. + * + * XXX Maybe there's a better way to find the relid? + */ + if ((count_varnos(clauses, &relid) == 1) && + ((varRelid == 0) || (varRelid == relid))) + stats = find_stats(root, relid); + + /* + * Check that there are multivariate statistics usable for selectivity + * estimation, i.e. anything except ndistinct coefficients. + * + * Also check the number of attributes in clauses that might be estimated + * using those statistics, and that there are at least two such attributes. + * It may easily happen that we won't be able to estimate the clauses using + * the multivariate statistics anyway, but that requires a more expensive + * to verify (so the check check should be worth it). + * + * If there are no such stats or not enough attributes, don't waste time + * simply skip to estimation using the plain per-column stats. + */ + if (has_stats(stats, STATS_TYPE_FDEPS) && + (count_mv_attnums(clauses, relid) >= 2)) + { + MVStatisticInfo *mvstat; + Bitmapset *mvattnums; + + /* collect attributes from the compatible conditions */ + mvattnums = collect_mv_attnums(clauses, relid); + + /* and search for the statistic covering the most attributes */ + mvstat = choose_mv_statistics(stats, mvattnums, STATS_TYPE_FDEPS); + + if (mvstat != NULL) /* we have a matching stats */ + { + /* clauses compatible with multi-variate stats */ + List *mvclauses = NIL; + + /* split the clauselist into regular and mv-clauses */ + clauses = clauselist_mv_split(root, relid, clauses, &mvclauses, + mvstat, STATS_TYPE_FDEPS); + + /* Empty list of clauses is a clear sign something went wrong. */ + Assert(list_length(mvclauses)); + + /* we've chosen the histogram to match the clauses */ + Assert(mvclauses != NIL); + + /* compute the multivariate stats (dependencies) */ + s1 *= clauselist_mv_selectivity_deps(root, relid, mvclauses, mvstat, + varRelid, jointype, sjinfo); + } + } + + /* * Initial scan over clauses. Anything that doesn't look like a potential * rangequery clause gets multiplied into s1 and forgotten. Anything that * does gets inserted into an rqlist entry. @@ -763,3 +873,668 @@ clause_selectivity(PlannerInfo *root, return s1; } + +/* + * When applying functional dependencies, we start with the strongest ones + * strongest dependencies. That is, we select the dependency that: + * + * (a) has all attributes covered by the clauses + * + * (b) has the most attributes + * + * (c) has the higher degree of validity + * + * TODO Explain why we select the dependencies this way. + */ +static MVDependency +find_strongest_dependency(MVStatisticInfo *mvstats, MVDependencies dependencies, + Bitmapset *attnums) +{ + int i; + MVDependency strongest = NULL; + + /* number of attnums in clauses */ + int nattnums = bms_num_members(attnums); + + /* + * Iterate over the MVDependency items and find the strongest one from + * the fully-matched dependencies. We do the cheap checks first, before + * matching it against the attnums. + */ + for (i = 0; i < dependencies->ndeps; i++) + { + MVDependency dependency = dependencies->deps[i]; + + /* + * Skip dependencies referencing more attributes than available clauses, + * as those can't be fully matched. + */ + if (dependency->nattributes > nattnums) + continue; + + /* We can skip dependencies on fewer attributes than the best one. */ + if (strongest && (strongest->nattributes > dependency->nattributes)) + continue; + + /* And also weaker dependencies on the same number of attributes. */ + if (strongest && + (strongest->nattributes == dependency->nattributes) && + (strongest->degree > dependency->degree)) + continue; + + /* + * Check that the dependency actually is fully covered by clauses. + * If the dependency is not fully matched by clauses, we can't use + * it for the estimation. + */ + if (! dependency_is_fully_matched(dependency, attnums, + mvstats->stakeys->values)) + continue; + + /* + * We have a fully-matched dependency, and we already know it has to + * be stronger than the current one (otherwise we'd skip it before + * inspecting it at the very beginning. + */ + strongest = dependency; + } + + return strongest; +} + +/* + * clauselist_mv_selectivity_deps + * estimate selectivity using functional dependencies + * + * Given equality clauses on attributes (a,b) we find the strongest dependency + * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected + * dependency, we then combine the per-clause selectivities using the formula + * + * P(a,b) = P(a) * [f + (1-f)*P(b)] + * + * where 'f' is the degree of the dependency. + * + * With clauses on more than two attributes, the dependencies are applied + * recursively, starting with the widest/strongest dependencies. For example + * P(a,b,c) is first split like this: + * + * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)] + * + * assuming (a,b=>c) is the strongest dependency. + */ +static Selectivity +clauselist_mv_selectivity_deps(PlannerInfo *root, Index relid, + List *clauses, MVStatisticInfo *mvstats, + Index varRelid, JoinType jointype, + SpecialJoinInfo *sjinfo) +{ + ListCell *lc; + Selectivity s1 = 1.0; + MVDependencies dependencies; + + Assert(mvstats->deps_enabled && mvstats->deps_built); + + /* load the dependency items stored in the statistics */ + dependencies = load_mv_dependencies(mvstats->mvoid); + + Assert(dependencies); + + /* + * Apply the dependencies recursively, starting with the widest/strongest + * ones, and proceeding to the smaller/weaker ones. At the end of each + * round we factor in the selectivity of clauses on the implied attribute, + * and remove the clauses from the list. + */ + while (true) + { + Selectivity s2 = 1.0; + Bitmapset *attnums; + MVDependency dependency; + + /* clauses remaining after removing those on the "implied" attribute */ + List *clauses_filtered = NIL; + + attnums = collect_mv_attnums(clauses, relid); + + /* no point in looking for dependencies with fewer than 2 attributes */ + if (bms_num_members(attnums) < 2) + break; + + /* the widest/strongest dependency, fully matched by clauses */ + dependency = find_strongest_dependency(mvstats, dependencies, attnums); + + /* if no suitable dependency was found, we're done */ + if (! dependency) + break; + + /* + * We found an applicable dependency, so find all the clauses on the + * implied attribute, so with dependency (a,b => c) we seach clauses + * on 'c'. We only really expect a single such clause, but in case + * there are more we simply multiply the selectivities as usual. + * + * XXX Maybe we should use the maximum, minimum or just error out? + */ + foreach(lc, clauses) + { + AttrNumber attnum_clause = InvalidAttrNumber; + Node *clause = (Node *) lfirst(lc); + + /* + * XXX We need the attnum referenced by the clause, and this is the + * easiest way to get it (but maybe not the best one). At this point + * we should only see equality clauses compatible with functional + * dependencies, so just error out if we stumble upon something else. + */ + if (! clause_is_mv_compatible(clause, relid, &attnum_clause)) + elog(ERROR, "clause not compatible with functional dependencies"); + + Assert(AttributeNumberIsValid(attnum_clause)); + + /* + * If the clause is not on the implied attribute, add it to the list + * of filtered clauses (for the next round) and continue with the + * next one. + */ + if (! dependency_implies_attribute(dependency, attnum_clause, + mvstats->stakeys->values)) + { + clauses_filtered = lappend(clauses_filtered, clause); + continue; + } + + /* + * Otherwise compute selectivity of the clause, and multiply it with + * other clauses on the same attribute. + * + * XXX Not sure if we need to worry about multiple clauses, though. + * Those are all equality clauses, and if they reference different + * constants, that's not going to work. + */ + s2 *= clause_selectivity(root, clause, varRelid, jointype, sjinfo); + } + + /* + * Now factor in the selectivity for all the "implied" clauses into the + * final one, using this formula: + * + * P(a,b) = P(a) * (f + (1-f) * P(b)) + * + * where 'f' is the degree of validity of the dependency. + */ + s1 *= (dependency->degree + (1 - dependency->degree) * s2); + + /* And only keep the filtered clauses for the next round. */ + clauses = clauses_filtered; + } + + /* And now simply multiply with selectivities of the remaining clauses. */ + foreach (lc, clauses) + { + Node *clause = (Node *) lfirst(lc); + + s1 *= clause_selectivity(root, clause, varRelid, jointype, sjinfo); + } + + return s1; +} + +/* + * Collect attributes from mv-compatible clauses. + */ +static Bitmapset * +collect_mv_attnums(List *clauses, Index relid) +{ + Bitmapset *attnums = NULL; + ListCell *l; + + /* + * Walk through the clauses and identify the ones we can estimate using + * multivariate stats, and remember the relid/columns. We'll then + * cross-check if we have suitable stats, and only if needed we'll split + * the clauses into multivariate and regular lists. + * + * For now we're only interested in RestrictInfo nodes with nested OpExpr, + * using either a range or equality. + */ + foreach(l, clauses) + { + AttrNumber attnum; + Node *clause = (Node *) lfirst(l); + + /* ignore the result for now - we only need the info */ + if (clause_is_mv_compatible(clause, relid, &attnum)) + attnums = bms_add_member(attnums, attnum); + } + + /* + * If there are not at least two attributes referenced by the clause(s), + * we can throw everything out (as we'll revert to simple stats). + */ + if (bms_num_members(attnums) <= 1) + { + if (attnums != NULL) + pfree(attnums); + attnums = NULL; + } + + return attnums; +} + +/* + * Count the number of attributes in clauses compatible with multivariate stats. + */ +static int +count_mv_attnums(List *clauses, Index relid) +{ + int c; + Bitmapset *attnums = collect_mv_attnums(clauses, relid); + + c = bms_num_members(attnums); + + bms_free(attnums); + + return c; +} + +/* + * Count varnos referenced in the clauses, and if there's a single varno then + * return the index in 'relid'. + */ +static int +count_varnos(List *clauses, Index *relid) +{ + int cnt; + Bitmapset *varnos = NULL; + + varnos = pull_varnos((Node *) clauses); + cnt = bms_num_members(varnos); + + /* if there's a single varno in the clauses, remember it */ + if (bms_num_members(varnos) == 1) + *relid = bms_singleton_member(varnos); + + bms_free(varnos); + + return cnt; +} + +static int +count_attnums_covered_by_stats(MVStatisticInfo *info, Bitmapset *attnums) +{ + int i; + int matches = 0; + int2vector *attrs = info->stakeys; + + /* count columns covered by the statistics */ + for (i = 0; i < attrs->dim1; i++) + if (bms_is_member(attrs->values[i], attnums)) + matches++; + + return matches; +} + +/* + * We're looking for statistics matching at least 2 attributes, referenced in + * clauses compatible with multivariate statistics. The current selection + * criteria is very simple - we choose the statistics referencing the most + * attributes. + * + * If there are multiple statistics referencing the same number of columns + * (from the clauses), the one with less source columns (as listed in the + * ADD STATISTICS when creating the statistics) wins. Else the first one wins. + * + * This is a very simple criteria, and has several weaknesses: + * + * (a) does not consider the accuracy of the statistics + * + * If there are two histograms built on the same set of columns, but one + * has 100 buckets and the other one has 1000 buckets (thus likely + * providing better estimates), this is not currently considered. + * + * (b) does not consider the type of statistics + * + * If there are three statistics - one containing just a MCV list, another + * one with just a histogram and a third one with both, we treat them equally. + * + * (c) does not consider the number of clauses + * + * As explained, only the number of referenced attributes counts, so if + * there are multiple clauses on a single attribute, this still counts as + * a single attribute. + * + * (d) does not consider type of condition + * + * Some clauses may work better with some statistics - for example equality + * clauses probably work better with MCV lists than with histograms. But + * IS [NOT] NULL conditions may often work better with histograms (thanks + * to NULL-buckets). + * + * So for example with five WHERE conditions + * + * WHERE (a = 1) AND (b = 1) AND (c = 1) AND (d = 1) AND (e = 1) + * + * and statistics on (a,b), (a,b,e) and (a,b,c,d), the last one will be selected + * as it references the most columns. + * + * Once we have selected the multivariate statistics, we split the list of + * clauses into two parts - conditions that are compatible with the selected + * stats, and conditions are estimated using simple statistics. + * + * From the example above, conditions + * + * (a = 1) AND (b = 1) AND (c = 1) AND (d = 1) + * + * will be estimated using the multivariate statistics (a,b,c,d) while the last + * condition (e = 1) will get estimated using the regular ones. + * + * There are various alternative selection criteria (e.g. counting conditions + * instead of just referenced attributes), but eventually the best option should + * be to combine multiple statistics. But that's much harder to do correctly. + * + * TODO: Select multiple statistics and combine them when computing the estimate. + * + * TODO: This will probably have to consider compatibility of clauses, because + * 'dependencies' will probably work only with equality clauses. + */ +static MVStatisticInfo * +choose_mv_statistics(List *stats, Bitmapset *attnums, int types) +{ + ListCell *lc; + + MVStatisticInfo *choice = NULL; + + int current_matches = 2; /* goal #1: maximize */ + int current_dims = (MVSTATS_MAX_DIMENSIONS + 1); /* goal #2: minimize */ + + /* + * Walk through the statistics (simple array with nmvstats elements) and + * for each one count the referenced attributes (encoded in the 'attnums' + * bitmap). + */ + foreach(lc, stats) + { + MVStatisticInfo *info = (MVStatisticInfo *) lfirst(lc); + + /* columns matching this statistics */ + int matches = 0; + + /* size (number of dimensions) of this statistics */ + int numattrs = info->stakeys->dim1; + + /* skip statistics not matching any of the requested types */ + if (! (info->deps_built && (STATS_TYPE_FDEPS & types))) + continue; + + /* count columns covered by the statistics */ + matches = count_attnums_covered_by_stats(info, attnums); + + /* + * Use this statistics when it increases the number of matched clauses + * or when it matches the same number of attributes but is smaller + * (in terms of number of attributes covered). + */ + if ((matches > current_matches) || + ((matches == current_matches) && (current_dims > numattrs))) + { + choice = info; + current_matches = matches; + current_dims = numattrs; + } + } + + return choice; +} + + +/* + * clauselist_mv_split + * split the clause list into a part to be estimated using the provided + * statistics, and remaining clauses (estimated in some other way) + */ +static List * +clauselist_mv_split(PlannerInfo *root, Index relid, + List *clauses, List **mvclauses, + MVStatisticInfo *mvstats, int types) +{ + int i; + ListCell *l; + List *non_mvclauses = NIL; + + /* FIXME is there a better way to get info on int2vector? */ + int2vector *attrs = mvstats->stakeys; + int numattrs = mvstats->stakeys->dim1; + + Bitmapset *mvattnums = NULL; + + /* build bitmap of attributes, so we can do bms_is_subset later */ + for (i = 0; i < numattrs; i++) + mvattnums = bms_add_member(mvattnums, attrs->values[i]); + + /* erase the list of mv-compatible clauses */ + *mvclauses = NIL; + + foreach(l, clauses) + { + bool match = false; /* by default not mv-compatible */ + AttrNumber attnum = InvalidAttrNumber; + Node *clause = (Node *) lfirst(l); + + if (clause_is_mv_compatible(clause, relid, &attnum)) + { + /* are all the attributes part of the selected stats? */ + if (bms_is_member(attnum, mvattnums)) + match = true; + } + + /* + * The clause matches the selected stats, so put it to the list of + * mv-compatible clauses. Otherwise, keep it in the list of 'regular' + * clauses (that may be selected later). + */ + if (match) + *mvclauses = lappend(*mvclauses, clause); + else + non_mvclauses = lappend(non_mvclauses, clause); + } + + /* + * Perform regular estimation using the clauses incompatible with the + * chosen histogram (or MV stats in general). + */ + return non_mvclauses; + +} + +typedef struct +{ + Index varno; /* relid we're interested in */ + Bitmapset *varattnos; /* attnums referenced by the clauses */ +} mv_compatible_context; + +/* + * Recursive walker that checks compatibility of the clause with multivariate + * statistics, and collects attnums from the Vars. + * + * XXX The original idea was to combine this with expression_tree_walker, but + * I've been unable to make that work - seems that does not quite allow + * checking the structure. Hence the explicit calls to the walker. + */ +static bool +mv_compatible_walker(Node *node, mv_compatible_context *context) +{ + if (node == NULL) + return false; + + if (IsA(node, RestrictInfo)) + { + RestrictInfo *rinfo = (RestrictInfo *) node; + + /* Pseudoconstants are not really interesting here. */ + if (rinfo->pseudoconstant) + return true; + + /* clauses referencing multiple varnos are incompatible */ + if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) + return true; + + /* check the clause inside the RestrictInfo */ + return mv_compatible_walker((Node *) rinfo->clause, (void *) context); + } + + if (IsA(node, Var)) + { + Var *var = (Var *) node; + + /* + * Also, the variable needs to reference the right relid (this might + * be unnecessary given the other checks, but let's be sure). + */ + if (var->varno != context->varno) + return true; + + /* Also skip system attributes (we don't allow stats on those). */ + if (!AttrNumberIsForUserDefinedAttr(var->varattno)) + return true; + + /* Seems fine, so let's remember the attnum. */ + context->varattnos = bms_add_member(context->varattnos, var->varattno); + + return false; + } + + /* + * And finally the operator expressions - we only allow simple expressions + * with two arguments, where one is a Var and the other is a constant, and + * it's a simple comparison (which we detect using estimator function). + */ + if (is_opclause(node)) + { + OpExpr *expr = (OpExpr *) node; + Var *var; + bool varonleft = true; + bool ok; + + /* + * Only expressions with two arguments are considered compatible. + * + * XXX Possibly unnecessary (can OpExpr have different arg count?). + */ + if (list_length(expr->args) != 2) + return true; + + /* see if it actually has the right */ + ok = (NumRelids((Node *) expr) == 1) && + (is_pseudo_constant_clause(lsecond(expr->args)) || + (varonleft = false, + is_pseudo_constant_clause(linitial(expr->args)))); + + /* unsupported structure (two variables or so) */ + if (!ok) + return true; + + /* + * If it's not a "<" or ">" or "=" operator, just ignore the clause. + * Otherwise note the relid and attnum for the variable. This uses the + * function for estimating selectivity, ont the operator directly (a + * bit awkward, but well ...). + */ + switch (get_oprrest(expr->opno)) + { + case F_EQSEL: + + /* equality conditions are compatible with all statistics */ + break; + + default: + + /* unknown estimator */ + return true; + } + + var = (varonleft) ? linitial(expr->args) : lsecond(expr->args); + + return mv_compatible_walker((Node *) var, context); + } + + /* Node not explicitly supported, so terminate */ + return true; +} + +/* + * Determines whether the clause is compatible with multivariate stats, + * and if it is, returns some additional information - varno (index + * into simple_rte_array) and a bitmap of attributes. This is then + * used to fetch related multivariate statistics. + * + * At this moment we only support basic conditions of the form + * + * variable OP constant + * + * where OP is one of [=,<,<=,>=,>] (which is however determined by + * looking at the associated function for estimating selectivity, just + * like with the single-dimensional case). + * + * TODO: Support 'OR clauses' - shouldn't be all that difficult to + * evaluate them using multivariate stats. + */ +static bool +clause_is_mv_compatible(Node *clause, Index relid, AttrNumber *attnum) +{ + mv_compatible_context context; + + context.varno = relid; + context.varattnos = NULL; /* no attnums */ + + if (mv_compatible_walker(clause, (void *) &context)) + return false; + + /* remember the newly collected attnums */ + *attnum = bms_singleton_member(context.varattnos); + + return true; +} + + +/* + * Check that the statistics matches at least one of the requested types. + */ +static bool +stats_type_matches(MVStatisticInfo *stat, int type) +{ + if ((type & STATS_TYPE_FDEPS) && stat->deps_built) + return true; + + return false; +} + +/* + * Check that there are stats with at least one of the requested types. + */ +static bool +has_stats(List *stats, int type) +{ + ListCell *s; + + foreach(s, stats) + { + MVStatisticInfo *stat = (MVStatisticInfo *) lfirst(s); + + /* terminate if we've found at least one matching statistics */ + if (stats_type_matches(stat, type)) + return true; + } + + return false; +} + +/* + * Lookups stats for the given baserel. + */ +static List * +find_stats(PlannerInfo *root, Index relid) +{ + Assert(root->simple_rel_array[relid] != NULL); + + return root->simple_rel_array[relid]->mvstatlist; +} diff --git a/src/backend/utils/mvstats/README.stats b/src/backend/utils/mvstats/README.stats index 30d60d6..814f39c 100644 --- a/src/backend/utils/mvstats/README.stats +++ b/src/backend/utils/mvstats/README.stats @@ -8,48 +8,9 @@ not true, resulting in estimation errors. Multivariate stats track different types of dependencies between the columns, hopefully improving the estimates. - -Types of statistics -------------------- - -Currently we only have two kinds of multivariate statistics - - (a) soft functional dependencies (README.dependencies) - - (b) ndistinct coefficients - - -Compatible clause types ------------------------ - -Each type of statistics may be used to estimate some subset of clause types. - - (a) functional dependencies - equality clauses (AND), possibly IS NULL - -Currently only simple operator clauses (Var op Const) are supported, but it's -possible to support more complex clause types, e.g. (Var op Var). - - -Complex clauses ---------------- - -We also support estimating more complex clauses - essentially AND/OR clauses -with (Var op Const) as leaves, as long as all the referenced attributes are -covered by a single statistics. - -For example this condition - - (a=1) AND ((b=2) OR ((c=3) AND (d=4))) - -may be estimated using statistics on (a,b,c,d). If we only have statistics on -(b,c,d) we may estimate the second part, and estimate (a=1) using simple stats. - -If we only have statistics on (a,b,c) we can't apply it at all at this point, -but it's worth pointing out clauselist_selectivity() works recursively and when -handling the second part (the OR-clause), we'll be able to apply the statistics. - -Note: The multi-statistics estimation patch also makes it possible to pass some -clauses as 'conditions' into the deeper parts of the expression tree. +Currently we only have one kind of multivariate statistics - soft functional +dependencies, and we use it to improve estimates of equality clauses. See +README.dependencies for details. Selectivity estimation diff --git a/src/backend/utils/mvstats/common.c b/src/backend/utils/mvstats/common.c index 4b570a1..39e3b92 100644 --- a/src/backend/utils/mvstats/common.c +++ b/src/backend/utils/mvstats/common.c @@ -308,6 +308,7 @@ compare_scalars_partition(const void *a, const void *b, void *arg) return ApplySortComparator(da, false, db, false, ssup); } + /* initialize multi-dimensional sort */ MultiSortSupport multi_sort_init(int ndims) diff --git a/src/backend/utils/mvstats/dependencies.c b/src/backend/utils/mvstats/dependencies.c index c6390e2..6bca03b 100644 --- a/src/backend/utils/mvstats/dependencies.c +++ b/src/backend/utils/mvstats/dependencies.c @@ -310,6 +310,10 @@ dependency_degree(int numrows, HeapTuple *rows, int k, int *dependency, * (c) -> b (c,a) -> b * (c) -> a (c,b) -> a * (b) -> a (b,c) -> a + * + * XXX Currently this builds redundant dependencies, becuse (a,b => c) and + * (b,a => c) is exactly the same thing, but both versions are generated + * and stored in the statistics. */ MVDependencies build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs, @@ -523,6 +527,70 @@ deserialize_mv_dependencies(bytea *data) } /* + * dependency_is_fully_matched + * checks that a functional dependency is fully matched given clauses on + * attributes (assuming the clauses are suitable equality clauses) + */ +bool +dependency_is_fully_matched(MVDependency dependency, Bitmapset *attnums, + int16 *attmap) +{ + int j; + + /* + * Check that the dependency actually is fully covered by clauses. We + * have to translate all attribute numbers, as those are referenced + */ + for (j = 0; j < dependency->nattributes; j++) + { + int attnum = attmap[dependency->attributes[j]]; + + if (! bms_is_member(attnum, attnums)) + return false; + } + + return true; +} + +/* + * dependency_implies_attribute + * check that the attnum matches is implied by the functional dependency + */ +bool +dependency_implies_attribute(MVDependency dependency, AttrNumber attnum, + int16 *attmap) +{ + if (attnum == attmap[dependency->attributes[dependency->nattributes-1]]) + return true; + + return false; +} + +MVDependencies +load_mv_dependencies(Oid mvoid) +{ + bool isnull = false; + Datum deps; + + /* Prepare to scan pg_mv_statistic for entries having indrelid = this rel. */ + HeapTuple htup = SearchSysCache1(MVSTATOID, ObjectIdGetDatum(mvoid)); + +#ifdef USE_ASSERT_CHECKING + Form_pg_mv_statistic mvstat = (Form_pg_mv_statistic) GETSTRUCT(htup); + Assert(mvstat->deps_enabled && mvstat->deps_built); +#endif + + deps = SysCacheGetAttr(MVSTATOID, htup, + Anum_pg_mv_statistic_stadeps, &isnull); + + Assert(!isnull); + + ReleaseSysCache(htup); + + return deserialize_mv_dependencies(DatumGetByteaP(deps)); +} + +/* * pg_dependencies_in - input routine for type pg_dependencies. * * pg_dependencies is real enough to be a table column, but it has no operations diff --git a/src/include/utils/mvstats.h b/src/include/utils/mvstats.h index e5a49bf..b230747 100644 --- a/src/include/utils/mvstats.h +++ b/src/include/utils/mvstats.h @@ -65,9 +65,13 @@ typedef struct MVDependenciesData typedef MVDependenciesData *MVDependencies; - +bool dependency_implies_attribute(MVDependency dependency, AttrNumber attnum, + int16 *attmap); +bool dependency_is_fully_matched(MVDependency dependency, Bitmapset *attnums, + int16 *attmap); MVNDistinct load_mv_ndistinct(Oid mvoid); +MVDependencies load_mv_dependencies(Oid mvoid); bytea *serialize_mv_ndistinct(MVNDistinct ndistinct); bytea *serialize_mv_dependencies(MVDependencies dependencies); diff --git a/src/test/regress/expected/mv_dependencies.out b/src/test/regress/expected/mv_dependencies.out index d442a16..cf57a67 100644 --- a/src/test/regress/expected/mv_dependencies.out +++ b/src/test/regress/expected/mv_dependencies.out @@ -55,8 +55,10 @@ SELECT deps_enabled, deps_built, stadeps TRUNCATE functional_dependencies; -- a => b, a => c, b => c +-- check explain (expect bitmap index scan, not plain index scan) INSERT INTO functional_dependencies - SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i); + SELECT mod(i,400), mod(i,200), mod(i,100) FROM generate_series(1,30000) s(i); +CREATE INDEX fdeps_idx ON functional_dependencies (a, b); ANALYZE functional_dependencies; SELECT deps_enabled, deps_built, stadeps FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; @@ -65,6 +67,16 @@ SELECT deps_enabled, deps_built, stadeps t | t | [{0 => 1 : 1.000000}, {0 => 2 : 1.000000}, {1 => 2 : 1.000000}, {0, 1 => 2 : 1.000000}, {0, 2 => 1 : 1.000000}] (1 row) +EXPLAIN (COSTS off) + SELECT * FROM functional_dependencies WHERE a = 10 AND b = 5; + QUERY PLAN +--------------------------------------------- + Bitmap Heap Scan on functional_dependencies + Recheck Cond: ((a = 10) AND (b = 5)) + -> Bitmap Index Scan on fdeps_idx + Index Cond: ((a = 10) AND (b = 5)) +(4 rows) + DROP TABLE functional_dependencies; -- varlena type (text) CREATE TABLE functional_dependencies ( @@ -110,8 +122,10 @@ SELECT deps_enabled, deps_built, stadeps TRUNCATE functional_dependencies; -- a => b, a => c, b => c +-- check explain (expect bitmap index scan, not plain index scan) INSERT INTO functional_dependencies - SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i); + SELECT mod(i,400), mod(i,200), mod(i,100) FROM generate_series(1,30000) s(i); +CREATE INDEX fdeps_idx ON functional_dependencies (a, b); ANALYZE functional_dependencies; SELECT deps_enabled, deps_built, stadeps FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; @@ -120,6 +134,16 @@ SELECT deps_enabled, deps_built, stadeps t | t | [{0 => 1 : 1.000000}, {0 => 2 : 1.000000}, {1 => 2 : 1.000000}, {0, 1 => 2 : 1.000000}, {0, 2 => 1 : 1.000000}] (1 row) +EXPLAIN (COSTS off) + SELECT * FROM functional_dependencies WHERE a = '10' AND b = '5'; + QUERY PLAN +------------------------------------------------------------ + Bitmap Heap Scan on functional_dependencies + Recheck Cond: ((a = '10'::text) AND (b = '5'::text)) + -> Bitmap Index Scan on fdeps_idx + Index Cond: ((a = '10'::text) AND (b = '5'::text)) +(4 rows) + DROP TABLE functional_dependencies; -- NULL values (mix of int and text columns) CREATE TABLE functional_dependencies ( diff --git a/src/test/regress/sql/mv_dependencies.sql b/src/test/regress/sql/mv_dependencies.sql index 43df798..49db649 100644 --- a/src/test/regress/sql/mv_dependencies.sql +++ b/src/test/regress/sql/mv_dependencies.sql @@ -53,13 +53,20 @@ SELECT deps_enabled, deps_built, stadeps TRUNCATE functional_dependencies; -- a => b, a => c, b => c +-- check explain (expect bitmap index scan, not plain index scan) INSERT INTO functional_dependencies - SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i); + SELECT mod(i,400), mod(i,200), mod(i,100) FROM generate_series(1,30000) s(i); + +CREATE INDEX fdeps_idx ON functional_dependencies (a, b); + ANALYZE functional_dependencies; SELECT deps_enabled, deps_built, stadeps FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; +EXPLAIN (COSTS off) + SELECT * FROM functional_dependencies WHERE a = 10 AND b = 5; + DROP TABLE functional_dependencies; -- varlena type (text) @@ -96,6 +103,7 @@ TRUNCATE functional_dependencies; -- a => b, a => c INSERT INTO functional_dependencies SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i); + ANALYZE functional_dependencies; SELECT deps_enabled, deps_built, stadeps @@ -104,13 +112,20 @@ SELECT deps_enabled, deps_built, stadeps TRUNCATE functional_dependencies; -- a => b, a => c, b => c +-- check explain (expect bitmap index scan, not plain index scan) INSERT INTO functional_dependencies - SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i); + SELECT mod(i,400), mod(i,200), mod(i,100) FROM generate_series(1,30000) s(i); + +CREATE INDEX fdeps_idx ON functional_dependencies (a, b); + ANALYZE functional_dependencies; SELECT deps_enabled, deps_built, stadeps FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; +EXPLAIN (COSTS off) + SELECT * FROM functional_dependencies WHERE a = '10' AND b = '5'; + DROP TABLE functional_dependencies; -- NULL values (mix of int and text columns) -- 2.5.5