From e04f7a0b43dc914d5b661723e1a4a14abc1df4ef Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Sun, 23 Oct 2016 17:36:25 +0200 Subject: [PATCH 3/9] PATCH: functional dependencies (only the ANALYZE part) - implementation of soft functional dependencies (ANALYZE etc.) - updates existing regression tests (new catalog etc.) - new regression test for functional dependencies - pg_ndistinct data type (varlena-based) The algorithm detecting the dependencies is rather simple and probably needs improvements, so that it detects more complicated dependencies, and also validation of the math. The patch introduces pg_dependencies, a new varlena data type for storing serialized version of functional dependencies. This is similar to what pg_ndistinct does for ndistinct coefficients. --- doc/src/sgml/catalogs.sgml | 30 ++ doc/src/sgml/ref/create_statistics.sgml | 42 +- src/backend/catalog/system_views.sql | 3 +- src/backend/commands/statscmds.c | 37 +- src/backend/nodes/copyfuncs.c | 1 + src/backend/nodes/outfuncs.c | 2 + src/backend/optimizer/util/plancat.c | 4 +- src/backend/parser/gram.y | 14 +- src/backend/utils/mvstats/Makefile | 2 +- src/backend/utils/mvstats/README.dependencies | 118 +++++ src/backend/utils/mvstats/common.c | 26 +- src/backend/utils/mvstats/dependencies.c | 622 ++++++++++++++++++++++++++ src/include/catalog/pg_cast.h | 4 + src/include/catalog/pg_mv_statistic.h | 14 +- src/include/catalog/pg_proc.h | 9 + src/include/catalog/pg_type.h | 4 + src/include/nodes/parsenodes.h | 1 + src/include/nodes/relation.h | 2 + src/include/utils/builtins.h | 4 + src/include/utils/mvstats.h | 37 +- src/test/regress/expected/mv_dependencies.out | 147 ++++++ src/test/regress/expected/mv_ndistinct.out | 10 +- src/test/regress/expected/object_address.out | 2 +- src/test/regress/expected/opr_sanity.out | 3 +- src/test/regress/expected/rules.out | 3 +- src/test/regress/expected/type_sanity.out | 7 +- src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/mv_dependencies.sql | 139 ++++++ src/test/regress/sql/mv_ndistinct.sql | 10 +- src/test/regress/sql/object_address.sql | 2 +- 31 files changed, 1261 insertions(+), 41 deletions(-) create mode 100644 src/backend/utils/mvstats/README.dependencies create mode 100644 src/backend/utils/mvstats/dependencies.c create mode 100644 src/test/regress/expected/mv_dependencies.out create mode 100644 src/test/regress/sql/mv_dependencies.sql diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 2a7bd6c..852f573 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -4285,6 +4285,17 @@ + deps_enabled + bool + + + If true, functional dependencies will be computed for the combination of + columns, covered by the statistics. This does not mean the dependencies + are already computed, though. + + + + ndist_built bool @@ -4295,6 +4306,16 @@ + deps_built + bool + + + If true, functional depenedencies are already computed and available for + use during query estimation. + + + + stakeys int2vector pg_attribute.attnum @@ -4314,6 +4335,15 @@ + + stadeps + pg_dependencies + + + Functional dependencies, serialized as pg_dependencies type. + + + diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml index 9f6a65c..eaa39ee 100644 --- a/doc/src/sgml/ref/create_statistics.sgml +++ b/doc/src/sgml/ref/create_statistics.sgml @@ -21,8 +21,9 @@ PostgreSQL documentation -CREATE STATISTICS [ IF NOT EXISTS ] statistics_name ON ( - column_name, column_name [, ...]) +CREATE STATISTICS [ IF NOT EXISTS ] statistics_name + WITH ( option [= value] [, ... ] ) + ON ( column_name, column_name [, ...]) FROM table_name @@ -99,6 +100,41 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na + + Parameters + + + statistics parameters + + + + The WITH clause can specify options + for statistics. The currently available parameters are listed below. + + + + + + dependencies (boolean) + + + Enables functional dependencies for the statistics. + + + + + + ndistinct (boolean) + + + Enables ndistinct coefficients for the statistics. + + + + + + + @@ -119,7 +155,7 @@ CREATE TABLE t1 ( INSERT INTO t1 SELECT i/100, i/500 FROM generate_series(1,1000000) s(i); -CREATE STATISTICS s1 ON (a, b) FROM t1; +CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t1; ANALYZE t1; diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 00ab440..216ece5 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -187,7 +187,8 @@ CREATE VIEW pg_mv_stats AS C.relname AS tablename, S.staname AS staname, S.stakeys AS attnums, - length(s.standist) AS ndistbytes + length(s.standist::bytea) AS ndistbytes, + length(S.stadeps::bytea) AS depsbytes FROM (pg_mv_statistic S JOIN pg_class C ON (C.oid = S.starelid)) LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace); diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c index bde7e4b..af4f4d3 100644 --- a/src/backend/commands/statscmds.c +++ b/src/backend/commands/statscmds.c @@ -38,7 +38,9 @@ compare_int16(const void *a, const void *b) } /* - * Implements the CREATE STATISTICS name ON (columns) FROM table + * Implements the CREATE STATISTICS command with syntax: + * + * CREATE STATISTICS name WITH (options) ON (columns) FROM table * * We do require that the types support sorting (ltopr), although some * statistics might work with equality only. @@ -66,6 +68,10 @@ CreateStatistics(CreateStatsStmt *stmt) ObjectAddress parentobject, childobject; + /* by default build nothing */ + bool build_ndistinct = false, + build_dependencies = false; + Assert(IsA(stmt, CreateStatsStmt)); /* resolve the pieces of the name (namespace etc.) */ @@ -151,6 +157,31 @@ CreateStatistics(CreateStatsStmt *stmt) (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("duplicate column name in statistics definition"))); + /* + * Parse the statistics options - currently only statistics types are + * recognized (ndistinct, dependencies). + */ + foreach(l, stmt->options) + { + DefElem *opt = (DefElem *) lfirst(l); + + if (strcmp(opt->defname, "ndistinct") == 0) + build_ndistinct = defGetBoolean(opt); + else if (strcmp(opt->defname, "dependencies") == 0) + build_dependencies = defGetBoolean(opt); + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized STATISTICS option \"%s\"", + opt->defname))); + } + + /* Make sure there's at least one statistics type specified. */ + if (! (build_ndistinct || build_dependencies)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("no statistics type (ndistinct, dependencies) requested"))); + stakeys = buildint2vector(attnums, numcols); /* @@ -170,9 +201,11 @@ CreateStatistics(CreateStatsStmt *stmt) values[Anum_pg_mv_statistic_stakeys - 1] = PointerGetDatum(stakeys); /* enabled statistics */ - values[Anum_pg_mv_statistic_ndist_enabled - 1] = BoolGetDatum(true); + values[Anum_pg_mv_statistic_ndist_enabled - 1] = BoolGetDatum(build_ndistinct); + values[Anum_pg_mv_statistic_deps_enabled - 1] = BoolGetDatum(build_dependencies); nulls[Anum_pg_mv_statistic_standist - 1] = true; + nulls[Anum_pg_mv_statistic_stadeps - 1] = true; /* insert the tuple into pg_mv_statistic */ mvstatrel = heap_open(MvStatisticRelationId, RowExclusiveLock); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index dc42be0..6e465a7 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4357,6 +4357,7 @@ _copyCreateStatsStmt(const CreateStatsStmt *from) COPY_NODE_FIELD(defnames); COPY_NODE_FIELD(relation); COPY_NODE_FIELD(keys); + COPY_NODE_FIELD(options); COPY_SCALAR_FIELD(if_not_exists); return newnode; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 57cc0b4..c72473b 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2202,9 +2202,11 @@ _outMVStatisticInfo(StringInfo str, const MVStatisticInfo *node) /* enabled statistics */ WRITE_BOOL_FIELD(ndist_enabled); + WRITE_BOOL_FIELD(deps_enabled); /* built/available statistics */ WRITE_BOOL_FIELD(ndist_built); + WRITE_BOOL_FIELD(deps_built); } static void diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index fc9ad93..8129143 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1287,7 +1287,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) mvstat = (Form_pg_mv_statistic) GETSTRUCT(htup); /* unavailable stats are not interesting for the planner */ - if (mvstat->ndist_built) + if (mvstat->deps_built || mvstat->ndist_built) { info = makeNode(MVStatisticInfo); @@ -1296,9 +1296,11 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) /* enabled statistics */ info->ndist_enabled = mvstat->ndist_enabled; + info->deps_enabled = mvstat->deps_enabled; /* built/available statistics */ info->ndist_built = mvstat->ndist_built; + info->deps_built = mvstat->deps_built; /* stakeys */ adatum = SysCacheGetAttr(MVSTATOID, htup, diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 475a8a6..f61765f 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -3756,21 +3756,23 @@ ExistingIndex: USING INDEX index_name { $$ = $3; } *****************************************************************************/ -CreateStatsStmt: CREATE STATISTICS any_name ON '(' columnList ')' FROM qualified_name +CreateStatsStmt: CREATE STATISTICS any_name opt_reloptions ON '(' columnList ')' FROM qualified_name { CreateStatsStmt *n = makeNode(CreateStatsStmt); n->defnames = $3; - n->relation = $9; - n->keys = $6; + n->relation = $10; + n->keys = $7; + n->options = $4; n->if_not_exists = false; $$ = (Node *)n; } - | CREATE STATISTICS IF_P NOT EXISTS any_name ON '(' columnList ')' FROM qualified_name + | CREATE STATISTICS IF_P NOT EXISTS any_name opt_reloptions ON '(' columnList ')' FROM qualified_name { CreateStatsStmt *n = makeNode(CreateStatsStmt); n->defnames = $6; - n->relation = $12; - n->keys = $9; + n->relation = $13; + n->keys = $10; + n->options = $7; n->if_not_exists = true; $$ = (Node *)n; } diff --git a/src/backend/utils/mvstats/Makefile b/src/backend/utils/mvstats/Makefile index 7295d46..21fe7e5 100644 --- a/src/backend/utils/mvstats/Makefile +++ b/src/backend/utils/mvstats/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/utils/mvstats top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = common.o mvdist.o +OBJS = common.o dependencies.o mvdist.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/mvstats/README.dependencies b/src/backend/utils/mvstats/README.dependencies new file mode 100644 index 0000000..908f094 --- /dev/null +++ b/src/backend/utils/mvstats/README.dependencies @@ -0,0 +1,118 @@ +Soft functional dependencies +============================ + +Functional dependencies are a concept well described in relational theory, +particularly in definition of normalization and "normal forms". Wikipedia +has a nice definition of a functional dependency [1]: + + In a given table, an attribute Y is said to have a functional dependency + on a set of attributes X (written X -> Y) if and only if each X value is + associated with precisely one Y value. For example, in an "Employee" + table that includes the attributes "Employee ID" and "Employee Date of + Birth", the functional dependency + + {Employee ID} -> {Employee Date of Birth} + + would hold. It follows from the previous two sentences that each + {Employee ID} is associated with precisely one {Employee Date of Birth}. + + [1] https://en.wikipedia.org/wiki/Functional_dependency + +In practical terms, functional dependencies mean that a value in one column +determines values in some other column. Consider for example this trivial +table with two integer columns: + + CREATE TABLE t (a INT, b INT) + AS SELECT i, i/10 FROM generate_series(1,100000) s(i); + +Clearly, knowledge of the value in column 'a' is sufficient to determine the +value in column 'b', as it's simply (a/10). A more practical example may be +addresses, where the knowledge of a ZIP code (usually) determines city. Larger +cities may have multiple ZIP codes, so the dependency can't be reversed. + +Many datasets might be normalized not to contain such dependencies, but often +it's not practical for various reasons. In some cases it's actually a conscious +design choice to model the dataset in denormalized way, either because of +performance or to make querying easier. + + +soft dependencies +----------------- + +Real-world data sets often contain data errors, either because of data entry +mistakes (user mistyping the ZIP code) or perhaps issues in generating the +data (e.g. a ZIP code mistakenly assigned to two cities in different states). + +A strict implementation would either ignore dependencies in such cases, +rendering the approach mostly useless even for slightly noisy data sets, or +result in sudden changes in behavior depending on minor differences between +samples provided to ANALYZE. + +For this reason the statistics implementes "soft" functional dependencies, +associating each functional dependency with a degree of validity (a number +number between 0 and 1). This degree is then used to combine selectivities +in a smooth manner. + + +Mining dependencies (ANALYZE) +----------------------------- + +The current algorithm is fairly simple - generate all possible functional +dependencies, and for each one count the number of rows rows consistent it. +Then use the fraction of rows (supporting/total) as the degree. + +To count the rows consistent with the dependency (a => b): + + (a) Sort the data lexicographically, i.e. first by 'a' then 'b'. + + (b) For each group of rows with the same 'a' value, count the number of + distinct values in 'b'. + + (c) If there's a single distinct value in 'b', the rows are consistent with + the functional dependency. Otherwise they contradict it. + +The algorithm also requires a minimum size of the group to consider it +consistent (currently 3 rows in the sample). Small groups make it less likely +to break the consistency. + + +Clause reduction (planner/optimizer) +------------------------------------ + +Apllying the functional dependencies is fairly simple - given a list of +equality clauses, we compute selectivities of each clause and then use the +degree to combine them using this formula + + P(a=?,b=?) = P(a=?) * (d + (1-d) * P(b=?)) + +Where 'd' is the degree of functional dependence (a=>b). + +With more than two equality clauses, this process happens recursively. For +example for (a,b,c) we first use (a,b=>c) to break the computation into + + P(a=?,b=?,c=?) = P(a=?,b=?) * (d + (1-d)*P(b=?)) + +and then apply (a=>b) the same way on P(a=?,b=?). + + +Consistecy of clauses +--------------------- + +Functional dependencies only express general dependencies between columns, +without referencing particular values. This assumes that the equality clauses +are in fact consistent with the functinal dependency, i.e. that given a +dependency (a=>b), the value in (b=?) clause is the value determined by (a=?). +If that's not the case, the clauses are "inconsistent" with the functional +dependency and the result will be over-estimation. + +This may happen for example when using conditions on ZIP and city name with +mismatching values (ZIP for a different city), etc. In such case the result +set will be empty, but we'll estimate the selectivity using the ZIP condition. + +In this case the default estimation based on AVIA principle happens to work +better, but mostly by chance. + +This issue is the price for the simplicity of functional dependencies. If the +application frequently constructs queries with clauses inconsistent with +functional dependencies present in the data, the best solution is not to +use functional dependencies, but one of the more complex types of statistics. diff --git a/src/backend/utils/mvstats/common.c b/src/backend/utils/mvstats/common.c index 7d2f3f3..4b570a1 100644 --- a/src/backend/utils/mvstats/common.c +++ b/src/backend/utils/mvstats/common.c @@ -21,7 +21,8 @@ static VacAttrStats **lookup_var_attr_stats(int2vector *attrs, static List *list_mv_stats(Oid relid); -static void update_mv_stats(Oid relid, MVNDistinct ndistinct, +static void update_mv_stats(Oid relid, + MVNDistinct ndistinct, MVDependencies dependencies, int2vector *attrs, VacAttrStats **stats); @@ -53,6 +54,7 @@ build_mv_stats(Relation onerel, double totalrows, int j; MVStatisticInfo *stat = (MVStatisticInfo *) lfirst(lc); MVNDistinct ndistinct = NULL; + MVDependencies deps = NULL; VacAttrStats **stats = NULL; int numatts = 0; @@ -89,8 +91,12 @@ build_mv_stats(Relation onerel, double totalrows, if (stat->ndist_enabled) ndistinct = build_mv_ndistinct(totalrows, numrows, rows, attrs, stats); + /* analyze functional dependencies between the columns */ + if (stat->deps_enabled) + deps = build_mv_dependencies(numrows, rows, attrs, stats); + /* store the statistics in the catalog */ - update_mv_stats(stat->mvoid, ndistinct, attrs, stats); + update_mv_stats(stat->mvoid, ndistinct, deps, attrs, stats); } } @@ -170,6 +176,8 @@ list_mv_stats(Oid relid) info->stakeys = buildint2vector(stats->stakeys.values, stats->stakeys.dim1); info->ndist_enabled = stats->ndist_enabled; info->ndist_built = stats->ndist_built; + info->deps_enabled = stats->deps_enabled; + info->deps_built = stats->deps_built; result = lappend(result, info); } @@ -191,7 +199,7 @@ list_mv_stats(Oid relid) * Serializes the statistics and stores them into the pg_mv_statistic tuple. */ static void -update_mv_stats(Oid mvoid, MVNDistinct ndistinct, +update_mv_stats(Oid mvoid, MVNDistinct ndistinct, MVDependencies dependencies, int2vector *attrs, VacAttrStats **stats) { HeapTuple stup, @@ -218,18 +226,29 @@ update_mv_stats(Oid mvoid, MVNDistinct ndistinct, values[Anum_pg_mv_statistic_standist-1] = PointerGetDatum(data); } + if (dependencies != NULL) + { + nulls[Anum_pg_mv_statistic_stadeps - 1] = false; + values[Anum_pg_mv_statistic_stadeps - 1] + = PointerGetDatum(serialize_mv_dependencies(dependencies)); + } + /* always replace the value (either by bytea or NULL) */ replaces[Anum_pg_mv_statistic_standist - 1] = true; + replaces[Anum_pg_mv_statistic_stadeps - 1] = true; /* always change the availability flags */ nulls[Anum_pg_mv_statistic_ndist_built - 1] = false; + nulls[Anum_pg_mv_statistic_deps_built - 1] = false; nulls[Anum_pg_mv_statistic_stakeys - 1] = false; /* use the new attnums, in case we removed some dropped ones */ replaces[Anum_pg_mv_statistic_ndist_built - 1] = true; + replaces[Anum_pg_mv_statistic_deps_built - 1] = true; replaces[Anum_pg_mv_statistic_stakeys - 1] = true; values[Anum_pg_mv_statistic_ndist_built - 1] = BoolGetDatum(ndistinct != NULL); + values[Anum_pg_mv_statistic_deps_built - 1] = BoolGetDatum(dependencies != NULL); values[Anum_pg_mv_statistic_stakeys - 1] = PointerGetDatum(attrs); @@ -370,6 +389,7 @@ multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b, &mss->ssup[dim]); } +/* compare all the dimensions in a given range (inclusive) */ int multi_sort_compare_dims(int start, int end, const SortItem *a, const SortItem *b, diff --git a/src/backend/utils/mvstats/dependencies.c b/src/backend/utils/mvstats/dependencies.c new file mode 100644 index 0000000..c6390e2 --- /dev/null +++ b/src/backend/utils/mvstats/dependencies.c @@ -0,0 +1,622 @@ +/*------------------------------------------------------------------------- + * + * dependencies.c + * POSTGRES multivariate functional dependencies + * + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/mvstats/dependencies.c + * + *------------------------------------------------------------------------- + */ + +#include "common.h" + +#include "utils/bytea.h" +#include "utils/lsyscache.h" + +/* + * Internal state for DependencyGenerator of dependencies. Dependencies are similar to + * k-permutations of n elements, except that the order does not matter for the + * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent. + */ +typedef struct DependencyGeneratorData +{ + int k; /* size of the dependency */ + int current; /* next dependency to return (index) */ + int ndependencies; /* number of dependencies generated */ + int *dependencies; /* array of pre-generated dependencies */ +} DependencyGeneratorData; + +typedef DependencyGeneratorData *DependencyGenerator; + +static void +generate_dependencies_recurse(DependencyGenerator state, + int n, int index, int start, int *current) +{ + /* + * The generator handles the first (k-1) elements differently from + * the last element. + */ + if (index < (state->k - 1)) + { + int i; + + /* + * The first (k-1) values have to be in ascending order, which we + * generate recursively. + */ + + for (i = start; i < n; i++) + { + current[index] = i; + generate_dependencies_recurse(state, n, (index+1), (i+1), current); + } + } + else + { + int i; + + /* + * the last element is the implied value, which does not respect the + * ascending order. We just need to check that the value is not in the + * first (k-1) elements. + */ + + for (i = 0; i < n; i++) + { + int j; + bool match = false; + + current[index] = i; + + for (j = 0; j < index; j++) + { + if (current[j] == i) + { + match = true; + break; + } + } + + /* + * If the value is not found in the first part of the dependency, + * we're done. + */ + if (! match) + { + state->dependencies + = (int*)repalloc(state->dependencies, + state->k * (state->ndependencies + 1) * sizeof(int)); + memcpy(&state->dependencies[(state->k * state->ndependencies)], + current, state->k * sizeof(int)); + state->ndependencies++; + } + } + } +} + +/* generate all dependencies (k-permutations of n elements) */ +static void +generate_dependencies(DependencyGenerator state, int n) +{ + int *current = (int *) palloc0(sizeof(int) * state->k); + + generate_dependencies_recurse(state, n, 0, 0, current); + + pfree(current); +} + +/* + * initialize the DependencyGenerator of variations, and prebuild the variations + * + * This pre-builds all the variations. We could also generate them in + * DependencyGenerator_next(), but this seems simpler. + */ +static DependencyGenerator +DependencyGenerator_init(int2vector *attrs, int k) +{ + int n = attrs->dim1; + DependencyGenerator state; + + Assert((n >= k) && (k > 0)); + + /* allocate the DependencyGenerator state as a single chunk of memory */ + state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData)); + state->dependencies = (int*)palloc(k * sizeof(int)); + + state->ndependencies = 0; + state->current = 0; + state->k = k; + + /* now actually pre-generate all the variations */ + generate_dependencies(state, n); + + return state; +} + +/* free the DependencyGenerator state */ +static void +DependencyGenerator_free(DependencyGenerator state) +{ + /* we've allocated a single chunk, so just free it */ + pfree(state); +} + +/* generate next combination */ +static int * +DependencyGenerator_next(DependencyGenerator state, int2vector *attrs) +{ + if (state->current == state->ndependencies) + return NULL; + + return &state->dependencies[state->k * state->current++]; +} + + +/* + * validates functional dependency on the data + * + * An actual work horse of detecting functional dependencies. Given a variation + * of k attributes, it checks that the first (k-1) are sufficient to determine + * the last one. + */ +static double +dependency_degree(int numrows, HeapTuple *rows, int k, int *dependency, + VacAttrStats **stats, int2vector *attrs) +{ + int i, + j; + int nvalues = numrows * k; + MultiSortSupport mss; + SortItem *items; + Datum *values; + bool *isnull; + + /* + * XXX Maybe the threshold should be somehow related to the number of + * distinct values in the combination of columns we're analyzing. Assuming + * the distribution is uniform, we can estimate the average group size and + * use it as a threshold, similarly to what we do for MCV lists. + */ + int min_group_size = 3; + + /* counters valid within a group */ + int group_size = 0; + int n_violations = 0; + + /* total number of rows supporting (consistent with) the dependency */ + int n_supporting_rows = 0; + + /* Make sure we have at least two input attributes. */ + Assert(k >= 2); + + /* sort info for all attributes columns */ + mss = multi_sort_init(k); + + /* data for the sort */ + items = (SortItem *) palloc0(numrows * sizeof(SortItem)); + values = (Datum *) palloc0(sizeof(Datum) * nvalues); + isnull = (bool *) palloc0(sizeof(bool) * nvalues); + + /* fix the pointers to values/isnull */ + for (i = 0; i < numrows; i++) + { + items[i].values = &values[i * k]; + items[i].isnull = &isnull[i * k]; + } + + /* + * Verify the dependency (a,b,...)->z, using a rather simple algorithm: + * + * (a) sort the data lexicographically + * + * (b) split the data into groups by first (k-1) columns + * + * (c) for each group count different values in the last column + */ + + /* prepare the sort function for the first dimension, and SortItem array */ + for (i = 0; i < k; i++) + { + multi_sort_add_dimension(mss, i, dependency[i], stats); + + /* accumulate all the data for both columns into an array and sort it */ + for (j = 0; j < numrows; j++) + { + items[j].values[i] + = heap_getattr(rows[j], attrs->values[dependency[i]], + stats[i]->tupDesc, &items[j].isnull[i]); + } + } + + /* sort the items so that we can detect the groups */ + qsort_arg((void *) items, numrows, sizeof(SortItem), + multi_sort_compare, mss); + + /* + * Walk through the sorted array, split it into rows according to the + * first (k-1) columns. If there's a single value in the last column, we + * count the group as 'supporting' the functional dependency. Otherwise we + * count it as contradicting. + * + * We also require a group to have a minimum number of rows to be + * considered useful for supporting the dependency. Contradicting groups + * may be of any size, though. + * + * XXX The minimum size requirement makes it impossible to identify case + * when both columns are unique (or nearly unique), and therefore + * trivially functionally dependent. + */ + + /* start with the first row forming a group */ + group_size = 1; + + for (i = 1; i <= numrows; i++) + { + /* + * Check if the group ended, which may be either because we processed + * all the items (i==numrows), or because the i-th item is not equal + * to the preceding one. + */ + if ((i == numrows) || + (multi_sort_compare_dims(0, (k - 2), &items[i - 1], &items[i], mss) != 0)) + { + /* + * Do accounting for the preceding group, and reset counters. + * + * If there were no contradicting rows in the group, count the + * rows as supporting. + */ + if ((n_violations == 0) && (group_size >= min_group_size)) + n_supporting_rows += group_size; + + /* current values start a new group */ + n_violations = 0; + group_size = 0; + } + /* first colums match, but the last one does not (so contradicting) */ + else if (multi_sort_compare_dim((k - 1), &items[i - 1], &items[i], mss) != 0) + n_violations += 1; + + group_size += 1; + } + + pfree(items); + pfree(values); + pfree(isnull); + pfree(mss); + + /* Compute the 'degree of validity' as (supporting/total). */ + return (n_supporting_rows * 1.0 / numrows); +} + +/* + * detects functional dependencies between groups of columns + * + * Generates all possible subsets of columns (variations) and checks if the + * last one is determined by the preceding ones. For example given 3 columns, + * there are 12 variations (6 for variations on 2 columns, 6 for 3 columns): + * + * two columns three columns + * ----------- ------------- + * (a) -> c (a,b) -> c + * (b) -> c (b,a) -> c + * (a) -> b (a,c) -> b + * (c) -> b (c,a) -> b + * (c) -> a (c,b) -> a + * (b) -> a (b,c) -> a + */ +MVDependencies +build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs, + VacAttrStats **stats) +{ + int i; + int k; + int numattrs = attrs->dim1; + + /* result */ + MVDependencies dependencies = NULL; + + Assert(numattrs >= 2); + + /* + * We'll try build functional dependencies starting from the smallest ones + * covering just 2 columns, to the largest ones, covering all columns + * included int the statistics. We start from the smallest ones because we + * want to be able to skip already implied ones. + */ + for (k = 2; k <= numattrs; k++) + { + int *dependency; /* array with k elements */ + + /* prepare a DependencyGenerator of variation */ + DependencyGenerator DependencyGenerator = DependencyGenerator_init(attrs, k); + + /* generate all possible variations of k values (out of n) */ + while ((dependency = DependencyGenerator_next(DependencyGenerator, attrs))) + { + double degree; + MVDependency d; + + /* compute how valid the dependency seems */ + degree = dependency_degree(numrows, rows, k, dependency, stats, attrs); + + /* if the dependency seems entirely invalid, don't bother storing it */ + if (degree == 0.0) + continue; + + d = (MVDependency) palloc0(offsetof(MVDependencyData, attributes) + +k * sizeof(int)); + + /* copy the dependency (and keep the indexes into stakeys) */ + d->degree = degree; + d->nattributes = k; + for (i = 0; i < k; i++) + d->attributes[i] = dependency[i]; + + /* initialize the list of dependencies */ + if (dependencies == NULL) + { + dependencies + = (MVDependencies) palloc0(sizeof(MVDependenciesData)); + + dependencies->magic = MVSTAT_DEPS_MAGIC; + dependencies->type = MVSTAT_DEPS_TYPE_BASIC; + dependencies->ndeps = 0; + } + + dependencies->ndeps++; + dependencies = (MVDependencies) repalloc(dependencies, + offsetof(MVDependenciesData, deps) + +dependencies->ndeps * sizeof(MVDependency)); + + dependencies->deps[dependencies->ndeps - 1] = d; + } + + /* we're done with variations of k elements, so free the DependencyGenerator */ + DependencyGenerator_free(DependencyGenerator); + } + + return dependencies; +} + + +/* + * serialize list of dependencies into a bytea + */ +bytea * +serialize_mv_dependencies(MVDependencies dependencies) +{ + int i; + bytea *output; + char *tmp; + Size len; + + /* we need to store ndeps, with a number of attributes for each one */ + len = VARHDRSZ + offsetof(MVDependenciesData, deps) + + dependencies->ndeps * offsetof(MVDependencyData, attributes); + + /* and also include space for the actual attribute numbers and degrees */ + for (i = 0; i < dependencies->ndeps; i++) + len += (sizeof(int16) * dependencies->deps[i]->nattributes); + + output = (bytea *) palloc0(len); + SET_VARSIZE(output, len); + + tmp = VARDATA(output); + + /* first, store the number of dimensions / items */ + memcpy(tmp, dependencies, offsetof(MVDependenciesData, deps)); + tmp += offsetof(MVDependenciesData, deps); + + /* store number of attributes and attribute numbers for each dependency */ + for (i = 0; i < dependencies->ndeps; i++) + { + MVDependency d = dependencies->deps[i]; + + memcpy(tmp, d, offsetof(MVDependencyData, attributes)); + tmp += offsetof(MVDependencyData, attributes); + + memcpy(tmp, d->attributes, sizeof(int16) * d->nattributes); + tmp += sizeof(int16) * d->nattributes; + + Assert(tmp <= ((char *) output + len)); + } + + return output; +} + +/* + * Reads serialized dependencies into MVDependencies structure. + */ +MVDependencies +deserialize_mv_dependencies(bytea *data) +{ + int i; + Size expected_size; + MVDependencies dependencies; + char *tmp; + + if (data == NULL) + return NULL; + + if (VARSIZE_ANY_EXHDR(data) < offsetof(MVDependenciesData, deps)) + elog(ERROR, "invalid MVDependencies size %ld (expected at least %ld)", + VARSIZE_ANY_EXHDR(data), offsetof(MVDependenciesData, deps)); + + /* read the MVDependencies header */ + dependencies = (MVDependencies) palloc0(sizeof(MVDependenciesData)); + + /* initialize pointer to the data part (skip the varlena header) */ + tmp = VARDATA_ANY(data); + + /* get the header and perform basic sanity checks */ + memcpy(dependencies, tmp, offsetof(MVDependenciesData, deps)); + tmp += offsetof(MVDependenciesData, deps); + + if (dependencies->magic != MVSTAT_DEPS_MAGIC) + elog(ERROR, "invalid dependency magic %d (expected %dd)", + dependencies->magic, MVSTAT_DEPS_MAGIC); + + if (dependencies->type != MVSTAT_DEPS_TYPE_BASIC) + elog(ERROR, "invalid dependency type %d (expected %dd)", + dependencies->type, MVSTAT_DEPS_TYPE_BASIC); + + Assert(dependencies->ndeps > 0); + + /* what minimum bytea size do we expect for those parameters */ + expected_size = offsetof(MVDependenciesData, deps) + + dependencies->ndeps * (offsetof(MVDependencyData, attributes) + + sizeof(int16) * 2); + + if (VARSIZE_ANY_EXHDR(data) < expected_size) + elog(ERROR, "invalid dependencies size %ld (expected at least %ld)", + VARSIZE_ANY_EXHDR(data), expected_size); + + /* allocate space for the MCV items */ + dependencies = repalloc(dependencies, offsetof(MVDependenciesData, deps) + +(dependencies->ndeps * sizeof(MVDependency))); + + for (i = 0; i < dependencies->ndeps; i++) + { + double degree; + int k; + MVDependency d; + + /* degree of validity */ + memcpy(°ree, tmp, sizeof(double)); + tmp += sizeof(double); + + /* number of attributes */ + memcpy(&k, tmp, sizeof(int)); + tmp += sizeof(int); + + /* is the number of attributes valid? */ + Assert((k >= 2) && (k <= MVSTATS_MAX_DIMENSIONS)); + + /* now that we know the number of attributes, allocate the dependency */ + d = (MVDependency) palloc0(offsetof(MVDependencyData, attributes) + + (k * sizeof(int))); + + d->degree = degree; + d->nattributes = k; + + /* copy attribute numbers */ + memcpy(d->attributes, tmp, sizeof(int16) * d->nattributes); + tmp += sizeof(int16) * d->nattributes; + + dependencies->deps[i] = d; + + /* still within the bytea */ + Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); + } + + /* we should have consumed the whole bytea exactly */ + Assert(tmp == ((char *) data + VARSIZE_ANY(data))); + + return dependencies; +} + +/* + * pg_dependencies_in - input routine for type pg_dependencies. + * + * pg_dependencies is real enough to be a table column, but it has no operations + * of its own, and disallows input too + * + * XXX This is inspired by what pg_node_tree does. + */ +Datum +pg_dependencies_in(PG_FUNCTION_ARGS) +{ + /* + * pg_node_list stores the data in binary form and parsing text input is + * not needed, so disallow this. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_dependencies"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_dependencies - output routine for type pg_dependencies. + * + * histograms are serialized into a bytea value, so we simply call byteaout() + * to serialize the value into text. But it'd be nice to serialize that into + * a meaningful representation (e.g. for inspection by people). + */ +Datum +pg_dependencies_out(PG_FUNCTION_ARGS) +{ + int i, j; + char *ret; + StringInfoData str; + + bytea *data = PG_GETARG_BYTEA_PP(0); + + MVDependencies dependencies = deserialize_mv_dependencies(data); + + initStringInfo(&str); + appendStringInfoString(&str, "["); + + for (i = 0; i < dependencies->ndeps; i++) + { + MVDependency dependency = dependencies->deps[i]; + + if (i > 0) + appendStringInfoString(&str, ", "); + + appendStringInfoString(&str, "{"); + + for (j = 0; j < dependency->nattributes; j++) + { + if (j == dependency->nattributes-1) + appendStringInfoString(&str, " => "); + else if (j > 0) + appendStringInfoString(&str, ", "); + + appendStringInfo(&str, "%d", dependency->attributes[j]); + } + + appendStringInfo(&str, " : %f", dependency->degree); + + appendStringInfoString(&str, "}"); + } + + appendStringInfoString(&str, "]"); + + ret = pstrdup(str.data); + pfree(str.data); + + PG_RETURN_CSTRING(ret); +} + +/* + * pg_dependencies_recv - binary input routine for type pg_dependencies. + */ +Datum +pg_dependencies_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_dependencies"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_dependencies_send - binary output routine for type pg_dependencies. + * + * XXX Histograms are serialized into a bytea value, so let's just send that. + */ +Datum +pg_dependencies_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} diff --git a/src/include/catalog/pg_cast.h b/src/include/catalog/pg_cast.h index bf39d43..22fa4b8 100644 --- a/src/include/catalog/pg_cast.h +++ b/src/include/catalog/pg_cast.h @@ -258,6 +258,10 @@ DATA(insert ( 194 25 0 i b )); DATA(insert ( 3353 17 0 i b )); DATA(insert ( 3353 25 0 i i )); +/* pg_dependencies can be coerced to, but not from, bytea and text */ +DATA(insert ( 3358 17 0 i b )); +DATA(insert ( 3358 25 0 i i )); + /* * Datetime category */ diff --git a/src/include/catalog/pg_mv_statistic.h b/src/include/catalog/pg_mv_statistic.h index fad80a3..e119cb7 100644 --- a/src/include/catalog/pg_mv_statistic.h +++ b/src/include/catalog/pg_mv_statistic.h @@ -38,9 +38,11 @@ CATALOG(pg_mv_statistic,3381) /* statistics requested to build */ bool ndist_enabled; /* build ndist coefficient? */ + bool deps_enabled; /* analyze dependencies? */ /* statistics that are available (if requested) */ bool ndist_built; /* ndistinct coeff built */ + bool deps_built; /* dependencies were built */ /* * variable-length fields start here, but we allow direct access to @@ -50,6 +52,7 @@ CATALOG(pg_mv_statistic,3381) #ifdef CATALOG_VARLEN pg_ndistinct standist; /* ndistinct coeff (serialized) */ + pg_dependencies stadeps; /* dependencies (serialized) */ #endif } FormData_pg_mv_statistic; @@ -65,14 +68,17 @@ typedef FormData_pg_mv_statistic *Form_pg_mv_statistic; * compiler constants for pg_mv_statistic * ---------------- */ -#define Natts_pg_mv_statistic 8 +#define Natts_pg_mv_statistic 11 #define Anum_pg_mv_statistic_starelid 1 #define Anum_pg_mv_statistic_staname 2 #define Anum_pg_mv_statistic_stanamespace 3 #define Anum_pg_mv_statistic_staowner 4 #define Anum_pg_mv_statistic_ndist_enabled 5 -#define Anum_pg_mv_statistic_ndist_built 6 -#define Anum_pg_mv_statistic_stakeys 7 -#define Anum_pg_mv_statistic_standist 8 +#define Anum_pg_mv_statistic_deps_enabled 6 +#define Anum_pg_mv_statistic_ndist_built 7 +#define Anum_pg_mv_statistic_deps_built 8 +#define Anum_pg_mv_statistic_stakeys 9 +#define Anum_pg_mv_statistic_standist 10 +#define Anum_pg_mv_statistic_stadeps 11 #endif /* PG_MV_STATISTIC_H */ diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 940a991..b1f7b75 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2735,6 +2735,15 @@ DESCR("I/O"); DATA(insert OID = 3357 ( pg_ndistinct_send PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 17 "3353" _null_ _null_ _null_ _null_ _null_ pg_ndistinct_send _null_ _null_ _null_ )); DESCR("I/O"); +DATA(insert OID = 3359 ( pg_dependencies_in PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3358 "2275" _null_ _null_ _null_ _null_ _null_ pg_dependencies_in _null_ _null_ _null_ )); +DESCR("I/O"); +DATA(insert OID = 3360 ( pg_dependencies_out PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2275 "3358" _null_ _null_ _null_ _null_ _null_ pg_dependencies_out _null_ _null_ _null_ )); +DESCR("I/O"); +DATA(insert OID = 3361 ( pg_dependencies_recv PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 3358 "2281" _null_ _null_ _null_ _null_ _null_ pg_dependencies_recv _null_ _null_ _null_ )); +DESCR("I/O"); +DATA(insert OID = 3362 ( pg_dependencies_send PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 17 "3358" _null_ _null_ _null_ _null_ _null_ pg_dependencies_send _null_ _null_ _null_ )); +DESCR("I/O"); + DATA(insert OID = 1928 ( pg_stat_get_numscans PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_numscans _null_ _null_ _null_ )); DESCR("statistics: number of scans done for table/index"); DATA(insert OID = 1929 ( pg_stat_get_tuples_returned PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_returned _null_ _null_ _null_ )); diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h index 9c9caf3..da637d4 100644 --- a/src/include/catalog/pg_type.h +++ b/src/include/catalog/pg_type.h @@ -368,6 +368,10 @@ DATA(insert OID = 3353 ( pg_ndistinct PGNSP PGUID -1 f b S f t \054 0 0 0 pg_nd DESCR("multivariate ndistinct coefficients"); #define PGNDISTINCTOID 3353 +DATA(insert OID = 3358 ( pg_dependencies PGNSP PGUID -1 f b S f t \054 0 0 0 pg_dependencies_in pg_dependencies_out pg_dependencies_recv pg_dependencies_send - - - i x f 0 -1 0 100 _null_ _null_ _null_ )); +DESCR("multivariate histogram"); +#define PGDEPENDENCIESOID 3358 + DATA(insert OID = 32 ( pg_ddl_command PGNSP PGUID SIZEOF_POINTER t p P f t \054 0 0 0 pg_ddl_command_in pg_ddl_command_out pg_ddl_command_recv pg_ddl_command_send - - - ALIGNOF_POINTER p f 0 -1 0 0 _null_ _null_ _null_ )); DESCR("internal type for passing CollectedCommand"); #define PGDDLCOMMANDOID 32 diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 18e1dd1..fe4b93a 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -617,6 +617,7 @@ typedef struct CreateStatsStmt List *defnames; /* qualified name (list of Value strings) */ RangeVar *relation; /* relation to build statistics on */ List *keys; /* String nodes naming referenced column(s) */ + List *options; /* list of DefElem nodes */ bool if_not_exists; /* do nothing if statistics already exists */ } CreateStatsStmt; diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 7a55151..56957e8 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -681,9 +681,11 @@ typedef struct MVStatisticInfo RelOptInfo *rel; /* back-link to index's table */ /* enabled statistics */ + bool deps_enabled; /* functional dependencies enabled */ bool ndist_enabled; /* ndistinct coefficient enabled */ /* built/available statistics */ + bool deps_built; /* functional dependencies built */ bool ndist_built; /* ndistinct coefficient built */ /* columns in the statistics (attnums) */ diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 262ee94..9ffd80c 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -73,6 +73,10 @@ extern Datum pg_ndistinct_in(PG_FUNCTION_ARGS); extern Datum pg_ndistinct_out(PG_FUNCTION_ARGS); extern Datum pg_ndistinct_recv(PG_FUNCTION_ARGS); extern Datum pg_ndistinct_send(PG_FUNCTION_ARGS); +extern Datum pg_dependencies_in(PG_FUNCTION_ARGS); +extern Datum pg_dependencies_out(PG_FUNCTION_ARGS); +extern Datum pg_dependencies_recv(PG_FUNCTION_ARGS); +extern Datum pg_dependencies_send(PG_FUNCTION_ARGS); /* regexp.c */ extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive, diff --git a/src/include/utils/mvstats.h b/src/include/utils/mvstats.h index 0660c59..e5a49bf 100644 --- a/src/include/utils/mvstats.h +++ b/src/include/utils/mvstats.h @@ -39,16 +39,49 @@ typedef struct MVNDistinctData { typedef MVNDistinctData *MVNDistinct; +#define MVSTAT_DEPS_MAGIC 0xB4549A2C /* marks serialized bytea */ +#define MVSTAT_DEPS_TYPE_BASIC 1 /* basic dependencies type */ + +/* + * Functional dependencies, tracking column-level relationships (values + * in one column determine values in another one). + */ +typedef struct MVDependencyData +{ + double degree; /* degree of validity (0-1) */ + int nattributes; /* number of attributes */ + int16 attributes[FLEXIBLE_ARRAY_MEMBER]; /* attribute numbers */ +} MVDependencyData; + +typedef MVDependencyData *MVDependency; + +typedef struct MVDependenciesData +{ + uint32 magic; /* magic constant marker */ + uint32 type; /* type of MV Dependencies (BASIC) */ + int32 ndeps; /* number of dependencies */ + MVDependency deps[FLEXIBLE_ARRAY_MEMBER]; /* dependencies */ +} MVDependenciesData; + +typedef MVDependenciesData *MVDependencies; + + + MVNDistinct load_mv_ndistinct(Oid mvoid); bytea *serialize_mv_ndistinct(MVNDistinct ndistinct); +bytea *serialize_mv_dependencies(MVDependencies dependencies); /* deserialization of stats (serialization is private to analyze) */ MVNDistinct deserialize_mv_ndistinct(bytea *data); - +MVDependencies deserialize_mv_dependencies(bytea *data); MVNDistinct build_mv_ndistinct(double totalrows, int numrows, HeapTuple *rows, - int2vector *attrs, VacAttrStats **stats); + int2vector *attrs, VacAttrStats **stats); + +MVDependencies build_mv_dependencies(int numrows, HeapTuple *rows, + int2vector *attrs, + VacAttrStats **stats); void build_mv_stats(Relation onerel, double totalrows, int numrows, HeapTuple *rows, diff --git a/src/test/regress/expected/mv_dependencies.out b/src/test/regress/expected/mv_dependencies.out new file mode 100644 index 0000000..d442a16 --- /dev/null +++ b/src/test/regress/expected/mv_dependencies.out @@ -0,0 +1,147 @@ +-- data type passed by value +CREATE TABLE functional_dependencies ( + a INT, + b INT, + c INT +); +-- unknown column +CREATE STATISTICS s1 WITH (dependencies) ON (unknown_column) FROM functional_dependencies; +ERROR: column "unknown_column" referenced in statistics does not exist +-- single column +CREATE STATISTICS s1 WITH (dependencies) ON (a) FROM functional_dependencies; +ERROR: statistics require at least 2 columns +-- single column, duplicated +CREATE STATISTICS s1 WITH (dependencies) ON (a,a) FROM functional_dependencies; +ERROR: duplicate column name in statistics definition +-- two columns, one duplicated +CREATE STATISTICS s1 WITH (dependencies) ON (a, a, b) FROM functional_dependencies; +ERROR: duplicate column name in statistics definition +-- correct command +CREATE STATISTICS s1 WITH (dependencies) ON (a, b, c) FROM functional_dependencies; +-- random data (no functional dependencies) +INSERT INTO functional_dependencies + SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i); +ANALYZE functional_dependencies; +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + deps_enabled | deps_built | stadeps +--------------+------------+--------- + t | f | +(1 row) + +TRUNCATE functional_dependencies; +-- a => b, a => c, b => c +INSERT INTO functional_dependencies + SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i); +ANALYZE functional_dependencies; +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + deps_enabled | deps_built | stadeps +--------------+------------+----------------------------------------------------------------------------------------------------------------- + t | t | [{0 => 1 : 0.999900}, {0 => 2 : 0.999900}, {1 => 2 : 0.999900}, {0, 1 => 2 : 0.999900}, {0, 2 => 1 : 0.999900}] +(1 row) + +TRUNCATE functional_dependencies; +-- a => b, a => c +INSERT INTO functional_dependencies + SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i); +ANALYZE functional_dependencies; +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + deps_enabled | deps_built | stadeps +--------------+------------+----------------------------------------------------------------------------------------------------------------- + t | t | [{0 => 1 : 0.999900}, {0 => 2 : 0.999900}, {1 => 2 : 0.494900}, {0, 1 => 2 : 0.999900}, {0, 2 => 1 : 0.999900}] +(1 row) + +TRUNCATE functional_dependencies; +-- a => b, a => c, b => c +INSERT INTO functional_dependencies + SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i); +ANALYZE functional_dependencies; +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + deps_enabled | deps_built | stadeps +--------------+------------+----------------------------------------------------------------------------------------------------------------- + t | t | [{0 => 1 : 1.000000}, {0 => 2 : 1.000000}, {1 => 2 : 1.000000}, {0, 1 => 2 : 1.000000}, {0, 2 => 1 : 1.000000}] +(1 row) + +DROP TABLE functional_dependencies; +-- varlena type (text) +CREATE TABLE functional_dependencies ( + a TEXT, + b TEXT, + c TEXT +); +CREATE STATISTICS s2 WITH (dependencies) ON (a, b, c) FROM functional_dependencies; +-- random data (no functional dependencies) +INSERT INTO functional_dependencies + SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i); +ANALYZE functional_dependencies; +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + deps_enabled | deps_built | stadeps +--------------+------------+--------- + t | f | +(1 row) + +TRUNCATE functional_dependencies; +-- a => b, a => c, b => c +INSERT INTO functional_dependencies + SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i); +ANALYZE functional_dependencies; +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + deps_enabled | deps_built | stadeps +--------------+------------+----------------------------------------------------------------------------------------------------------------- + t | t | [{0 => 1 : 0.999900}, {0 => 2 : 0.999900}, {1 => 2 : 0.999900}, {0, 1 => 2 : 0.999900}, {0, 2 => 1 : 0.999900}] +(1 row) + +TRUNCATE functional_dependencies; +-- a => b, a => c +INSERT INTO functional_dependencies + SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i); +ANALYZE functional_dependencies; +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + deps_enabled | deps_built | stadeps +--------------+------------+----------------------------------------------------------------------------------------------------------------- + t | t | [{0 => 1 : 0.999900}, {0 => 2 : 0.999900}, {1 => 2 : 0.494900}, {0, 1 => 2 : 0.999900}, {0, 2 => 1 : 0.999900}] +(1 row) + +TRUNCATE functional_dependencies; +-- a => b, a => c, b => c +INSERT INTO functional_dependencies + SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i); +ANALYZE functional_dependencies; +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + deps_enabled | deps_built | stadeps +--------------+------------+----------------------------------------------------------------------------------------------------------------- + t | t | [{0 => 1 : 1.000000}, {0 => 2 : 1.000000}, {1 => 2 : 1.000000}, {0, 1 => 2 : 1.000000}, {0, 2 => 1 : 1.000000}] +(1 row) + +DROP TABLE functional_dependencies; +-- NULL values (mix of int and text columns) +CREATE TABLE functional_dependencies ( + a INT, + b TEXT, + c INT, + d TEXT +); +CREATE STATISTICS s3 WITH (dependencies) ON (a, b, c, d) FROM functional_dependencies; +INSERT INTO functional_dependencies + SELECT + mod(i, 100), + (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END), + mod(i, 400), + (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END) + FROM generate_series(1,10000) s(i); +ANALYZE functional_dependencies; +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + deps_enabled | deps_built | stadeps +--------------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + t | t | [{1 => 0 : 1.000000}, {2 => 0 : 1.000000}, {2 => 1 : 1.000000}, {3 => 0 : 1.000000}, {3 => 1 : 0.996700}, {0, 2 => 1 : 1.000000}, {0, 3 => 1 : 0.996700}, {1, 2 => 0 : 1.000000}, {1, 3 => 0 : 1.000000}, {2, 3 => 0 : 1.000000}, {2, 3 => 1 : 1.000000}, {0, 2, 3 => 1 : 1.000000}, {1, 2, 3 => 0 : 1.000000}] +(1 row) + +DROP TABLE functional_dependencies; diff --git a/src/test/regress/expected/mv_ndistinct.out b/src/test/regress/expected/mv_ndistinct.out index 5f55091..06a7634 100644 --- a/src/test/regress/expected/mv_ndistinct.out +++ b/src/test/regress/expected/mv_ndistinct.out @@ -6,19 +6,19 @@ CREATE TABLE ndistinct ( d INT ); -- unknown column -CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (unknown_column) FROM ndistinct; ERROR: column "unknown_column" referenced in statistics does not exist -- single column -CREATE STATISTICS s10 ON (a) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (a) FROM ndistinct; ERROR: statistics require at least 2 columns -- single column, duplicated -CREATE STATISTICS s10 ON (a,a) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (a,a) FROM ndistinct; ERROR: duplicate column name in statistics definition -- two columns, one duplicated -CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (a, a, b) FROM ndistinct; ERROR: duplicate column name in statistics definition -- correct command -CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (a, b, c) FROM ndistinct; -- perfectly correlated groups INSERT INTO ndistinct SELECT i/100, i/100, i/100 FROM generate_series(1,10000) s(i); diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out index 2b5c022..f574554 100644 --- a/src/test/regress/expected/object_address.out +++ b/src/test/regress/expected/object_address.out @@ -38,7 +38,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL ( TO SQL WITH FUNCTION int4recv(internal)); CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable; CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT); -CREATE STATISTICS addr_nsp.gentable_stat ON (a,b) FROM addr_nsp.gentable; +CREATE STATISTICS addr_nsp.gentable_stat WITH (ndistinct) ON (a,b) FROM addr_nsp.gentable; -- test some error cases SELECT pg_get_object_address('stone', '{}', '{}'); ERROR: unrecognized object type "stone" diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out index 9a26205..db1cf8a 100644 --- a/src/test/regress/expected/opr_sanity.out +++ b/src/test/regress/expected/opr_sanity.out @@ -818,11 +818,12 @@ WHERE c.castmethod = 'b' AND character varying | character | 0 | i pg_node_tree | text | 0 | i pg_ndistinct | bytea | 0 | i + pg_dependencies | bytea | 0 | i cidr | inet | 0 | i xml | text | 0 | a xml | character varying | 0 | a xml | character | 0 | a -(8 rows) +(9 rows) -- **************** pg_conversion **************** -- Look for illegal values in pg_conversion fields. diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 2c54779..39179a6 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1380,7 +1380,8 @@ pg_mv_stats| SELECT n.nspname AS schemaname, c.relname AS tablename, s.staname, s.stakeys AS attnums, - length((s.standist)::text) AS ndistbytes + length((s.standist)::bytea) AS ndistbytes, + length((s.stadeps)::bytea) AS depsbytes FROM ((pg_mv_statistic s JOIN pg_class c ON ((c.oid = s.starelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))); diff --git a/src/test/regress/expected/type_sanity.out b/src/test/regress/expected/type_sanity.out index 6281cef..b0b40ca 100644 --- a/src/test/regress/expected/type_sanity.out +++ b/src/test/regress/expected/type_sanity.out @@ -67,12 +67,13 @@ WHERE p1.typtype not in ('c','d','p') AND p1.typname NOT LIKE E'\\_%' (SELECT 1 FROM pg_type as p2 WHERE p2.typname = ('_' || p1.typname)::name AND p2.typelem = p1.oid and p1.typarray = p2.oid); - oid | typname -------+-------------- + oid | typname +------+----------------- 194 | pg_node_tree 3353 | pg_ndistinct + 3358 | pg_dependencies 210 | smgr -(3 rows) +(4 rows) -- Make sure typarray points to a varlena array type of our own base SELECT p1.oid, p1.typname as basetype, p2.typname as arraytype, diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 0273ea6..fda9166 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -118,4 +118,4 @@ test: event_trigger test: stats # run tests of multivariate stats -test: mv_ndistinct +test: mv_ndistinct mv_dependencies diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index f7f3a14..90d74d2 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -172,3 +172,4 @@ test: xml test: event_trigger test: stats test: mv_ndistinct +test: mv_dependencies diff --git a/src/test/regress/sql/mv_dependencies.sql b/src/test/regress/sql/mv_dependencies.sql new file mode 100644 index 0000000..43df798 --- /dev/null +++ b/src/test/regress/sql/mv_dependencies.sql @@ -0,0 +1,139 @@ +-- data type passed by value +CREATE TABLE functional_dependencies ( + a INT, + b INT, + c INT +); + +-- unknown column +CREATE STATISTICS s1 WITH (dependencies) ON (unknown_column) FROM functional_dependencies; + +-- single column +CREATE STATISTICS s1 WITH (dependencies) ON (a) FROM functional_dependencies; + +-- single column, duplicated +CREATE STATISTICS s1 WITH (dependencies) ON (a,a) FROM functional_dependencies; + +-- two columns, one duplicated +CREATE STATISTICS s1 WITH (dependencies) ON (a, a, b) FROM functional_dependencies; + +-- correct command +CREATE STATISTICS s1 WITH (dependencies) ON (a, b, c) FROM functional_dependencies; + +-- random data (no functional dependencies) +INSERT INTO functional_dependencies + SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i); + +ANALYZE functional_dependencies; + +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + +TRUNCATE functional_dependencies; + +-- a => b, a => c, b => c +INSERT INTO functional_dependencies + SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i); + +ANALYZE functional_dependencies; + +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + +TRUNCATE functional_dependencies; + +-- a => b, a => c +INSERT INTO functional_dependencies + SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i); +ANALYZE functional_dependencies; + +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + +TRUNCATE functional_dependencies; + +-- a => b, a => c, b => c +INSERT INTO functional_dependencies + SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i); +ANALYZE functional_dependencies; + +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + +DROP TABLE functional_dependencies; + +-- varlena type (text) +CREATE TABLE functional_dependencies ( + a TEXT, + b TEXT, + c TEXT +); + +CREATE STATISTICS s2 WITH (dependencies) ON (a, b, c) FROM functional_dependencies; + +-- random data (no functional dependencies) +INSERT INTO functional_dependencies + SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i); + +ANALYZE functional_dependencies; + +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + +TRUNCATE functional_dependencies; + +-- a => b, a => c, b => c +INSERT INTO functional_dependencies + SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i); + +ANALYZE functional_dependencies; + +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + +TRUNCATE functional_dependencies; + +-- a => b, a => c +INSERT INTO functional_dependencies + SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i); +ANALYZE functional_dependencies; + +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + +TRUNCATE functional_dependencies; + +-- a => b, a => c, b => c +INSERT INTO functional_dependencies + SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i); +ANALYZE functional_dependencies; + +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + +DROP TABLE functional_dependencies; + +-- NULL values (mix of int and text columns) +CREATE TABLE functional_dependencies ( + a INT, + b TEXT, + c INT, + d TEXT +); + +CREATE STATISTICS s3 WITH (dependencies) ON (a, b, c, d) FROM functional_dependencies; + +INSERT INTO functional_dependencies + SELECT + mod(i, 100), + (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END), + mod(i, 400), + (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END) + FROM generate_series(1,10000) s(i); + +ANALYZE functional_dependencies; + +SELECT deps_enabled, deps_built, stadeps + FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass; + +DROP TABLE functional_dependencies; diff --git a/src/test/regress/sql/mv_ndistinct.sql b/src/test/regress/sql/mv_ndistinct.sql index 5cef254..43024ca 100644 --- a/src/test/regress/sql/mv_ndistinct.sql +++ b/src/test/regress/sql/mv_ndistinct.sql @@ -7,19 +7,19 @@ CREATE TABLE ndistinct ( ); -- unknown column -CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (unknown_column) FROM ndistinct; -- single column -CREATE STATISTICS s10 ON (a) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (a) FROM ndistinct; -- single column, duplicated -CREATE STATISTICS s10 ON (a,a) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (a,a) FROM ndistinct; -- two columns, one duplicated -CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (a, a, b) FROM ndistinct; -- correct command -CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct; +CREATE STATISTICS s10 WITH (ndistinct) ON (a, b, c) FROM ndistinct; -- perfectly correlated groups INSERT INTO ndistinct diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql index 791b942..902599b 100644 --- a/src/test/regress/sql/object_address.sql +++ b/src/test/regress/sql/object_address.sql @@ -41,7 +41,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL ( TO SQL WITH FUNCTION int4recv(internal)); CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable; CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT); -CREATE STATISTICS addr_nsp.gentable_stat ON (a,b) FROM addr_nsp.gentable; +CREATE STATISTICS addr_nsp.gentable_stat WITH (ndistinct) ON (a,b) FROM addr_nsp.gentable; -- test some error cases SELECT pg_get_object_address('stone', '{}', '{}'); -- 2.5.5