From e04f7a0b43dc914d5b661723e1a4a14abc1df4ef Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Sun, 23 Oct 2016 17:36:25 +0200
Subject: [PATCH 3/9] PATCH: functional dependencies (only the ANALYZE part)

- implementation of soft functional dependencies (ANALYZE etc.)
- updates existing regression tests (new catalog etc.)
- new regression test for functional dependencies
- pg_ndistinct data type (varlena-based)

The algorithm detecting the dependencies is rather simple and probably
needs improvements, so that it detects more complicated dependencies,
and also validation of the math.

The patch introduces pg_dependencies, a new varlena data type for
storing serialized version of functional dependencies. This is similar
to what pg_ndistinct does for ndistinct coefficients.
---
 doc/src/sgml/catalogs.sgml                    |  30 ++
 doc/src/sgml/ref/create_statistics.sgml       |  42 +-
 src/backend/catalog/system_views.sql          |   3 +-
 src/backend/commands/statscmds.c              |  37 +-
 src/backend/nodes/copyfuncs.c                 |   1 +
 src/backend/nodes/outfuncs.c                  |   2 +
 src/backend/optimizer/util/plancat.c          |   4 +-
 src/backend/parser/gram.y                     |  14 +-
 src/backend/utils/mvstats/Makefile            |   2 +-
 src/backend/utils/mvstats/README.dependencies | 118 +++++
 src/backend/utils/mvstats/common.c            |  26 +-
 src/backend/utils/mvstats/dependencies.c      | 622 ++++++++++++++++++++++++++
 src/include/catalog/pg_cast.h                 |   4 +
 src/include/catalog/pg_mv_statistic.h         |  14 +-
 src/include/catalog/pg_proc.h                 |   9 +
 src/include/catalog/pg_type.h                 |   4 +
 src/include/nodes/parsenodes.h                |   1 +
 src/include/nodes/relation.h                  |   2 +
 src/include/utils/builtins.h                  |   4 +
 src/include/utils/mvstats.h                   |  37 +-
 src/test/regress/expected/mv_dependencies.out | 147 ++++++
 src/test/regress/expected/mv_ndistinct.out    |  10 +-
 src/test/regress/expected/object_address.out  |   2 +-
 src/test/regress/expected/opr_sanity.out      |   3 +-
 src/test/regress/expected/rules.out           |   3 +-
 src/test/regress/expected/type_sanity.out     |   7 +-
 src/test/regress/parallel_schedule            |   2 +-
 src/test/regress/serial_schedule              |   1 +
 src/test/regress/sql/mv_dependencies.sql      | 139 ++++++
 src/test/regress/sql/mv_ndistinct.sql         |  10 +-
 src/test/regress/sql/object_address.sql       |   2 +-
 31 files changed, 1261 insertions(+), 41 deletions(-)
 create mode 100644 src/backend/utils/mvstats/README.dependencies
 create mode 100644 src/backend/utils/mvstats/dependencies.c
 create mode 100644 src/test/regress/expected/mv_dependencies.out
 create mode 100644 src/test/regress/sql/mv_dependencies.sql
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 2a7bd6c..852f573 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -4285,6 +4285,17 @@
      </row>
 
      <row>
+      <entry><structfield>deps_enabled</structfield></entry>
+      <entry><type>bool</type></entry>
+      <entry></entry>
+      <entry>
+       If true, functional dependencies will be computed for the combination of
+       columns, covered by the statistics. This does not mean the dependencies
+       are already computed, though.
+      </entry>
+     </row>
+
+     <row>
       <entry><structfield>ndist_built</structfield></entry>
       <entry><type>bool</type></entry>
       <entry></entry>
@@ -4295,6 +4306,16 @@
      </row>
 
      <row>
+      <entry><structfield>deps_built</structfield></entry>
+      <entry><type>bool</type></entry>
+      <entry></entry>
+      <entry>
+       If true, functional depenedencies are already computed and available for
+       use during query estimation.
+      </entry>
+     </row>
+
+     <row>
       <entry><structfield>stakeys</structfield></entry>
       <entry><type>int2vector</type></entry>
       <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
@@ -4314,6 +4335,15 @@
       </entry>
      </row>
 
+     <row>
+      <entry><structfield>stadeps</structfield></entry>
+      <entry><type>pg_dependencies</type></entry>
+      <entry></entry>
+      <entry>
+       Functional dependencies, serialized as <structname>pg_dependencies</> type.
+      </entry>
+     </row>
+
     </tbody>
    </tgroup>
   </table>
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml
index 9f6a65c..eaa39ee 100644
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -21,8 +21,9 @@ PostgreSQL documentation
 
  <refsynopsisdiv>
 <synopsis>
-CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable> ON (
-  <replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable>
+  WITH ( <replaceable class="PARAMETER">option</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] )
+  ON ( <replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])
   FROM <replaceable class="PARAMETER">table_name</replaceable>
 </synopsis>
 
@@ -99,6 +100,41 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
 
   </variablelist>
 
+  <refsect2 id="SQL-CREATESTATISTICS-parameters">
+   <title id="SQL-CREATESTATISTICS-parameters-title">Parameters</title>
+
+ <indexterm zone="sql-createstatistics-parameters">
+  <primary>statistics parameters</primary>
+ </indexterm>
+
+   <para>
+    The <literal>WITH</> clause can specify <firstterm>options</>
+    for statistics. The currently available parameters are listed below.
+   </para>
+
+   <variablelist>
+
+   <varlistentry>
+    <term><literal>dependencies</> (<type>boolean</>)</term>
+    <listitem>
+     <para>
+      Enables functional dependencies for the statistics.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><literal>ndistinct</> (<type>boolean</>)</term>
+    <listitem>
+     <para>
+      Enables ndistinct coefficients for the statistics.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   </variablelist>
+
+  </refsect2>
  </refsect1>
 
  <refsect1 id="SQL-CREATESTATISTICS-examples">
@@ -119,7 +155,7 @@ CREATE TABLE t1 (
 INSERT INTO t1 SELECT i/100, i/500
                  FROM generate_series(1,1000000) s(i);
 
-CREATE STATISTICS s1 ON (a, b) FROM t1;
+CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t1;
 
 ANALYZE t1;
 
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 00ab440..216ece5 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -187,7 +187,8 @@ CREATE VIEW pg_mv_stats AS
         C.relname AS tablename,
         S.staname AS staname,
         S.stakeys AS attnums,
-        length(s.standist) AS ndistbytes
+        length(s.standist::bytea) AS ndistbytes,
+        length(S.stadeps::bytea) AS depsbytes
     FROM (pg_mv_statistic S JOIN pg_class C ON (C.oid = S.starelid))
         LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);
 
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index bde7e4b..af4f4d3 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -38,7 +38,9 @@ compare_int16(const void *a, const void *b)
 }
 
 /*
- * Implements the CREATE STATISTICS name ON (columns) FROM table
+ * Implements the CREATE STATISTICS command with syntax:
+ *
+ *    CREATE STATISTICS name WITH (options) ON (columns) FROM table
  *
  * We do require that the types support sorting (ltopr), although some
  * statistics might work with  equality only.
@@ -66,6 +68,10 @@ CreateStatistics(CreateStatsStmt *stmt)
 	ObjectAddress parentobject,
 				childobject;
 
+	/* by default build nothing */
+	bool		build_ndistinct = false,
+				build_dependencies = false;
+
 	Assert(IsA(stmt, CreateStatsStmt));
 
 	/* resolve the pieces of the name (namespace etc.) */
@@ -151,6 +157,31 @@ CreateStatistics(CreateStatsStmt *stmt)
 					(errcode(ERRCODE_UNDEFINED_COLUMN),
 			  errmsg("duplicate column name in statistics definition")));
 
+	/*
+	 * Parse the statistics options - currently only statistics types are
+	 * recognized (ndistinct, dependencies).
+	 */
+	foreach(l, stmt->options)
+	{
+		DefElem    *opt = (DefElem *) lfirst(l);
+
+		if (strcmp(opt->defname, "ndistinct") == 0)
+			build_ndistinct = defGetBoolean(opt);
+		else if (strcmp(opt->defname, "dependencies") == 0)
+			build_dependencies = defGetBoolean(opt);
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("unrecognized STATISTICS option \"%s\"",
+							opt->defname)));
+	}
+
+	/* Make sure there's at least one statistics type specified. */
+	if (! (build_ndistinct || build_dependencies))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("no statistics type (ndistinct, dependencies) requested")));
+
 	stakeys = buildint2vector(attnums, numcols);
 
 	/*
@@ -170,9 +201,11 @@ CreateStatistics(CreateStatsStmt *stmt)
 	values[Anum_pg_mv_statistic_stakeys - 1] = PointerGetDatum(stakeys);
 
 	/* enabled statistics */
-	values[Anum_pg_mv_statistic_ndist_enabled - 1] = BoolGetDatum(true);
+	values[Anum_pg_mv_statistic_ndist_enabled - 1] = BoolGetDatum(build_ndistinct);
+	values[Anum_pg_mv_statistic_deps_enabled - 1] = BoolGetDatum(build_dependencies);
 
 	nulls[Anum_pg_mv_statistic_standist - 1] = true;
+	nulls[Anum_pg_mv_statistic_stadeps - 1] = true;
 
 	/* insert the tuple into pg_mv_statistic */
 	mvstatrel = heap_open(MvStatisticRelationId, RowExclusiveLock);
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index dc42be0..6e465a7 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -4357,6 +4357,7 @@ _copyCreateStatsStmt(const CreateStatsStmt *from)
 	COPY_NODE_FIELD(defnames);
 	COPY_NODE_FIELD(relation);
 	COPY_NODE_FIELD(keys);
+	COPY_NODE_FIELD(options);
 	COPY_SCALAR_FIELD(if_not_exists);
 
 	return newnode;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 57cc0b4..c72473b 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2202,9 +2202,11 @@ _outMVStatisticInfo(StringInfo str, const MVStatisticInfo *node)
 
 	/* enabled statistics */
 	WRITE_BOOL_FIELD(ndist_enabled);
+	WRITE_BOOL_FIELD(deps_enabled);
 
 	/* built/available statistics */
 	WRITE_BOOL_FIELD(ndist_built);
+	WRITE_BOOL_FIELD(deps_built);
 }
 
 static void
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index fc9ad93..8129143 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1287,7 +1287,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
 		mvstat = (Form_pg_mv_statistic) GETSTRUCT(htup);
 
 		/* unavailable stats are not interesting for the planner */
-		if (mvstat->ndist_built)
+		if (mvstat->deps_built || mvstat->ndist_built)
 		{
 			info = makeNode(MVStatisticInfo);
 
@@ -1296,9 +1296,11 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
 
 			/* enabled statistics */
 			info->ndist_enabled = mvstat->ndist_enabled;
+			info->deps_enabled = mvstat->deps_enabled;
 
 			/* built/available statistics */
 			info->ndist_built = mvstat->ndist_built;
+			info->deps_built = mvstat->deps_built;
 
 			/* stakeys */
 			adatum = SysCacheGetAttr(MVSTATOID, htup,
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 475a8a6..f61765f 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -3756,21 +3756,23 @@ ExistingIndex:   USING INDEX index_name				{ $$ = $3; }
  *****************************************************************************/
 
 
-CreateStatsStmt:	CREATE STATISTICS any_name ON '(' columnList ')' FROM qualified_name
+CreateStatsStmt:	CREATE STATISTICS any_name opt_reloptions ON '(' columnList ')' FROM qualified_name
 						{
 							CreateStatsStmt *n = makeNode(CreateStatsStmt);
 							n->defnames = $3;
-							n->relation = $9;
-							n->keys = $6;
+							n->relation = $10;
+							n->keys = $7;
+							n->options = $4;
 							n->if_not_exists = false;
 							$$ = (Node *)n;
 						}
-					| CREATE STATISTICS IF_P NOT EXISTS any_name ON '(' columnList ')' FROM qualified_name
+					| CREATE STATISTICS IF_P NOT EXISTS any_name opt_reloptions ON '(' columnList ')' FROM qualified_name
 						{
 							CreateStatsStmt *n = makeNode(CreateStatsStmt);
 							n->defnames = $6;
-							n->relation = $12;
-							n->keys = $9;
+							n->relation = $13;
+							n->keys = $10;
+							n->options = $7;
 							n->if_not_exists = true;
 							$$ = (Node *)n;
 						}
diff --git a/src/backend/utils/mvstats/Makefile b/src/backend/utils/mvstats/Makefile
index 7295d46..21fe7e5 100644
--- a/src/backend/utils/mvstats/Makefile
+++ b/src/backend/utils/mvstats/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mvstats
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = common.o mvdist.o
+OBJS = common.o dependencies.o mvdist.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mvstats/README.dependencies b/src/backend/utils/mvstats/README.dependencies
new file mode 100644
index 0000000..908f094
--- /dev/null
+++ b/src/backend/utils/mvstats/README.dependencies
@@ -0,0 +1,118 @@
+Soft functional dependencies
+============================
+
+Functional dependencies are a concept well described in relational theory,
+particularly in definition of normalization and "normal forms". Wikipedia
+has a nice definition of a functional dependency [1]:
+
+    In a given table, an attribute Y is said to have a functional dependency
+    on a set of attributes X (written X -> Y) if and only if each X value is
+    associated with precisely one Y value. For example, in an "Employee"
+    table that includes the attributes "Employee ID" and "Employee Date of
+    Birth", the functional dependency
+
+        {Employee ID} -> {Employee Date of Birth}
+
+    would hold. It follows from the previous two sentences that each
+    {Employee ID} is associated with precisely one {Employee Date of Birth}.
+
+    [1] https://en.wikipedia.org/wiki/Functional_dependency
+
+In practical terms, functional dependencies mean that a value in one column
+determines values in some other column. Consider for example this trivial
+table with two integer columns:
+
+    CREATE TABLE t (a INT, b INT)
+        AS SELECT i, i/10 FROM generate_series(1,100000) s(i);
+
+Clearly, knowledge of the value in column 'a' is sufficient to determine the
+value in column 'b', as it's simply (a/10). A more practical example may be
+addresses, where the knowledge of a ZIP code (usually) determines city. Larger
+cities may have multiple ZIP codes, so the dependency can't be reversed.
+
+Many datasets might be normalized not to contain such dependencies, but often
+it's not practical for various reasons. In some cases it's actually a conscious
+design choice to model the dataset in denormalized way, either because of
+performance or to make querying easier.
+
+
+soft dependencies
+-----------------
+
+Real-world data sets often contain data errors, either because of data entry
+mistakes (user mistyping the ZIP code) or perhaps issues in generating the
+data (e.g. a ZIP code mistakenly assigned to two cities in different states).
+
+A strict implementation would either ignore dependencies in such cases,
+rendering the approach mostly useless even for slightly noisy data sets, or
+result in sudden changes in behavior depending on minor differences between
+samples provided to ANALYZE.
+
+For this reason the statistics implementes "soft" functional dependencies,
+associating each functional dependency with a degree of validity (a number
+number between 0 and 1). This degree is then used to combine selectivities
+in a smooth manner.
+
+
+Mining dependencies (ANALYZE)
+-----------------------------
+
+The current algorithm is fairly simple - generate all possible functional
+dependencies, and for each one count the number of rows rows consistent it.
+Then use the fraction of rows (supporting/total) as the degree.
+
+To count the rows consistent with the dependency (a => b):
+
+ (a) Sort the data lexicographically, i.e. first by 'a' then 'b'.
+
+ (b) For each group of rows with the same 'a' value, count the number of
+     distinct values in 'b'.
+
+ (c) If there's a single distinct value in 'b', the rows are consistent with
+     the functional dependency. Otherwise they contradict it.
+
+The algorithm also requires a minimum size of the group to consider it
+consistent (currently 3 rows in the sample). Small groups make it less likely
+to break the consistency.
+
+
+Clause reduction (planner/optimizer)
+------------------------------------
+
+Apllying the functional dependencies is fairly simple - given a list of
+equality clauses, we compute selectivities of each clause and then use the
+degree to combine them using this formula
+
+    P(a=?,b=?) = P(a=?) * (d + (1-d) * P(b=?))
+
+Where 'd' is the degree of functional dependence (a=>b).
+
+With more than two equality clauses, this process happens recursively. For
+example for (a,b,c) we first use (a,b=>c) to break the computation into
+
+    P(a=?,b=?,c=?) = P(a=?,b=?) * (d + (1-d)*P(b=?))
+
+and then apply (a=>b) the same way on P(a=?,b=?).
+
+
+Consistecy of clauses
+---------------------
+
+Functional dependencies only express general dependencies between columns,
+without referencing particular values. This assumes that the equality clauses
+are in fact consistent with the functinal dependency, i.e. that given a
+dependency (a=>b), the value in (b=?) clause is the value determined by (a=?).
+If that's not the case, the clauses are "inconsistent" with the functional
+dependency and the result will be over-estimation.
+
+This may happen for example when using conditions on ZIP and city name with
+mismatching values (ZIP for a different city), etc. In such case the result
+set will be empty, but we'll estimate the selectivity using the ZIP condition.
+
+In this case the default estimation based on AVIA principle happens to work
+better, but mostly by chance.
+
+This issue is the price for the simplicity of functional dependencies. If the
+application frequently constructs queries with clauses inconsistent with
+functional dependencies present in the data, the best solution is not to
+use functional dependencies, but one of the more complex types of statistics.
diff --git a/src/backend/utils/mvstats/common.c b/src/backend/utils/mvstats/common.c
index 7d2f3f3..4b570a1 100644
--- a/src/backend/utils/mvstats/common.c
+++ b/src/backend/utils/mvstats/common.c
@@ -21,7 +21,8 @@ static VacAttrStats **lookup_var_attr_stats(int2vector *attrs,
 
 static List *list_mv_stats(Oid relid);
 
-static void update_mv_stats(Oid relid, MVNDistinct ndistinct,
+static void update_mv_stats(Oid relid,
+					  MVNDistinct ndistinct, MVDependencies dependencies,
 					  int2vector *attrs, VacAttrStats **stats);
 
 
@@ -53,6 +54,7 @@ build_mv_stats(Relation onerel, double totalrows,
 		int			j;
 		MVStatisticInfo *stat = (MVStatisticInfo *) lfirst(lc);
 		MVNDistinct	ndistinct = NULL;
+		MVDependencies deps = NULL;
 
 		VacAttrStats **stats = NULL;
 		int			numatts = 0;
@@ -89,8 +91,12 @@ build_mv_stats(Relation onerel, double totalrows,
 		if (stat->ndist_enabled)
 			ndistinct = build_mv_ndistinct(totalrows, numrows, rows, attrs, stats);
 
+		/* analyze functional dependencies between the columns */
+		if (stat->deps_enabled)
+			deps = build_mv_dependencies(numrows, rows, attrs, stats);
+
 		/* store the statistics in the catalog */
-		update_mv_stats(stat->mvoid, ndistinct, attrs, stats);
+		update_mv_stats(stat->mvoid, ndistinct, deps, attrs, stats);
 	}
 }
 
@@ -170,6 +176,8 @@ list_mv_stats(Oid relid)
 		info->stakeys = buildint2vector(stats->stakeys.values, stats->stakeys.dim1);
 		info->ndist_enabled = stats->ndist_enabled;
 		info->ndist_built = stats->ndist_built;
+		info->deps_enabled = stats->deps_enabled;
+		info->deps_built = stats->deps_built;
 
 		result = lappend(result, info);
 	}
@@ -191,7 +199,7 @@ list_mv_stats(Oid relid)
  *	Serializes the statistics and stores them into the pg_mv_statistic tuple.
  */
 static void
-update_mv_stats(Oid mvoid, MVNDistinct ndistinct,
+update_mv_stats(Oid mvoid, MVNDistinct ndistinct, MVDependencies dependencies,
 				int2vector *attrs, VacAttrStats **stats)
 {
 	HeapTuple	stup,
@@ -218,18 +226,29 @@ update_mv_stats(Oid mvoid, MVNDistinct ndistinct,
 		values[Anum_pg_mv_statistic_standist-1] = PointerGetDatum(data);
 	}
 
+	if (dependencies != NULL)
+	{
+		nulls[Anum_pg_mv_statistic_stadeps - 1] = false;
+		values[Anum_pg_mv_statistic_stadeps - 1]
+			= PointerGetDatum(serialize_mv_dependencies(dependencies));
+	}
+
 	/* always replace the value (either by bytea or NULL) */
 	replaces[Anum_pg_mv_statistic_standist - 1] = true;
+	replaces[Anum_pg_mv_statistic_stadeps - 1] = true;
 
 	/* always change the availability flags */
 	nulls[Anum_pg_mv_statistic_ndist_built - 1] = false;
+	nulls[Anum_pg_mv_statistic_deps_built - 1] = false;
 	nulls[Anum_pg_mv_statistic_stakeys - 1] = false;
 
 	/* use the new attnums, in case we removed some dropped ones */
 	replaces[Anum_pg_mv_statistic_ndist_built - 1] = true;
+	replaces[Anum_pg_mv_statistic_deps_built - 1] = true;
 	replaces[Anum_pg_mv_statistic_stakeys - 1] = true;
 
 	values[Anum_pg_mv_statistic_ndist_built - 1] = BoolGetDatum(ndistinct != NULL);
+	values[Anum_pg_mv_statistic_deps_built - 1] = BoolGetDatum(dependencies != NULL);
 
 	values[Anum_pg_mv_statistic_stakeys - 1] = PointerGetDatum(attrs);
 
@@ -370,6 +389,7 @@ multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b,
 							   &mss->ssup[dim]);
 }
 
+/* compare all the dimensions in a given range (inclusive) */
 int
 multi_sort_compare_dims(int start, int end,
 						const SortItem *a, const SortItem *b,
diff --git a/src/backend/utils/mvstats/dependencies.c b/src/backend/utils/mvstats/dependencies.c
new file mode 100644
index 0000000..c6390e2
--- /dev/null
+++ b/src/backend/utils/mvstats/dependencies.c
@@ -0,0 +1,622 @@
+/*-------------------------------------------------------------------------
+ *
+ * dependencies.c
+ *	  POSTGRES multivariate functional dependencies
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mvstats/dependencies.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "common.h"
+
+#include "utils/bytea.h"
+#include "utils/lsyscache.h"
+
+/*
+ * Internal state for DependencyGenerator of dependencies. Dependencies are similar to
+ * k-permutations of n elements, except that the order does not matter for the
+ * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
+ */
+typedef struct DependencyGeneratorData
+{
+	int		k;					/* size of the dependency */
+	int		current;			/* next dependency to return (index) */
+	int		ndependencies;		/* number of dependencies generated */
+	int	   *dependencies;		/* array of pre-generated dependencies  */
+} DependencyGeneratorData;
+
+typedef DependencyGeneratorData *DependencyGenerator;
+
+static void
+generate_dependencies_recurse(DependencyGenerator state,
+							  int n, int index, int start, int *current)
+{
+	/*
+	 * The generator handles the first (k-1) elements differently from
+	 * the last element.
+	 */
+	if (index < (state->k - 1))
+	{
+		int i;
+
+		/*
+		 * The first (k-1) values have to be in ascending order, which we
+		 * generate recursively.
+		 */
+
+		for (i = start; i < n; i++)
+		{
+			current[index] = i;
+			generate_dependencies_recurse(state, n, (index+1), (i+1), current);
+		}
+	}
+	else
+	{
+		int i;
+
+		/*
+		 * the last element is the implied value, which does not respect the
+		 * ascending order. We just need to check that the value is not in the
+		 * first (k-1) elements.
+		 */
+
+		for (i = 0; i < n; i++)
+		{
+			int		j;
+			bool	match = false;
+
+			current[index] = i;
+
+			for (j = 0; j < index; j++)
+			{
+				if (current[j] == i)
+				{
+					match = true;
+					break;
+				}
+			}
+
+			/*
+			 * If the value is not found in the first part of the dependency,
+			 * we're done.
+			 */
+			if (! match)
+			{
+				state->dependencies
+					= (int*)repalloc(state->dependencies,
+									 state->k * (state->ndependencies + 1) * sizeof(int));
+				memcpy(&state->dependencies[(state->k * state->ndependencies)],
+					   current, state->k * sizeof(int));
+				state->ndependencies++;
+			}
+		}
+	}
+}
+
+/* generate all dependencies (k-permutations of n elements) */
+static void
+generate_dependencies(DependencyGenerator state, int n)
+{
+	int	   *current = (int *) palloc0(sizeof(int) * state->k);
+
+	generate_dependencies_recurse(state, n, 0, 0, current);
+
+	pfree(current);
+}
+
+/*
+ * initialize the DependencyGenerator of variations, and prebuild the variations
+ *
+ * This pre-builds all the variations. We could also generate them in
+ * DependencyGenerator_next(), but this seems simpler.
+ */
+static DependencyGenerator
+DependencyGenerator_init(int2vector *attrs, int k)
+{
+	int			n = attrs->dim1;
+	DependencyGenerator state;
+
+	Assert((n >= k) && (k > 0));
+
+	/* allocate the DependencyGenerator state as a single chunk of memory */
+	state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData));
+	state->dependencies = (int*)palloc(k * sizeof(int));
+
+	state->ndependencies = 0;
+	state->current = 0;
+	state->k = k;
+
+	/* now actually pre-generate all the variations */
+	generate_dependencies(state, n);
+
+	return state;
+}
+
+/* free the DependencyGenerator state */
+static void
+DependencyGenerator_free(DependencyGenerator state)
+{
+	/* we've allocated a single chunk, so just free it */
+	pfree(state);
+}
+
+/* generate next combination */
+static int *
+DependencyGenerator_next(DependencyGenerator state, int2vector *attrs)
+{
+	if (state->current == state->ndependencies)
+		return NULL;
+
+	return &state->dependencies[state->k * state->current++];
+}
+
+
+/*
+ * validates functional dependency on the data
+ *
+ * An actual work horse of detecting functional dependencies. Given a variation
+ * of k attributes, it checks that the first (k-1) are sufficient to determine
+ * the last one.
+ */
+static double
+dependency_degree(int numrows, HeapTuple *rows, int k, int *dependency,
+				  VacAttrStats **stats, int2vector *attrs)
+{
+	int			i,
+				j;
+	int			nvalues = numrows * k;
+	MultiSortSupport mss;
+	SortItem   *items;
+	Datum	   *values;
+	bool	   *isnull;
+
+	/*
+	 * XXX Maybe the threshold should be somehow related to the number of
+	 * distinct values in the combination of columns we're analyzing. Assuming
+	 * the distribution is uniform, we can estimate the average group size and
+	 * use it as a threshold, similarly to what we do for MCV lists.
+	 */
+	int			min_group_size = 3;
+
+	/* counters valid within a group */
+	int			group_size = 0;
+	int			n_violations = 0;
+
+	/* total number of rows supporting (consistent with) the dependency */
+	int			n_supporting_rows = 0;
+
+	/* Make sure we have at least two input attributes. */
+	Assert(k >= 2);
+
+	/* sort info for all attributes columns */
+	mss = multi_sort_init(k);
+
+	/* data for the sort */
+	items = (SortItem *) palloc0(numrows * sizeof(SortItem));
+	values = (Datum *) palloc0(sizeof(Datum) * nvalues);
+	isnull = (bool *) palloc0(sizeof(bool) * nvalues);
+
+	/* fix the pointers to values/isnull */
+	for (i = 0; i < numrows; i++)
+	{
+		items[i].values = &values[i * k];
+		items[i].isnull = &isnull[i * k];
+	}
+
+	/*
+	 * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
+	 *
+	 * (a) sort the data lexicographically
+	 *
+	 * (b) split the data into groups by first (k-1) columns
+	 *
+	 * (c) for each group count different values in the last column
+	 */
+
+	/* prepare the sort function for the first dimension, and SortItem array */
+	for (i = 0; i < k; i++)
+	{
+		multi_sort_add_dimension(mss, i, dependency[i], stats);
+
+		/* accumulate all the data for both columns into an array and sort it */
+		for (j = 0; j < numrows; j++)
+		{
+			items[j].values[i]
+				= heap_getattr(rows[j], attrs->values[dependency[i]],
+							   stats[i]->tupDesc, &items[j].isnull[i]);
+		}
+	}
+
+	/* sort the items so that we can detect the groups */
+	qsort_arg((void *) items, numrows, sizeof(SortItem),
+			  multi_sort_compare, mss);
+
+	/*
+	 * Walk through the sorted array, split it into rows according to the
+	 * first (k-1) columns. If there's a single value in the last column, we
+	 * count the group as 'supporting' the functional dependency. Otherwise we
+	 * count it as contradicting.
+	 *
+	 * We also require a group to have a minimum number of rows to be
+	 * considered useful for supporting the dependency. Contradicting groups
+	 * may be of any size, though.
+	 *
+	 * XXX The minimum size requirement makes it impossible to identify case
+	 * when both columns are unique (or nearly unique), and therefore
+	 * trivially functionally dependent.
+	 */
+
+	/* start with the first row forming a group */
+	group_size = 1;
+
+	for (i = 1; i <= numrows; i++)
+	{
+		/*
+		 * Check if the group ended, which may be either because we processed
+		 * all the items (i==numrows), or because the i-th item is not equal
+		 * to the preceding one.
+		 */
+		if ((i == numrows) ||
+			(multi_sort_compare_dims(0, (k - 2), &items[i - 1], &items[i], mss) != 0))
+		{
+			/*
+			 * Do accounting for the preceding group, and reset counters.
+			 *
+			 * If there were no contradicting rows in the group, count the
+			 * rows as supporting.
+			 */
+			if ((n_violations == 0) && (group_size >= min_group_size))
+				n_supporting_rows += group_size;
+
+			/* current values start a new group */
+			n_violations = 0;
+			group_size = 0;
+		}
+		/* first colums match, but the last one does not (so contradicting) */
+		else if (multi_sort_compare_dim((k - 1), &items[i - 1], &items[i], mss) != 0)
+			n_violations += 1;
+
+		group_size += 1;
+	}
+
+	pfree(items);
+	pfree(values);
+	pfree(isnull);
+	pfree(mss);
+
+	/* Compute the 'degree of validity' as (supporting/total). */
+	return (n_supporting_rows * 1.0 / numrows);
+}
+
+/*
+ * detects functional dependencies between groups of columns
+ *
+ * Generates all possible subsets of columns (variations) and checks if the
+ * last one is determined by the preceding ones. For example given 3 columns,
+ * there are 12 variations (6 for variations on 2 columns, 6 for 3 columns):
+ *
+ *	   two columns			  three columns
+ *	   -----------			  -------------
+ *	   (a) -> c				  (a,b) -> c
+ *	   (b) -> c				  (b,a) -> c
+ *	   (a) -> b				  (a,c) -> b
+ *	   (c) -> b				  (c,a) -> b
+ *	   (c) -> a				  (c,b) -> a
+ *	   (b) -> a				  (b,c) -> a
+ */
+MVDependencies
+build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
+					  VacAttrStats **stats)
+{
+	int			i;
+	int			k;
+	int			numattrs = attrs->dim1;
+
+	/* result */
+	MVDependencies dependencies = NULL;
+
+	Assert(numattrs >= 2);
+
+	/*
+	 * We'll try build functional dependencies starting from the smallest ones
+	 * covering just 2 columns, to the largest ones, covering all columns
+	 * included int the statistics. We start from the smallest ones because we
+	 * want to be able to skip already implied ones.
+	 */
+	for (k = 2; k <= numattrs; k++)
+	{
+		int		   *dependency; /* array with k elements */
+
+		/* prepare a DependencyGenerator of variation */
+		DependencyGenerator DependencyGenerator = DependencyGenerator_init(attrs, k);
+
+		/* generate all possible variations of k values (out of n) */
+		while ((dependency = DependencyGenerator_next(DependencyGenerator, attrs)))
+		{
+			double			degree;
+			MVDependency	d;
+
+			/* compute how valid the dependency seems */
+			degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
+
+			/* if the dependency seems entirely invalid, don't bother storing it */
+			if (degree == 0.0)
+				continue;
+
+			d = (MVDependency) palloc0(offsetof(MVDependencyData, attributes)
+									   +k * sizeof(int));
+
+			/* copy the dependency (and keep the indexes into stakeys) */
+			d->degree = degree;
+			d->nattributes = k;
+			for (i = 0; i < k; i++)
+				d->attributes[i] = dependency[i];
+
+			/* initialize the list of dependencies */
+			if (dependencies == NULL)
+			{
+				dependencies
+					= (MVDependencies) palloc0(sizeof(MVDependenciesData));
+
+				dependencies->magic = MVSTAT_DEPS_MAGIC;
+				dependencies->type = MVSTAT_DEPS_TYPE_BASIC;
+				dependencies->ndeps = 0;
+			}
+
+			dependencies->ndeps++;
+			dependencies = (MVDependencies) repalloc(dependencies,
+										   offsetof(MVDependenciesData, deps)
+								+dependencies->ndeps * sizeof(MVDependency));
+
+			dependencies->deps[dependencies->ndeps - 1] = d;
+		}
+
+		/* we're done with variations of k elements, so free the DependencyGenerator */
+		DependencyGenerator_free(DependencyGenerator);
+	}
+
+	return dependencies;
+}
+
+
+/*
+ * serialize list of dependencies into a bytea
+ */
+bytea *
+serialize_mv_dependencies(MVDependencies dependencies)
+{
+	int			i;
+	bytea	   *output;
+	char	   *tmp;
+	Size		len;
+
+	/* we need to store ndeps, with a number of attributes for each one */
+	len = VARHDRSZ + offsetof(MVDependenciesData, deps) +
+		  dependencies->ndeps * offsetof(MVDependencyData, attributes);
+
+	/* and also include space for the actual attribute numbers and degrees */
+	for (i = 0; i < dependencies->ndeps; i++)
+		len += (sizeof(int16) * dependencies->deps[i]->nattributes);
+
+	output = (bytea *) palloc0(len);
+	SET_VARSIZE(output, len);
+
+	tmp = VARDATA(output);
+
+	/* first, store the number of dimensions / items */
+	memcpy(tmp, dependencies, offsetof(MVDependenciesData, deps));
+	tmp += offsetof(MVDependenciesData, deps);
+
+	/* store number of attributes and attribute numbers for each dependency */
+	for (i = 0; i < dependencies->ndeps; i++)
+	{
+		MVDependency d = dependencies->deps[i];
+
+		memcpy(tmp, d, offsetof(MVDependencyData, attributes));
+		tmp += offsetof(MVDependencyData, attributes);
+
+		memcpy(tmp, d->attributes, sizeof(int16) * d->nattributes);
+		tmp += sizeof(int16) * d->nattributes;
+
+		Assert(tmp <= ((char *) output + len));
+	}
+
+	return output;
+}
+
+/*
+ * Reads serialized dependencies into MVDependencies structure.
+ */
+MVDependencies
+deserialize_mv_dependencies(bytea *data)
+{
+	int			i;
+	Size		expected_size;
+	MVDependencies dependencies;
+	char	   *tmp;
+
+	if (data == NULL)
+		return NULL;
+
+	if (VARSIZE_ANY_EXHDR(data) < offsetof(MVDependenciesData, deps))
+		elog(ERROR, "invalid MVDependencies size %ld (expected at least %ld)",
+			 VARSIZE_ANY_EXHDR(data), offsetof(MVDependenciesData, deps));
+
+	/* read the MVDependencies header */
+	dependencies = (MVDependencies) palloc0(sizeof(MVDependenciesData));
+
+	/* initialize pointer to the data part (skip the varlena header) */
+	tmp = VARDATA_ANY(data);
+
+	/* get the header and perform basic sanity checks */
+	memcpy(dependencies, tmp, offsetof(MVDependenciesData, deps));
+	tmp += offsetof(MVDependenciesData, deps);
+
+	if (dependencies->magic != MVSTAT_DEPS_MAGIC)
+		elog(ERROR, "invalid dependency magic %d (expected %dd)",
+			 dependencies->magic, MVSTAT_DEPS_MAGIC);
+
+	if (dependencies->type != MVSTAT_DEPS_TYPE_BASIC)
+		elog(ERROR, "invalid dependency type %d (expected %dd)",
+			 dependencies->type, MVSTAT_DEPS_TYPE_BASIC);
+
+	Assert(dependencies->ndeps > 0);
+
+	/* what minimum bytea size do we expect for those parameters */
+	expected_size = offsetof(MVDependenciesData, deps) +
+		dependencies->ndeps * (offsetof(MVDependencyData, attributes) +
+							   sizeof(int16) * 2);
+
+	if (VARSIZE_ANY_EXHDR(data) < expected_size)
+		elog(ERROR, "invalid dependencies size %ld (expected at least %ld)",
+			 VARSIZE_ANY_EXHDR(data), expected_size);
+
+	/* allocate space for the MCV items */
+	dependencies = repalloc(dependencies, offsetof(MVDependenciesData, deps)
+							+(dependencies->ndeps * sizeof(MVDependency)));
+
+	for (i = 0; i < dependencies->ndeps; i++)
+	{
+		double		degree;
+		int			k;
+		MVDependency d;
+
+		/* degree of validity */
+		memcpy(&degree, tmp, sizeof(double));
+		tmp += sizeof(double);
+
+		/* number of attributes */
+		memcpy(&k, tmp, sizeof(int));
+		tmp += sizeof(int);
+
+		/* is the number of attributes valid? */
+		Assert((k >= 2) && (k <= MVSTATS_MAX_DIMENSIONS));
+
+		/* now that we know the number of attributes, allocate the dependency */
+		d = (MVDependency) palloc0(offsetof(MVDependencyData, attributes) +
+								   (k * sizeof(int)));
+
+		d->degree = degree;
+		d->nattributes = k;
+
+		/* copy attribute numbers */
+		memcpy(d->attributes, tmp, sizeof(int16) * d->nattributes);
+		tmp += sizeof(int16) * d->nattributes;
+
+		dependencies->deps[i] = d;
+
+		/* still within the bytea */
+		Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
+	}
+
+	/* we should have consumed the whole bytea exactly */
+	Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
+
+	return dependencies;
+}
+
+/*
+ * pg_dependencies_in		- input routine for type pg_dependencies.
+ *
+ * pg_dependencies is real enough to be a table column, but it has no operations
+ * of its own, and disallows input too
+ *
+ * XXX This is inspired by what pg_node_tree does.
+ */
+Datum
+pg_dependencies_in(PG_FUNCTION_ARGS)
+{
+	/*
+	 * pg_node_list stores the data in binary form and parsing text input is
+	 * not needed, so disallow this.
+	 */
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "pg_dependencies")));
+
+	PG_RETURN_VOID();			/* keep compiler quiet */
+}
+
+/*
+ * pg_dependencies		- output routine for type pg_dependencies.
+ *
+ * histograms are serialized into a bytea value, so we simply call byteaout()
+ * to serialize the value into text. But it'd be nice to serialize that into
+ * a meaningful representation (e.g. for inspection by people).
+ */
+Datum
+pg_dependencies_out(PG_FUNCTION_ARGS)
+{
+	int i, j;
+	char		   *ret;
+	StringInfoData	str;
+
+	bytea	   *data = PG_GETARG_BYTEA_PP(0);
+
+	MVDependencies dependencies = deserialize_mv_dependencies(data);
+
+	initStringInfo(&str);
+	appendStringInfoString(&str, "[");
+
+	for (i = 0; i < dependencies->ndeps; i++)
+	{
+		MVDependency dependency = dependencies->deps[i];
+
+		if (i > 0)
+			appendStringInfoString(&str, ", ");
+
+		appendStringInfoString(&str, "{");
+
+		for (j = 0; j < dependency->nattributes; j++)
+		{
+			if (j == dependency->nattributes-1)
+				appendStringInfoString(&str, " => ");
+			else if (j > 0)
+				appendStringInfoString(&str, ", ");
+
+			appendStringInfo(&str, "%d", dependency->attributes[j]);
+		}
+
+		appendStringInfo(&str, " : %f", dependency->degree);
+
+		appendStringInfoString(&str, "}");
+	}
+
+	appendStringInfoString(&str, "]");
+
+	ret = pstrdup(str.data);
+	pfree(str.data);
+
+	PG_RETURN_CSTRING(ret);
+}
+
+/*
+ * pg_dependencies_recv		- binary input routine for type pg_dependencies.
+ */
+Datum
+pg_dependencies_recv(PG_FUNCTION_ARGS)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "pg_dependencies")));
+
+	PG_RETURN_VOID();			/* keep compiler quiet */
+}
+
+/*
+ * pg_dependencies_send		- binary output routine for type pg_dependencies.
+ *
+ * XXX Histograms are serialized into a bytea value, so let's just send that.
+ */
+Datum
+pg_dependencies_send(PG_FUNCTION_ARGS)
+{
+	return byteasend(fcinfo);
+}
diff --git a/src/include/catalog/pg_cast.h b/src/include/catalog/pg_cast.h
index bf39d43..22fa4b8 100644
--- a/src/include/catalog/pg_cast.h
+++ b/src/include/catalog/pg_cast.h
@@ -258,6 +258,10 @@ DATA(insert (  194	 25    0 i b ));
 DATA(insert (  3353	 17    0 i b ));
 DATA(insert (  3353	 25    0 i i ));
 
+/* pg_dependencies can be coerced to, but not from, bytea and text */
+DATA(insert (  3358	 17    0 i b ));
+DATA(insert (  3358	 25    0 i i ));
+
 /*
  * Datetime category
  */
diff --git a/src/include/catalog/pg_mv_statistic.h b/src/include/catalog/pg_mv_statistic.h
index fad80a3..e119cb7 100644
--- a/src/include/catalog/pg_mv_statistic.h
+++ b/src/include/catalog/pg_mv_statistic.h
@@ -38,9 +38,11 @@ CATALOG(pg_mv_statistic,3381)
 
 	/* statistics requested to build */
 	bool		ndist_enabled;	/* build ndist coefficient? */
+	bool		deps_enabled;	/* analyze dependencies? */
 
 	/* statistics that are available (if requested) */
 	bool		ndist_built;	/* ndistinct coeff built */
+	bool		deps_built;		/* dependencies were built */
 
 	/*
 	 * variable-length fields start here, but we allow direct access to
@@ -50,6 +52,7 @@ CATALOG(pg_mv_statistic,3381)
 
 #ifdef CATALOG_VARLEN
 	pg_ndistinct		standist;		/* ndistinct coeff (serialized) */
+	pg_dependencies		stadeps;		/* dependencies (serialized) */
 #endif
 
 } FormData_pg_mv_statistic;
@@ -65,14 +68,17 @@ typedef FormData_pg_mv_statistic *Form_pg_mv_statistic;
  *		compiler constants for pg_mv_statistic
  * ----------------
  */
-#define Natts_pg_mv_statistic					8
+#define Natts_pg_mv_statistic					11
 #define Anum_pg_mv_statistic_starelid			1
 #define Anum_pg_mv_statistic_staname			2
 #define Anum_pg_mv_statistic_stanamespace		3
 #define Anum_pg_mv_statistic_staowner			4
 #define Anum_pg_mv_statistic_ndist_enabled		5
-#define Anum_pg_mv_statistic_ndist_built		6
-#define Anum_pg_mv_statistic_stakeys			7
-#define Anum_pg_mv_statistic_standist			8
+#define Anum_pg_mv_statistic_deps_enabled		6
+#define Anum_pg_mv_statistic_ndist_built		7
+#define Anum_pg_mv_statistic_deps_built			8
+#define Anum_pg_mv_statistic_stakeys			9
+#define Anum_pg_mv_statistic_standist			10
+#define Anum_pg_mv_statistic_stadeps			11
 
 #endif   /* PG_MV_STATISTIC_H */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 940a991..b1f7b75 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2735,6 +2735,15 @@ DESCR("I/O");
 DATA(insert OID = 3357 (  pg_ndistinct_send	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 17 "3353" _null_ _null_ _null_ _null_ _null_	pg_ndistinct_send _null_ _null_ _null_ ));
 DESCR("I/O");
 
+DATA(insert OID = 3359 (  pg_dependencies_in	PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3358 "2275" _null_ _null_ _null_ _null_ _null_ pg_dependencies_in _null_ _null_ _null_ ));
+DESCR("I/O");
+DATA(insert OID = 3360 (  pg_dependencies_out	PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2275 "3358" _null_ _null_ _null_ _null_ _null_ pg_dependencies_out _null_ _null_ _null_ ));
+DESCR("I/O");
+DATA(insert OID = 3361 (  pg_dependencies_recv	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 3358 "2281" _null_ _null_ _null_ _null_ _null_ pg_dependencies_recv _null_ _null_ _null_ ));
+DESCR("I/O");
+DATA(insert OID = 3362 (  pg_dependencies_send	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 17 "3358" _null_ _null_ _null_ _null_ _null_	pg_dependencies_send _null_ _null_ _null_ ));
+DESCR("I/O");
+
 DATA(insert OID = 1928 (  pg_stat_get_numscans			PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_numscans _null_ _null_ _null_ ));
 DESCR("statistics: number of scans done for table/index");
 DATA(insert OID = 1929 (  pg_stat_get_tuples_returned	PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_returned _null_ _null_ _null_ ));
diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h
index 9c9caf3..da637d4 100644
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@ -368,6 +368,10 @@ DATA(insert OID = 3353 ( pg_ndistinct		PGNSP PGUID -1 f b S f t \054 0 0 0 pg_nd
 DESCR("multivariate ndistinct coefficients");
 #define PGNDISTINCTOID	3353
 
+DATA(insert OID = 3358 ( pg_dependencies		PGNSP PGUID -1 f b S f t \054 0 0 0 pg_dependencies_in pg_dependencies_out pg_dependencies_recv pg_dependencies_send - - - i x f 0 -1 0 100 _null_ _null_ _null_ ));
+DESCR("multivariate histogram");
+#define PGDEPENDENCIESOID	3358
+
 DATA(insert OID = 32 ( pg_ddl_command	PGNSP PGUID SIZEOF_POINTER t p P f t \054 0 0 0 pg_ddl_command_in pg_ddl_command_out pg_ddl_command_recv pg_ddl_command_send - - - ALIGNOF_POINTER p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("internal type for passing CollectedCommand");
 #define PGDDLCOMMANDOID 32
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 18e1dd1..fe4b93a 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -617,6 +617,7 @@ typedef struct CreateStatsStmt
 	List	   *defnames;		/* qualified name (list of Value strings) */
 	RangeVar   *relation;		/* relation to build statistics on */
 	List	   *keys;			/* String nodes naming referenced column(s) */
+	List	   *options;		/* list of DefElem nodes */
 	bool		if_not_exists;	/* do nothing if statistics already exists */
 } CreateStatsStmt;
 
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 7a55151..56957e8 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -681,9 +681,11 @@ typedef struct MVStatisticInfo
 	RelOptInfo *rel;			/* back-link to index's table */
 
 	/* enabled statistics */
+	bool		deps_enabled;	/* functional dependencies enabled */
 	bool		ndist_enabled;	/* ndistinct coefficient enabled */
 
 	/* built/available statistics */
+	bool		deps_built;		/* functional dependencies built */
 	bool		ndist_built;	/* ndistinct coefficient built */
 
 	/* columns in the statistics (attnums) */
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 262ee94..9ffd80c 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -73,6 +73,10 @@ extern Datum pg_ndistinct_in(PG_FUNCTION_ARGS);
 extern Datum pg_ndistinct_out(PG_FUNCTION_ARGS);
 extern Datum pg_ndistinct_recv(PG_FUNCTION_ARGS);
 extern Datum pg_ndistinct_send(PG_FUNCTION_ARGS);
+extern Datum pg_dependencies_in(PG_FUNCTION_ARGS);
+extern Datum pg_dependencies_out(PG_FUNCTION_ARGS);
+extern Datum pg_dependencies_recv(PG_FUNCTION_ARGS);
+extern Datum pg_dependencies_send(PG_FUNCTION_ARGS);
 
 /* regexp.c */
 extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive,
diff --git a/src/include/utils/mvstats.h b/src/include/utils/mvstats.h
index 0660c59..e5a49bf 100644
--- a/src/include/utils/mvstats.h
+++ b/src/include/utils/mvstats.h
@@ -39,16 +39,49 @@ typedef struct MVNDistinctData {
 typedef MVNDistinctData *MVNDistinct;
 
 
+#define MVSTAT_DEPS_MAGIC		0xB4549A2C		/* marks serialized bytea */
+#define MVSTAT_DEPS_TYPE_BASIC	1		/* basic dependencies type */
+
+/*
+ * Functional dependencies, tracking column-level relationships (values
+ * in one column determine values in another one).
+ */
+typedef struct MVDependencyData
+{
+	double		degree;			/* degree of validity (0-1) */
+	int			nattributes;	/* number of attributes */
+	int16		attributes[FLEXIBLE_ARRAY_MEMBER];	/* attribute numbers */
+} MVDependencyData;
+
+typedef MVDependencyData *MVDependency;
+
+typedef struct MVDependenciesData
+{
+	uint32		magic;			/* magic constant marker */
+	uint32		type;			/* type of MV Dependencies (BASIC) */
+	int32		ndeps;			/* number of dependencies */
+	MVDependency deps[FLEXIBLE_ARRAY_MEMBER];	/* dependencies */
+} MVDependenciesData;
+
+typedef MVDependenciesData *MVDependencies;
+
+
+
 MVNDistinct		load_mv_ndistinct(Oid mvoid);
 
 bytea *serialize_mv_ndistinct(MVNDistinct ndistinct);
+bytea *serialize_mv_dependencies(MVDependencies dependencies);
 
 /* deserialization of stats (serialization is private to analyze) */
 MVNDistinct deserialize_mv_ndistinct(bytea *data);
-
+MVDependencies deserialize_mv_dependencies(bytea *data);
 
 MVNDistinct build_mv_ndistinct(double totalrows, int numrows, HeapTuple *rows,
-				 int2vector *attrs, VacAttrStats **stats);
+							   int2vector *attrs, VacAttrStats **stats);
+
+MVDependencies build_mv_dependencies(int numrows, HeapTuple *rows,
+					  int2vector *attrs,
+					  VacAttrStats **stats);
 
 void build_mv_stats(Relation onerel, double totalrows,
 			   int numrows, HeapTuple *rows,
diff --git a/src/test/regress/expected/mv_dependencies.out b/src/test/regress/expected/mv_dependencies.out
new file mode 100644
index 0000000..d442a16
--- /dev/null
+++ b/src/test/regress/expected/mv_dependencies.out
@@ -0,0 +1,147 @@
+-- data type passed by value
+CREATE TABLE functional_dependencies (
+    a INT,
+    b INT,
+    c INT
+);
+-- unknown column
+CREATE STATISTICS s1 WITH (dependencies) ON (unknown_column) FROM functional_dependencies;
+ERROR:  column "unknown_column" referenced in statistics does not exist
+-- single column
+CREATE STATISTICS s1 WITH (dependencies) ON (a) FROM functional_dependencies;
+ERROR:  statistics require at least 2 columns
+-- single column, duplicated
+CREATE STATISTICS s1 WITH (dependencies) ON (a,a) FROM functional_dependencies;
+ERROR:  duplicate column name in statistics definition
+-- two columns, one duplicated
+CREATE STATISTICS s1 WITH (dependencies) ON (a, a, b) FROM functional_dependencies;
+ERROR:  duplicate column name in statistics definition
+-- correct command
+CREATE STATISTICS s1 WITH (dependencies) ON (a, b, c) FROM functional_dependencies;
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | stadeps 
+--------------+------------+---------
+ t            | f          | 
+(1 row)
+
+TRUNCATE functional_dependencies;
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built |                                                     stadeps                                                     
+--------------+------------+-----------------------------------------------------------------------------------------------------------------
+ t            | t          | [{0 => 1 : 0.999900}, {0 => 2 : 0.999900}, {1 => 2 : 0.999900}, {0, 1 => 2 : 0.999900}, {0, 2 => 1 : 0.999900}]
+(1 row)
+
+TRUNCATE functional_dependencies;
+-- a => b, a => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built |                                                     stadeps                                                     
+--------------+------------+-----------------------------------------------------------------------------------------------------------------
+ t            | t          | [{0 => 1 : 0.999900}, {0 => 2 : 0.999900}, {1 => 2 : 0.494900}, {0, 1 => 2 : 0.999900}, {0, 2 => 1 : 0.999900}]
+(1 row)
+
+TRUNCATE functional_dependencies;
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built |                                                     stadeps                                                     
+--------------+------------+-----------------------------------------------------------------------------------------------------------------
+ t            | t          | [{0 => 1 : 1.000000}, {0 => 2 : 1.000000}, {1 => 2 : 1.000000}, {0, 1 => 2 : 1.000000}, {0, 2 => 1 : 1.000000}]
+(1 row)
+
+DROP TABLE functional_dependencies;
+-- varlena type (text)
+CREATE TABLE functional_dependencies (
+    a TEXT,
+    b TEXT,
+    c TEXT
+);
+CREATE STATISTICS s2 WITH (dependencies) ON (a, b, c) FROM functional_dependencies;
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | stadeps 
+--------------+------------+---------
+ t            | f          | 
+(1 row)
+
+TRUNCATE functional_dependencies;
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built |                                                     stadeps                                                     
+--------------+------------+-----------------------------------------------------------------------------------------------------------------
+ t            | t          | [{0 => 1 : 0.999900}, {0 => 2 : 0.999900}, {1 => 2 : 0.999900}, {0, 1 => 2 : 0.999900}, {0, 2 => 1 : 0.999900}]
+(1 row)
+
+TRUNCATE functional_dependencies;
+-- a => b, a => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built |                                                     stadeps                                                     
+--------------+------------+-----------------------------------------------------------------------------------------------------------------
+ t            | t          | [{0 => 1 : 0.999900}, {0 => 2 : 0.999900}, {1 => 2 : 0.494900}, {0, 1 => 2 : 0.999900}, {0, 2 => 1 : 0.999900}]
+(1 row)
+
+TRUNCATE functional_dependencies;
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built |                                                     stadeps                                                     
+--------------+------------+-----------------------------------------------------------------------------------------------------------------
+ t            | t          | [{0 => 1 : 1.000000}, {0 => 2 : 1.000000}, {1 => 2 : 1.000000}, {0, 1 => 2 : 1.000000}, {0, 2 => 1 : 1.000000}]
+(1 row)
+
+DROP TABLE functional_dependencies;
+-- NULL values (mix of int and text columns)
+CREATE TABLE functional_dependencies (
+    a INT,
+    b TEXT,
+    c INT,
+    d TEXT
+);
+CREATE STATISTICS s3 WITH (dependencies) ON (a, b, c, d) FROM functional_dependencies;
+INSERT INTO functional_dependencies
+     SELECT
+         mod(i, 100),
+         (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END),
+         mod(i, 400),
+         (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END)
+     FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built |                                                                                                                                                     stadeps                                                                                                                                                     
+--------------+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ t            | t          | [{1 => 0 : 1.000000}, {2 => 0 : 1.000000}, {2 => 1 : 1.000000}, {3 => 0 : 1.000000}, {3 => 1 : 0.996700}, {0, 2 => 1 : 1.000000}, {0, 3 => 1 : 0.996700}, {1, 2 => 0 : 1.000000}, {1, 3 => 0 : 1.000000}, {2, 3 => 0 : 1.000000}, {2, 3 => 1 : 1.000000}, {0, 2, 3 => 1 : 1.000000}, {1, 2, 3 => 0 : 1.000000}]
+(1 row)
+
+DROP TABLE functional_dependencies;
diff --git a/src/test/regress/expected/mv_ndistinct.out b/src/test/regress/expected/mv_ndistinct.out
index 5f55091..06a7634 100644
--- a/src/test/regress/expected/mv_ndistinct.out
+++ b/src/test/regress/expected/mv_ndistinct.out
@@ -6,19 +6,19 @@ CREATE TABLE ndistinct (
     d INT
 );
 -- unknown column
-CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (unknown_column) FROM ndistinct;
 ERROR:  column "unknown_column" referenced in statistics does not exist
 -- single column
-CREATE STATISTICS s10 ON (a) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (a) FROM ndistinct;
 ERROR:  statistics require at least 2 columns
 -- single column, duplicated
-CREATE STATISTICS s10 ON (a,a) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (a,a) FROM ndistinct;
 ERROR:  duplicate column name in statistics definition
 -- two columns, one duplicated
-CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (a, a, b) FROM ndistinct;
 ERROR:  duplicate column name in statistics definition
 -- correct command
-CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (a, b, c) FROM ndistinct;
 -- perfectly correlated groups
 INSERT INTO ndistinct
      SELECT i/100, i/100, i/100 FROM generate_series(1,10000) s(i);
diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out
index 2b5c022..f574554 100644
--- a/src/test/regress/expected/object_address.out
+++ b/src/test/regress/expected/object_address.out
@@ -38,7 +38,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL (
 	TO SQL WITH FUNCTION int4recv(internal));
 CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable;
 CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT);
-CREATE STATISTICS addr_nsp.gentable_stat ON (a,b) FROM addr_nsp.gentable;
+CREATE STATISTICS addr_nsp.gentable_stat WITH (ndistinct) ON (a,b) FROM addr_nsp.gentable;
 -- test some error cases
 SELECT pg_get_object_address('stone', '{}', '{}');
 ERROR:  unrecognized object type "stone"
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 9a26205..db1cf8a 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -818,11 +818,12 @@ WHERE c.castmethod = 'b' AND
  character varying | character         |        0 | i
  pg_node_tree      | text              |        0 | i
  pg_ndistinct      | bytea             |        0 | i
+ pg_dependencies   | bytea             |        0 | i
  cidr              | inet              |        0 | i
  xml               | text              |        0 | a
  xml               | character varying |        0 | a
  xml               | character         |        0 | a
-(8 rows)
+(9 rows)
 
 -- **************** pg_conversion ****************
 -- Look for illegal values in pg_conversion fields.
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 2c54779..39179a6 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1380,7 +1380,8 @@ pg_mv_stats| SELECT n.nspname AS schemaname,
     c.relname AS tablename,
     s.staname,
     s.stakeys AS attnums,
-    length((s.standist)::text) AS ndistbytes
+    length((s.standist)::bytea) AS ndistbytes,
+    length((s.stadeps)::bytea) AS depsbytes
    FROM ((pg_mv_statistic s
      JOIN pg_class c ON ((c.oid = s.starelid)))
      LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)));
diff --git a/src/test/regress/expected/type_sanity.out b/src/test/regress/expected/type_sanity.out
index 6281cef..b0b40ca 100644
--- a/src/test/regress/expected/type_sanity.out
+++ b/src/test/regress/expected/type_sanity.out
@@ -67,12 +67,13 @@ WHERE p1.typtype not in ('c','d','p') AND p1.typname NOT LIKE E'\\_%'
     (SELECT 1 FROM pg_type as p2
      WHERE p2.typname = ('_' || p1.typname)::name AND
            p2.typelem = p1.oid and p1.typarray = p2.oid);
- oid  |   typname    
-------+--------------
+ oid  |     typname     
+------+-----------------
   194 | pg_node_tree
  3353 | pg_ndistinct
+ 3358 | pg_dependencies
   210 | smgr
-(3 rows)
+(4 rows)
 
 -- Make sure typarray points to a varlena array type of our own base
 SELECT p1.oid, p1.typname as basetype, p2.typname as arraytype,
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 0273ea6..fda9166 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -118,4 +118,4 @@ test: event_trigger
 test: stats
 
 # run tests of multivariate stats
-test: mv_ndistinct
+test: mv_ndistinct mv_dependencies
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index f7f3a14..90d74d2 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -172,3 +172,4 @@ test: xml
 test: event_trigger
 test: stats
 test: mv_ndistinct
+test: mv_dependencies
diff --git a/src/test/regress/sql/mv_dependencies.sql b/src/test/regress/sql/mv_dependencies.sql
new file mode 100644
index 0000000..43df798
--- /dev/null
+++ b/src/test/regress/sql/mv_dependencies.sql
@@ -0,0 +1,139 @@
+-- data type passed by value
+CREATE TABLE functional_dependencies (
+    a INT,
+    b INT,
+    c INT
+);
+
+-- unknown column
+CREATE STATISTICS s1 WITH (dependencies) ON (unknown_column) FROM functional_dependencies;
+
+-- single column
+CREATE STATISTICS s1 WITH (dependencies) ON (a) FROM functional_dependencies;
+
+-- single column, duplicated
+CREATE STATISTICS s1 WITH (dependencies) ON (a,a) FROM functional_dependencies;
+
+-- two columns, one duplicated
+CREATE STATISTICS s1 WITH (dependencies) ON (a, a, b) FROM functional_dependencies;
+
+-- correct command
+CREATE STATISTICS s1 WITH (dependencies) ON (a, b, c) FROM functional_dependencies;
+
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;
+
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;
+
+-- a => b, a => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;
+
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+DROP TABLE functional_dependencies;
+
+-- varlena type (text)
+CREATE TABLE functional_dependencies (
+    a TEXT,
+    b TEXT,
+    c TEXT
+);
+
+CREATE STATISTICS s2 WITH (dependencies) ON (a, b, c) FROM functional_dependencies;
+
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;
+
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;
+
+-- a => b, a => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;
+
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+DROP TABLE functional_dependencies;
+
+-- NULL values (mix of int and text columns)
+CREATE TABLE functional_dependencies (
+    a INT,
+    b TEXT,
+    c INT,
+    d TEXT
+);
+
+CREATE STATISTICS s3 WITH (dependencies) ON (a, b, c, d) FROM functional_dependencies;
+
+INSERT INTO functional_dependencies
+     SELECT
+         mod(i, 100),
+         (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END),
+         mod(i, 400),
+         (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END)
+     FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, stadeps
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+DROP TABLE functional_dependencies;
diff --git a/src/test/regress/sql/mv_ndistinct.sql b/src/test/regress/sql/mv_ndistinct.sql
index 5cef254..43024ca 100644
--- a/src/test/regress/sql/mv_ndistinct.sql
+++ b/src/test/regress/sql/mv_ndistinct.sql
@@ -7,19 +7,19 @@ CREATE TABLE ndistinct (
 );
 
 -- unknown column
-CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (unknown_column) FROM ndistinct;
 
 -- single column
-CREATE STATISTICS s10 ON (a) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (a) FROM ndistinct;
 
 -- single column, duplicated
-CREATE STATISTICS s10 ON (a,a) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (a,a) FROM ndistinct;
 
 -- two columns, one duplicated
-CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (a, a, b) FROM ndistinct;
 
 -- correct command
-CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
+CREATE STATISTICS s10 WITH (ndistinct) ON (a, b, c) FROM ndistinct;
 
 -- perfectly correlated groups
 INSERT INTO ndistinct
diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql
index 791b942..902599b 100644
--- a/src/test/regress/sql/object_address.sql
+++ b/src/test/regress/sql/object_address.sql
@@ -41,7 +41,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL (
 	TO SQL WITH FUNCTION int4recv(internal));
 CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable;
 CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT);
-CREATE STATISTICS addr_nsp.gentable_stat ON (a,b) FROM addr_nsp.gentable;
+CREATE STATISTICS addr_nsp.gentable_stat WITH (ndistinct) ON (a,b) FROM addr_nsp.gentable;
 
 -- test some error cases
 SELECT pg_get_object_address('stone', '{}', '{}');
-- 
2.5.5