From f63aad8ee45cf410ed7e3cc716287f99bcc72072 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B8=80=E6=8C=83?= <yizhi.fzh@alibaba-inc.com>
Date: Wed, 6 May 2020 15:44:15 +0800
Subject: [PATCH v7 3/6] Refactor existing uniqueness related code to use
 UnqiueKey

the call of remove_useless_joins and reduce_unique_semijoins are
postponed due to this as well
---
 .../postgres_fdw/expected/postgres_fdw.out    |  32 ++--
 contrib/postgres_fdw/sql/postgres_fdw.sql     |   1 +
 src/backend/optimizer/path/allpaths.c         |  18 ++-
 src/backend/optimizer/plan/analyzejoins.c     | 137 ++++--------------
 src/backend/optimizer/plan/planmain.c         |  13 --
 src/test/regress/expected/join.out            |  59 ++++----
 src/test/regress/sql/join.sql                 |  16 +-
 7 files changed, 97 insertions(+), 179 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 90db550b92..d71b6cc556 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -1378,6 +1378,7 @@ SELECT t1.c1, ss.a, ss.b FROM (SELECT c1 FROM ft4 WHERE c1 between 50 and 60) t1
 (8 rows)
 
 -- d. test deparsing rowmarked relations as subqueries
+-- YYY:  The inner table in the plan is "T 3", which has a primary key on c1, that's why we have the new added "Inner Unique: true".
 EXPLAIN (VERBOSE, COSTS OFF)
 SELECT t1.c1, ss.a, ss.b FROM (SELECT c1 FROM "S 1"."T 3" WHERE c1 = 50) t1 INNER JOIN (SELECT t2.c1, t3.c1 FROM (SELECT c1 FROM ft4 WHERE c1 between 50 and 60) t2 FULL JOIN (SELECT c1 FROM ft5 WHERE c1 between 50 and 60) t3 ON (t2.c1 = t3.c1) WHERE t2.c1 IS NULL OR t2.c1 IS NOT NULL) ss(a, b) ON (TRUE) ORDER BY t1.c1, ss.a, ss.b FOR UPDATE OF t1;
                                                                                                                                                                                              QUERY PLAN                                                                                                                                                                                             
@@ -1386,6 +1387,7 @@ SELECT t1.c1, ss.a, ss.b FROM (SELECT c1 FROM "S 1"."T 3" WHERE c1 = 50) t1 INNE
    Output: "T 3".c1, ft4.c1, ft5.c1, "T 3".ctid, ft4.*, ft5.*
    ->  Nested Loop
          Output: "T 3".c1, ft4.c1, ft5.c1, "T 3".ctid, ft4.*, ft5.*
+         Inner Unique: true
          ->  Foreign Scan
                Output: ft4.c1, ft4.*, ft5.c1, ft5.*
                Relations: (public.ft4) FULL JOIN (public.ft5)
@@ -1410,7 +1412,7 @@ SELECT t1.c1, ss.a, ss.b FROM (SELECT c1 FROM "S 1"."T 3" WHERE c1 = 50) t1 INNE
                ->  Seq Scan on "S 1"."T 3"
                      Output: "T 3".c1, "T 3".ctid
                      Filter: ("T 3".c1 = 50)
-(28 rows)
+(29 rows)
 
 SELECT t1.c1, ss.a, ss.b FROM (SELECT c1 FROM "S 1"."T 3" WHERE c1 = 50) t1 INNER JOIN (SELECT t2.c1, t3.c1 FROM (SELECT c1 FROM ft4 WHERE c1 between 50 and 60) t2 FULL JOIN (SELECT c1 FROM ft5 WHERE c1 between 50 and 60) t3 ON (t2.c1 = t3.c1) WHERE t2.c1 IS NULL OR t2.c1 IS NOT NULL) ss(a, b) ON (TRUE) ORDER BY t1.c1, ss.a, ss.b FOR UPDATE OF t1;
  c1 | a  | b  
@@ -2902,22 +2904,20 @@ select sum(c1%3), sum(distinct c1%3 order by c1%3) filter (where c1%3 < 2), c2 f
 -- Outer query is aggregation query
 explain (verbose, costs off)
 select distinct (select count(*) filter (where t2.c2 = 6 and t2.c1 < 10) from ft1 t1 where t1.c1 = 6) from ft2 t2 where t2.c2 % 6 = 0 order by 1;
-                                                          QUERY PLAN                                                          
-------------------------------------------------------------------------------------------------------------------------------
- Unique
+                                                       QUERY PLAN                                                       
+------------------------------------------------------------------------------------------------------------------------
+ Sort
    Output: ((SubPlan 1))
-   ->  Sort
-         Output: ((SubPlan 1))
-         Sort Key: ((SubPlan 1))
-         ->  Foreign Scan
-               Output: (SubPlan 1)
-               Relations: Aggregate on (public.ft2 t2)
-               Remote SQL: SELECT count(*) FILTER (WHERE ((c2 = 6) AND ("C 1" < 10))) FROM "S 1"."T 1" WHERE (((c2 % 6) = 0))
-               SubPlan 1
-                 ->  Foreign Scan on public.ft1 t1
-                       Output: (count(*) FILTER (WHERE ((t2.c2 = 6) AND (t2.c1 < 10))))
-                       Remote SQL: SELECT NULL FROM "S 1"."T 1" WHERE (("C 1" = 6))
-(13 rows)
+   Sort Key: ((SubPlan 1))
+   ->  Foreign Scan
+         Output: (SubPlan 1)
+         Relations: Aggregate on (public.ft2 t2)
+         Remote SQL: SELECT count(*) FILTER (WHERE ((c2 = 6) AND ("C 1" < 10))) FROM "S 1"."T 1" WHERE (((c2 % 6) = 0))
+         SubPlan 1
+           ->  Foreign Scan on public.ft1 t1
+                 Output: (count(*) FILTER (WHERE ((t2.c2 = 6) AND (t2.c1 < 10))))
+                 Remote SQL: SELECT NULL FROM "S 1"."T 1" WHERE (("C 1" = 6))
+(11 rows)
 
 select distinct (select count(*) filter (where t2.c2 = 6 and t2.c1 < 10) from ft1 t1 where t1.c1 = 6) from ft2 t2 where t2.c2 % 6 = 0 order by 1;
  count 
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 83971665e3..a42cfa134d 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -453,6 +453,7 @@ EXPLAIN (VERBOSE, COSTS OFF)
 SELECT t1.c1, ss.a, ss.b FROM (SELECT c1 FROM ft4 WHERE c1 between 50 and 60) t1 FULL JOIN (SELECT t2.c1, t3.c1 FROM (SELECT c1 FROM ft4 WHERE c1 between 50 and 60) t2 FULL JOIN (SELECT c1 FROM ft5 WHERE c1 between 50 and 60) t3 ON (t2.c1 = t3.c1) WHERE t2.c1 IS NULL OR t2.c1 IS NOT NULL) ss(a, b) ON (t1.c1 = ss.a) ORDER BY t1.c1, ss.a, ss.b;
 SELECT t1.c1, ss.a, ss.b FROM (SELECT c1 FROM ft4 WHERE c1 between 50 and 60) t1 FULL JOIN (SELECT t2.c1, t3.c1 FROM (SELECT c1 FROM ft4 WHERE c1 between 50 and 60) t2 FULL JOIN (SELECT c1 FROM ft5 WHERE c1 between 50 and 60) t3 ON (t2.c1 = t3.c1) WHERE t2.c1 IS NULL OR t2.c1 IS NOT NULL) ss(a, b) ON (t1.c1 = ss.a) ORDER BY t1.c1, ss.a, ss.b;
 -- d. test deparsing rowmarked relations as subqueries
+-- YYY:  The inner table in the plan is "T 3", which has a primary key on c1, that's why we have the new added "Inner Unique: true".
 EXPLAIN (VERBOSE, COSTS OFF)
 SELECT t1.c1, ss.a, ss.b FROM (SELECT c1 FROM "S 1"."T 3" WHERE c1 = 50) t1 INNER JOIN (SELECT t2.c1, t3.c1 FROM (SELECT c1 FROM ft4 WHERE c1 between 50 and 60) t2 FULL JOIN (SELECT c1 FROM ft5 WHERE c1 between 50 and 60) t3 ON (t2.c1 = t3.c1) WHERE t2.c1 IS NULL OR t2.c1 IS NOT NULL) ss(a, b) ON (TRUE) ORDER BY t1.c1, ss.a, ss.b FOR UPDATE OF t1;
 SELECT t1.c1, ss.a, ss.b FROM (SELECT c1 FROM "S 1"."T 3" WHERE c1 = 50) t1 INNER JOIN (SELECT t2.c1, t3.c1 FROM (SELECT c1 FROM ft4 WHERE c1 between 50 and 60) t2 FULL JOIN (SELECT c1 FROM ft5 WHERE c1 between 50 and 60) t3 ON (t2.c1 = t3.c1) WHERE t2.c1 IS NULL OR t2.c1 IS NOT NULL) ss(a, b) ON (TRUE) ORDER BY t1.c1, ss.a, ss.b FOR UPDATE OF t1;
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 96bbd3d1cf..166e3bd0ba 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -39,6 +39,7 @@
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/plancat.h"
+#include "optimizer/planmain.h"
 #include "optimizer/planner.h"
 #include "optimizer/restrictinfo.h"
 #include "optimizer/tlist.h"
@@ -222,13 +223,24 @@ make_one_rel(PlannerInfo *root, List *joinlist)
 	set_base_rel_pathlists(root);
 
 	/*
-	 * Generate access paths for the entire join tree.
+	 * Remove any useless outer joins.  Ideally this would be done during
+	 * jointree preprocessing, but the necessary information isn't available
+	 * until we've built baserel data structures, classified qual clauses
+	 * and uniquekeys
 	 */
-	rel = make_rel_from_joinlist(root, joinlist);
+	joinlist = remove_useless_joins(root, joinlist);
+
+	/*
+	 * Also, reduce any semijoins with unique inner rels to plain inner joins.
+	 * Likewise, this can't be done until now for lack of needed info.
+	 */
+	reduce_unique_semijoins(root);
 
 	/*
-	 * The result should join all and only the query's base rels.
+	 * Generate access paths for the entire join tree.
 	 */
+	rel = make_rel_from_joinlist(root, joinlist);
+
 	Assert(bms_equal(rel->relids, root->all_baserels));
 
 	return rel;
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index d0ff660284..d80bff65d2 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -439,6 +439,8 @@ remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids)
 	 * There may be references to the rel in root->fkey_list, but if so,
 	 * match_foreign_keys_to_quals() will get rid of them.
 	 */
+
+	root->all_baserels = bms_del_member(root->all_baserels, relid);
 }
 
 /*
@@ -584,39 +586,7 @@ reduce_unique_semijoins(PlannerInfo *root)
 static bool
 rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 {
-	/* We only know about baserels ... */
-	if (rel->reloptkind != RELOPT_BASEREL)
-		return false;
-	if (rel->rtekind == RTE_RELATION)
-	{
-		/*
-		 * For a plain relation, we only know how to prove uniqueness by
-		 * reference to unique indexes.  Make sure there's at least one
-		 * suitable unique index.  It must be immediately enforced, and if
-		 * it's a partial index, it must match the query.  (Keep these
-		 * conditions in sync with relation_has_unique_index_for!)
-		 */
-		ListCell   *lc;
-
-		foreach(lc, rel->indexlist)
-		{
-			IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
-
-			if (ind->unique && ind->immediate &&
-				(ind->indpred == NIL || ind->predOK))
-				return true;
-		}
-	}
-	else if (rel->rtekind == RTE_SUBQUERY)
-	{
-		Query	   *subquery = root->simple_rte_array[rel->relid]->subquery;
-
-		/* Check if the subquery has any qualities that support distinctness */
-		if (query_supports_distinctness(subquery))
-			return true;
-	}
-	/* We have no proof rules for any other rtekinds. */
-	return false;
+	return rel->uniquekeys != NIL;
 }
 
 /*
@@ -640,83 +610,33 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 static bool
 rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list)
 {
-	/*
-	 * We could skip a couple of tests here if we assume all callers checked
-	 * rel_supports_distinctness first, but it doesn't seem worth taking any
-	 * risk for.
-	 */
-	if (rel->reloptkind != RELOPT_BASEREL)
-		return false;
-	if (rel->rtekind == RTE_RELATION)
-	{
-		/*
-		 * Examine the indexes to see if we have a matching unique index.
-		 * relation_has_unique_index_for automatically adds any usable
-		 * restriction clauses for the rel, so we needn't do that here.
-		 */
-		if (relation_has_unique_index_for(root, rel, clause_list, NIL, NIL))
-			return true;
-	}
-	else if (rel->rtekind == RTE_SUBQUERY)
-	{
-		Index		relid = rel->relid;
-		Query	   *subquery = root->simple_rte_array[relid]->subquery;
-		List	   *colnos = NIL;
-		List	   *opids = NIL;
-		ListCell   *l;
 
-		/*
-		 * Build the argument lists for query_is_distinct_for: a list of
-		 * output column numbers that the query needs to be distinct over, and
-		 * a list of equality operators that the output columns need to be
-		 * distinct according to.
-		 *
-		 * (XXX we are not considering restriction clauses attached to the
-		 * subquery; is that worth doing?)
-		 */
-		foreach(l, clause_list)
+	ListCell	*lc1, *lc2,  *lc3;
+	foreach(lc1,  rel->uniquekeys)
+	{
+		UniqueKey *uqk = lfirst_node(UniqueKey, lc1);
+		bool all_uqk_exprs_found = true;
+		foreach(lc2, uqk->exprs)
 		{
-			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
-			Oid			op;
-			Var		   *var;
-
-			/*
-			 * Get the equality operator we need uniqueness according to.
-			 * (This might be a cross-type operator and thus not exactly the
-			 * same operator the subquery would consider; that's all right
-			 * since query_is_distinct_for can resolve such cases.)  The
-			 * caller's mergejoinability test should have selected only
-			 * OpExprs.
-			 */
-			op = castNode(OpExpr, rinfo->clause)->opno;
-
-			/* caller identified the inner side for us */
-			if (rinfo->outer_is_left)
-				var = (Var *) get_rightop(rinfo->clause);
-			else
-				var = (Var *) get_leftop(rinfo->clause);
-
-			/*
-			 * We may ignore any RelabelType node above the operand.  (There
-			 * won't be more than one, since eval_const_expressions() has been
-			 * applied already.)
-			 */
-			if (var && IsA(var, RelabelType))
-				var = (Var *) ((RelabelType *) var)->arg;
-
-			/*
-			 * If inner side isn't a Var referencing a subquery output column,
-			 * this clause doesn't help us.
-			 */
-			if (!var || !IsA(var, Var) ||
-				var->varno != relid || var->varlevelsup != 0)
-				continue;
-
-			colnos = lappend_int(colnos, var->varattno);
-			opids = lappend_oid(opids, op);
+			Node *uq_expr = lfirst(lc2);
+			bool find_uq_exprs_in_clause_list = false;
+			foreach(lc3, clause_list)
+			{
+				RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc3);
+				Node *clause_expr = rinfo->outer_is_left ? get_rightop(rinfo->clause): get_leftop(rinfo->clause) ;
+				if (equal(uq_expr, clause_expr))
+				{
+					find_uq_exprs_in_clause_list = true;
+					break;
+				}
+			}
+			if (!find_uq_exprs_in_clause_list)
+			{
+				all_uqk_exprs_found = false;
+				break;
+			}
 		}
-
-		if (query_is_distinct_for(subquery, colnos, opids))
+		if (all_uqk_exprs_found)
 			return true;
 	}
 	return false;
@@ -972,6 +892,9 @@ innerrel_is_unique(PlannerInfo *root,
 	MemoryContext old_context;
 	ListCell   *lc;
 
+	if (relation_is_onerow(innerrel))
+		return true;
+
 	/* Certainly can't prove uniqueness when there are no joinclauses */
 	if (restrictlist == NIL)
 		return false;
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c
index 62dfc6d44a..6ad73cb57b 100644
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -213,19 +213,6 @@ query_planner(PlannerInfo *root,
 	 */
 	fix_placeholder_input_needed_levels(root);
 
-	/*
-	 * Remove any useless outer joins.  Ideally this would be done during
-	 * jointree preprocessing, but the necessary information isn't available
-	 * until we've built baserel data structures and classified qual clauses.
-	 */
-	joinlist = remove_useless_joins(root, joinlist);
-
-	/*
-	 * Also, reduce any semijoins with unique inner rels to plain inner joins.
-	 * Likewise, this can't be done until now for lack of needed info.
-	 */
-	reduce_unique_semijoins(root);
-
 	/*
 	 * Now distribute "placeholders" to base rels as needed.  This has to be
 	 * done after join removal because removal could change whether a
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index a46b1573bd..8378936eda 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4349,11 +4349,11 @@ reset enable_nestloop;
 --
 begin;
 CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
-CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int);
+CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int, d int);
 CREATE TEMP TABLE c (id int PRIMARY KEY);
 CREATE TEMP TABLE d (a int, b int);
 INSERT INTO a VALUES (0, 0), (1, NULL);
-INSERT INTO b VALUES (0, 0), (1, NULL);
+INSERT INTO b VALUES (0, 0, 1), (1, NULL, 1);
 INSERT INTO c VALUES (0), (1);
 INSERT INTO d VALUES (1,3), (2,2), (3,1);
 -- all three cases should be optimizable into a simple seqscan
@@ -4411,40 +4411,37 @@ select d.* from d left join (select distinct * from b) s
 (1 row)
 
 -- join removal is not possible when the GROUP BY contains a column that is
--- not in the join condition.  (Note: as of 9.6, we notice that b.id is a
--- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
--- but this happens too late for join removal in the outer plan level.)
+-- not in the join condition.
 explain (costs off)
-select d.* from d left join (select * from b group by b.id, b.c_id) s
-  on d.a = s.id;
-                QUERY PLAN                
-------------------------------------------
- Merge Right Join
-   Merge Cond: (b.id = d.a)
-   ->  Group
-         Group Key: b.id
-         ->  Index Scan using b_pkey on b
-   ->  Sort
-         Sort Key: d.a
-         ->  Seq Scan on d
+select d.* from d left join (select d, c_id from b group by b.d, b.c_id) s
+  on d.a = s.d;
+                 QUERY PLAN                 
+--------------------------------------------
+ Hash Left Join
+   Hash Cond: (d.a = s.d)
+   ->  Seq Scan on d
+   ->  Hash
+         ->  Subquery Scan on s
+               ->  HashAggregate
+                     Group Key: b.d, b.c_id
+                     ->  Seq Scan on b
 (8 rows)
 
 -- similarly, but keying off a DISTINCT clause
 explain (costs off)
-select d.* from d left join (select distinct * from b) s
-  on d.a = s.id;
-              QUERY PLAN              
---------------------------------------
- Merge Right Join
-   Merge Cond: (b.id = d.a)
-   ->  Unique
-         ->  Sort
-               Sort Key: b.id, b.c_id
-               ->  Seq Scan on b
-   ->  Sort
-         Sort Key: d.a
-         ->  Seq Scan on d
-(9 rows)
+select d.* from d left join (select distinct c_id, d from b) s
+  on d.a = s.d;
+                 QUERY PLAN                 
+--------------------------------------------
+ Hash Left Join
+   Hash Cond: (d.a = s.d)
+   ->  Seq Scan on d
+   ->  Hash
+         ->  Subquery Scan on s
+               ->  HashAggregate
+                     Group Key: b.c_id, b.d
+                     ->  Seq Scan on b
+(8 rows)
 
 -- check join removal works when uniqueness of the join condition is enforced
 -- by a UNION
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 1403e0ffe7..3312542411 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1479,11 +1479,11 @@ reset enable_nestloop;
 begin;
 
 CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
-CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int);
+CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int, d int);
 CREATE TEMP TABLE c (id int PRIMARY KEY);
 CREATE TEMP TABLE d (a int, b int);
 INSERT INTO a VALUES (0, 0), (1, NULL);
-INSERT INTO b VALUES (0, 0), (1, NULL);
+INSERT INTO b VALUES (0, 0, 1), (1, NULL, 1);
 INSERT INTO c VALUES (0), (1);
 INSERT INTO d VALUES (1,3), (2,2), (3,1);
 
@@ -1512,17 +1512,15 @@ select d.* from d left join (select distinct * from b) s
   on d.a = s.id and d.b = s.c_id;
 
 -- join removal is not possible when the GROUP BY contains a column that is
--- not in the join condition.  (Note: as of 9.6, we notice that b.id is a
--- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
--- but this happens too late for join removal in the outer plan level.)
+-- not in the join condition.
 explain (costs off)
-select d.* from d left join (select * from b group by b.id, b.c_id) s
-  on d.a = s.id;
+select d.* from d left join (select d, c_id from b group by b.d, b.c_id) s
+  on d.a = s.d;
 
 -- similarly, but keying off a DISTINCT clause
 explain (costs off)
-select d.* from d left join (select distinct * from b) s
-  on d.a = s.id;
+select d.* from d left join (select distinct c_id, d from b) s
+  on d.a = s.d;
 
 -- check join removal works when uniqueness of the join condition is enforced
 -- by a UNION
-- 
2.21.0