diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index db1792b..c32ff2b 100644 *** a/src/backend/utils/adt/selfuncs.c --- b/src/backend/utils/adt/selfuncs.c *************** eqjoinsel_semi(Oid operator, *** 2570,2575 **** --- 2570,2585 ---- memset(&sslot2, 0, sizeof(sslot2)); /* + * If we lack any fact-based estimate for nd2, it seems best to set it + * equal to the inner relation size estimate, ie, assume the inner side of + * the semijoin is unique. This may lead to overestimating the size of + * the join, but that's usually better than an underestimate. We don't + * make any comparable assumption for the outer side, though. + */ + if (isdefault2) + nd2 = inner_rel->rows; + + /* * We clamp nd2 to be not more than what we estimate the inner relation's * size to be. This is intuitively somewhat reasonable since obviously * there can't be more than that many distinct values coming from the *************** eqjoinsel_semi(Oid operator, *** 2583,2606 **** * We can apply this clamping both with respect to the base relation from * which the join variable comes (if there is just one), and to the * immediate inner input relation of the current join. - * - * If we clamp, we can treat nd2 as being a non-default estimate; it's not - * great, maybe, but it didn't come out of nowhere either. This is most - * helpful when the inner relation is empty and consequently has no stats. */ if (vardata2->rel) { ! if (nd2 >= vardata2->rel->rows) ! { nd2 = vardata2->rel->rows; - isdefault2 = false; - } } ! if (nd2 >= inner_rel->rows) ! { nd2 = inner_rel->rows; - isdefault2 = false; - } if (HeapTupleIsValid(vardata1->statsTuple)) { --- 2593,2606 ---- * We can apply this clamping both with respect to the base relation from * which the join variable comes (if there is just one), and to the * immediate inner input relation of the current join. */ if (vardata2->rel) { ! if (nd2 > vardata2->rel->rows) nd2 = vardata2->rel->rows; } ! if (nd2 > inner_rel->rows) nd2 = inner_rel->rows; if (HeapTupleIsValid(vardata1->statsTuple)) { *************** eqjoinsel_semi(Oid operator, *** 2701,2723 **** * the uncertain rows that a fraction nd2/nd1 have join partners. We * can discount the known-matched MCVs from the distinct-values counts * before doing the division. - * - * Crude as the above is, it's completely useless if we don't have - * reliable ndistinct values for both sides. Hence, if either nd1 or - * nd2 is default, punt and assume half of the uncertain rows have - * join partners. */ ! if (!isdefault1 && !isdefault2) ! { ! nd1 -= nmatches; ! nd2 -= nmatches; ! if (nd1 <= nd2 || nd2 < 0) ! uncertainfrac = 1.0; ! else ! uncertainfrac = nd2 / nd1; ! } else ! uncertainfrac = 0.5; uncertain = 1.0 - matchfreq1 - nullfrac1; CLAMP_PROBABILITY(uncertain); selec = matchfreq1 + uncertainfrac * uncertain; --- 2701,2713 ---- * the uncertain rows that a fraction nd2/nd1 have join partners. We * can discount the known-matched MCVs from the distinct-values counts * before doing the division. */ ! nd1 -= nmatches; ! nd2 -= nmatches; ! if (nd1 <= nd2 || nd2 < 0) ! uncertainfrac = 1.0; else ! uncertainfrac = nd2 / nd1; uncertain = 1.0 - matchfreq1 - nullfrac1; CLAMP_PROBABILITY(uncertain); selec = matchfreq1 + uncertainfrac * uncertain; *************** eqjoinsel_semi(Oid operator, *** 2730,2744 **** */ double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0; ! if (!isdefault1 && !isdefault2) ! { ! if (nd1 <= nd2 || nd2 < 0) ! selec = 1.0 - nullfrac1; ! else ! selec = (nd2 / nd1) * (1.0 - nullfrac1); ! } else ! selec = 0.5 * (1.0 - nullfrac1); } free_attstatsslot(&sslot1); --- 2720,2729 ---- */ double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0; ! if (nd1 <= nd2 || nd2 < 0) ! selec = 1.0 - nullfrac1; else ! selec = (nd2 / nd1) * (1.0 - nullfrac1); } free_attstatsslot(&sslot1); diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index f47449b..eaffbad 100644 *** a/src/test/regress/expected/join.out --- b/src/test/regress/expected/join.out *************** where not exists ( *** 2431,2440 **** ); QUERY PLAN --------------------------------------------------------- ! Hash Anti Join ! Hash Cond: (t1.c1 = t2.c2) -> Seq Scan on tt4x t1 ! -> Hash -> Merge Right Join Merge Cond: (t5.c1 = t3.c2) -> Merge Join --- 2431,2440 ---- ); QUERY PLAN --------------------------------------------------------- ! Nested Loop Anti Join ! Join Filter: (t1.c1 = t2.c2) -> Seq Scan on tt4x t1 ! -> Materialize -> Merge Right Join Merge Cond: (t5.c1 = t3.c2) -> Merge Join