diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c new file mode 100644 index 5f21fcb..c6b1af9 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1766,6 +1766,12 @@ static void compute_scalar_stats(VacAttr double totalrows); static int compare_scalars(const void *a, const void *b, void *arg); static int compare_mcvs(const void *a, const void *b); +static int analyze_mcv_list(int *mcv_counts, + int num_mcv, + double stadistinct, + double stanullfrac, + int samplerows, + double totalrows); /* @@ -2184,9 +2190,7 @@ compute_distinct_stats(VacAttrStatsP sta * we are able to generate a complete MCV list (all the values in the * sample will fit, and we think these are all the ones in the table), * then do so. Otherwise, store only those values that are - * significantly more common than the (estimated) average. We set the - * threshold rather arbitrarily at 25% more than average, with at - * least 2 instances in the sample. + * significantly more common than the values not in the list. * * Note: the first of these cases is meant to address columns with * small, fixed sets of possible values, such as boolean or enum @@ -2195,8 +2199,7 @@ compute_distinct_stats(VacAttrStatsP sta * so and thus provide the planner with complete information. But if * the MCV list is not complete, it's generally worth being more * selective, and not just filling it all the way up to the stats - * target. So for an incomplete list, we try to take only MCVs that - * are significantly more common than average. + * target. */ if (track_cnt < track_max && toowide_cnt == 0 && stats->stadistinct > 0 && @@ -2207,28 +2210,22 @@ compute_distinct_stats(VacAttrStatsP sta } else { - double ndistinct_table = stats->stadistinct; - double avgcount, - mincount; + int *mcv_counts; - /* Re-extract estimate of # distinct nonnull values in table */ - if (ndistinct_table < 0) - ndistinct_table = -ndistinct_table * totalrows; - /* estimate # occurrences in sample of a typical nonnull value */ - avgcount = (double) nonnull_cnt / ndistinct_table; - /* set minimum threshold count to store a value */ - mincount = avgcount * 1.25; - if (mincount < 2) - mincount = 2; + /* Incomplete list; decide how many values are worth keeping */ if (num_mcv > track_cnt) num_mcv = track_cnt; - for (i = 0; i < num_mcv; i++) + + if (num_mcv > 0) { - if (track[i].count < mincount) - { - num_mcv = i; - break; - } + mcv_counts = (int *) palloc(num_mcv * sizeof(int)); + for (i = 0; i < num_mcv; i++) + mcv_counts[i] = track[i].count; + + num_mcv = analyze_mcv_list(mcv_counts, num_mcv, + stats->stadistinct, + stats->stanullfrac, + samplerows, totalrows); } } @@ -2558,14 +2555,7 @@ compute_scalar_stats(VacAttrStatsP stats * we are able to generate a complete MCV list (all the values in the * sample will fit, and we think these are all the ones in the table), * then do so. Otherwise, store only those values that are - * significantly more common than the (estimated) average. We set the - * threshold rather arbitrarily at 25% more than average, with at - * least 2 instances in the sample. Also, we won't suppress values - * that have a frequency of at least 1/K where K is the intended - * number of histogram bins; such values might otherwise cause us to - * emit duplicate histogram bin boundaries. (We might end up with - * duplicate histogram entries anyway, if the distribution is skewed; - * but we prefer to treat such values as MCVs if at all possible.) + * significantly more common than the values not in the list. * * Note: the first of these cases is meant to address columns with * small, fixed sets of possible values, such as boolean or enum @@ -2574,8 +2564,7 @@ compute_scalar_stats(VacAttrStatsP stats * so and thus provide the planner with complete information. But if * the MCV list is not complete, it's generally worth being more * selective, and not just filling it all the way up to the stats - * target. So for an incomplete list, we try to take only MCVs that - * are significantly more common than average. + * target. */ if (track_cnt == ndistinct && toowide_cnt == 0 && stats->stadistinct > 0 && @@ -2586,33 +2575,22 @@ compute_scalar_stats(VacAttrStatsP stats } else { - double ndistinct_table = stats->stadistinct; - double avgcount, - mincount, - maxmincount; + int *mcv_counts; - /* Re-extract estimate of # distinct nonnull values in table */ - if (ndistinct_table < 0) - ndistinct_table = -ndistinct_table * totalrows; - /* estimate # occurrences in sample of a typical nonnull value */ - avgcount = (double) nonnull_cnt / ndistinct_table; - /* set minimum threshold count to store a value */ - mincount = avgcount * 1.25; - if (mincount < 2) - mincount = 2; - /* don't let threshold exceed 1/K, however */ - maxmincount = (double) values_cnt / (double) num_bins; - if (mincount > maxmincount) - mincount = maxmincount; + /* Incomplete list; decide how many values are worth keeping */ if (num_mcv > track_cnt) num_mcv = track_cnt; - for (i = 0; i < num_mcv; i++) + + if (num_mcv > 0) { - if (track[i].count < mincount) - { - num_mcv = i; - break; - } + mcv_counts = (int *) palloc(num_mcv * sizeof(int)); + for (i = 0; i < num_mcv; i++) + mcv_counts[i] = track[i].count; + + num_mcv = analyze_mcv_list(mcv_counts, num_mcv, + stats->stadistinct, + stats->stanullfrac, + samplerows, totalrows); } } @@ -2878,3 +2856,103 @@ compare_mcvs(const void *a, const void * return da - db; } + +/* + * Analyze the list of common values in the sample and decide how many are + * worth storing in the table's MCV list. + * + * mcv_counts is assumed to be a list of the counts of the most common values + * seen in the sample, starting with the most common. The return value is the + * number that are significantly more common than the values not in the list, + * and are therefore deemed worth storing in the table's MCV list. + */ +static int +analyze_mcv_list(int *mcv_counts, + int num_mcv, + double stadistinct, + double stanullfrac, + int samplerows, + double totalrows) +{ + double ndistinct_table; + double sumcount; + int i; + + /* + * If the entire table was sampled, keep the whole list. This also + * protects us against division by zero in the code below. + */ + if (samplerows == totalrows || totalrows <= 1.0) + return num_mcv; + + /* Re-extract the estimated number of distinct nonnull values in table */ + ndistinct_table = stadistinct; + if (ndistinct_table < 0) + ndistinct_table = -ndistinct_table * totalrows; + + /* + * Exclude the least common values from the MCV list, if they are not + * significantly more common than the estimated selectivity they would + * have if they weren't in the list. All non-MCV values are assumed to be + * equally common, after taking into account the frequencies of all the + * the values in the MCV list and the number of nulls (c.f. eqsel()). + * + * Here sumcount tracks the total count of all but the last (least common) + * value in the MCV list, allowing us to determine the effect of excluding + * that value from the list. + */ + sumcount = 0.0; + for (i = 0; i < num_mcv - 1; i++) + sumcount += mcv_counts[i]; + + while (num_mcv > 0) + { + double selec, + otherdistinct, + N, + n, + K, + variance, + stddev; + + /* + * Estimated selectivity of the least common value, if it weren't in + * the MCV list (c.f. eqsel()). + */ + selec = 1.0 - sumcount / samplerows - stanullfrac; + if (selec < 0.0) + selec = 0.0; + if (selec > 1.0) + selec = 1.0; + otherdistinct = ndistinct_table - (num_mcv - 1); + if (otherdistinct > 1) + selec /= otherdistinct; + + /* + * If the value is kept in the MCV list, its population frequency is + * assumed to equal its sample frequency, and the distribution of the + * value's count in the sample is a hypergeomtric distribution with + * the following standard deviation. + */ + N = totalrows; + n = samplerows; + K = N * mcv_counts[num_mcv - 1] / n; + variance = n * K * (N - K) * (N - n) / (N * N * (N - 1)); + stddev = sqrt(variance); + + /* + * If the value is significantly more common than the non-MCV + * selectivity would suggest, keep it, and all the other more common + * values. + */ + if (mcv_counts[num_mcv - 1] > selec * samplerows + 2 * stddev) + break; + + /* Otherwise discard it and consider the next least common value */ + num_mcv--; + if (num_mcv == 0) + break; + sumcount -= mcv_counts[num_mcv - 1]; + } + return num_mcv; +}