@@ -2055,7 +2055,11 @@ compute_distinct_stats(VacAttrStatsP stats,
2055
2055
/*
2056
2056
* Our track list includes every value in the sample, and every
2057
2057
* value appeared more than once. Assume the column has just
2058
- * these values.
2058
+ * these values. (This case is meant to address columns with
2059
+ * small, fixed sets of possible values, such as boolean or enum
2060
+ * columns. If there are any values that appear just once in the
2061
+ * sample, including too-wide values, we should assume that that's
2062
+ * not what we're dealing with.)
2059
2063
*/
2060
2064
stats -> stadistinct = track_cnt ;
2061
2065
}
@@ -2123,6 +2127,16 @@ compute_distinct_stats(VacAttrStatsP stats,
2123
2127
* significantly more common than the (estimated) average. We set the
2124
2128
* threshold rather arbitrarily at 25% more than average, with at
2125
2129
* least 2 instances in the sample.
2130
+ *
2131
+ * Note: the first of these cases is meant to address columns with
2132
+ * small, fixed sets of possible values, such as boolean or enum
2133
+ * columns. If we can *completely* represent the column population by
2134
+ * an MCV list that will fit into the stats target, then we should do
2135
+ * so and thus provide the planner with complete information. But if
2136
+ * the MCV list is not complete, it's generally worth being more
2137
+ * selective, and not just filling it all the way up to the stats
2138
+ * target. So for an incomplete list, we try to take only MCVs that
2139
+ * are significantly more common than average.
2126
2140
*/
2127
2141
if (track_cnt < track_max && toowide_cnt == 0 &&
2128
2142
stats -> stadistinct > 0 &&
@@ -2416,7 +2430,11 @@ compute_scalar_stats(VacAttrStatsP stats,
2416
2430
{
2417
2431
/*
2418
2432
* Every value in the sample appeared more than once. Assume the
2419
- * column has just these values.
2433
+ * column has just these values. (This case is meant to address
2434
+ * columns with small, fixed sets of possible values, such as
2435
+ * boolean or enum columns. If there are any values that appear
2436
+ * just once in the sample, including too-wide values, we should
2437
+ * assume that that's not what we're dealing with.)
2420
2438
*/
2421
2439
stats -> stadistinct = ndistinct ;
2422
2440
}
@@ -2485,6 +2503,16 @@ compute_scalar_stats(VacAttrStatsP stats,
2485
2503
* emit duplicate histogram bin boundaries. (We might end up with
2486
2504
* duplicate histogram entries anyway, if the distribution is skewed;
2487
2505
* but we prefer to treat such values as MCVs if at all possible.)
2506
+ *
2507
+ * Note: the first of these cases is meant to address columns with
2508
+ * small, fixed sets of possible values, such as boolean or enum
2509
+ * columns. If we can *completely* represent the column population by
2510
+ * an MCV list that will fit into the stats target, then we should do
2511
+ * so and thus provide the planner with complete information. But if
2512
+ * the MCV list is not complete, it's generally worth being more
2513
+ * selective, and not just filling it all the way up to the stats
2514
+ * target. So for an incomplete list, we try to take only MCVs that
2515
+ * are significantly more common than average.
2488
2516
*/
2489
2517
if (track_cnt == ndistinct && toowide_cnt == 0 &&
2490
2518
stats -> stadistinct > 0 &&
0 commit comments