Skip to content

Commit 3505862

Browse files
committed
Further repair of eqjoinsel ndistinct-clamping logic.
Examination of examples provided by Mark Kirkwood and others has convinced me that actually commit 7f3eba3 was quite a few bricks shy of a load. The useful part of that patch was clamping ndistinct for the inner side of a semi or anti join, and the reason why that's needed is that it's the only way that restriction clauses eliminating rows from the inner relation can affect the estimated size of the join result. I had not clearly understood why the clamping was appropriate, and so mis-extrapolated to conclude that we should clamp ndistinct for the outer side too, as well as for both sides of regular joins. These latter actions were all wrong, and are reverted with this patch. In addition, the clamping logic is now made to affect the behavior of both paths in eqjoinsel_semi, with or without MCV lists to compare. When we have MCVs, we suppose that the most common values are the ones that are most likely to survive the decimation resulting from a lower restriction clause, so we think of the clamping as eliminating non-MCV values, or potentially even the least-common MCVs for the inner relation. Back-patch to 8.4, same as previous fixes in this area.
1 parent e724b96 commit 3505862

File tree

1 file changed

+50
-58
lines changed

1 file changed

+50
-58
lines changed

src/backend/utils/adt/selfuncs.c

Lines changed: 50 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
139139
FmgrInfo *opproc, bool isgt,
140140
Datum constval, Oid consttype);
141141
static double eqjoinsel_inner(Oid operator,
142-
VariableStatData *vardata1, VariableStatData *vardata2,
143-
RelOptInfo *rel1, RelOptInfo *rel2);
142+
VariableStatData *vardata1, VariableStatData *vardata2);
144143
static double eqjoinsel_semi(Oid operator,
145144
VariableStatData *vardata1, VariableStatData *vardata2,
146-
RelOptInfo *rel1, RelOptInfo *rel2);
145+
RelOptInfo *inner_rel);
147146
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
148147
Datum lobound, Datum hibound, Oid boundstypid,
149148
double *scaledlobound, double *scaledhibound);
@@ -1993,47 +1992,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
19931992
VariableStatData vardata1;
19941993
VariableStatData vardata2;
19951994
bool join_is_reversed;
1996-
RelOptInfo *rel1;
1997-
RelOptInfo *rel2;
1995+
RelOptInfo *inner_rel;
19981996

19991997
get_join_variables(root, args, sjinfo,
20001998
&vardata1, &vardata2, &join_is_reversed);
20011999

2002-
/*
2003-
* Identify the join's direct input relations. We use the min lefthand
2004-
* and min righthand as the inputs, even though the join might actually
2005-
* get done with larger input relations. The min inputs are guaranteed to
2006-
* have been formed by now, though, and always using them ensures
2007-
* consistency of estimates.
2008-
*/
2009-
if (!join_is_reversed)
2010-
{
2011-
rel1 = find_join_input_rel(root, sjinfo->min_lefthand);
2012-
rel2 = find_join_input_rel(root, sjinfo->min_righthand);
2013-
}
2014-
else
2015-
{
2016-
rel1 = find_join_input_rel(root, sjinfo->min_righthand);
2017-
rel2 = find_join_input_rel(root, sjinfo->min_lefthand);
2018-
}
2019-
20202000
switch (sjinfo->jointype)
20212001
{
20222002
case JOIN_INNER:
20232003
case JOIN_LEFT:
20242004
case JOIN_FULL:
2025-
selec = eqjoinsel_inner(operator, &vardata1, &vardata2,
2026-
rel1, rel2);
2005+
selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
20272006
break;
20282007
case JOIN_SEMI:
20292008
case JOIN_ANTI:
2009+
/*
2010+
* Look up the join's inner relation. min_righthand is sufficient
2011+
* information because neither SEMI nor ANTI joins permit any
2012+
* reassociation into or out of their RHS, so the righthand will
2013+
* always be exactly that set of rels.
2014+
*/
2015+
inner_rel = find_join_input_rel(root, sjinfo->min_righthand);
2016+
20302017
if (!join_is_reversed)
20312018
selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
2032-
rel1, rel2);
2019+
inner_rel);
20332020
else
20342021
selec = eqjoinsel_semi(get_commutator(operator),
20352022
&vardata2, &vardata1,
2036-
rel2, rel1);
2023+
inner_rel);
20372024
break;
20382025
default:
20392026
/* other values not expected here */
@@ -2059,8 +2046,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
20592046
*/
20602047
static double
20612048
eqjoinsel_inner(Oid operator,
2062-
VariableStatData *vardata1, VariableStatData *vardata2,
2063-
RelOptInfo *rel1, RelOptInfo *rel2)
2049+
VariableStatData *vardata1, VariableStatData *vardata2)
20642050
{
20652051
double selec;
20662052
double nd1;
@@ -2254,26 +2240,10 @@ eqjoinsel_inner(Oid operator,
22542240
* XXX Can we be smarter if we have an MCV list for just one side? It
22552241
* seems that if we assume equal distribution for the other side, we
22562242
* end up with the same answer anyway.
2257-
*
2258-
* An additional hack we use here is to clamp the nd1 and nd2 values
2259-
* to not more than what we are estimating the input relation sizes to
2260-
* be, providing a crude correction for the selectivity of restriction
2261-
* clauses on those relations. (We don't do that in the other path
2262-
* since there we are comparing the nd values to stats for the whole
2263-
* relations.) We can apply this clamp both with respect to the base
2264-
* relations from which the join variables come, and to the immediate
2265-
* input relations of the current join.
22662243
*/
22672244
double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
22682245
double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
22692246

2270-
if (vardata1->rel)
2271-
nd1 = Min(nd1, vardata1->rel->rows);
2272-
nd1 = Min(nd1, rel1->rows);
2273-
if (vardata2->rel)
2274-
nd2 = Min(nd2, vardata2->rel->rows);
2275-
nd2 = Min(nd2, rel2->rows);
2276-
22772247
selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
22782248
if (nd1 > nd2)
22792249
selec /= nd1;
@@ -2300,7 +2270,7 @@ eqjoinsel_inner(Oid operator,
23002270
static double
23012271
eqjoinsel_semi(Oid operator,
23022272
VariableStatData *vardata1, VariableStatData *vardata2,
2303-
RelOptInfo *rel1, RelOptInfo *rel2)
2273+
RelOptInfo *inner_rel)
23042274
{
23052275
double selec;
23062276
double nd1;
@@ -2321,6 +2291,25 @@ eqjoinsel_semi(Oid operator,
23212291
nd1 = get_variable_numdistinct(vardata1);
23222292
nd2 = get_variable_numdistinct(vardata2);
23232293

2294+
/*
2295+
* We clamp nd2 to be not more than what we estimate the inner relation's
2296+
* size to be. This is intuitively somewhat reasonable since obviously
2297+
* there can't be more than that many distinct values coming from the
2298+
* inner rel. The reason for the asymmetry (ie, that we don't clamp nd1
2299+
* likewise) is that this is the only pathway by which restriction clauses
2300+
* applied to the inner rel will affect the join result size estimate,
2301+
* since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
2302+
* only the outer rel's size. If we clamped nd1 we'd be double-counting
2303+
* the selectivity of outer-rel restrictions.
2304+
*
2305+
* We can apply this clamping both with respect to the base relation from
2306+
* which the join variable comes (if there is just one), and to the
2307+
* immediate inner input relation of the current join.
2308+
*/
2309+
if (vardata2->rel)
2310+
nd2 = Min(nd2, vardata2->rel->rows);
2311+
nd2 = Min(nd2, inner_rel->rows);
2312+
23242313
if (HeapTupleIsValid(vardata1->statsTuple))
23252314
{
23262315
stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
@@ -2365,11 +2354,21 @@ eqjoinsel_semi(Oid operator,
23652354
uncertainfrac,
23662355
uncertain;
23672356
int i,
2368-
nmatches;
2357+
nmatches,
2358+
clamped_nvalues2;
2359+
2360+
/*
2361+
* The clamping above could have resulted in nd2 being less than
2362+
* nvalues2; in which case, we assume that precisely the nd2 most
2363+
* common values in the relation will appear in the join input, and so
2364+
* compare to only the first nd2 members of the MCV list. Of course
2365+
* this is frequently wrong, but it's the best bet we can make.
2366+
*/
2367+
clamped_nvalues2 = Min(nvalues2, nd2);
23692368

23702369
fmgr_info(get_opcode(operator), &eqproc);
23712370
hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
2372-
hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
2371+
hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool));
23732372

23742373
/*
23752374
* Note we assume that each MCV will match at most one member of the
@@ -2382,7 +2381,7 @@ eqjoinsel_semi(Oid operator,
23822381
{
23832382
int j;
23842383

2385-
for (j = 0; j < nvalues2; j++)
2384+
for (j = 0; j < clamped_nvalues2; j++)
23862385
{
23872386
if (hasmatch2[j])
23882387
continue;
@@ -2426,7 +2425,7 @@ eqjoinsel_semi(Oid operator,
24262425
{
24272426
nd1 -= nmatches;
24282427
nd2 -= nmatches;
2429-
if (nd1 <= nd2 || nd2 <= 0)
2428+
if (nd1 <= nd2 || nd2 < 0)
24302429
uncertainfrac = 1.0;
24312430
else
24322431
uncertainfrac = nd2 / nd1;
@@ -2447,14 +2446,7 @@ eqjoinsel_semi(Oid operator,
24472446

24482447
if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
24492448
{
2450-
if (vardata1->rel)
2451-
nd1 = Min(nd1, vardata1->rel->rows);
2452-
nd1 = Min(nd1, rel1->rows);
2453-
if (vardata2->rel)
2454-
nd2 = Min(nd2, vardata2->rel->rows);
2455-
nd2 = Min(nd2, rel2->rows);
2456-
2457-
if (nd1 <= nd2 || nd2 <= 0)
2449+
if (nd1 <= nd2 || nd2 < 0)
24582450
selec = 1.0 - nullfrac1;
24592451
else
24602452
selec = (nd2 / nd1) * (1.0 - nullfrac1);

0 commit comments

Comments
 (0)