@@ -139,11 +139,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
139
139
FmgrInfo * opproc , bool isgt ,
140
140
Datum constval , Oid consttype );
141
141
static double eqjoinsel_inner (Oid operator ,
142
- VariableStatData * vardata1 , VariableStatData * vardata2 ,
143
- RelOptInfo * rel1 , RelOptInfo * rel2 );
142
+ VariableStatData * vardata1 , VariableStatData * vardata2 );
144
143
static double eqjoinsel_semi (Oid operator ,
145
144
VariableStatData * vardata1 , VariableStatData * vardata2 ,
146
- RelOptInfo * rel1 , RelOptInfo * rel2 );
145
+ RelOptInfo * inner_rel );
147
146
static bool convert_to_scalar (Datum value , Oid valuetypid , double * scaledvalue ,
148
147
Datum lobound , Datum hibound , Oid boundstypid ,
149
148
double * scaledlobound , double * scaledhibound );
@@ -1993,47 +1992,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
1993
1992
VariableStatData vardata1 ;
1994
1993
VariableStatData vardata2 ;
1995
1994
bool join_is_reversed ;
1996
- RelOptInfo * rel1 ;
1997
- RelOptInfo * rel2 ;
1995
+ RelOptInfo * inner_rel ;
1998
1996
1999
1997
get_join_variables (root , args , sjinfo ,
2000
1998
& vardata1 , & vardata2 , & join_is_reversed );
2001
1999
2002
- /*
2003
- * Identify the join's direct input relations. We use the min lefthand
2004
- * and min righthand as the inputs, even though the join might actually
2005
- * get done with larger input relations. The min inputs are guaranteed to
2006
- * have been formed by now, though, and always using them ensures
2007
- * consistency of estimates.
2008
- */
2009
- if (!join_is_reversed )
2010
- {
2011
- rel1 = find_join_input_rel (root , sjinfo -> min_lefthand );
2012
- rel2 = find_join_input_rel (root , sjinfo -> min_righthand );
2013
- }
2014
- else
2015
- {
2016
- rel1 = find_join_input_rel (root , sjinfo -> min_righthand );
2017
- rel2 = find_join_input_rel (root , sjinfo -> min_lefthand );
2018
- }
2019
-
2020
2000
switch (sjinfo -> jointype )
2021
2001
{
2022
2002
case JOIN_INNER :
2023
2003
case JOIN_LEFT :
2024
2004
case JOIN_FULL :
2025
- selec = eqjoinsel_inner (operator , & vardata1 , & vardata2 ,
2026
- rel1 , rel2 );
2005
+ selec = eqjoinsel_inner (operator , & vardata1 , & vardata2 );
2027
2006
break ;
2028
2007
case JOIN_SEMI :
2029
2008
case JOIN_ANTI :
2009
+ /*
2010
+ * Look up the join's inner relation. min_righthand is sufficient
2011
+ * information because neither SEMI nor ANTI joins permit any
2012
+ * reassociation into or out of their RHS, so the righthand will
2013
+ * always be exactly that set of rels.
2014
+ */
2015
+ inner_rel = find_join_input_rel (root , sjinfo -> min_righthand );
2016
+
2030
2017
if (!join_is_reversed )
2031
2018
selec = eqjoinsel_semi (operator , & vardata1 , & vardata2 ,
2032
- rel1 , rel2 );
2019
+ inner_rel );
2033
2020
else
2034
2021
selec = eqjoinsel_semi (get_commutator (operator ),
2035
2022
& vardata2 , & vardata1 ,
2036
- rel2 , rel1 );
2023
+ inner_rel );
2037
2024
break ;
2038
2025
default :
2039
2026
/* other values not expected here */
@@ -2059,8 +2046,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
2059
2046
*/
2060
2047
static double
2061
2048
eqjoinsel_inner (Oid operator ,
2062
- VariableStatData * vardata1 , VariableStatData * vardata2 ,
2063
- RelOptInfo * rel1 , RelOptInfo * rel2 )
2049
+ VariableStatData * vardata1 , VariableStatData * vardata2 )
2064
2050
{
2065
2051
double selec ;
2066
2052
double nd1 ;
@@ -2254,26 +2240,10 @@ eqjoinsel_inner(Oid operator,
2254
2240
* XXX Can we be smarter if we have an MCV list for just one side? It
2255
2241
* seems that if we assume equal distribution for the other side, we
2256
2242
* end up with the same answer anyway.
2257
- *
2258
- * An additional hack we use here is to clamp the nd1 and nd2 values
2259
- * to not more than what we are estimating the input relation sizes to
2260
- * be, providing a crude correction for the selectivity of restriction
2261
- * clauses on those relations. (We don't do that in the other path
2262
- * since there we are comparing the nd values to stats for the whole
2263
- * relations.) We can apply this clamp both with respect to the base
2264
- * relations from which the join variables come, and to the immediate
2265
- * input relations of the current join.
2266
2243
*/
2267
2244
double nullfrac1 = stats1 ? stats1 -> stanullfrac : 0.0 ;
2268
2245
double nullfrac2 = stats2 ? stats2 -> stanullfrac : 0.0 ;
2269
2246
2270
- if (vardata1 -> rel )
2271
- nd1 = Min (nd1 , vardata1 -> rel -> rows );
2272
- nd1 = Min (nd1 , rel1 -> rows );
2273
- if (vardata2 -> rel )
2274
- nd2 = Min (nd2 , vardata2 -> rel -> rows );
2275
- nd2 = Min (nd2 , rel2 -> rows );
2276
-
2277
2247
selec = (1.0 - nullfrac1 ) * (1.0 - nullfrac2 );
2278
2248
if (nd1 > nd2 )
2279
2249
selec /= nd1 ;
@@ -2300,7 +2270,7 @@ eqjoinsel_inner(Oid operator,
2300
2270
static double
2301
2271
eqjoinsel_semi (Oid operator ,
2302
2272
VariableStatData * vardata1 , VariableStatData * vardata2 ,
2303
- RelOptInfo * rel1 , RelOptInfo * rel2 )
2273
+ RelOptInfo * inner_rel )
2304
2274
{
2305
2275
double selec ;
2306
2276
double nd1 ;
@@ -2321,6 +2291,25 @@ eqjoinsel_semi(Oid operator,
2321
2291
nd1 = get_variable_numdistinct (vardata1 );
2322
2292
nd2 = get_variable_numdistinct (vardata2 );
2323
2293
2294
+ /*
2295
+ * We clamp nd2 to be not more than what we estimate the inner relation's
2296
+ * size to be. This is intuitively somewhat reasonable since obviously
2297
+ * there can't be more than that many distinct values coming from the
2298
+ * inner rel. The reason for the asymmetry (ie, that we don't clamp nd1
2299
+ * likewise) is that this is the only pathway by which restriction clauses
2300
+ * applied to the inner rel will affect the join result size estimate,
2301
+ * since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
2302
+ * only the outer rel's size. If we clamped nd1 we'd be double-counting
2303
+ * the selectivity of outer-rel restrictions.
2304
+ *
2305
+ * We can apply this clamping both with respect to the base relation from
2306
+ * which the join variable comes (if there is just one), and to the
2307
+ * immediate inner input relation of the current join.
2308
+ */
2309
+ if (vardata2 -> rel )
2310
+ nd2 = Min (nd2 , vardata2 -> rel -> rows );
2311
+ nd2 = Min (nd2 , inner_rel -> rows );
2312
+
2324
2313
if (HeapTupleIsValid (vardata1 -> statsTuple ))
2325
2314
{
2326
2315
stats1 = (Form_pg_statistic ) GETSTRUCT (vardata1 -> statsTuple );
@@ -2365,11 +2354,21 @@ eqjoinsel_semi(Oid operator,
2365
2354
uncertainfrac ,
2366
2355
uncertain ;
2367
2356
int i ,
2368
- nmatches ;
2357
+ nmatches ,
2358
+ clamped_nvalues2 ;
2359
+
2360
+ /*
2361
+ * The clamping above could have resulted in nd2 being less than
2362
+ * nvalues2; in which case, we assume that precisely the nd2 most
2363
+ * common values in the relation will appear in the join input, and so
2364
+ * compare to only the first nd2 members of the MCV list. Of course
2365
+ * this is frequently wrong, but it's the best bet we can make.
2366
+ */
2367
+ clamped_nvalues2 = Min (nvalues2 , nd2 );
2369
2368
2370
2369
fmgr_info (get_opcode (operator ), & eqproc );
2371
2370
hasmatch1 = (bool * ) palloc0 (nvalues1 * sizeof (bool ));
2372
- hasmatch2 = (bool * ) palloc0 (nvalues2 * sizeof (bool ));
2371
+ hasmatch2 = (bool * ) palloc0 (clamped_nvalues2 * sizeof (bool ));
2373
2372
2374
2373
/*
2375
2374
* Note we assume that each MCV will match at most one member of the
@@ -2382,7 +2381,7 @@ eqjoinsel_semi(Oid operator,
2382
2381
{
2383
2382
int j ;
2384
2383
2385
- for (j = 0 ; j < nvalues2 ; j ++ )
2384
+ for (j = 0 ; j < clamped_nvalues2 ; j ++ )
2386
2385
{
2387
2386
if (hasmatch2 [j ])
2388
2387
continue ;
@@ -2426,7 +2425,7 @@ eqjoinsel_semi(Oid operator,
2426
2425
{
2427
2426
nd1 -= nmatches ;
2428
2427
nd2 -= nmatches ;
2429
- if (nd1 <= nd2 || nd2 <= 0 )
2428
+ if (nd1 <= nd2 || nd2 < 0 )
2430
2429
uncertainfrac = 1.0 ;
2431
2430
else
2432
2431
uncertainfrac = nd2 / nd1 ;
@@ -2447,14 +2446,7 @@ eqjoinsel_semi(Oid operator,
2447
2446
2448
2447
if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT )
2449
2448
{
2450
- if (vardata1 -> rel )
2451
- nd1 = Min (nd1 , vardata1 -> rel -> rows );
2452
- nd1 = Min (nd1 , rel1 -> rows );
2453
- if (vardata2 -> rel )
2454
- nd2 = Min (nd2 , vardata2 -> rel -> rows );
2455
- nd2 = Min (nd2 , rel2 -> rows );
2456
-
2457
- if (nd1 <= nd2 || nd2 <= 0 )
2449
+ if (nd1 <= nd2 || nd2 < 0 )
2458
2450
selec = 1.0 - nullfrac1 ;
2459
2451
else
2460
2452
selec = (nd2 / nd1 ) * (1.0 - nullfrac1 );
0 commit comments