Skip to content

Commit 1cb108e

Browse files
committed
Dig down into sub-selects to look for column statistics.
If a sub-select's output column is a simple Var, recursively look for statistics applying to that Var, and use them if available. The need for this was foreseen ages ago, but we didn't have enough infrastructure to do it with reasonable speed until just now. We punt and stick with default estimates if the subquery uses set operations, GROUP BY, or DISTINCT, since those operations would change the underlying column statistics (particularly, the relative frequencies of different values) beyond recognition. This means that the types of sub-selects for which this improvement applies are fairly limited, since most subqueries satisfying those restrictions would have gotten flattened into the parent query anyway. But it does help for some cases, such as subqueries with ORDER BY or LIMIT.
1 parent 698df33 commit 1cb108e

File tree

1 file changed

+108
-33
lines changed

1 file changed

+108
-33
lines changed

src/backend/utils/adt/selfuncs.c

Lines changed: 108 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
168168
int rangelo, int rangehi);
169169
static char *convert_string_datum(Datum value, Oid typid);
170170
static double convert_timevalue_to_scalar(Datum value, Oid typid);
171+
static void examine_simple_variable(PlannerInfo *root, Var *var,
172+
VariableStatData *vardata);
171173
static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata,
172174
Oid sortop, Datum *min, Datum *max);
173175
static bool get_actual_variable_range(PlannerInfo *root,
@@ -4153,46 +4155,16 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
41534155
(varRelid == 0 || varRelid == ((Var *) basenode)->varno))
41544156
{
41554157
Var *var = (Var *) basenode;
4156-
RangeTblEntry *rte;
41574158

4159+
/* Set up result fields other than the stats tuple */
41584160
vardata->var = basenode; /* return Var without relabeling */
41594161
vardata->rel = find_base_rel(root, var->varno);
41604162
vardata->atttype = var->vartype;
41614163
vardata->atttypmod = var->vartypmod;
41624164
vardata->isunique = has_unique_index(vardata->rel, var->varattno);
41634165

4164-
rte = root->simple_rte_array[var->varno];
4165-
4166-
if (get_relation_stats_hook &&
4167-
(*get_relation_stats_hook) (root, rte, var->varattno, vardata))
4168-
{
4169-
/*
4170-
* The hook took control of acquiring a stats tuple. If it did
4171-
* supply a tuple, it'd better have supplied a freefunc.
4172-
*/
4173-
if (HeapTupleIsValid(vardata->statsTuple) &&
4174-
!vardata->freefunc)
4175-
elog(ERROR, "no function provided to release variable stats with");
4176-
}
4177-
else if (rte->rtekind == RTE_RELATION)
4178-
{
4179-
vardata->statsTuple = SearchSysCache3(STATRELATTINH,
4180-
ObjectIdGetDatum(rte->relid),
4181-
Int16GetDatum(var->varattno),
4182-
BoolGetDatum(rte->inh));
4183-
vardata->freefunc = ReleaseSysCache;
4184-
}
4185-
else
4186-
{
4187-
/*
4188-
* XXX This means the Var comes from a JOIN or sub-SELECT. Later
4189-
* add code to dig down into the join etc and see if we can trace
4190-
* the variable to something with stats. (But beware of
4191-
* sub-SELECTs with DISTINCT/GROUP BY/etc. Perhaps there are no
4192-
* cases where this would really be useful, because we'd have
4193-
* flattened the subselect if it is??)
4194-
*/
4195-
}
4166+
/* Try to locate some stats */
4167+
examine_simple_variable(root, var, vardata);
41964168

41974169
return;
41984170
}
@@ -4334,6 +4306,109 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
43344306
}
43354307
}
43364308

4309+
/*
4310+
* examine_simple_variable
4311+
* Handle a simple Var for examine_variable
4312+
*
4313+
* This is split out as a subroutine so that we can recurse to deal with
4314+
* Vars referencing subqueries.
4315+
*
4316+
* We already filled in all the fields of *vardata except for the stats tuple.
4317+
*/
4318+
static void
4319+
examine_simple_variable(PlannerInfo *root, Var *var,
4320+
VariableStatData *vardata)
4321+
{
4322+
RangeTblEntry *rte = root->simple_rte_array[var->varno];
4323+
4324+
Assert(IsA(rte, RangeTblEntry));
4325+
4326+
if (get_relation_stats_hook &&
4327+
(*get_relation_stats_hook) (root, rte, var->varattno, vardata))
4328+
{
4329+
/*
4330+
* The hook took control of acquiring a stats tuple. If it did supply
4331+
* a tuple, it'd better have supplied a freefunc.
4332+
*/
4333+
if (HeapTupleIsValid(vardata->statsTuple) &&
4334+
!vardata->freefunc)
4335+
elog(ERROR, "no function provided to release variable stats with");
4336+
}
4337+
else if (rte->rtekind == RTE_RELATION)
4338+
{
4339+
/*
4340+
* Plain table or parent of an inheritance appendrel, so look up the
4341+
* column in pg_statistic
4342+
*/
4343+
vardata->statsTuple = SearchSysCache3(STATRELATTINH,
4344+
ObjectIdGetDatum(rte->relid),
4345+
Int16GetDatum(var->varattno),
4346+
BoolGetDatum(rte->inh));
4347+
vardata->freefunc = ReleaseSysCache;
4348+
}
4349+
else if (rte->rtekind == RTE_SUBQUERY && !rte->inh)
4350+
{
4351+
/*
4352+
* Plain subquery (not one that was converted to an appendrel).
4353+
*
4354+
* Punt if subquery uses set operations, GROUP BY, or DISTINCT --- any
4355+
* of these will mash underlying columns' stats beyond recognition.
4356+
* (Set ops are particularly nasty; if we forged ahead, we would
4357+
* return stats relevant to only the leftmost subselect...)
4358+
*/
4359+
Query *subquery = rte->subquery;
4360+
RelOptInfo *rel;
4361+
TargetEntry *ste;
4362+
4363+
if (subquery->setOperations ||
4364+
subquery->groupClause ||
4365+
subquery->distinctClause)
4366+
return;
4367+
4368+
/*
4369+
* OK, fetch RelOptInfo for subquery. Note that we don't change the
4370+
* rel returned in vardata, since caller expects it to be a rel of the
4371+
* caller's query level. Because we might already be recursing, we
4372+
* can't use that rel pointer either, but have to look up the Var's
4373+
* rel afresh.
4374+
*/
4375+
rel = find_base_rel(root, var->varno);
4376+
4377+
/* Subquery should have been planned already */
4378+
Assert(rel->subroot && IsA(rel->subroot, PlannerInfo));
4379+
4380+
/* Get the subquery output expression referenced by the upper Var */
4381+
ste = get_tle_by_resno(subquery->targetList, var->varattno);
4382+
if (ste == NULL || ste->resjunk)
4383+
elog(ERROR, "subquery %s does not have attribute %d",
4384+
rte->eref->aliasname, var->varattno);
4385+
var = (Var *) ste->expr;
4386+
4387+
/* Can only handle a simple Var of subquery's query level */
4388+
if (var && IsA(var, Var) &&
4389+
var->varlevelsup == 0)
4390+
{
4391+
/*
4392+
* OK, recurse into the subquery. Note that the original setting
4393+
* of vardata->isunique (which will surely be false) is left
4394+
* unchanged in this situation. That's what we want, since even
4395+
* if the underlying column is unique, the subquery may have
4396+
* joined to other tables in a way that creates duplicates.
4397+
*/
4398+
examine_simple_variable(rel->subroot, var, vardata);
4399+
}
4400+
}
4401+
else
4402+
{
4403+
/*
4404+
* Otherwise, the Var comes from a FUNCTION, VALUES, or CTE RTE. (We
4405+
* won't see RTE_JOIN here because join alias Vars have already been
4406+
* flattened.) There's not much we can do with function outputs, but
4407+
* maybe someday try to be smarter about VALUES and/or CTEs.
4408+
*/
4409+
}
4410+
}
4411+
43374412
/*
43384413
* get_variable_numdistinct
43394414
* Estimate the number of distinct values of a variable.

0 commit comments

Comments
 (0)