Skip to content

Commit 2686ee1

Browse files
Collect and use multi-column dependency stats
Follow on patch in the multi-variate statistics patch series. CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t; ANALYZE; will collect dependency stats on (a, b) and then use the measured dependency in subsequent query planning. Commit 7b504eb added CREATE STATISTICS with n-distinct coefficients. These are now specified using the mutually exclusive option WITH (ndistinct). Author: Tomas Vondra, David Rowley Reviewed-by: Kyotaro HORIGUCHI, Álvaro Herrera, Dean Rasheed, Robert Haas and many other comments and contributions Discussion: https://postgr.es/m/56f40b20-c464-fad2-ff39-06b668fac47c@2ndquadrant.com
1 parent 00b6b6f commit 2686ee1

31 files changed

+2035
-79
lines changed

contrib/file_fdw/file_fdw.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,7 @@ estimate_size(PlannerInfo *root, RelOptInfo *baserel,
10131013
baserel->baserestrictinfo,
10141014
0,
10151015
JOIN_INNER,
1016+
NULL,
10161017
NULL);
10171018

10181019
nrows = clamp_row_est(nrows);

contrib/postgres_fdw/postgres_fdw.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,7 @@ postgresGetForeignRelSize(PlannerInfo *root,
591591
fpinfo->local_conds,
592592
baserel->relid,
593593
JOIN_INNER,
594+
NULL,
594595
NULL);
595596

596597
cost_qual_eval(&fpinfo->local_conds_cost, fpinfo->local_conds, root);
@@ -2572,6 +2573,7 @@ estimate_path_cost_size(PlannerInfo *root,
25722573
local_param_join_conds,
25732574
foreignrel->relid,
25742575
JOIN_INNER,
2576+
NULL,
25752577
NULL);
25762578
local_sel *= fpinfo->local_conds_sel;
25772579

@@ -4455,6 +4457,7 @@ postgresGetForeignJoinPaths(PlannerInfo *root,
44554457
fpinfo->local_conds,
44564458
0,
44574459
JOIN_INNER,
4460+
NULL,
44584461
NULL);
44594462
cost_qual_eval(&fpinfo->local_conds_cost, fpinfo->local_conds, root);
44604463

@@ -4465,7 +4468,7 @@ postgresGetForeignJoinPaths(PlannerInfo *root,
44654468
if (!fpinfo->use_remote_estimate)
44664469
fpinfo->joinclause_sel = clauselist_selectivity(root, fpinfo->joinclauses,
44674470
0, fpinfo->jointype,
4468-
extra->sjinfo);
4471+
extra->sjinfo, NULL);
44694472

44704473
/* Estimate costs for bare join relation */
44714474
estimate_path_cost_size(root, joinrel, NIL, NIL, &rows,

doc/src/sgml/catalogs.sgml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4339,6 +4339,15 @@
43394339
</entry>
43404340
</row>
43414341

4342+
<row>
4343+
<entry><structfield>stadependencies</structfield></entry>
4344+
<entry><type>pg_dependencies</type></entry>
4345+
<entry></entry>
4346+
<entry>
4347+
Functional dependencies, serialized as <structname>pg_dependencies</> type.
4348+
</entry>
4349+
</row>
4350+
43424351
</tbody>
43434352
</tgroup>
43444353
</table>

doc/src/sgml/planstats.sgml

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,160 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
446446
in <filename>src/backend/utils/adt/selfuncs.c</filename>.
447447
</para>
448448

449+
<sect2 id="functional-dependencies">
450+
<title>Functional Dependencies</title>
451+
452+
<para>
453+
The simplest type of extended statistics are functional dependencies,
454+
used in definitions of database normal forms. When simplified, saying that
455+
<literal>b</> is functionally dependent on <literal>a</> means that
456+
knowledge of value of <literal>a</> is sufficient to determine value of
457+
<literal>b</>.
458+
</para>
459+
460+
<para>
461+
In normalized databases, only functional dependencies on primary keys
462+
and superkeys are allowed. However, in practice, many data sets are not
463+
fully normalized, for example, due to intentional denormalization for
464+
performance reasons.
465+
</para>
466+
467+
<para>
468+
Functional dependencies directly affect accuracy of the estimates, as
469+
conditions on the dependent column(s) do not restrict the result set,
470+
resulting in underestimates.
471+
</para>
472+
473+
<para>
474+
To inform the planner about the functional dependencies, we collect
475+
measurements of dependency during <command>ANALYZE</>. Assessing
476+
dependency between all sets of columns would be prohibitively
477+
expensive, so we limit our search to potential dependencies defined
478+
using the <command>CREATE STATISTICS</> command.
479+
480+
<programlisting>
481+
CREATE TABLE t (a INT, b INT);
482+
INSERT INTO t SELECT i/100, i/100 FROM generate_series(1,10000) s(i);
483+
CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t;
484+
ANALYZE t;
485+
EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;
486+
QUERY PLAN
487+
-------------------------------------------------------------------------------------------------
488+
Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual time=0.095..3.118 rows=100 loops=1)
489+
Filter: ((a = 1) AND (b = 1))
490+
Rows Removed by Filter: 9900
491+
Planning time: 0.367 ms
492+
Execution time: 3.380 ms
493+
(5 rows)
494+
</programlisting>
495+
496+
The planner is now aware of the functional dependencies and considers
497+
them when computing the selectivity of the second condition. Running
498+
the query without the statistics would lead to quite different estimates.
499+
500+
<programlisting>
501+
DROP STATISTICS s1;
502+
EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;
503+
QUERY PLAN
504+
-----------------------------------------------------------------------------------------------
505+
Seq Scan on t (cost=0.00..195.00 rows=1 width=8) (actual time=0.000..6.379 rows=100 loops=1)
506+
Filter: ((a = 1) AND (b = 1))
507+
Rows Removed by Filter: 9900
508+
Planning time: 0.000 ms
509+
Execution time: 6.379 ms
510+
(5 rows)
511+
</programlisting>
512+
</para>
513+
514+
<para>
515+
If no dependency exists, the collected statistics do not influence the
516+
query plan. The only effect is to slow down <command>ANALYZE</>. Should
517+
partial dependencies exist these will also be stored and applied
518+
during planning.
519+
</para>
520+
521+
<para>
522+
Similarly to per-column statistics, extended statistics are stored in
523+
a system catalog called <structname>pg_statistic_ext</structname>, but
524+
there is also a more convenient view <structname>pg_stats_ext</structname>.
525+
To inspect the statistics <literal>s1</literal> defined above,
526+
you may do this:
527+
528+
<programlisting>
529+
SELECT tablename, staname, attnums, depsbytes
530+
FROM pg_stats_ext WHERE staname = 's1';
531+
tablename | staname | attnums | depsbytes
532+
-----------+---------+---------+-----------
533+
t | s1 | 1 2 | 40
534+
(1 row)
535+
</programlisting>
536+
537+
This shows that the statistics are defined on table <structname>t</>,
538+
<structfield>attnums</structfield> lists attribute numbers of columns
539+
(references <structname>pg_attribute</structname>). It also shows
540+
the length in bytes of the functional dependencies, as found by
541+
<command>ANALYZE</> when serialized into a <literal>bytea</> column.
542+
</para>
543+
544+
<para>
545+
When computing the selectivity, the planner inspects all conditions and
546+
attempts to identify which conditions are already implied by other
547+
conditions. The selectivity estimates from any redundant conditions are
548+
ignored from a selectivity point of view. In the example query above,
549+
the selectivity estimates for either of the conditions may be eliminated,
550+
thus improving the overall estimate.
551+
</para>
552+
553+
<sect3 id="functional-dependencies-limitations">
554+
<title>Limitations of functional dependencies</title>
555+
556+
<para>
557+
Functional dependencies are a very simple type of statistics, and
558+
as such have several limitations. The first limitation is that they
559+
only work with simple equality conditions, comparing columns and constant
560+
values. It's not possible to use them to eliminate equality conditions
561+
comparing two columns or a column to an expression, range clauses,
562+
<literal>LIKE</> or any other type of conditions.
563+
</para>
564+
565+
<para>
566+
When eliminating the implied conditions, the planner assumes that the
567+
conditions are compatible. Consider the following example, violating
568+
this assumption:
569+
570+
<programlisting>
571+
EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 10;
572+
QUERY PLAN
573+
-----------------------------------------------------------------------------------------------
574+
Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual time=2.992..2.992 rows=0 loops=1)
575+
Filter: ((a = 1) AND (b = 10))
576+
Rows Removed by Filter: 10000
577+
Planning time: 0.232 ms
578+
Execution time: 3.033 ms
579+
(5 rows)
580+
</programlisting>
581+
582+
While there are no rows with such combination of values, the planner
583+
is unable to verify whether the values match - it only knows that
584+
the columns are functionally dependent.
585+
</para>
586+
587+
<para>
588+
This assumption is more about queries executed on the database - in many
589+
cases, it's actually satisfied (e.g. when the GUI only allows selecting
590+
compatible values). But if that's not the case, functional dependencies
591+
may not be a viable option.
592+
</para>
593+
594+
<para>
595+
For additional information about functional dependencies, see
596+
<filename>src/backend/statistics/README.dependencies</>.
597+
</para>
598+
599+
</sect3>
600+
601+
</sect2>
602+
449603
</sect1>
450604

451605
</chapter>

doc/src/sgml/ref/create_statistics.sgml

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ PostgreSQL documentation
2121

2222
<refsynopsisdiv>
2323
<synopsis>
24-
CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable> ON (
25-
<replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])
24+
CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable>
25+
WITH ( <replaceable class="PARAMETER">option</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] )
26+
ON ( <replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])
2627
FROM <replaceable class="PARAMETER">table_name</replaceable>
2728
</synopsis>
2829

@@ -94,6 +95,41 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
9495

9596
</variablelist>
9697

98+
<refsect2 id="SQL-CREATESTATISTICS-parameters">
99+
<title id="SQL-CREATESTATISTICS-parameters-title">Parameters</title>
100+
101+
<indexterm zone="sql-createstatistics-parameters">
102+
<primary>statistics parameters</primary>
103+
</indexterm>
104+
105+
<para>
106+
The <literal>WITH</> clause can specify <firstterm>options</>
107+
for the statistics. Available options are listed below.
108+
</para>
109+
110+
<variablelist>
111+
112+
<varlistentry>
113+
<term><literal>dependencies</> (<type>boolean</>)</term>
114+
<listitem>
115+
<para>
116+
Enables functional dependencies for the statistics.
117+
</para>
118+
</listitem>
119+
</varlistentry>
120+
121+
<varlistentry>
122+
<term><literal>ndistinct</> (<type>boolean</>)</term>
123+
<listitem>
124+
<para>
125+
Enables ndistinct coefficients for the statistics.
126+
</para>
127+
</listitem>
128+
</varlistentry>
129+
130+
</variablelist>
131+
132+
</refsect2>
97133
</refsect1>
98134

99135
<refsect1>
@@ -122,7 +158,7 @@ CREATE TABLE t1 (
122158
INSERT INTO t1 SELECT i/100, i/500
123159
FROM generate_series(1,1000000) s(i);
124160

125-
CREATE STATISTICS s1 ON (a, b) FROM t1;
161+
CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t1;
126162

127163
ANALYZE t1;
128164

src/backend/catalog/system_views.sql

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,8 @@ CREATE VIEW pg_stats_ext AS
192192
C.relname AS tablename,
193193
S.staname AS staname,
194194
S.stakeys AS attnums,
195-
length(s.standistinct) AS ndistbytes
195+
length(s.standistinct::bytea) AS ndistbytes,
196+
length(S.stadependencies::bytea) AS depsbytes
196197
FROM (pg_statistic_ext S JOIN pg_class C ON (C.oid = S.starelid))
197198
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);
198199

src/backend/commands/statscmds.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ CreateStatistics(CreateStatsStmt *stmt)
6262
Oid relid;
6363
ObjectAddress parentobject,
6464
childobject;
65-
Datum types[1]; /* only ndistinct defined now */
65+
Datum types[2]; /* one for each possible type of statistics */
6666
int ntypes;
6767
ArrayType *staenabled;
6868
bool build_ndistinct;
69+
bool build_dependencies;
6970
bool requested_type = false;
7071

7172
Assert(IsA(stmt, CreateStatsStmt));
@@ -159,7 +160,7 @@ CreateStatistics(CreateStatsStmt *stmt)
159160
errmsg("statistics require at least 2 columns")));
160161

161162
/*
162-
* Sort the attnums, which makes detecting duplicies somewhat easier, and
163+
* Sort the attnums, which makes detecting duplicities somewhat easier, and
163164
* it does not hurt (it does not affect the efficiency, unlike for
164165
* indexes, for example).
165166
*/
@@ -182,6 +183,7 @@ CreateStatistics(CreateStatsStmt *stmt)
182183
* recognized.
183184
*/
184185
build_ndistinct = false;
186+
build_dependencies = false;
185187
foreach(l, stmt->options)
186188
{
187189
DefElem *opt = (DefElem *) lfirst(l);
@@ -191,6 +193,11 @@ CreateStatistics(CreateStatsStmt *stmt)
191193
build_ndistinct = defGetBoolean(opt);
192194
requested_type = true;
193195
}
196+
else if (strcmp(opt->defname, "dependencies") == 0)
197+
{
198+
build_dependencies = defGetBoolean(opt);
199+
requested_type = true;
200+
}
194201
else
195202
ereport(ERROR,
196203
(errcode(ERRCODE_SYNTAX_ERROR),
@@ -199,12 +206,17 @@ CreateStatistics(CreateStatsStmt *stmt)
199206
}
200207
/* If no statistic type was specified, build them all. */
201208
if (!requested_type)
209+
{
202210
build_ndistinct = true;
211+
build_dependencies = true;
212+
}
203213

204214
/* construct the char array of enabled statistic types */
205215
ntypes = 0;
206216
if (build_ndistinct)
207217
types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT);
218+
if (build_dependencies)
219+
types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
208220
Assert(ntypes > 0);
209221
staenabled = construct_array(types, ntypes, CHAROID, 1, true, 'c');
210222

@@ -222,6 +234,7 @@ CreateStatistics(CreateStatsStmt *stmt)
222234

223235
/* no statistics build yet */
224236
nulls[Anum_pg_statistic_ext_standistinct - 1] = true;
237+
nulls[Anum_pg_statistic_ext_stadependencies - 1] = true;
225238

226239
/* insert it into pg_statistic_ext */
227240
statrel = heap_open(StatisticExtRelationId, RowExclusiveLock);

0 commit comments

Comments
 (0)