Skip to content

Commit e28033f

Browse files
author
Richard Guo
committed
Ignore nullingrels when looking up statistics
When looking up statistical data about an expression, we do not need to concern ourselves with the outer joins that could null the Vars/PHVs contained in the expression. Accounting for nullingrels in the expression could cause estimate_num_groups to count the same Var multiple times if it's marked with different nullingrels. This is incorrect, and could lead to "ERROR: corrupt MVNDistinct entry" when searching for multivariate n-distinct. Furthermore, the nullingrels could prevent us from matching an expression to expressional index columns or to the expressions in extended statistics, leading to inaccurate estimates. To fix, strip out all the nullingrels from the expression before we look up statistical data about it. There is one ensuing plan change in the regression tests, but it looks reasonable and does not compromise its original purpose. This patch could result in plan changes, but it fixes an actual bug, so back-patch to v16 where the outer-join-aware-Var infrastructure was introduced. Author: Richard Guo Discussion: https://postgr.es/m/CAMbWs4-2Z4k+nFTiZe0Qbu5n8juUWenDAtMzi98bAZQtwHx0-w@mail.gmail.com
1 parent d93bb81 commit e28033f

File tree

3 files changed

+63
-5
lines changed

3 files changed

+63
-5
lines changed

src/backend/utils/adt/selfuncs.c

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@
121121
#include "parser/parse_clause.h"
122122
#include "parser/parse_relation.h"
123123
#include "parser/parsetree.h"
124+
#include "rewrite/rewriteManip.h"
124125
#include "statistics/statistics.h"
125126
#include "storage/bufmgr.h"
126127
#include "utils/acl.h"
@@ -3306,6 +3307,15 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
33063307

33073308
ndistinct = get_variable_numdistinct(vardata, &isdefault);
33083309

3310+
/*
3311+
* The nullingrels bits within the var could cause the same var to be
3312+
* counted multiple times if it's marked with different nullingrels. They
3313+
* could also prevent us from matching the var to the expressions in
3314+
* extended statistics (see estimate_multivariate_ndistinct). So strip
3315+
* them out first.
3316+
*/
3317+
var = remove_nulling_relids(var, root->outer_join_rels, NULL);
3318+
33093319
foreach(lc, varinfos)
33103320
{
33113321
varinfo = (GroupVarInfo *) lfirst(lc);
@@ -5025,6 +5035,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
50255035
{
50265036
Node *basenode;
50275037
Relids varnos;
5038+
Relids basevarnos;
50285039
RelOptInfo *onerel;
50295040

50305041
/* Make sure we don't return dangling pointers in vardata */
@@ -5066,18 +5077,20 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
50665077
* relation are considered "real" vars.
50675078
*/
50685079
varnos = pull_varnos(root, basenode);
5080+
basevarnos = bms_difference(varnos, root->outer_join_rels);
50695081

50705082
onerel = NULL;
50715083

5072-
if (bms_is_empty(varnos))
5084+
if (bms_is_empty(basevarnos))
50735085
{
50745086
/* No Vars at all ... must be pseudo-constant clause */
50755087
}
50765088
else
50775089
{
50785090
int relid;
50795091

5080-
if (bms_get_singleton_member(varnos, &relid))
5092+
/* Check if the expression is in vars of a single base relation */
5093+
if (bms_get_singleton_member(basevarnos, &relid))
50815094
{
50825095
if (varRelid == 0 || varRelid == relid)
50835096
{
@@ -5107,7 +5120,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
51075120
}
51085121
}
51095122

5110-
bms_free(varnos);
5123+
bms_free(basevarnos);
51115124

51125125
vardata->var = node;
51135126
vardata->atttype = exprType(node);
@@ -5132,6 +5145,14 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
51325145
ListCell *slist;
51335146
Oid userid;
51345147

5148+
/*
5149+
* The nullingrels bits within the expression could prevent us from
5150+
* matching it to expressional index columns or to the expressions in
5151+
* extended statistics. So strip them out first.
5152+
*/
5153+
if (bms_overlap(varnos, root->outer_join_rels))
5154+
node = remove_nulling_relids(node, root->outer_join_rels, NULL);
5155+
51355156
/*
51365157
* Determine the user ID to use for privilege checks: either
51375158
* onerel->userid if it's set (e.g., in case we're accessing the table
@@ -5402,6 +5423,8 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
54025423
}
54035424
}
54045425
}
5426+
5427+
bms_free(varnos);
54055428
}
54065429

54075430
/*

src/test/regress/expected/join.out

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2573,10 +2573,11 @@ where t1.f1 = coalesce(t2.f1, 1);
25732573
-> Materialize
25742574
-> Seq Scan on int4_tbl t2
25752575
Filter: (f1 > 1)
2576-
-> Seq Scan on int4_tbl t3
2576+
-> Materialize
2577+
-> Seq Scan on int4_tbl t3
25772578
-> Materialize
25782579
-> Seq Scan on int4_tbl t4
2579-
(13 rows)
2580+
(14 rows)
25802581

25812582
explain (costs off)
25822583
select * from int4_tbl t1
@@ -8217,3 +8218,24 @@ SELECT * FROM rescan_bhs t1 LEFT JOIN rescan_bhs t2 ON t1.a IN
82178218

82188219
RESET enable_seqscan;
82198220
RESET enable_indexscan;
8221+
-- Test that we do not account for nullingrels when looking up statistics
8222+
CREATE TABLE group_tbl (a INT, b INT);
8223+
INSERT INTO group_tbl SELECT 1, 1;
8224+
CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl;
8225+
ANALYZE group_tbl;
8226+
EXPLAIN (COSTS OFF)
8227+
SELECT 1 FROM group_tbl t1
8228+
LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE
8229+
GROUP BY s.c1, s.c2;
8230+
QUERY PLAN
8231+
--------------------------------------------
8232+
Group
8233+
Group Key: t2.a, (COALESCE(t2.a))
8234+
-> Sort
8235+
Sort Key: t2.a, (COALESCE(t2.a))
8236+
-> Nested Loop Left Join
8237+
-> Seq Scan on group_tbl t1
8238+
-> Seq Scan on group_tbl t2
8239+
(7 rows)
8240+
8241+
DROP TABLE group_tbl;

src/test/regress/sql/join.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3033,3 +3033,16 @@ SELECT * FROM rescan_bhs t1 LEFT JOIN rescan_bhs t2 ON t1.a IN
30333033

30343034
RESET enable_seqscan;
30353035
RESET enable_indexscan;
3036+
3037+
-- Test that we do not account for nullingrels when looking up statistics
3038+
CREATE TABLE group_tbl (a INT, b INT);
3039+
INSERT INTO group_tbl SELECT 1, 1;
3040+
CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl;
3041+
ANALYZE group_tbl;
3042+
3043+
EXPLAIN (COSTS OFF)
3044+
SELECT 1 FROM group_tbl t1
3045+
LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE
3046+
GROUP BY s.c1, s.c2;
3047+
3048+
DROP TABLE group_tbl;

0 commit comments

Comments
 (0)