Skip to content

Commit 6985592

Browse files
committed
Split out into a separate function the code in grouping_planner() that
decides whether to use hashed grouping instead of sort-plus-uniq grouping. The function needs an annoyingly large number of parameters, but this still seems like a win for legibility, since it removes over a hundred lines from grouping_planner (which is still too big :-().
1 parent 313de22 commit 6985592

File tree

1 file changed

+156
-140
lines changed

1 file changed

+156
-140
lines changed

src/backend/optimizer/plan/planner.c

Lines changed: 156 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.182 2005/04/06 16:34:05 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.183 2005/04/10 19:50:08 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -58,6 +58,10 @@ static Node *preprocess_expression(Query *parse, Node *expr, int kind);
5858
static void preprocess_qual_conditions(Query *parse, Node *jtnode);
5959
static Plan *inheritance_planner(Query *parse, List *inheritlist);
6060
static Plan *grouping_planner(Query *parse, double tuple_fraction);
61+
static bool choose_hashed_grouping(Query *parse, double tuple_fraction,
62+
Path *cheapest_path, Path *sorted_path,
63+
List *sort_pathkeys, List *group_pathkeys,
64+
double dNumGroups, AggClauseCounts *agg_counts);
6165
static bool hash_safe_grouping(Query *parse);
6266
static List *make_subplanTargetList(Query *parse, List *tlist,
6367
AttrNumber **groupColIdx, bool *need_tlist_eval);
@@ -920,34 +924,25 @@ grouping_planner(Query *parse, double tuple_fraction)
920924
sort_pathkeys = canonicalize_pathkeys(parse, sort_pathkeys);
921925

922926
/*
923-
* Consider whether we might want to use hashed grouping.
927+
* If grouping, estimate the number of groups. (We can't do this
928+
* until after running query_planner(), either.) Then decide
929+
* whether we want to use hashed grouping.
924930
*/
925931
if (parse->groupClause)
926932
{
927933
List *groupExprs;
928934
double cheapest_path_rows;
929-
int cheapest_path_width;
930935

931936
/*
932-
* Beware in this section of the possibility that
933-
* cheapest_path->parent is NULL. This could happen if user
934-
* does something silly like SELECT 'foo' GROUP BY 1;
937+
* Beware of the possibility that cheapest_path->parent is NULL.
938+
* This could happen if user does something silly like
939+
* SELECT 'foo' GROUP BY 1;
935940
*/
936941
if (cheapest_path->parent)
937-
{
938942
cheapest_path_rows = cheapest_path->parent->rows;
939-
cheapest_path_width = cheapest_path->parent->width;
940-
}
941943
else
942-
{
943944
cheapest_path_rows = 1; /* assume non-set result */
944-
cheapest_path_width = 100; /* arbitrary */
945-
}
946945

947-
/*
948-
* Always estimate the number of groups. We can't do this
949-
* until after running query_planner(), either.
950-
*/
951946
groupExprs = get_sortgrouplist_exprs(parse->groupClause,
952947
parse->targetList);
953948
dNumGroups = estimate_num_groups(parse,
@@ -956,130 +951,11 @@ grouping_planner(Query *parse, double tuple_fraction)
956951
/* Also want it as a long int --- but 'ware overflow! */
957952
numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
958953

959-
/*
960-
* Check can't-do-it conditions, including whether the
961-
* grouping operators are hashjoinable.
962-
*
963-
* Executor doesn't support hashed aggregation with DISTINCT
964-
* aggregates. (Doing so would imply storing *all* the input
965-
* values in the hash table, which seems like a certain
966-
* loser.)
967-
*/
968-
if (!enable_hashagg || !hash_safe_grouping(parse))
969-
use_hashed_grouping = false;
970-
else if (agg_counts.numDistinctAggs != 0)
971-
use_hashed_grouping = false;
972-
else
973-
{
974-
/*
975-
* Use hashed grouping if (a) we think we can fit the
976-
* hashtable into work_mem, *and* (b) the estimated cost
977-
* is no more than doing it the other way. While avoiding
978-
* the need for sorted input is usually a win, the fact
979-
* that the output won't be sorted may be a loss; so we
980-
* need to do an actual cost comparison.
981-
*/
982-
Size hashentrysize;
983-
984-
/* Estimate per-hash-entry space at tuple width... */
985-
hashentrysize = cheapest_path_width;
986-
/* plus space for pass-by-ref transition values... */
987-
hashentrysize += agg_counts.transitionSpace;
988-
/* plus the per-hash-entry overhead */
989-
hashentrysize += hash_agg_entry_size(agg_counts.numAggs);
990-
991-
if (hashentrysize * dNumGroups <= work_mem * 1024L)
992-
{
993-
/*
994-
* Okay, do the cost comparison. We need to consider
995-
* cheapest_path + hashagg [+ final sort] versus
996-
* either cheapest_path [+ sort] + group or agg [+
997-
* final sort] or presorted_path + group or agg [+
998-
* final sort] where brackets indicate a step that may
999-
* not be needed. We assume query_planner() will have
1000-
* returned a presorted path only if it's a winner
1001-
* compared to cheapest_path for this purpose.
1002-
*
1003-
* These path variables are dummies that just hold cost
1004-
* fields; we don't make actual Paths for these steps.
1005-
*/
1006-
Path hashed_p;
1007-
Path sorted_p;
1008-
1009-
cost_agg(&hashed_p, parse,
1010-
AGG_HASHED, agg_counts.numAggs,
1011-
numGroupCols, dNumGroups,
1012-
cheapest_path->startup_cost,
1013-
cheapest_path->total_cost,
1014-
cheapest_path_rows);
1015-
/* Result of hashed agg is always unsorted */
1016-
if (sort_pathkeys)
1017-
cost_sort(&hashed_p, parse, sort_pathkeys,
1018-
hashed_p.total_cost,
1019-
dNumGroups,
1020-
cheapest_path_width);
1021-
1022-
if (sorted_path)
1023-
{
1024-
sorted_p.startup_cost = sorted_path->startup_cost;
1025-
sorted_p.total_cost = sorted_path->total_cost;
1026-
current_pathkeys = sorted_path->pathkeys;
1027-
}
1028-
else
1029-
{
1030-
sorted_p.startup_cost = cheapest_path->startup_cost;
1031-
sorted_p.total_cost = cheapest_path->total_cost;
1032-
current_pathkeys = cheapest_path->pathkeys;
1033-
}
1034-
if (!pathkeys_contained_in(group_pathkeys,
1035-
current_pathkeys))
1036-
{
1037-
cost_sort(&sorted_p, parse, group_pathkeys,
1038-
sorted_p.total_cost,
1039-
cheapest_path_rows,
1040-
cheapest_path_width);
1041-
current_pathkeys = group_pathkeys;
1042-
}
1043-
if (parse->hasAggs)
1044-
cost_agg(&sorted_p, parse,
1045-
AGG_SORTED, agg_counts.numAggs,
1046-
numGroupCols, dNumGroups,
1047-
sorted_p.startup_cost,
1048-
sorted_p.total_cost,
1049-
cheapest_path_rows);
1050-
else
1051-
cost_group(&sorted_p, parse,
1052-
numGroupCols, dNumGroups,
1053-
sorted_p.startup_cost,
1054-
sorted_p.total_cost,
1055-
cheapest_path_rows);
1056-
/* The Agg or Group node will preserve ordering */
1057-
if (sort_pathkeys &&
1058-
!pathkeys_contained_in(sort_pathkeys,
1059-
current_pathkeys))
1060-
{
1061-
cost_sort(&sorted_p, parse, sort_pathkeys,
1062-
sorted_p.total_cost,
1063-
dNumGroups,
1064-
cheapest_path_width);
1065-
}
1066-
1067-
/*
1068-
* Now make the decision using the top-level tuple
1069-
* fraction. First we have to convert an absolute
1070-
* count (LIMIT) into fractional form.
1071-
*/
1072-
if (tuple_fraction >= 1.0)
1073-
tuple_fraction /= dNumGroups;
1074-
1075-
if (compare_fractional_path_costs(&hashed_p, &sorted_p,
1076-
tuple_fraction) < 0)
1077-
{
1078-
/* Hashed is cheaper, so use it */
1079-
use_hashed_grouping = true;
1080-
}
1081-
}
1082-
}
954+
use_hashed_grouping =
955+
choose_hashed_grouping(parse, tuple_fraction,
956+
cheapest_path, sorted_path,
957+
sort_pathkeys, group_pathkeys,
958+
dNumGroups, &agg_counts);
1083959
}
1084960

1085961
/*
@@ -1331,6 +1207,146 @@ grouping_planner(Query *parse, double tuple_fraction)
13311207
return result_plan;
13321208
}
13331209

1210+
/*
1211+
* choose_hashed_grouping - should we use hashed grouping?
1212+
*/
1213+
static bool
1214+
choose_hashed_grouping(Query *parse, double tuple_fraction,
1215+
Path *cheapest_path, Path *sorted_path,
1216+
List *sort_pathkeys, List *group_pathkeys,
1217+
double dNumGroups, AggClauseCounts *agg_counts)
1218+
{
1219+
int numGroupCols = list_length(parse->groupClause);
1220+
double cheapest_path_rows;
1221+
int cheapest_path_width;
1222+
Size hashentrysize;
1223+
List *current_pathkeys;
1224+
Path hashed_p;
1225+
Path sorted_p;
1226+
1227+
/*
1228+
* Check can't-do-it conditions, including whether the grouping operators
1229+
* are hashjoinable.
1230+
*
1231+
* Executor doesn't support hashed aggregation with DISTINCT aggregates.
1232+
* (Doing so would imply storing *all* the input values in the hash table,
1233+
* which seems like a certain loser.)
1234+
*/
1235+
if (!enable_hashagg)
1236+
return false;
1237+
if (agg_counts->numDistinctAggs != 0)
1238+
return false;
1239+
if (!hash_safe_grouping(parse))
1240+
return false;
1241+
1242+
/*
1243+
* Don't do it if it doesn't look like the hashtable will fit into
1244+
* work_mem.
1245+
*
1246+
* Beware here of the possibility that cheapest_path->parent is NULL.
1247+
* This could happen if user does something silly like
1248+
* SELECT 'foo' GROUP BY 1;
1249+
*/
1250+
if (cheapest_path->parent)
1251+
{
1252+
cheapest_path_rows = cheapest_path->parent->rows;
1253+
cheapest_path_width = cheapest_path->parent->width;
1254+
}
1255+
else
1256+
{
1257+
cheapest_path_rows = 1; /* assume non-set result */
1258+
cheapest_path_width = 100; /* arbitrary */
1259+
}
1260+
1261+
/* Estimate per-hash-entry space at tuple width... */
1262+
hashentrysize = cheapest_path_width;
1263+
/* plus space for pass-by-ref transition values... */
1264+
hashentrysize += agg_counts->transitionSpace;
1265+
/* plus the per-hash-entry overhead */
1266+
hashentrysize += hash_agg_entry_size(agg_counts->numAggs);
1267+
1268+
if (hashentrysize * dNumGroups > work_mem * 1024L)
1269+
return false;
1270+
1271+
/*
1272+
* See if the estimated cost is no more than doing it the other way.
1273+
* While avoiding the need for sorted input is usually a win, the fact
1274+
* that the output won't be sorted may be a loss; so we need to do an
1275+
* actual cost comparison.
1276+
*
1277+
* We need to consider
1278+
* cheapest_path + hashagg [+ final sort]
1279+
* versus either
1280+
* cheapest_path [+ sort] + group or agg [+ final sort]
1281+
* or
1282+
* presorted_path + group or agg [+ final sort]
1283+
* where brackets indicate a step that may not be needed. We assume
1284+
* query_planner() will have returned a presorted path only if it's a
1285+
* winner compared to cheapest_path for this purpose.
1286+
*
1287+
* These path variables are dummies that just hold cost fields; we don't
1288+
* make actual Paths for these steps.
1289+
*/
1290+
cost_agg(&hashed_p, parse, AGG_HASHED, agg_counts->numAggs,
1291+
numGroupCols, dNumGroups,
1292+
cheapest_path->startup_cost, cheapest_path->total_cost,
1293+
cheapest_path_rows);
1294+
/* Result of hashed agg is always unsorted */
1295+
if (sort_pathkeys)
1296+
cost_sort(&hashed_p, parse, sort_pathkeys, hashed_p.total_cost,
1297+
dNumGroups, cheapest_path_width);
1298+
1299+
if (sorted_path)
1300+
{
1301+
sorted_p.startup_cost = sorted_path->startup_cost;
1302+
sorted_p.total_cost = sorted_path->total_cost;
1303+
current_pathkeys = sorted_path->pathkeys;
1304+
}
1305+
else
1306+
{
1307+
sorted_p.startup_cost = cheapest_path->startup_cost;
1308+
sorted_p.total_cost = cheapest_path->total_cost;
1309+
current_pathkeys = cheapest_path->pathkeys;
1310+
}
1311+
if (!pathkeys_contained_in(group_pathkeys,
1312+
current_pathkeys))
1313+
{
1314+
cost_sort(&sorted_p, parse, group_pathkeys, sorted_p.total_cost,
1315+
cheapest_path_rows, cheapest_path_width);
1316+
current_pathkeys = group_pathkeys;
1317+
}
1318+
1319+
if (parse->hasAggs)
1320+
cost_agg(&sorted_p, parse, AGG_SORTED, agg_counts->numAggs,
1321+
numGroupCols, dNumGroups,
1322+
sorted_p.startup_cost, sorted_p.total_cost,
1323+
cheapest_path_rows);
1324+
else
1325+
cost_group(&sorted_p, parse, numGroupCols, dNumGroups,
1326+
sorted_p.startup_cost, sorted_p.total_cost,
1327+
cheapest_path_rows);
1328+
/* The Agg or Group node will preserve ordering */
1329+
if (sort_pathkeys &&
1330+
!pathkeys_contained_in(sort_pathkeys, current_pathkeys))
1331+
cost_sort(&sorted_p, parse, sort_pathkeys, sorted_p.total_cost,
1332+
dNumGroups, cheapest_path_width);
1333+
1334+
/*
1335+
* Now make the decision using the top-level tuple fraction. First we
1336+
* have to convert an absolute count (LIMIT) into fractional form.
1337+
*/
1338+
if (tuple_fraction >= 1.0)
1339+
tuple_fraction /= dNumGroups;
1340+
1341+
if (compare_fractional_path_costs(&hashed_p, &sorted_p,
1342+
tuple_fraction) < 0)
1343+
{
1344+
/* Hashed is cheaper, so use it */
1345+
return true;
1346+
}
1347+
return false;
1348+
}
1349+
13341350
/*
13351351
* hash_safe_grouping - are grouping operators hashable?
13361352
*

0 commit comments

Comments
 (0)