Skip to content

Commit 0452b46

Browse files
committed
Explore alternative orderings of group-by pathkeys during optimization.
When evaluating a query with a multi-column GROUP BY clause, we can minimize sort operations or avoid them if we synchronize the order of GROUP BY clauses with the ORDER BY sort clause or sort order, which comes from the underlying query tree. Grouping does not imply any ordering, so we can compare the keys in arbitrary order, and a Hash Agg leverages this. But for Group Agg, we simply compared keys in the order specified in the query. This commit explores alternative ordering of the keys, trying to find a cheaper one. The ordering of group keys may interact with other parts of the query, some of which may not be known while planning the grouping. For example, there may be an explicit ORDER BY clause or some other ordering-dependent operation higher up in the query, and using the same ordering may allow using either incremental sort or even eliminating the sort entirely. The patch always keeps the ordering specified in the query, assuming the user might have additional insights. This introduces a new GUC enable_group_by_reordering so that the optimization may be disabled if needed. Discussion: https://postgr.es/m/7c79e6a5-8597-74e8-0671-1c39d124c9d6%40sigaev.ru Author: Andrei Lepikhov, Teodor Sigaev Reviewed-by: Tomas Vondra, Claudio Freire, Gavin Flower, Dmitry Dolgov Reviewed-by: Robert Haas, Pavel Borisov, David Rowley, Zhihong Yu Reviewed-by: Tom Lane, Alexander Korotkov, Richard Guo, Alena Rybakina
1 parent 7ab80ac commit 0452b46

File tree

11 files changed

+770
-223
lines changed

11 files changed

+770
-223
lines changed

src/backend/optimizer/path/equivclass.c

+12-1
Original file line numberDiff line numberDiff line change
@@ -652,7 +652,18 @@ get_eclass_for_sort_expr(PlannerInfo *root,
652652

653653
if (opcintype == cur_em->em_datatype &&
654654
equal(expr, cur_em->em_expr))
655-
return cur_ec; /* Match! */
655+
{
656+
/*
657+
* Match!
658+
*
659+
* Copy the sortref if it wasn't set yet. That may happen if
660+
* the ec was constructed from a WHERE clause, i.e. it doesn't
661+
* have a target reference at all.
662+
*/
663+
if (cur_ec->ec_sortref == 0 && sortref > 0)
664+
cur_ec->ec_sortref = sortref;
665+
return cur_ec;
666+
}
656667
}
657668
}
658669

src/backend/optimizer/path/pathkeys.c

+252
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,15 @@
2222
#include "nodes/makefuncs.h"
2323
#include "nodes/nodeFuncs.h"
2424
#include "nodes/plannodes.h"
25+
#include "optimizer/cost.h"
2526
#include "optimizer/optimizer.h"
2627
#include "optimizer/pathnode.h"
2728
#include "optimizer/paths.h"
2829
#include "partitioning/partbounds.h"
2930
#include "utils/lsyscache.h"
3031

32+
/* Consider reordering of GROUP BY keys? */
33+
bool enable_group_by_reordering = true;
3134

3235
static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
3336
static bool matches_boolean_partition_clause(RestrictInfo *rinfo,
@@ -350,6 +353,202 @@ pathkeys_contained_in(List *keys1, List *keys2)
350353
return false;
351354
}
352355

356+
/*
357+
* group_keys_reorder_by_pathkeys
358+
* Reorder GROUP BY keys to match the input pathkeys.
359+
*
360+
* Function returns new lists (pathkeys and clauses), original GROUP BY lists
361+
* stay untouched.
362+
*
363+
* Returns the number of GROUP BY keys with a matching pathkey.
364+
*/
365+
static int
366+
group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
367+
List **group_clauses,
368+
int num_groupby_pathkeys)
369+
{
370+
List *new_group_pathkeys = NIL,
371+
*new_group_clauses = NIL;
372+
ListCell *lc;
373+
int n;
374+
375+
if (pathkeys == NIL || *group_pathkeys == NIL)
376+
return 0;
377+
378+
/*
379+
* Walk the pathkeys (determining ordering of the input path) and see if
380+
* there's a matching GROUP BY key. If we find one, we append it to the
381+
* list, and do the same for the clauses.
382+
*
383+
* Once we find the first pathkey without a matching GROUP BY key, the
384+
* rest of the pathkeys are useless and can't be used to evaluate the
385+
* grouping, so we abort the loop and ignore the remaining pathkeys.
386+
*/
387+
foreach(lc, pathkeys)
388+
{
389+
PathKey *pathkey = (PathKey *) lfirst(lc);
390+
SortGroupClause *sgc;
391+
392+
/*
393+
* Pathkeys are built in a way that allows simply comparing pointers.
394+
* Give up if we can't find the matching pointer. Also give up if
395+
* there is no sortclause reference for some reason.
396+
*/
397+
if (foreach_current_index(lc) >= num_groupby_pathkeys ||
398+
!list_member_ptr(*group_pathkeys, pathkey) ||
399+
pathkey->pk_eclass->ec_sortref == 0)
400+
break;
401+
402+
/*
403+
* Since 1349d27 pathkey coming from underlying node can be in the
404+
* root->group_pathkeys but not in the processed_groupClause. So, we
405+
* should be careful here.
406+
*/
407+
sgc = get_sortgroupref_clause_noerr(pathkey->pk_eclass->ec_sortref,
408+
*group_clauses);
409+
if (!sgc)
410+
/* The grouping clause does not cover this pathkey */
411+
break;
412+
413+
/*
414+
* Sort group clause should have an ordering operator as long as there
415+
* is an associated pathkey.
416+
*/
417+
Assert(OidIsValid(sgc->sortop));
418+
419+
new_group_pathkeys = lappend(new_group_pathkeys, pathkey);
420+
new_group_clauses = lappend(new_group_clauses, sgc);
421+
}
422+
423+
/* remember the number of pathkeys with a matching GROUP BY key */
424+
n = list_length(new_group_pathkeys);
425+
426+
/* append the remaining group pathkeys (will be treated as not sorted) */
427+
*group_pathkeys = list_concat_unique_ptr(new_group_pathkeys,
428+
*group_pathkeys);
429+
*group_clauses = list_concat_unique_ptr(new_group_clauses,
430+
*group_clauses);
431+
432+
return n;
433+
}
434+
435+
/*
436+
* pathkeys_are_duplicate
437+
* Check if give pathkeys are already contained the list of
438+
* PathKeyInfo's.
439+
*/
440+
static bool
441+
pathkeys_are_duplicate(List *infos, List *pathkeys)
442+
{
443+
ListCell *lc;
444+
445+
foreach(lc, infos)
446+
{
447+
PathKeyInfo *info = lfirst_node(PathKeyInfo, lc);
448+
449+
if (compare_pathkeys(pathkeys, info->pathkeys) == PATHKEYS_EQUAL)
450+
return true;
451+
}
452+
return false;
453+
}
454+
455+
/*
456+
* get_useful_group_keys_orderings
457+
* Determine which orderings of GROUP BY keys are potentially interesting.
458+
*
459+
* Returns a list of PathKeyInfo items, each representing an interesting
460+
* ordering of GROUP BY keys. Each item stores pathkeys and clauses in the
461+
* matching order.
462+
*
463+
* The function considers (and keeps) multiple GROUP BY orderings:
464+
*
465+
* - the original ordering, as specified by the GROUP BY clause,
466+
* - GROUP BY keys reordered to match 'path' ordering (as much as possible),
467+
* - GROUP BY keys to match target ORDER BY clause (as much as possible).
468+
*/
469+
List *
470+
get_useful_group_keys_orderings(PlannerInfo *root, Path *path)
471+
{
472+
Query *parse = root->parse;
473+
List *infos = NIL;
474+
PathKeyInfo *info;
475+
476+
List *pathkeys = root->group_pathkeys;
477+
List *clauses = root->processed_groupClause;
478+
479+
/* always return at least the original pathkeys/clauses */
480+
info = makeNode(PathKeyInfo);
481+
info->pathkeys = pathkeys;
482+
info->clauses = clauses;
483+
infos = lappend(infos, info);
484+
485+
/*
486+
* Should we try generating alternative orderings of the group keys? If
487+
* not, we produce only the order specified in the query, i.e. the
488+
* optimization is effectively disabled.
489+
*/
490+
if (!enable_group_by_reordering)
491+
return infos;
492+
493+
/*
494+
* Grouping sets have own and more complex logic to decide the ordering.
495+
*/
496+
if (parse->groupingSets)
497+
return infos;
498+
499+
/*
500+
* If the path is sorted in some way, try reordering the group keys to
501+
* match the path as much of the ordering as possible. Then thanks to
502+
* incremental sort we would get this sort as cheap as possible.
503+
*/
504+
if (path->pathkeys &&
505+
!pathkeys_contained_in(path->pathkeys, root->group_pathkeys))
506+
{
507+
int n;
508+
509+
n = group_keys_reorder_by_pathkeys(path->pathkeys, &pathkeys, &clauses,
510+
root->num_groupby_pathkeys);
511+
512+
if (n > 0 &&
513+
(enable_incremental_sort || n == root->num_groupby_pathkeys) &&
514+
!pathkeys_are_duplicate(infos, pathkeys))
515+
{
516+
info = makeNode(PathKeyInfo);
517+
info->pathkeys = pathkeys;
518+
info->clauses = clauses;
519+
520+
infos = lappend(infos, info);
521+
}
522+
}
523+
524+
/*
525+
* Try reordering pathkeys to minimize the sort cost (this time consider
526+
* the ORDER BY clause).
527+
*/
528+
if (root->sort_pathkeys &&
529+
!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys))
530+
{
531+
int n;
532+
533+
n = group_keys_reorder_by_pathkeys(root->sort_pathkeys, &pathkeys,
534+
&clauses,
535+
root->num_groupby_pathkeys);
536+
537+
if (n > 0 &&
538+
(enable_incremental_sort || n == list_length(root->sort_pathkeys)) &&
539+
!pathkeys_are_duplicate(infos, pathkeys))
540+
{
541+
info = makeNode(PathKeyInfo);
542+
info->pathkeys = pathkeys;
543+
info->clauses = clauses;
544+
545+
infos = lappend(infos, info);
546+
}
547+
}
548+
549+
return infos;
550+
}
551+
353552
/*
354553
* pathkeys_count_contained_in
355554
* Same as pathkeys_contained_in, but also sets length of longest
@@ -1939,6 +2138,54 @@ pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys)
19392138
return n_common_pathkeys;
19402139
}
19412140

2141+
/*
2142+
* pathkeys_useful_for_grouping
2143+
* Count the number of pathkeys that are useful for grouping (instead of
2144+
* explicit sort)
2145+
*
2146+
* Group pathkeys could be reordered to benefit from the ordering. The
2147+
* ordering may not be "complete" and may require incremental sort, but that's
2148+
* fine. So we simply count prefix pathkeys with a matching group key, and
2149+
* stop once we find the first pathkey without a match.
2150+
*
2151+
* So e.g. with pathkeys (a,b,c) and group keys (a,b,e) this determines (a,b)
2152+
* pathkeys are useful for grouping, and we might do incremental sort to get
2153+
* path ordered by (a,b,e).
2154+
*
2155+
* This logic is necessary to retain paths with ordering not matching grouping
2156+
* keys directly, without the reordering.
2157+
*
2158+
* Returns the length of pathkey prefix with matching group keys.
2159+
*/
2160+
static int
2161+
pathkeys_useful_for_grouping(PlannerInfo *root, List *pathkeys)
2162+
{
2163+
ListCell *key;
2164+
int n = 0;
2165+
2166+
/* no special ordering requested for grouping */
2167+
if (root->group_pathkeys == NIL)
2168+
return 0;
2169+
2170+
/* unordered path */
2171+
if (pathkeys == NIL)
2172+
return 0;
2173+
2174+
/* walk the pathkeys and search for matching group key */
2175+
foreach(key, pathkeys)
2176+
{
2177+
PathKey *pathkey = (PathKey *) lfirst(key);
2178+
2179+
/* no matching group key, we're done */
2180+
if (!list_member_ptr(root->group_pathkeys, pathkey))
2181+
break;
2182+
2183+
n++;
2184+
}
2185+
2186+
return n;
2187+
}
2188+
19422189
/*
19432190
* truncate_useless_pathkeys
19442191
* Shorten the given pathkey list to just the useful pathkeys.
@@ -1953,6 +2200,9 @@ truncate_useless_pathkeys(PlannerInfo *root,
19532200

19542201
nuseful = pathkeys_useful_for_merging(root, rel, pathkeys);
19552202
nuseful2 = pathkeys_useful_for_ordering(root, pathkeys);
2203+
if (nuseful2 > nuseful)
2204+
nuseful = nuseful2;
2205+
nuseful2 = pathkeys_useful_for_grouping(root, pathkeys);
19562206
if (nuseful2 > nuseful)
19572207
nuseful = nuseful2;
19582208

@@ -1988,6 +2238,8 @@ has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel)
19882238
{
19892239
if (rel->joininfo != NIL || rel->has_eclass_joins)
19902240
return true; /* might be able to use pathkeys for merging */
2241+
if (root->group_pathkeys != NIL)
2242+
return true; /* might be able to use pathkeys for grouping */
19912243
if (root->query_pathkeys != NIL)
19922244
return true; /* might be able to use them for ordering */
19932245
return false; /* definitely useless */

0 commit comments

Comments
 (0)