Skip to content

Commit d26559d

Browse files
committed
Teach tuplesort.c about "top N" sorting, in which only the first N tuples
need be returned. We keep a heap of the current best N tuples and sift-up new tuples into it as we scan the input. For M input tuples this means only about M*log(N) comparisons instead of M*log(M), not to mention a lot less workspace when N is small --- avoiding spill-to-disk for large M is actually the most attractive thing about it. Patch includes planner and executor support for invoking this facility in ORDER BY ... LIMIT queries. Greg Stark, with some editorialization by moi.
1 parent 0fef38d commit d26559d

File tree

13 files changed

+464
-58
lines changed

13 files changed

+464
-58
lines changed

src/backend/executor/nodeLimit.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.29 2007/01/05 22:19:28 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.30 2007/05/04 01:13:43 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -280,6 +280,37 @@ recompute_limits(LimitState *node)
280280
/* Reset position to start-of-scan */
281281
node->position = 0;
282282
node->subSlot = NULL;
283+
284+
/*
285+
* If we have a COUNT, and our input is a Sort node, notify it that it can
286+
* use bounded sort.
287+
*
288+
* This is a bit of a kluge, but we don't have any more-abstract way of
289+
* communicating between the two nodes; and it doesn't seem worth trying
290+
* to invent one without some more examples of special communication needs.
291+
*
292+
* Note: it is the responsibility of nodeSort.c to react properly to
293+
* changes of these parameters. If we ever do redesign this, it'd be
294+
* a good idea to integrate this signaling with the parameter-change
295+
* mechanism.
296+
*/
297+
if (IsA(outerPlanState(node), SortState))
298+
{
299+
SortState *sortState = (SortState *) outerPlanState(node);
300+
int64 tuples_needed = node->count + node->offset;
301+
302+
/* negative test checks for overflow */
303+
if (node->noCount || tuples_needed < 0)
304+
{
305+
/* make sure flag gets reset if needed upon rescan */
306+
sortState->bounded = false;
307+
}
308+
else
309+
{
310+
sortState->bounded = true;
311+
sortState->bound = tuples_needed;
312+
}
313+
}
283314
}
284315

285316
/* ----------------------------------------------------------------

src/backend/executor/nodeSort.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.60 2007/01/09 02:14:11 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.61 2007/05/04 01:13:44 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -89,6 +89,8 @@ ExecSort(SortState *node)
8989
plannode->nullsFirst,
9090
work_mem,
9191
node->randomAccess);
92+
if (node->bounded)
93+
tuplesort_set_bound(tuplesortstate, node->bound);
9294
node->tuplesortstate = (void *) tuplesortstate;
9395

9496
/*
@@ -119,6 +121,8 @@ ExecSort(SortState *node)
119121
* finally set the sorted flag to true
120122
*/
121123
node->sort_Done = true;
124+
node->bounded_Done = node->bounded;
125+
node->bound_Done = node->bound;
122126
SO1_printf("ExecSort: %s\n", "sorting done");
123127
}
124128

@@ -167,6 +171,7 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
167171
EXEC_FLAG_BACKWARD |
168172
EXEC_FLAG_MARK)) != 0;
169173

174+
sortstate->bounded = false;
170175
sortstate->sort_Done = false;
171176
sortstate->tuplesortstate = NULL;
172177

@@ -307,11 +312,14 @@ ExecReScanSort(SortState *node, ExprContext *exprCtxt)
307312

308313
/*
309314
* If subnode is to be rescanned then we forget previous sort results; we
310-
* have to re-read the subplan and re-sort.
315+
* have to re-read the subplan and re-sort. Also must re-sort if the
316+
* bounded-sort parameters changed or we didn't select randomAccess.
311317
*
312318
* Otherwise we can just rewind and rescan the sorted output.
313319
*/
314320
if (((PlanState *) node)->lefttree->chgParam != NULL ||
321+
node->bounded != node->bounded_Done ||
322+
node->bound != node->bound_Done ||
315323
!node->randomAccess)
316324
{
317325
node->sort_Done = false;

src/backend/optimizer/path/costsize.c

Lines changed: 60 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
* Portions Copyright (c) 1994, Regents of the University of California
5555
*
5656
* IDENTIFICATION
57-
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.181 2007/04/21 21:01:44 tgl Exp $
57+
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $
5858
*
5959
*-------------------------------------------------------------------------
6060
*/
@@ -922,6 +922,10 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
922922
* disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
923923
* cpu = comparison_cost * t * log2(t)
924924
*
925+
* If the sort is bounded (i.e., only the first k result tuples are needed)
926+
* and k tuples can fit into work_mem, we use a heap method that keeps only
927+
* k tuples in the heap; this will require about t*log2(k) tuple comparisons.
928+
*
925929
* The disk traffic is assumed to be 3/4ths sequential and 1/4th random
926930
* accesses (XXX can't we refine that guess?)
927931
*
@@ -932,6 +936,7 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
932936
* 'input_cost' is the total cost for reading the input data
933937
* 'tuples' is the number of tuples in the relation
934938
* 'width' is the average tuple width in bytes
939+
* 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
935940
*
936941
* NOTE: some callers currently pass NIL for pathkeys because they
937942
* can't conveniently supply the sort keys. Since this routine doesn't
@@ -942,11 +947,14 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
942947
*/
943948
void
944949
cost_sort(Path *path, PlannerInfo *root,
945-
List *pathkeys, Cost input_cost, double tuples, int width)
950+
List *pathkeys, Cost input_cost, double tuples, int width,
951+
double limit_tuples)
946952
{
947953
Cost startup_cost = input_cost;
948954
Cost run_cost = 0;
949-
double nbytes = relation_byte_size(tuples, width);
955+
double input_bytes = relation_byte_size(tuples, width);
956+
double output_bytes;
957+
double output_tuples;
950958
long work_mem_bytes = work_mem * 1024L;
951959

952960
if (!enable_sort)
@@ -959,23 +967,39 @@ cost_sort(Path *path, PlannerInfo *root,
959967
if (tuples < 2.0)
960968
tuples = 2.0;
961969

962-
/*
963-
* CPU costs
964-
*
965-
* Assume about two operator evals per tuple comparison and N log2 N
966-
* comparisons
967-
*/
968-
startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
970+
/* Do we have a useful LIMIT? */
971+
if (limit_tuples > 0 && limit_tuples < tuples)
972+
{
973+
output_tuples = limit_tuples;
974+
output_bytes = relation_byte_size(output_tuples, width);
975+
}
976+
else
977+
{
978+
output_tuples = tuples;
979+
output_bytes = input_bytes;
980+
}
969981

970-
/* disk costs */
971-
if (nbytes > work_mem_bytes)
982+
if (output_bytes > work_mem_bytes)
972983
{
973-
double npages = ceil(nbytes / BLCKSZ);
974-
double nruns = (nbytes / work_mem_bytes) * 0.5;
984+
/*
985+
* We'll have to use a disk-based sort of all the tuples
986+
*/
987+
double npages = ceil(input_bytes / BLCKSZ);
988+
double nruns = (input_bytes / work_mem_bytes) * 0.5;
975989
double mergeorder = tuplesort_merge_order(work_mem_bytes);
976990
double log_runs;
977991
double npageaccesses;
978992

993+
/*
994+
* CPU costs
995+
*
996+
* Assume about two operator evals per tuple comparison and N log2 N
997+
* comparisons
998+
*/
999+
startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
1000+
1001+
/* Disk costs */
1002+
9791003
/* Compute logM(r) as log(r) / log(M) */
9801004
if (nruns > mergeorder)
9811005
log_runs = ceil(log(nruns) / log(mergeorder));
@@ -986,10 +1010,27 @@ cost_sort(Path *path, PlannerInfo *root,
9861010
startup_cost += npageaccesses *
9871011
(seq_page_cost * 0.75 + random_page_cost * 0.25);
9881012
}
1013+
else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes)
1014+
{
1015+
/*
1016+
* We'll use a bounded heap-sort keeping just K tuples in memory,
1017+
* for a total number of tuple comparisons of N log2 K; but the
1018+
* constant factor is a bit higher than for quicksort. Tweak it
1019+
* so that the cost curve is continuous at the crossover point.
1020+
*/
1021+
startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples);
1022+
}
1023+
else
1024+
{
1025+
/* We'll use plain quicksort on all the input tuples */
1026+
startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
1027+
}
9891028

9901029
/*
9911030
* Also charge a small amount (arbitrarily set equal to operator cost) per
992-
* extracted tuple.
1031+
* extracted tuple. Note it's correct to use tuples not output_tuples
1032+
* here --- the upper LIMIT will pro-rate the run cost so we'd be double
1033+
* counting the LIMIT otherwise.
9931034
*/
9941035
run_cost += cpu_operator_cost * tuples;
9951036

@@ -1431,7 +1472,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
14311472
outersortkeys,
14321473
outer_path->total_cost,
14331474
outer_path_rows,
1434-
outer_path->parent->width);
1475+
outer_path->parent->width,
1476+
-1.0);
14351477
startup_cost += sort_path.startup_cost;
14361478
run_cost += (sort_path.total_cost - sort_path.startup_cost)
14371479
* outerscansel;
@@ -1450,7 +1492,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
14501492
innersortkeys,
14511493
inner_path->total_cost,
14521494
inner_path_rows,
1453-
inner_path->parent->width);
1495+
inner_path->parent->width,
1496+
-1.0);
14541497
startup_cost += sort_path.startup_cost;
14551498
run_cost += (sort_path.total_cost - sort_path.startup_cost)
14561499
* innerscansel * rescanratio;

src/backend/optimizer/plan/createplan.c

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
*
1111
*
1212
* IDENTIFICATION
13-
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.229 2007/04/21 21:01:44 tgl Exp $
13+
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.230 2007/05/04 01:13:44 tgl Exp $
1414
*
1515
*-------------------------------------------------------------------------
1616
*/
@@ -122,7 +122,8 @@ static MergeJoin *make_mergejoin(List *tlist,
122122
Plan *lefttree, Plan *righttree,
123123
JoinType jointype);
124124
static Sort *make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
125-
AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst);
125+
AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
126+
double limit_tuples);
126127

127128

128129
/*
@@ -1579,7 +1580,8 @@ create_mergejoin_plan(PlannerInfo *root,
15791580
outer_plan = (Plan *)
15801581
make_sort_from_pathkeys(root,
15811582
outer_plan,
1582-
best_path->outersortkeys);
1583+
best_path->outersortkeys,
1584+
-1.0);
15831585
outerpathkeys = best_path->outersortkeys;
15841586
}
15851587
else
@@ -1591,7 +1593,8 @@ create_mergejoin_plan(PlannerInfo *root,
15911593
inner_plan = (Plan *)
15921594
make_sort_from_pathkeys(root,
15931595
inner_plan,
1594-
best_path->innersortkeys);
1596+
best_path->innersortkeys,
1597+
-1.0);
15951598
innerpathkeys = best_path->innersortkeys;
15961599
}
15971600
else
@@ -2589,11 +2592,13 @@ make_mergejoin(List *tlist,
25892592
* make_sort --- basic routine to build a Sort plan node
25902593
*
25912594
* Caller must have built the sortColIdx, sortOperators, and nullsFirst
2592-
* arrays already.
2595+
* arrays already. limit_tuples is as for cost_sort (in particular, pass
2596+
* -1 if no limit)
25932597
*/
25942598
static Sort *
25952599
make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
2596-
AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst)
2600+
AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
2601+
double limit_tuples)
25972602
{
25982603
Sort *node = makeNode(Sort);
25992604
Plan *plan = &node->plan;
@@ -2603,7 +2608,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
26032608
cost_sort(&sort_path, root, NIL,
26042609
lefttree->total_cost,
26052610
lefttree->plan_rows,
2606-
lefttree->plan_width);
2611+
lefttree->plan_width,
2612+
limit_tuples);
26072613
plan->startup_cost = sort_path.startup_cost;
26082614
plan->total_cost = sort_path.total_cost;
26092615
plan->targetlist = lefttree->targetlist;
@@ -2664,6 +2670,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
26642670
*
26652671
* 'lefttree' is the node which yields input tuples
26662672
* 'pathkeys' is the list of pathkeys by which the result is to be sorted
2673+
* 'limit_tuples' is the bound on the number of output tuples;
2674+
* -1 if no bound
26672675
*
26682676
* We must convert the pathkey information into arrays of sort key column
26692677
* numbers and sort operator OIDs.
@@ -2675,7 +2683,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
26752683
* adding a Result node just to do the projection.
26762684
*/
26772685
Sort *
2678-
make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
2686+
make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys,
2687+
double limit_tuples)
26792688
{
26802689
List *tlist = lefttree->targetlist;
26812690
ListCell *i;
@@ -2810,7 +2819,7 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
28102819
Assert(numsortkeys > 0);
28112820

28122821
return make_sort(root, lefttree, numsortkeys,
2813-
sortColIdx, sortOperators, nullsFirst);
2822+
sortColIdx, sortOperators, nullsFirst, limit_tuples);
28142823
}
28152824

28162825
/*
@@ -2859,7 +2868,7 @@ make_sort_from_sortclauses(PlannerInfo *root, List *sortcls, Plan *lefttree)
28592868
Assert(numsortkeys > 0);
28602869

28612870
return make_sort(root, lefttree, numsortkeys,
2862-
sortColIdx, sortOperators, nullsFirst);
2871+
sortColIdx, sortOperators, nullsFirst, -1.0);
28632872
}
28642873

28652874
/*
@@ -2919,7 +2928,7 @@ make_sort_from_groupcols(PlannerInfo *root,
29192928
Assert(numsortkeys > 0);
29202929

29212930
return make_sort(root, lefttree, numsortkeys,
2922-
sortColIdx, sortOperators, nullsFirst);
2931+
sortColIdx, sortOperators, nullsFirst, -1.0);
29232932
}
29242933

29252934
Material *

src/backend/optimizer/plan/planmain.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
*
1515
*
1616
* IDENTIFICATION
17-
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.100 2007/04/21 21:01:45 tgl Exp $
17+
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.101 2007/05/04 01:13:44 tgl Exp $
1818
*
1919
*-------------------------------------------------------------------------
2020
*/
@@ -47,6 +47,8 @@
4747
* tlist is the target list the query should produce
4848
* (this is NOT necessarily root->parse->targetList!)
4949
* tuple_fraction is the fraction of tuples we expect will be retrieved
50+
* limit_tuples is a hard limit on number of tuples to retrieve,
51+
* or -1 if no limit
5052
*
5153
* Output parameters:
5254
* *cheapest_path receives the overall-cheapest path for the query
@@ -74,9 +76,13 @@
7476
* from the plan to be retrieved
7577
* tuple_fraction >= 1: tuple_fraction is the absolute number of tuples
7678
* expected to be retrieved (ie, a LIMIT specification)
79+
* Note that a nonzero tuple_fraction could come from outer context; it is
80+
* therefore not redundant with limit_tuples. We use limit_tuples to determine
81+
* whether a bounded sort can be used at runtime.
7782
*/
7883
void
79-
query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
84+
query_planner(PlannerInfo *root, List *tlist,
85+
double tuple_fraction, double limit_tuples,
8086
Path **cheapest_path, Path **sorted_path,
8187
double *num_groups)
8288
{
@@ -354,7 +360,8 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
354360
/* Figure cost for sorting */
355361
cost_sort(&sort_path, root, root->query_pathkeys,
356362
cheapestpath->total_cost,
357-
final_rel->rows, final_rel->width);
363+
final_rel->rows, final_rel->width,
364+
limit_tuples);
358365
}
359366

360367
if (compare_fractional_path_costs(sortedpath, &sort_path,

0 commit comments

Comments
 (0)