postgres
diff --git a/‎src/backend/executor/nodeLimit.c
Lines changed: 32 additions & 1 deletion b/‎src/backend/executor/nodeLimit.c
Lines changed: 32 additions & 1 deletion
diff --git a/‎src/backend/executor/nodeSort.c
Lines changed: 10 additions & 2 deletions b/‎src/backend/executor/nodeSort.c
Lines changed: 10 additions & 2 deletions
diff --git a/‎src/backend/optimizer/path/costsize.c
Lines changed: 60 additions & 17 deletions b/‎src/backend/optimizer/path/costsize.c
Lines changed: 60 additions & 17 deletions
diff --git a/‎src/backend/optimizer/plan/createplan.c
Lines changed: 20 additions & 11 deletions b/‎src/backend/optimizer/plan/createplan.c
Lines changed: 20 additions & 11 deletions
diff --git a/‎src/backend/optimizer/plan/planmain.c
Lines changed: 10 additions & 3 deletions b/‎src/backend/optimizer/plan/planmain.c
Lines changed: 10 additions & 3 deletions
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.29 2007/01/05 22:19:28 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.30 2007/05/04 01:13:43 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -280,6 +280,37 @@ recompute_limits(LimitState *node)
 	/* Reset position to start-of-scan */
 	node->position = 0;
 	node->subSlot = NULL;
+
+	/*
+	 * If we have a COUNT, and our input is a Sort node, notify it that it can
+	 * use bounded sort.
+	 *
+	 * This is a bit of a kluge, but we don't have any more-abstract way of
+	 * communicating between the two nodes; and it doesn't seem worth trying
+	 * to invent one without some more examples of special communication needs.
+	 *
+	 * Note: it is the responsibility of nodeSort.c to react properly to
+	 * changes of these parameters.  If we ever do redesign this, it'd be
+	 * a good idea to integrate this signaling with the parameter-change
+	 * mechanism.
+	 */
+	if (IsA(outerPlanState(node), SortState))
+	{
+		SortState *sortState = (SortState *) outerPlanState(node);
+		int64 tuples_needed = node->count + node->offset;
+
+		/* negative test checks for overflow */
+		if (node->noCount || tuples_needed < 0)
+		{
+			/* make sure flag gets reset if needed upon rescan */
+			sortState->bounded = false;
+		}
+		else
+		{
+			sortState->bounded = true;
+			sortState->bound = tuples_needed;
+		}
+	}
 }
 
 /* ----------------------------------------------------------------
 
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.60 2007/01/09 02:14:11 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.61 2007/05/04 01:13:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -89,6 +89,8 @@ ExecSort(SortState *node)
 											  plannode->nullsFirst,
 											  work_mem,
 											  node->randomAccess);
+		if (node->bounded)
+			tuplesort_set_bound(tuplesortstate, node->bound);
 		node->tuplesortstate = (void *) tuplesortstate;
 
 		/*
@@ -119,6 +121,8 @@ ExecSort(SortState *node)
 		 * finally set the sorted flag to true
 		 */
 		node->sort_Done = true;
+		node->bounded_Done = node->bounded;
+		node->bound_Done = node->bound;
 		SO1_printf("ExecSort: %s\n", "sorting done");
 	}
 
@@ -167,6 +171,7 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
 										 EXEC_FLAG_BACKWARD |
 										 EXEC_FLAG_MARK)) != 0;
 
+	sortstate->bounded = false;
 	sortstate->sort_Done = false;
 	sortstate->tuplesortstate = NULL;
 
@@ -307,11 +312,14 @@ ExecReScanSort(SortState *node, ExprContext *exprCtxt)
 
 	/*
 	 * If subnode is to be rescanned then we forget previous sort results; we
-	 * have to re-read the subplan and re-sort.
+	 * have to re-read the subplan and re-sort.  Also must re-sort if the
+	 * bounded-sort parameters changed or we didn't select randomAccess.
 	 *
 	 * Otherwise we can just rewind and rescan the sorted output.
 	 */
 	if (((PlanState *) node)->lefttree->chgParam != NULL ||
+		node->bounded != node->bounded_Done ||
+		node->bound != node->bound_Done ||
 		!node->randomAccess)
 	{
 		node->sort_Done = false;
 
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.181 2007/04/21 21:01:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -922,6 +922,10 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
  *		disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
  *		cpu = comparison_cost * t * log2(t)
  *
+ * If the sort is bounded (i.e., only the first k result tuples are needed)
+ * and k tuples can fit into work_mem, we use a heap method that keeps only
+ * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
+ *
  * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
  * accesses (XXX can't we refine that guess?)
  *
@@ -932,6 +936,7 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
  * 'input_cost' is the total cost for reading the input data
  * 'tuples' is the number of tuples in the relation
  * 'width' is the average tuple width in bytes
+ * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
  *
  * NOTE: some callers currently pass NIL for pathkeys because they
  * can't conveniently supply the sort keys.  Since this routine doesn't
@@ -942,11 +947,14 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
  */
 void
 cost_sort(Path *path, PlannerInfo *root,
-		  List *pathkeys, Cost input_cost, double tuples, int width)
+		  List *pathkeys, Cost input_cost, double tuples, int width,
+		  double limit_tuples)
 {
 	Cost		startup_cost = input_cost;
 	Cost		run_cost = 0;
-	double		nbytes = relation_byte_size(tuples, width);
+	double		input_bytes = relation_byte_size(tuples, width);
+	double		output_bytes;
+	double		output_tuples;
 	long		work_mem_bytes = work_mem * 1024L;
 
 	if (!enable_sort)
@@ -959,23 +967,39 @@ cost_sort(Path *path, PlannerInfo *root,
 	if (tuples < 2.0)
 		tuples = 2.0;
 
-	/*
-	 * CPU costs
-	 *
-	 * Assume about two operator evals per tuple comparison and N log2 N
-	 * comparisons
-	 */
-	startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+	/* Do we have a useful LIMIT? */
+	if (limit_tuples > 0 && limit_tuples < tuples)
+	{
+		output_tuples = limit_tuples;
+		output_bytes = relation_byte_size(output_tuples, width);
+	}
+	else
+	{
+		output_tuples = tuples;
+		output_bytes = input_bytes;
+	}
 
-	/* disk costs */
-	if (nbytes > work_mem_bytes)
+	if (output_bytes > work_mem_bytes)
 	{
-		double		npages = ceil(nbytes / BLCKSZ);
-		double		nruns = (nbytes / work_mem_bytes) * 0.5;
+		/*
+		 * We'll have to use a disk-based sort of all the tuples
+		 */
+		double		npages = ceil(input_bytes / BLCKSZ);
+		double		nruns = (input_bytes / work_mem_bytes) * 0.5;
 		double		mergeorder = tuplesort_merge_order(work_mem_bytes);
 		double		log_runs;
 		double		npageaccesses;
 
+		/*
+		 * CPU costs
+		 *
+		 * Assume about two operator evals per tuple comparison and N log2 N
+		 * comparisons
+		 */
+		startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+
+		/* Disk costs */
+
 		/* Compute logM(r) as log(r) / log(M) */
 		if (nruns > mergeorder)
 			log_runs = ceil(log(nruns) / log(mergeorder));
@@ -986,10 +1010,27 @@ cost_sort(Path *path, PlannerInfo *root,
 		startup_cost += npageaccesses *
 			(seq_page_cost * 0.75 + random_page_cost * 0.25);
 	}
+	else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes)
+	{
+		/*
+		 * We'll use a bounded heap-sort keeping just K tuples in memory,
+		 * for a total number of tuple comparisons of N log2 K; but the
+		 * constant factor is a bit higher than for quicksort.  Tweak it
+		 * so that the cost curve is continuous at the crossover point.
+		 */
+		startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples);
+	}
+	else
+	{
+		/* We'll use plain quicksort on all the input tuples */
+		startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+	}
 
 	/*
 	 * Also charge a small amount (arbitrarily set equal to operator cost) per
-	 * extracted tuple.
+	 * extracted tuple.  Note it's correct to use tuples not output_tuples
+	 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
+	 * counting the LIMIT otherwise.
 	 */
 	run_cost += cpu_operator_cost * tuples;
 
@@ -1431,7 +1472,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
 				  outersortkeys,
 				  outer_path->total_cost,
 				  outer_path_rows,
-				  outer_path->parent->width);
+				  outer_path->parent->width,
+				  -1.0);
 		startup_cost += sort_path.startup_cost;
 		run_cost += (sort_path.total_cost - sort_path.startup_cost)
 			* outerscansel;
@@ -1450,7 +1492,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
 				  innersortkeys,
 				  inner_path->total_cost,
 				  inner_path_rows,
-				  inner_path->parent->width);
+				  inner_path->parent->width,
+				  -1.0);
 		startup_cost += sort_path.startup_cost;
 		run_cost += (sort_path.total_cost - sort_path.startup_cost)
 			* innerscansel * rescanratio;
 
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.229 2007/04/21 21:01:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.230 2007/05/04 01:13:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -122,7 +122,8 @@ static MergeJoin *make_mergejoin(List *tlist,
 			   Plan *lefttree, Plan *righttree,
 			   JoinType jointype);
 static Sort *make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
-		  AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst);
+		  AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
+		  double limit_tuples);
 
 
 /*
@@ -1579,7 +1580,8 @@ create_mergejoin_plan(PlannerInfo *root,
 		outer_plan = (Plan *)
 			make_sort_from_pathkeys(root,
 									outer_plan,
-									best_path->outersortkeys);
+									best_path->outersortkeys,
+									-1.0);
 		outerpathkeys = best_path->outersortkeys;
 	}
 	else
@@ -1591,7 +1593,8 @@ create_mergejoin_plan(PlannerInfo *root,
 		inner_plan = (Plan *)
 			make_sort_from_pathkeys(root,
 									inner_plan,
-									best_path->innersortkeys);
+									best_path->innersortkeys,
+									-1.0);
 		innerpathkeys = best_path->innersortkeys;
 	}
 	else
@@ -2589,11 +2592,13 @@ make_mergejoin(List *tlist,
  * make_sort --- basic routine to build a Sort plan node
  *
  * Caller must have built the sortColIdx, sortOperators, and nullsFirst
- * arrays already.
+ * arrays already.  limit_tuples is as for cost_sort (in particular, pass
+ * -1 if no limit)
  */
 static Sort *
 make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
-		  AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst)
+		  AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
+		  double limit_tuples)
 {
 	Sort	   *node = makeNode(Sort);
 	Plan	   *plan = &node->plan;
@@ -2603,7 +2608,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
 	cost_sort(&sort_path, root, NIL,
 			  lefttree->total_cost,
 			  lefttree->plan_rows,
-			  lefttree->plan_width);
+			  lefttree->plan_width,
+			  limit_tuples);
 	plan->startup_cost = sort_path.startup_cost;
 	plan->total_cost = sort_path.total_cost;
 	plan->targetlist = lefttree->targetlist;
@@ -2664,6 +2670,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
  *
  *	  'lefttree' is the node which yields input tuples
  *	  'pathkeys' is the list of pathkeys by which the result is to be sorted
+ *	  'limit_tuples' is the bound on the number of output tuples;
+ *				-1 if no bound
  *
  * We must convert the pathkey information into arrays of sort key column
  * numbers and sort operator OIDs.
@@ -2675,7 +2683,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
  * adding a Result node just to do the projection.
  */
 Sort *
-make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
+make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys,
+						double limit_tuples)
 {
 	List	   *tlist = lefttree->targetlist;
 	ListCell   *i;
@@ -2810,7 +2819,7 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
 	Assert(numsortkeys > 0);
 
 	return make_sort(root, lefttree, numsortkeys,
-					 sortColIdx, sortOperators, nullsFirst);
+					 sortColIdx, sortOperators, nullsFirst, limit_tuples);
 }
 
 /*
@@ -2859,7 +2868,7 @@ make_sort_from_sortclauses(PlannerInfo *root, List *sortcls, Plan *lefttree)
 	Assert(numsortkeys > 0);
 
 	return make_sort(root, lefttree, numsortkeys,
-					 sortColIdx, sortOperators, nullsFirst);
+					 sortColIdx, sortOperators, nullsFirst, -1.0);
 }
 
 /*
@@ -2919,7 +2928,7 @@ make_sort_from_groupcols(PlannerInfo *root,
 	Assert(numsortkeys > 0);
 
 	return make_sort(root, lefttree, numsortkeys,
-					 sortColIdx, sortOperators, nullsFirst);
+					 sortColIdx, sortOperators, nullsFirst, -1.0);
 }
 
 Material *
 
@@ -14,7 +14,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.100 2007/04/21 21:01:45 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.101 2007/05/04 01:13:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -47,6 +47,8 @@
  * tlist is the target list the query should produce
  *		(this is NOT necessarily root->parse->targetList!)
  * tuple_fraction is the fraction of tuples we expect will be retrieved
+ * limit_tuples is a hard limit on number of tuples to retrieve,
+ *		or -1 if no limit
  *
  * Output parameters:
  * *cheapest_path receives the overall-cheapest path for the query
@@ -74,9 +76,13 @@
  *		from the plan to be retrieved
  *	  tuple_fraction >= 1: tuple_fraction is the absolute number of tuples
  *		expected to be retrieved (ie, a LIMIT specification)
+ * Note that a nonzero tuple_fraction could come from outer context; it is
+ * therefore not redundant with limit_tuples.  We use limit_tuples to determine
+ * whether a bounded sort can be used at runtime.
  */
 void
-query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
+query_planner(PlannerInfo *root, List *tlist,
+			  double tuple_fraction, double limit_tuples,
 			  Path **cheapest_path, Path **sorted_path,
 			  double *num_groups)
 {
@@ -354,7 +360,8 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
 			/* Figure cost for sorting */
 			cost_sort(&sort_path, root, root->query_pathkeys,
 					  cheapestpath->total_cost,
-					  final_rel->rows, final_rel->width);
+					  final_rel->rows, final_rel->width,
+					  limit_tuples);
 		}
 
 		if (compare_fractional_path_costs(sortedpath, &sort_path,