Skip to content

Commit 2415ad9

Browse files
committed
Teach tuplestore.c to throw away data before the "mark" point when the caller
is using mark/restore but not rewind or backward-scan capability. Insert a materialize plan node between a mergejoin and its inner child if the inner child is a sort that is expected to spill to disk. The materialize shields the sort from the need to do mark/restore and thereby allows it to perform its final merge pass on-the-fly; while the materialize itself is normally cheap since it won't spill to disk unless the number of tuples with equal key values exceeds work_mem. Greg Stark, with some kibitzing from Tom Lane.
1 parent 3963574 commit 2415ad9

File tree

8 files changed

+236
-40
lines changed

8 files changed

+236
-40
lines changed

src/backend/executor/nodeMaterial.c

+21-15
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/executor/nodeMaterial.c,v 1.58 2007/01/05 22:19:28 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/executor/nodeMaterial.c,v 1.59 2007/05/21 17:57:33 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -56,10 +56,10 @@ ExecMaterial(MaterialState *node)
5656
/*
5757
* If first time through, and we need a tuplestore, initialize it.
5858
*/
59-
if (tuplestorestate == NULL && node->randomAccess)
59+
if (tuplestorestate == NULL && node->eflags != 0)
6060
{
6161
tuplestorestate = tuplestore_begin_heap(true, false, work_mem);
62-
62+
tuplestore_set_eflags(tuplestorestate, node->eflags);
6363
node->tuplestorestate = (void *) tuplestorestate;
6464
}
6565

@@ -162,14 +162,14 @@ ExecInitMaterial(Material *node, EState *estate, int eflags)
162162
matstate->ss.ps.state = estate;
163163

164164
/*
165-
* We must have random access to the subplan output to do backward scan or
166-
* mark/restore. We also prefer to materialize the subplan output if we
167-
* might be called on to rewind and replay it many times. However, if none
168-
* of these cases apply, we can skip storing the data.
165+
* We must have a tuplestore buffering the subplan output to do backward
166+
* scan or mark/restore. We also prefer to materialize the subplan output
167+
* if we might be called on to rewind and replay it many times. However,
168+
* if none of these cases apply, we can skip storing the data.
169169
*/
170-
matstate->randomAccess = (eflags & (EXEC_FLAG_REWIND |
171-
EXEC_FLAG_BACKWARD |
172-
EXEC_FLAG_MARK)) != 0;
170+
matstate->eflags = (eflags & (EXEC_FLAG_REWIND |
171+
EXEC_FLAG_BACKWARD |
172+
EXEC_FLAG_MARK));
173173

174174
matstate->eof_underlying = false;
175175
matstate->tuplestorestate = NULL;
@@ -255,7 +255,7 @@ ExecEndMaterial(MaterialState *node)
255255
void
256256
ExecMaterialMarkPos(MaterialState *node)
257257
{
258-
Assert(node->randomAccess);
258+
Assert(node->eflags & EXEC_FLAG_MARK);
259259

260260
/*
261261
* if we haven't materialized yet, just return.
@@ -275,7 +275,7 @@ ExecMaterialMarkPos(MaterialState *node)
275275
void
276276
ExecMaterialRestrPos(MaterialState *node)
277277
{
278-
Assert(node->randomAccess);
278+
Assert(node->eflags & EXEC_FLAG_MARK);
279279

280280
/*
281281
* if we haven't materialized yet, just return.
@@ -300,7 +300,7 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt)
300300
{
301301
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
302302

303-
if (node->randomAccess)
303+
if (node->eflags != 0)
304304
{
305305
/*
306306
* If we haven't materialized yet, just return. If outerplan' chgParam
@@ -312,15 +312,21 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt)
312312

313313
/*
314314
* If subnode is to be rescanned then we forget previous stored
315-
* results; we have to re-read the subplan and re-store.
315+
* results; we have to re-read the subplan and re-store. Also,
316+
* if we told tuplestore it needn't support rescan, we lose and
317+
* must re-read. (This last should not happen in common cases;
318+
* else our caller lied by not passing EXEC_FLAG_REWIND to us.)
316319
*
317320
* Otherwise we can just rewind and rescan the stored output. The
318321
* state of the subnode does not change.
319322
*/
320-
if (((PlanState *) node)->lefttree->chgParam != NULL)
323+
if (((PlanState *) node)->lefttree->chgParam != NULL ||
324+
(node->eflags & EXEC_FLAG_REWIND) == 0)
321325
{
322326
tuplestore_end((Tuplestorestate *) node->tuplestorestate);
323327
node->tuplestorestate = NULL;
328+
if (((PlanState *) node)->lefttree->chgParam == NULL)
329+
ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
324330
node->eof_underlying = false;
325331
}
326332
else

src/backend/executor/nodeMergejoin.c

+37-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/executor/nodeMergejoin.c,v 1.87 2007/02/02 00:07:03 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/executor/nodeMergejoin.c,v 1.88 2007/05/21 17:57:33 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -706,6 +706,9 @@ ExecMergeJoin(MergeJoinState *node)
706706
}
707707
else
708708
{
709+
/* Mark before advancing, if wanted */
710+
if (node->mj_ExtraMarks)
711+
ExecMarkPos(innerPlan);
709712
/* Stay in same state to fetch next inner tuple */
710713
if (doFillInner)
711714
{
@@ -830,6 +833,9 @@ ExecMergeJoin(MergeJoinState *node)
830833
* now we get the next inner tuple, if any. If there's none,
831834
* advance to next outer tuple (which may be able to join to
832835
* previously marked tuples).
836+
*
837+
* NB: must NOT do "extraMarks" here, since we may need to
838+
* return to previously marked tuples.
833839
*/
834840
innerTupleSlot = ExecProcNode(innerPlan);
835841
node->mj_InnerTupleSlot = innerTupleSlot;
@@ -1140,6 +1146,9 @@ ExecMergeJoin(MergeJoinState *node)
11401146
break;
11411147

11421148
/*
1149+
* SKIPOUTER_ADVANCE: advance over an outer tuple that is
1150+
* known not to join to any inner tuple.
1151+
*
11431152
* Before advancing, we check to see if we must emit an
11441153
* outer-join fill tuple for this outer tuple.
11451154
*/
@@ -1204,6 +1213,9 @@ ExecMergeJoin(MergeJoinState *node)
12041213
break;
12051214

12061215
/*
1216+
* SKIPINNER_ADVANCE: advance over an inner tuple that is
1217+
* known not to join to any outer tuple.
1218+
*
12071219
* Before advancing, we check to see if we must emit an
12081220
* outer-join fill tuple for this inner tuple.
12091221
*/
@@ -1225,6 +1237,10 @@ ExecMergeJoin(MergeJoinState *node)
12251237
return result;
12261238
}
12271239

1240+
/* Mark before advancing, if wanted */
1241+
if (node->mj_ExtraMarks)
1242+
ExecMarkPos(innerPlan);
1243+
12281244
/*
12291245
* now we get the next inner tuple, if any
12301246
*/
@@ -1295,6 +1311,10 @@ ExecMergeJoin(MergeJoinState *node)
12951311
return result;
12961312
}
12971313

1314+
/* Mark before advancing, if wanted */
1315+
if (node->mj_ExtraMarks)
1316+
ExecMarkPos(innerPlan);
1317+
12981318
/*
12991319
* now we get the next inner tuple, if any
13001320
*/
@@ -1425,6 +1445,22 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags)
14251445
innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate,
14261446
eflags | EXEC_FLAG_MARK);
14271447

1448+
/*
1449+
* For certain types of inner child nodes, it is advantageous to issue
1450+
* MARK every time we advance past an inner tuple we will never return
1451+
* to. For other types, MARK on a tuple we cannot return to is a waste
1452+
* of cycles. Detect which case applies and set mj_ExtraMarks if we
1453+
* want to issue "unnecessary" MARK calls.
1454+
*
1455+
* Currently, only Material wants the extra MARKs, and it will be helpful
1456+
* only if eflags doesn't specify REWIND.
1457+
*/
1458+
if (IsA(innerPlan(node), Material) &&
1459+
(eflags & EXEC_FLAG_REWIND) == 0)
1460+
mergestate->mj_ExtraMarks = true;
1461+
else
1462+
mergestate->mj_ExtraMarks = false;
1463+
14281464
#define MERGEJOIN_NSLOTS 4
14291465

14301466
/*

src/backend/optimizer/path/costsize.c

+18-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
* Portions Copyright (c) 1994, Regents of the University of California
5555
*
5656
* IDENTIFICATION
57-
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $
57+
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.183 2007/05/21 17:57:33 tgl Exp $
5858
*
5959
*-------------------------------------------------------------------------
6060
*/
@@ -1038,6 +1038,23 @@ cost_sort(Path *path, PlannerInfo *root,
10381038
path->total_cost = startup_cost + run_cost;
10391039
}
10401040

1041+
/*
1042+
* sort_exceeds_work_mem
1043+
* Given a finished Sort plan node, detect whether it is expected to
1044+
* spill to disk (ie, will need more than work_mem workspace)
1045+
*
1046+
* This assumes there will be no available LIMIT.
1047+
*/
1048+
bool
1049+
sort_exceeds_work_mem(Sort *sort)
1050+
{
1051+
double input_bytes = relation_byte_size(sort->plan.plan_rows,
1052+
sort->plan.plan_width);
1053+
long work_mem_bytes = work_mem * 1024L;
1054+
1055+
return (input_bytes > work_mem_bytes);
1056+
}
1057+
10411058
/*
10421059
* cost_material
10431060
* Determines and returns the cost of materializing a relation, including

src/backend/optimizer/plan/createplan.c

+25-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
*
1111
*
1212
* IDENTIFICATION
13-
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.230 2007/05/04 01:13:44 tgl Exp $
13+
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.231 2007/05/21 17:57:34 tgl Exp $
1414
*
1515
*-------------------------------------------------------------------------
1616
*/
@@ -1600,6 +1600,30 @@ create_mergejoin_plan(PlannerInfo *root,
16001600
else
16011601
innerpathkeys = best_path->jpath.innerjoinpath->pathkeys;
16021602

1603+
/*
1604+
* If inner plan is a sort that is expected to spill to disk, add a
1605+
* materialize node to shield it from the need to handle mark/restore.
1606+
* This will allow it to perform the last merge pass on-the-fly, while
1607+
* in most cases not requiring the materialize to spill to disk.
1608+
*
1609+
* XXX really, Sort oughta do this for itself, probably, to avoid the
1610+
* overhead of a separate plan node.
1611+
*/
1612+
if (IsA(inner_plan, Sort) &&
1613+
sort_exceeds_work_mem((Sort *) inner_plan))
1614+
{
1615+
Plan *matplan = (Plan *) make_material(inner_plan);
1616+
1617+
/*
1618+
* We assume the materialize will not spill to disk, and therefore
1619+
* charge just cpu_tuple_cost per tuple.
1620+
*/
1621+
copy_plan_costsize(matplan, inner_plan);
1622+
matplan->total_cost += cpu_tuple_cost * matplan->plan_rows;
1623+
1624+
inner_plan = matplan;
1625+
}
1626+
16031627
/*
16041628
* Compute the opfamily/strategy/nullsfirst arrays needed by the executor.
16051629
* The information is in the pathkeys for the two inputs, but we need to

0 commit comments

Comments
 (0)