Skip to content

Commit c352ea2

Browse files
committed
Further cleanup of gistsplit.c.
After further reflection I was unconvinced that the existing coding is guaranteed to return valid union datums in every code path for multi-column indexes. Fix that by forcing a gistunionsubkey() call at the end of the recursion. Having done that, we can remove some clearly-redundant calls elsewhere. This should be a little faster for multi-column indexes (since the previous coding would uselessly do such a call for each column while unwinding the recursion), as well as much harder to break. Also, simplify the handling of cases where one side or the other of a primary split contains only don't-care tuples. The previous coding used a very ugly hack in removeDontCares() that essentially forced one random tuple to be treated as non-don't-care, providing a random initial choice of seed datum for the secondary split. It seems unlikely that that method will give better-than-random splits. Instead, treat such a split as degenerate and just let the next column determine the split, the same way that we handle fully degenerate cases where the two sides produce identical union datums.
1 parent db3d7e9 commit c352ea2

File tree

1 file changed

+93
-64
lines changed

1 file changed

+93
-64
lines changed

src/backend/access/gist/gistsplit.c

Lines changed: 93 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -162,23 +162,16 @@ findDontCares(Relation r, GISTSTATE *giststate, GISTENTRY *valvec,
162162
* Remove tuples that are marked don't-cares from the tuple index array a[]
163163
* of length *len. This is applied separately to the spl_left and spl_right
164164
* arrays.
165-
*
166-
* Corner case: we do not wish to reduce the index array to zero length.
167-
* (If we did, then the union key for this side would be null, and having just
168-
* one of spl_ldatum_exists and spl_rdatum_exists be TRUE might confuse
169-
* user-defined PickSplit methods.) To avoid that, we'll forcibly redefine
170-
* one tuple as non-don't-care if necessary. Hence, we must be able to adjust
171-
* caller's NumDontCare count.
172165
*/
173166
static void
174-
removeDontCares(OffsetNumber *a, int *len, bool *dontcare, int *NumDontCare)
167+
removeDontCares(OffsetNumber *a, int *len, const bool *dontcare)
175168
{
176169
int origlen,
177-
curlen,
170+
newlen,
178171
i;
179172
OffsetNumber *curwpos;
180173

181-
origlen = curlen = *len;
174+
origlen = newlen = *len;
182175
curwpos = a;
183176
for (i = 0; i < origlen; i++)
184177
{
@@ -190,18 +183,11 @@ removeDontCares(OffsetNumber *a, int *len, bool *dontcare, int *NumDontCare)
190183
*curwpos = ai;
191184
curwpos++;
192185
}
193-
else if (curlen == 1)
194-
{
195-
/* corner case: don't let array become empty */
196-
dontcare[ai] = FALSE; /* mark item as non-dont-care */
197-
*NumDontCare -= 1;
198-
i--; /* reprocess item on next iteration */
199-
}
200186
else
201-
curlen--;
187+
newlen--;
202188
}
203189

204-
*len = curlen;
190+
*len = newlen;
205191
}
206192

207193
/*
@@ -304,8 +290,14 @@ supportSecondarySplit(Relation r, GISTSTATE *giststate, int attno,
304290
penalty2;
305291

306292
/*
307-
* there is only one previously defined union, so we just choose swap
308-
* or not by lowest penalty
293+
* There is only one previously defined union, so we just choose swap
294+
* or not by lowest penalty for that side. We can only get here if a
295+
* secondary split happened to have all NULLs in its column in the
296+
* tuples that the outer recursion level had assigned to one side.
297+
* (Note that the null checks in gistSplitByKey don't prevent the
298+
* case, because they'll only be checking tuples that were considered
299+
* don't-cares at the outer recursion level, not the tuples that went
300+
* into determining the passed-down left and right union keys.)
309301
*/
310302
penalty1 = gistpenalty(giststate, attno, entry1, false, &entrySL, false);
311303
penalty2 = gistpenalty(giststate, attno, entry1, false, &entrySR, false);
@@ -405,13 +397,19 @@ genericPickSplit(GISTSTATE *giststate, GistEntryVector *entryvec, GIST_SPLITVEC
405397
* two vectors.
406398
*
407399
* Returns FALSE if split is complete (there are no more index columns, or
408-
* there is no need to consider them). Note that in this case the union
409-
* keys for all columns must be computed here.
410-
* Returns TRUE and v->spl_dontcare = NULL if left and right unions of attno
411-
* column are the same, so we should split on next column instead.
400+
* there is no need to consider them because split is optimal already).
401+
*
402+
* Returns TRUE and v->spl_dontcare = NULL if the picksplit result is
403+
* degenerate (all tuples seem to be don't-cares), so we should just
404+
* disregard this column and split on the next column(s) instead.
405+
*
412406
* Returns TRUE and v->spl_dontcare != NULL if there are don't-care tuples
413407
* that could be relocated based on the next column(s). The don't-care
414408
* tuples have been removed from the split and must be reinserted by caller.
409+
* There is at least one non-don't-care tuple on each side of the split,
410+
* and union keys for all columns are updated to include just those tuples.
411+
*
412+
* A TRUE result implies there is at least one more index column.
415413
*/
416414
static bool
417415
gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVector *v,
@@ -483,49 +481,57 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec
483481

484482
/*
485483
* If index columns remain, then consider whether we can improve the split
486-
* by using them. Even if we can't, we must compute union keys for those
487-
* columns before we can return FALSE.
484+
* by using them.
488485
*/
489486
v->spl_dontcare = NULL;
490487

491488
if (attno + 1 < giststate->tupdesc->natts)
492489
{
493490
int NumDontCare;
494491

492+
/*
493+
* Make a quick check to see if left and right union keys are equal;
494+
* if so, the split is certainly degenerate, so tell caller to
495+
* re-split with the next column.
496+
*/
495497
if (gistKeyIsEQ(giststate, attno, sv->spl_ldatum, sv->spl_rdatum))
496-
{
497-
/*
498-
* Left and right union keys are equal, so we can get better split
499-
* by considering next column.
500-
*/
501498
return true;
502-
}
503499

504500
/*
505-
* Locate don't-care tuples, if any
501+
* Locate don't-care tuples, if any. If there are none, the split is
502+
* optimal, so just fall out and return false.
506503
*/
507504
v->spl_dontcare = (bool *) palloc0(sizeof(bool) * (entryvec->n + 1));
508505

509506
NumDontCare = findDontCares(r, giststate, entryvec->vector, v, attno);
510507

511-
if (NumDontCare == 0)
508+
if (NumDontCare > 0)
512509
{
513510
/*
514-
* There are no don't-cares, so just compute the union keys for
515-
* remaining columns and we're done.
511+
* Remove don't-cares from spl_left[] and spl_right[].
516512
*/
517-
gistunionsubkey(giststate, itup, v);
518-
}
519-
else
520-
{
513+
removeDontCares(sv->spl_left, &sv->spl_nleft, v->spl_dontcare);
514+
removeDontCares(sv->spl_right, &sv->spl_nright, v->spl_dontcare);
515+
521516
/*
522-
* Remove don't-cares from spl_left[] and spl_right[]. NOTE: this
523-
* could reduce NumDontCare to zero.
517+
* If all tuples on either side were don't-cares, the split is
518+
* degenerate, and we're best off to ignore it and split on the
519+
* next column. (We used to try to press on with a secondary
520+
* split by forcing a random tuple on each side to be treated as
521+
* non-don't-care, but it seems unlikely that that technique
522+
* really gives a better result. Note that we don't want to try a
523+
* secondary split with empty left or right primary split sides,
524+
* because then there is no union key on that side for the
525+
* PickSplit function to try to expand, so it can have no good
526+
* figure of merit for what it's doing. Also note that this check
527+
* ensures we can't produce a bogus one-side-only split in the
528+
* NumDontCare == 1 special case below.)
524529
*/
525-
removeDontCares(sv->spl_left, &sv->spl_nleft,
526-
v->spl_dontcare, &NumDontCare);
527-
removeDontCares(sv->spl_right, &sv->spl_nright,
528-
v->spl_dontcare, &NumDontCare);
530+
if (sv->spl_nleft == 0 || sv->spl_nright == 0)
531+
{
532+
v->spl_dontcare = NULL;
533+
return true;
534+
}
529535

530536
/*
531537
* Recompute union keys, considering only non-don't-care tuples.
@@ -541,7 +547,9 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec
541547
/*
542548
* If there's only one don't-care tuple then we can't do a
543549
* PickSplit on it, so just choose whether to send it left or
544-
* right by comparing penalties.
550+
* right by comparing penalties. We needed the
551+
* gistunionsubkey step anyway so that we have appropriate
552+
* union keys for figuring the penalties.
545553
*/
546554
OffsetNumber toMove;
547555

@@ -556,13 +564,14 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec
556564
/* ... and assign it to cheaper side */
557565
placeOne(r, giststate, v, itup[toMove - 1], toMove, attno + 1);
558566

559-
/* recompute the union keys including this tuple */
560-
v->spl_dontcare = NULL;
561-
gistunionsubkey(giststate, itup, v);
567+
/*
568+
* At this point the union keys are wrong, but we don't care
569+
* because we're done splitting. The outermost recursion
570+
* level of gistSplitByKey will fix things before returning.
571+
*/
562572
}
563-
else if (NumDontCare > 1)
573+
else
564574
return true;
565-
/* else NumDontCare is now zero; handle same as above */
566575
}
567576
}
568577

@@ -648,7 +657,7 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len,
648657
*/
649658
v->spl_risnull[attno] = v->spl_lisnull[attno] = TRUE;
650659

651-
if (attno + 1 < r->rd_att->natts)
660+
if (attno + 1 < giststate->tupdesc->natts)
652661
gistSplitByKey(r, page, itup, len, giststate, v, attno + 1);
653662
else
654663
gistSplitHalf(&v->splitVector, len);
@@ -673,28 +682,31 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len,
673682
else
674683
v->splitVector.spl_left[v->splitVector.spl_nleft++] = i;
675684

676-
/* Must compute union keys for this and any following columns */
677-
v->spl_dontcare = NULL;
678-
gistunionsubkey(giststate, itup, v);
685+
/* Compute union keys, unless outer recursion level will handle it */
686+
if (attno == 0 && giststate->tupdesc->natts == 1)
687+
{
688+
v->spl_dontcare = NULL;
689+
gistunionsubkey(giststate, itup, v);
690+
}
679691
}
680692
else
681693
{
682694
/*
683-
* all keys are not-null, so apply user-defined PickSplit method
695+
* All keys are not-null, so apply user-defined PickSplit method
684696
*/
685697
if (gistUserPicksplit(r, entryvec, attno, v, itup, len, giststate))
686698
{
687699
/*
688700
* Splitting on attno column is not optimal, so consider
689701
* redistributing don't-care tuples according to the next column
690702
*/
691-
Assert(attno + 1 < r->rd_att->natts);
703+
Assert(attno + 1 < giststate->tupdesc->natts);
692704

693705
if (v->spl_dontcare == NULL)
694706
{
695707
/*
696-
* Simple case: left and right keys for attno column are
697-
* equal, so just split according to the next column.
708+
* This split was actually degenerate, so ignore it altogether
709+
* and just split according to the next column.
698710
*/
699711
gistSplitByKey(r, page, itup, len, giststate, v, attno + 1);
700712
}
@@ -741,10 +753,27 @@ gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len,
741753
backupSplit.spl_right[backupSplit.spl_nright++] = map[v->splitVector.spl_right[i] - 1];
742754

743755
v->splitVector = backupSplit;
744-
745-
/* recompute left and right union datums */
746-
gistunionsubkey(giststate, itup, v);
747756
}
748757
}
749758
}
759+
760+
/*
761+
* If we're handling a multicolumn index, at the end of the recursion
762+
* recompute the left and right union datums for all index columns. This
763+
* makes sure we hand back correct union datums in all corner cases,
764+
* including when we haven't processed all columns to start with, or when
765+
* a secondary split moved "don't care" tuples from one side to the other
766+
* (we really shouldn't assume that that didn't change the union datums).
767+
*
768+
* Note: when we're in an internal recursion (attno > 0), we do not worry
769+
* about whether the union datums we return with are sensible, since
770+
* calling levels won't care. Also, in a single-column index, we expect
771+
* that PickSplit (or the special cases above) produced correct union
772+
* datums.
773+
*/
774+
if (attno == 0 && giststate->tupdesc->natts > 1)
775+
{
776+
v->spl_dontcare = NULL;
777+
gistunionsubkey(giststate, itup, v);
778+
}
750779
}

0 commit comments

Comments
 (0)