Skip to content

Commit d2528e5

Browse files
committed
Back-patch recent fixes for gistchoose and gistRelocateBuildBuffersOnSplit.
This back-ports commits c8ba697 and e5db11c, which fix one definite and one speculative bug in gistchoose, and make the code a lot more intelligible as well. In 9.2 only, this also affects the largely-copied-and-pasted logic in gistRelocateBuildBuffersOnSplit. The impact of the bugs was that the functions might make poor decisions as to which index tree branch to push a new entry down into, resulting in GiST index bloat and poor performance. The fixes rectify these decisions for future insertions, but a REINDEX would be needed to clean up any existing index bloat. Alexander Korotkov, Robert Haas, Tom Lane
1 parent d561fc5 commit d2528e5

File tree

2 files changed

+141
-46
lines changed

2 files changed

+141
-46
lines changed

src/backend/access/gist/gistbuildbuffers.c

Lines changed: 68 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -625,60 +625,107 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
625625
}
626626

627627
/*
628-
* Loop through all index tuples on the buffer on the splitted page,
629-
* moving them to buffers on the new pages.
628+
* Loop through all index tuples in the buffer of the page being split,
629+
* moving them to buffers for the new pages. We try to move each tuple to
630+
* the page that will result in the lowest penalty for the leading column
631+
* or, in the case of a tie, the lowest penalty for the earliest column
632+
* that is not tied.
633+
*
634+
* The page searching logic is very similar to gistchoose().
630635
*/
631636
while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup))
632637
{
633-
float sum_grow,
634-
which_grow[INDEX_MAX_KEYS];
638+
float best_penalty[INDEX_MAX_KEYS];
635639
int i,
636640
which;
637641
IndexTuple newtup;
638642
RelocationBufferInfo *targetBufferInfo;
639643

640-
/*
641-
* Choose which page this tuple should go to.
642-
*/
643644
gistDeCompressAtt(giststate, r,
644645
itup, NULL, (OffsetNumber) 0, entry, isnull);
645646

646-
which = -1;
647-
*which_grow = -1.0f;
648-
sum_grow = 1.0f;
647+
/* default to using first page (shouldn't matter) */
648+
which = 0;
649+
650+
/*
651+
* best_penalty[j] is the best penalty we have seen so far for column
652+
* j, or -1 when we haven't yet examined column j. Array entries to
653+
* the right of the first -1 are undefined.
654+
*/
655+
best_penalty[0] = -1;
649656

650-
for (i = 0; i < splitPagesCount && sum_grow; i++)
657+
/*
658+
* Loop over possible target pages, looking for one to move this tuple
659+
* to.
660+
*/
661+
for (i = 0; i < splitPagesCount; i++)
651662
{
652-
int j;
653663
RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
664+
bool zero_penalty;
665+
int j;
654666

655-
sum_grow = 0.0f;
667+
zero_penalty = true;
668+
669+
/* Loop over index attributes. */
656670
for (j = 0; j < r->rd_att->natts; j++)
657671
{
658672
float usize;
659673

674+
/* Compute penalty for this column. */
660675
usize = gistpenalty(giststate, j,
661676
&splitPageInfo->entry[j],
662677
splitPageInfo->isnull[j],
663678
&entry[j], isnull[j]);
679+
if (usize > 0)
680+
zero_penalty = false;
664681

665-
if (which_grow[j] < 0 || usize < which_grow[j])
682+
if (best_penalty[j] < 0 || usize < best_penalty[j])
666683
{
684+
/*
685+
* New best penalty for column. Tentatively select this
686+
* page as the target, and record the best penalty. Then
687+
* reset the next column's penalty to "unknown" (and
688+
* indirectly, the same for all the ones to its right).
689+
* This will force us to adopt this page's penalty values
690+
* as the best for all the remaining columns during
691+
* subsequent loop iterations.
692+
*/
667693
which = i;
668-
which_grow[j] = usize;
669-
if (j < r->rd_att->natts - 1 && i == 0)
670-
which_grow[j + 1] = -1;
671-
sum_grow += which_grow[j];
694+
best_penalty[j] = usize;
695+
696+
if (j < r->rd_att->natts - 1)
697+
best_penalty[j + 1] = -1;
698+
}
699+
else if (best_penalty[j] == usize)
700+
{
701+
/*
702+
* The current page is exactly as good for this column as
703+
* the best page seen so far. The next iteration of this
704+
* loop will compare the next column.
705+
*/
672706
}
673-
else if (which_grow[j] == usize)
674-
sum_grow += usize;
675707
else
676708
{
677-
sum_grow = 1;
709+
/*
710+
* The current page is worse for this column than the best
711+
* page seen so far. Skip the remaining columns and move
712+
* on to the next page, if any.
713+
*/
714+
zero_penalty = false; /* so outer loop won't exit */
678715
break;
679716
}
680717
}
718+
719+
/*
720+
* If we find a page with zero penalty for all columns, there's no
721+
* need to examine remaining pages; just break out of the loop and
722+
* return it.
723+
*/
724+
if (zero_penalty)
725+
break;
681726
}
727+
728+
/* OK, "which" is the page index to push the tuple to */
682729
targetBufferInfo = &relocationBuffersInfos[which];
683730

684731
/* Push item to selected node buffer */

src/backend/access/gist/gistutil.c

Lines changed: 73 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -363,72 +363,120 @@ gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *gis
363363
}
364364

365365
/*
366-
* find entry with lowest penalty
366+
* Search an upper index page for the entry with lowest penalty for insertion
367+
* of the new index key contained in "it".
368+
*
369+
* Returns the index of the page entry to insert into.
367370
*/
368371
OffsetNumber
369372
gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */
370373
GISTSTATE *giststate)
371374
{
375+
OffsetNumber result;
372376
OffsetNumber maxoff;
373377
OffsetNumber i;
374-
OffsetNumber which;
375-
float sum_grow,
376-
which_grow[INDEX_MAX_KEYS];
378+
float best_penalty[INDEX_MAX_KEYS];
377379
GISTENTRY entry,
378380
identry[INDEX_MAX_KEYS];
379381
bool isnull[INDEX_MAX_KEYS];
380382

381-
maxoff = PageGetMaxOffsetNumber(p);
382-
*which_grow = -1.0;
383-
which = InvalidOffsetNumber;
384-
sum_grow = 1;
383+
Assert(!GistPageIsLeaf(p));
384+
385385
gistDeCompressAtt(giststate, r,
386386
it, NULL, (OffsetNumber) 0,
387387
identry, isnull);
388388

389+
/* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */
390+
result = FirstOffsetNumber;
391+
392+
/*
393+
* The index may have multiple columns, and there's a penalty value for
394+
* each column. The penalty associated with a column that appears earlier
395+
* in the index definition is strictly more important than the penalty of
396+
* a column that appears later in the index definition.
397+
*
398+
* best_penalty[j] is the best penalty we have seen so far for column j,
399+
* or -1 when we haven't yet examined column j. Array entries to the
400+
* right of the first -1 are undefined.
401+
*/
402+
best_penalty[0] = -1;
403+
404+
/*
405+
* Loop over tuples on page.
406+
*/
407+
maxoff = PageGetMaxOffsetNumber(p);
389408
Assert(maxoff >= FirstOffsetNumber);
390-
Assert(!GistPageIsLeaf(p));
391409

392-
for (i = FirstOffsetNumber; i <= maxoff && sum_grow; i = OffsetNumberNext(i))
410+
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
393411
{
394-
int j;
395412
IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i));
413+
bool zero_penalty;
414+
int j;
396415

397-
sum_grow = 0;
416+
zero_penalty = true;
417+
418+
/* Loop over index attributes. */
398419
for (j = 0; j < r->rd_att->natts; j++)
399420
{
400421
Datum datum;
401422
float usize;
402423
bool IsNull;
403424

425+
/* Compute penalty for this column. */
404426
datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull);
405427
gistdentryinit(giststate, j, &entry, datum, r, p, i,
406428
FALSE, IsNull);
407429
usize = gistpenalty(giststate, j, &entry, IsNull,
408430
&identry[j], isnull[j]);
431+
if (usize > 0)
432+
zero_penalty = false;
409433

410-
if (which_grow[j] < 0 || usize < which_grow[j])
434+
if (best_penalty[j] < 0 || usize < best_penalty[j])
435+
{
436+
/*
437+
* New best penalty for column. Tentatively select this tuple
438+
* as the target, and record the best penalty. Then reset the
439+
* next column's penalty to "unknown" (and indirectly, the
440+
* same for all the ones to its right). This will force us to
441+
* adopt this tuple's penalty values as the best for all the
442+
* remaining columns during subsequent loop iterations.
443+
*/
444+
result = i;
445+
best_penalty[j] = usize;
446+
447+
if (j < r->rd_att->natts - 1)
448+
best_penalty[j + 1] = -1;
449+
}
450+
else if (best_penalty[j] == usize)
411451
{
412-
which = i;
413-
which_grow[j] = usize;
414-
if (j < r->rd_att->natts - 1 && i == FirstOffsetNumber)
415-
which_grow[j + 1] = -1;
416-
sum_grow += which_grow[j];
452+
/*
453+
* The current tuple is exactly as good for this column as the
454+
* best tuple seen so far. The next iteration of this loop
455+
* will compare the next column.
456+
*/
417457
}
418-
else if (which_grow[j] == usize)
419-
sum_grow += usize;
420458
else
421459
{
422-
sum_grow = 1;
460+
/*
461+
* The current tuple is worse for this column than the best
462+
* tuple seen so far. Skip the remaining columns and move on
463+
* to the next tuple, if any.
464+
*/
465+
zero_penalty = false; /* so outer loop won't exit */
423466
break;
424467
}
425468
}
426-
}
427469

428-
if (which == InvalidOffsetNumber)
429-
which = FirstOffsetNumber;
470+
/*
471+
* If we find a tuple with zero penalty for all columns, there's no
472+
* need to examine remaining tuples; just break out of the loop and
473+
* return it.
474+
*/
475+
if (zero_penalty)
476+
break;
477+
}
430478

431-
return which;
479+
return result;
432480
}
433481

434482
/*

0 commit comments

Comments
 (0)