Skip to content

Commit fd83c83

Browse files
committed
Fix deadlock in GIN vacuum introduced by 218f515
Before 218f515 if posting tree page is about to be deleted, then the whole posting tree is locked by LockBufferForCleanup() on root preventing all the concurrent inserts. 218f515 reduced locking to the subtree containing page to be deleted. However, due to concurrent parent split, inserter doesn't always holds pins on all the pages constituting path from root to the target leaf page. That could cause a deadlock between GIN vacuum process and GIN inserter. And we didn't find non-invasive way to fix this. This commit reverts VACUUM behavior to lock the whole posting tree before delete any page. However, we keep another useful change by 218f515: the tree is locked only if there are pages to be deleted. Reported-by: Chen Huajun Diagnosed-by: Chen Huajun, Andrey Borodin, Peter Geoghegan Discussion: https://postgr.es/m/31a702a.14dd.166c1366ac1.Coremail.chjischj%40163.com Author: Alexander Korotkov, based on ideas from Andrey Borodin and Peter Geoghegan Reviewed-by: Andrey Borodin Backpatch-through: 10
1 parent 77d4d88 commit fd83c83

File tree

2 files changed

+73
-90
lines changed

2 files changed

+73
-90
lines changed

src/backend/access/gin/README

+4-11
Original file line numberDiff line numberDiff line change
@@ -314,17 +314,10 @@ deleted.
314314
The previous paragraph's reasoning only applies to searches, and only to
315315
posting trees. To protect from inserters following a downlink to a deleted
316316
page, vacuum simply locks out all concurrent insertions to the posting tree,
317-
by holding a super-exclusive lock on the parent page of subtree with deletable
318-
pages. Inserters hold a pin on the root page, but searches do not, so while
319-
new searches cannot begin while root page is locked, any already-in-progress
320-
scans can continue concurrently with vacuum in corresponding subtree of
321-
posting tree. To exclude interference with readers vacuum takes exclusive
322-
locks in a depth-first scan in left-to-right order of page tuples. Leftmost
323-
page is never deleted. Thus before deleting any page we obtain exclusive
324-
lock on any left page, effectively excluding deadlock with any reader, despite
325-
taking parent lock before current and left lock after current. We take left
326-
lock not for a concurrency reasons, but rather in need to mark page dirty.
327-
In the entry tree, we never delete pages.
317+
by holding a super-exclusive lock on the posting tree root. Inserters hold a
318+
pin on the root page, but searches do not, so while new searches cannot begin
319+
while root page is locked, any already-in-progress scans can continue
320+
concurrently with vacuum. In the entry tree, we never delete pages.
328321

329322
(This is quite different from the mechanism the btree indexam uses to make
330323
page-deletions safe; it stamps the deleted pages with an XID and keeps the

src/backend/access/gin/ginvacuum.c

+69-79
Original file line numberDiff line numberDiff line change
@@ -315,124 +315,114 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
315315

316316

317317
/*
318-
* Scan through posting tree, delete empty tuples from leaf pages.
319-
* Also, this function collects empty subtrees (with all empty leafs).
320-
* For parents of these subtrees CleanUp lock is taken, then we call
321-
* ScanToDelete. This is done for every inner page, which points to
322-
* empty subtree.
318+
* Scan through posting tree leafs, delete empty tuples. Returns true if there
319+
* is at least one empty page.
323320
*/
324321
static bool
325-
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot)
322+
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno)
326323
{
327324
Buffer buffer;
328325
Page page;
329326
bool hasVoidPage = false;
330327
MemoryContext oldCxt;
331328

332-
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
333-
RBM_NORMAL, gvs->strategy);
334-
page = BufferGetPage(buffer);
329+
/* Find leftmost leaf page of posting tree and lock it in exclusive mode */
330+
while (true)
331+
{
332+
PostingItem *pitem;
335333

336-
ginTraverseLock(buffer, false);
334+
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
335+
RBM_NORMAL, gvs->strategy);
336+
LockBuffer(buffer, GIN_SHARE);
337+
page = BufferGetPage(buffer);
337338

338-
Assert(GinPageIsData(page));
339+
Assert(GinPageIsData(page));
339340

340-
if (GinPageIsLeaf(page))
341+
if (GinPageIsLeaf(page))
342+
{
343+
LockBuffer(buffer, GIN_UNLOCK);
344+
LockBuffer(buffer, GIN_EXCLUSIVE);
345+
break;
346+
}
347+
348+
Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
349+
350+
pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
351+
blkno = PostingItemGetBlockNumber(pitem);
352+
Assert(blkno != InvalidBlockNumber);
353+
354+
UnlockReleaseBuffer(buffer);
355+
}
356+
357+
/* Iterate all posting tree leaves using rightlinks and vacuum them */
358+
while (true)
341359
{
342360
oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
343361
ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
344362
MemoryContextSwitchTo(oldCxt);
345363
MemoryContextReset(gvs->tmpCxt);
346364

347-
/* if root is a leaf page, we don't desire further processing */
348365
if (GinDataLeafPageIsEmpty(page))
349366
hasVoidPage = true;
350367

351-
UnlockReleaseBuffer(buffer);
352-
353-
return hasVoidPage;
354-
}
355-
else
356-
{
357-
OffsetNumber i;
358-
bool hasEmptyChild = false;
359-
bool hasNonEmptyChild = false;
360-
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
361-
BlockNumber *children = palloc(sizeof(BlockNumber) * (maxoff + 1));
362-
363-
/*
364-
* Read all children BlockNumbers. Not sure it is safe if there are
365-
* many concurrent vacuums.
366-
*/
367-
368-
for (i = FirstOffsetNumber; i <= maxoff; i++)
369-
{
370-
PostingItem *pitem = GinDataPageGetPostingItem(page, i);
371-
372-
children[i] = PostingItemGetBlockNumber(pitem);
373-
}
368+
blkno = GinPageGetOpaque(page)->rightlink;
374369

375370
UnlockReleaseBuffer(buffer);
376371

377-
for (i = FirstOffsetNumber; i <= maxoff; i++)
378-
{
379-
if (ginVacuumPostingTreeLeaves(gvs, children[i], false))
380-
hasEmptyChild = true;
381-
else
382-
hasNonEmptyChild = true;
383-
}
372+
if (blkno == InvalidBlockNumber)
373+
break;
384374

385-
pfree(children);
375+
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
376+
RBM_NORMAL, gvs->strategy);
377+
LockBuffer(buffer, GIN_EXCLUSIVE);
378+
page = BufferGetPage(buffer);
379+
}
386380

387-
vacuum_delay_point();
381+
return hasVoidPage;
382+
}
388383

384+
static void
385+
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
386+
{
387+
if (ginVacuumPostingTreeLeaves(gvs, rootBlkno))
388+
{
389389
/*
390-
* All subtree is empty - just return true to indicate that parent
391-
* must do a cleanup, unless we are ROOT and there is way to go upper.
390+
* There is at least one empty page. So we have to rescan the tree
391+
* deleting empty pages.
392392
*/
393+
Buffer buffer;
394+
DataPageDeleteStack root,
395+
*ptr,
396+
*tmp;
393397

394-
if (hasEmptyChild && !hasNonEmptyChild && !isRoot)
395-
return true;
398+
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, rootBlkno,
399+
RBM_NORMAL, gvs->strategy);
396400

397-
if (hasEmptyChild)
398-
{
399-
DataPageDeleteStack root,
400-
*ptr,
401-
*tmp;
402-
403-
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
404-
RBM_NORMAL, gvs->strategy);
405-
LockBufferForCleanup(buffer);
406-
407-
memset(&root, 0, sizeof(DataPageDeleteStack));
408-
root.leftBlkno = InvalidBlockNumber;
409-
root.isRoot = true;
401+
/*
402+
* Lock posting tree root for cleanup to ensure there are no concurrent
403+
* inserts.
404+
*/
405+
LockBufferForCleanup(buffer);
410406

411-
ginScanToDelete(gvs, blkno, true, &root, InvalidOffsetNumber);
407+
memset(&root, 0, sizeof(DataPageDeleteStack));
408+
root.leftBlkno = InvalidBlockNumber;
409+
root.isRoot = true;
412410

413-
ptr = root.child;
411+
ginScanToDelete(gvs, rootBlkno, true, &root, InvalidOffsetNumber);
414412

415-
while (ptr)
416-
{
417-
tmp = ptr->child;
418-
pfree(ptr);
419-
ptr = tmp;
420-
}
413+
ptr = root.child;
421414

422-
UnlockReleaseBuffer(buffer);
415+
while (ptr)
416+
{
417+
tmp = ptr->child;
418+
pfree(ptr);
419+
ptr = tmp;
423420
}
424421

425-
/* Here we have deleted all empty subtrees */
426-
return false;
422+
UnlockReleaseBuffer(buffer);
427423
}
428424
}
429425

430-
static void
431-
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
432-
{
433-
ginVacuumPostingTreeLeaves(gvs, rootBlkno, true);
434-
}
435-
436426
/*
437427
* returns modified page or NULL if page isn't modified.
438428
* Function works with original page until first change is occurred,

0 commit comments

Comments
 (0)