Skip to content

Commit 218f515

Browse files
committed
Reduce page locking in GIN vacuum
GIN vacuum during cleaning posting tree can lock this whole tree for a long time with by holding LockBufferForCleanup() on root. Patch changes it with two ways: first, cleanup lock will be taken only if there is an empty page (which should be deleted) and, second, it tries to lock only subtree, not the whole posting tree. Author: Andrey Borodin with minor editorization by me Reviewed-by: Jeff Davis, me https://commitfest.postgresql.org/13/896/
1 parent 7356101 commit 218f515

File tree

4 files changed

+145
-110
lines changed

4 files changed

+145
-110
lines changed

src/backend/access/gin/README

+11-4
Original file line numberDiff line numberDiff line change
@@ -314,10 +314,17 @@ deleted.
314314
The previous paragraph's reasoning only applies to searches, and only to
315315
posting trees. To protect from inserters following a downlink to a deleted
316316
page, vacuum simply locks out all concurrent insertions to the posting tree,
317-
by holding a super-exclusive lock on the posting tree root. Inserters hold a
318-
pin on the root page, but searches do not, so while new searches cannot begin
319-
while root page is locked, any already-in-progress scans can continue
320-
concurrently with vacuum. In the entry tree, we never delete pages.
317+
by holding a super-exclusive lock on the parent page of subtree with deletable
318+
pages. Inserters hold a pin on the root page, but searches do not, so while
319+
new searches cannot begin while root page is locked, any already-in-progress
320+
scans can continue concurrently with vacuum in corresponding subtree of
321+
posting tree. To exclude interference with readers vacuum takes exclusive
322+
locks in a depth-first scan in left-to-right order of page tuples. Leftmost
323+
page is never deleted. Thus before deleting any page we obtain exclusive
324+
lock on any left page, effectively excluding deadlock with any reader, despite
325+
taking parent lock before current and left lock after current. We take left
326+
lock not for a concurrency reasons, but rather in need to mark page dirty.
327+
In the entry tree, we never delete pages.
321328

322329
(This is quite different from the mechanism the btree indexam uses to make
323330
page-deletions safe; it stamps the deleted pages with an XID and keeps the

src/backend/access/gin/ginbtree.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
3131
/*
3232
* Lock buffer by needed method for search.
3333
*/
34-
static int
34+
int
3535
ginTraverseLock(Buffer buffer, bool searchMode)
3636
{
3737
Page page;

src/backend/access/gin/ginvacuum.c

+131-105
Original file line numberDiff line numberDiff line change
@@ -109,75 +109,17 @@ xlogVacuumPage(Relation index, Buffer buffer)
109109
PageSetLSN(page, recptr);
110110
}
111111

112-
static bool
113-
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer *rootBuffer)
114-
{
115-
Buffer buffer;
116-
Page page;
117-
bool hasVoidPage = FALSE;
118-
MemoryContext oldCxt;
119-
120-
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
121-
RBM_NORMAL, gvs->strategy);
122-
page = BufferGetPage(buffer);
123-
124-
/*
125-
* We should be sure that we don't concurrent with inserts, insert process
126-
* never release root page until end (but it can unlock it and lock
127-
* again). New scan can't start but previously started ones work
128-
* concurrently.
129-
*/
130-
if (isRoot)
131-
LockBufferForCleanup(buffer);
132-
else
133-
LockBuffer(buffer, GIN_EXCLUSIVE);
134-
135-
Assert(GinPageIsData(page));
136112

137-
if (GinPageIsLeaf(page))
138-
{
139-
oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
140-
ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
141-
MemoryContextSwitchTo(oldCxt);
142-
MemoryContextReset(gvs->tmpCxt);
143-
144-
/* if root is a leaf page, we don't desire further processing */
145-
if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page))
146-
hasVoidPage = TRUE;
147-
}
148-
else
149-
{
150-
OffsetNumber i;
151-
bool isChildHasVoid = FALSE;
152-
153-
for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++)
154-
{
155-
PostingItem *pitem = GinDataPageGetPostingItem(page, i);
156-
157-
if (ginVacuumPostingTreeLeaves(gvs, PostingItemGetBlockNumber(pitem), FALSE, NULL))
158-
isChildHasVoid = TRUE;
159-
}
160-
161-
if (isChildHasVoid)
162-
hasVoidPage = TRUE;
163-
}
113+
typedef struct DataPageDeleteStack
114+
{
115+
struct DataPageDeleteStack *child;
116+
struct DataPageDeleteStack *parent;
164117

165-
/*
166-
* if we have root and there are empty pages in tree, then we don't
167-
* release lock to go further processing and guarantee that tree is unused
168-
*/
169-
if (!(isRoot && hasVoidPage))
170-
{
171-
UnlockReleaseBuffer(buffer);
172-
}
173-
else
174-
{
175-
Assert(rootBuffer);
176-
*rootBuffer = buffer;
177-
}
118+
BlockNumber blkno; /* current block number */
119+
BlockNumber leftBlkno; /* rightest non-deleted page on left */
120+
bool isRoot;
121+
} DataPageDeleteStack;
178122

179-
return hasVoidPage;
180-
}
181123

182124
/*
183125
* Delete a posting tree page.
@@ -194,8 +136,13 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
194136
BlockNumber rightlink;
195137

196138
/*
197-
* Lock the pages in the same order as an insertion would, to avoid
198-
* deadlocks: left, then right, then parent.
139+
* This function MUST be called only if someone of parent pages hold
140+
* exclusive cleanup lock. This guarantees that no insertions currently
141+
* happen in this subtree. Caller also acquire Exclusive lock on deletable
142+
* page and is acquiring and releasing exclusive lock on left page before.
143+
* Left page was locked and released. Then parent and this page are locked.
144+
* We acquire left page lock here only to mark page dirty after changing
145+
* right pointer.
199146
*/
200147
lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
201148
RBM_NORMAL, gvs->strategy);
@@ -205,10 +152,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
205152
RBM_NORMAL, gvs->strategy);
206153

207154
LockBuffer(lBuffer, GIN_EXCLUSIVE);
208-
LockBuffer(dBuffer, GIN_EXCLUSIVE);
209-
if (!isParentRoot) /* parent is already locked by
210-
* LockBufferForCleanup() */
211-
LockBuffer(pBuffer, GIN_EXCLUSIVE);
212155

213156
START_CRIT_SECTION();
214157

@@ -272,26 +215,15 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
272215
PageSetLSN(BufferGetPage(lBuffer), recptr);
273216
}
274217

275-
if (!isParentRoot)
276-
LockBuffer(pBuffer, GIN_UNLOCK);
277218
ReleaseBuffer(pBuffer);
278219
UnlockReleaseBuffer(lBuffer);
279-
UnlockReleaseBuffer(dBuffer);
220+
ReleaseBuffer(dBuffer);
280221

281222
END_CRIT_SECTION();
282223

283224
gvs->result->pages_deleted++;
284225
}
285226

286-
typedef struct DataPageDeleteStack
287-
{
288-
struct DataPageDeleteStack *child;
289-
struct DataPageDeleteStack *parent;
290-
291-
BlockNumber blkno; /* current block number */
292-
BlockNumber leftBlkno; /* rightest non-deleted page on left */
293-
bool isRoot;
294-
} DataPageDeleteStack;
295227

296228
/*
297229
* scans posting tree and deletes empty pages
@@ -325,6 +257,10 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
325257

326258
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
327259
RBM_NORMAL, gvs->strategy);
260+
261+
if(!isRoot)
262+
LockBuffer(buffer, GIN_EXCLUSIVE);
263+
328264
page = BufferGetPage(buffer);
329265

330266
Assert(GinPageIsData(page));
@@ -359,6 +295,9 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
359295
}
360296
}
361297

298+
if(!isRoot)
299+
LockBuffer(buffer, GIN_UNLOCK);
300+
362301
ReleaseBuffer(buffer);
363302

364303
if (!meDelete)
@@ -367,37 +306,124 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
367306
return meDelete;
368307
}
369308

370-
static void
371-
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
309+
310+
/*
311+
* Scan through posting tree, delete empty tuples from leaf pages.
312+
* Also, this function collects empty subtrees (with all empty leafs).
313+
* For parents of these subtrees CleanUp lock is taken, then we call
314+
* ScanToDelete. This is done for every inner page, which points to
315+
* empty subtree.
316+
*/
317+
static bool
318+
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot)
372319
{
373-
Buffer rootBuffer = InvalidBuffer;
374-
DataPageDeleteStack root,
375-
*ptr,
376-
*tmp;
320+
Buffer buffer;
321+
Page page;
322+
bool hasVoidPage = FALSE;
323+
MemoryContext oldCxt;
324+
325+
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
326+
RBM_NORMAL, gvs->strategy);
327+
page = BufferGetPage(buffer);
328+
329+
ginTraverseLock(buffer,false);
377330

378-
if (ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE, &rootBuffer) == FALSE)
331+
Assert(GinPageIsData(page));
332+
333+
if (GinPageIsLeaf(page))
379334
{
380-
Assert(rootBuffer == InvalidBuffer);
381-
return;
335+
oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
336+
ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
337+
MemoryContextSwitchTo(oldCxt);
338+
MemoryContextReset(gvs->tmpCxt);
339+
340+
/* if root is a leaf page, we don't desire further processing */
341+
if (GinDataLeafPageIsEmpty(page))
342+
hasVoidPage = TRUE;
343+
344+
UnlockReleaseBuffer(buffer);
345+
346+
return hasVoidPage;
382347
}
348+
else
349+
{
350+
OffsetNumber i;
351+
bool hasEmptyChild = FALSE;
352+
bool hasNonEmptyChild = FALSE;
353+
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
354+
BlockNumber* children = palloc(sizeof(BlockNumber) * (maxoff + 1));
355+
356+
/*
357+
* Read all children BlockNumbers.
358+
* Not sure it is safe if there are many concurrent vacuums.
359+
*/
383360

384-
memset(&root, 0, sizeof(DataPageDeleteStack));
385-
root.leftBlkno = InvalidBlockNumber;
386-
root.isRoot = TRUE;
361+
for (i = FirstOffsetNumber; i <= maxoff; i++)
362+
{
363+
PostingItem *pitem = GinDataPageGetPostingItem(page, i);
387364

388-
vacuum_delay_point();
365+
children[i] = PostingItemGetBlockNumber(pitem);
366+
}
389367

390-
ginScanToDelete(gvs, rootBlkno, TRUE, &root, InvalidOffsetNumber);
368+
UnlockReleaseBuffer(buffer);
391369

392-
ptr = root.child;
393-
while (ptr)
394-
{
395-
tmp = ptr->child;
396-
pfree(ptr);
397-
ptr = tmp;
370+
for (i = FirstOffsetNumber; i <= maxoff; i++)
371+
{
372+
if (ginVacuumPostingTreeLeaves(gvs, children[i], FALSE))
373+
hasEmptyChild = TRUE;
374+
else
375+
hasNonEmptyChild = TRUE;
376+
}
377+
378+
pfree(children);
379+
380+
vacuum_delay_point();
381+
382+
/*
383+
* All subtree is empty - just return TRUE to indicate that parent must
384+
* do a cleanup. Unless we are ROOT an there is way to go upper.
385+
*/
386+
387+
if(hasEmptyChild && !hasNonEmptyChild && !isRoot)
388+
return TRUE;
389+
390+
if(hasEmptyChild)
391+
{
392+
DataPageDeleteStack root,
393+
*ptr,
394+
*tmp;
395+
396+
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
397+
RBM_NORMAL, gvs->strategy);
398+
LockBufferForCleanup(buffer);
399+
400+
memset(&root, 0, sizeof(DataPageDeleteStack));
401+
root.leftBlkno = InvalidBlockNumber;
402+
root.isRoot = TRUE;
403+
404+
ginScanToDelete(gvs, blkno, TRUE, &root, InvalidOffsetNumber);
405+
406+
ptr = root.child;
407+
408+
while (ptr)
409+
{
410+
tmp = ptr->child;
411+
pfree(ptr);
412+
ptr = tmp;
413+
}
414+
415+
UnlockReleaseBuffer(buffer);
416+
}
417+
418+
/* Here we have deleted all empty subtrees */
419+
return FALSE;
398420
}
421+
}
399422

400-
UnlockReleaseBuffer(rootBuffer);
423+
static void
424+
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
425+
{
426+
ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE);
401427
}
402428

403429
/*

src/include/access/gin_private.h

+2
Original file line numberDiff line numberDiff line change
@@ -471,4 +471,6 @@ ginCompareItemPointers(ItemPointer a, ItemPointer b)
471471
return -1;
472472
}
473473

474+
extern int ginTraverseLock(Buffer buffer, bool searchMode);
475+
474476
#endif /* GIN_PRIVATE_H */

0 commit comments

Comments
 (0)