PostgreSQL Source Code git master
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "executor/instrument.h"
49#include "lib/binaryheap.h"
50#include "miscadmin.h"
51#include "pg_trace.h"
52#include "pgstat.h"
53#include "postmaster/bgwriter.h"
54#include "storage/aio.h"
56#include "storage/bufmgr.h"
57#include "storage/fd.h"
58#include "storage/ipc.h"
59#include "storage/lmgr.h"
60#include "storage/proc.h"
61#include "storage/read_stream.h"
62#include "storage/smgr.h"
63#include "storage/standby.h"
64#include "utils/memdebug.h"
65#include "utils/ps_status.h"
66#include "utils/rel.h"
67#include "utils/resowner.h"
68#include "utils/timestamp.h"
69
70
71/* Note: these two macros only work on shared buffers, not local ones! */
72#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
73#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
74
75/* Note: this macro only works on local buffers, not shared ones! */
76#define LocalBufHdrGetBlock(bufHdr) \
77 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
78
79/* Bits in SyncOneBuffer's return value */
80#define BUF_WRITTEN 0x01
81#define BUF_REUSABLE 0x02
82
83#define RELS_BSEARCH_THRESHOLD 20
84
85/*
86 * This is the size (in the number of blocks) above which we scan the
87 * entire buffer pool to remove the buffers for all the pages of relation
88 * being dropped. For the relations with size below this threshold, we find
89 * the buffers by doing lookups in BufMapping table.
90 */
91#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
92
94{
98
99/* 64 bytes, about the size of a cache line on common systems */
100#define REFCOUNT_ARRAY_ENTRIES 8
101
102/*
103 * Status of buffers to checkpoint for a particular tablespace, used
104 * internally in BufferSync.
105 */
106typedef struct CkptTsStatus
107{
108 /* oid of the tablespace */
110
111 /*
112 * Checkpoint progress for this tablespace. To make progress comparable
113 * between tablespaces the progress is, for each tablespace, measured as a
114 * number between 0 and the total number of to-be-checkpointed pages. Each
115 * page checkpointed in this tablespace increments this space's progress
116 * by progress_slice.
117 */
120
121 /* number of to-be checkpointed pages in this tablespace */
123 /* already processed pages in this tablespace */
125
126 /* current offset in CkptBufferIds for this tablespace */
127 int index;
129
130/*
131 * Type for array used to sort SMgrRelations
132 *
133 * FlushRelationsAllBuffers shares the same comparator function with
134 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
135 * compatible.
136 */
137typedef struct SMgrSortArray
138{
139 RelFileLocator rlocator; /* This must be the first member */
142
143/* GUC variables */
147bool track_io_timing = false;
148
149/*
150 * How many buffers PrefetchBuffer callers should try to stay ahead of their
151 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
152 * for buffers not belonging to tablespaces that have their
153 * effective_io_concurrency parameter set.
154 */
156
157/*
158 * Like effective_io_concurrency, but used by maintenance code paths that might
159 * benefit from a higher setting because they work on behalf of many sessions.
160 * Overridden by the tablespace setting of the same name.
161 */
163
164/*
165 * Limit on how many blocks should be handled in single I/O operations.
166 * StartReadBuffers() callers should respect it, as should other operations
167 * that call smgr APIs directly. It is computed as the minimum of underlying
168 * GUCs io_combine_limit_guc and io_max_combine_limit.
169 */
173
174/*
175 * GUC variables about triggering kernel writeback for buffers written; OS
176 * dependent defaults are set via the GUC mechanism.
177 */
181
182/* local state for LockBufferForCleanup */
184
185/*
186 * Backend-Private refcount management:
187 *
188 * Each buffer also has a private refcount that keeps track of the number of
189 * times the buffer is pinned in the current process. This is so that the
190 * shared refcount needs to be modified only once if a buffer is pinned more
191 * than once by an individual backend. It's also used to check that no buffers
192 * are still pinned at the end of transactions and when exiting.
193 *
194 *
195 * To avoid - as we used to - requiring an array with NBuffers entries to keep
196 * track of local buffers, we use a small sequentially searched array
197 * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
198 * keep track of backend local pins.
199 *
200 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
201 * refcounts are kept track of in the array; after that, new array entries
202 * displace old ones into the hash table. That way a frequently used entry
203 * can't get "stuck" in the hashtable while infrequent ones clog the array.
204 *
205 * Note that in most scenarios the number of pinned buffers will not exceed
206 * REFCOUNT_ARRAY_ENTRIES.
207 *
208 *
209 * To enter a buffer into the refcount tracking mechanism first reserve a free
210 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
211 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
212 * memory allocations in NewPrivateRefCountEntry() which can be important
213 * because in some scenarios it's called with a spinlock held...
214 */
220
222
223static void ReservePrivateRefCountEntry(void);
228
229/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
230static void ResOwnerReleaseBufferIO(Datum res);
231static char *ResOwnerPrintBufferIO(Datum res);
232static void ResOwnerReleaseBufferPin(Datum res);
233static char *ResOwnerPrintBufferPin(Datum res);
234
236{
237 .name = "buffer io",
238 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
239 .release_priority = RELEASE_PRIO_BUFFER_IOS,
240 .ReleaseResource = ResOwnerReleaseBufferIO,
241 .DebugPrint = ResOwnerPrintBufferIO
242};
243
245{
246 .name = "buffer pin",
247 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
248 .release_priority = RELEASE_PRIO_BUFFER_PINS,
249 .ReleaseResource = ResOwnerReleaseBufferPin,
250 .DebugPrint = ResOwnerPrintBufferPin
251};
252
253/*
254 * Ensure that the PrivateRefCountArray has sufficient space to store one more
255 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
256 * a new entry - but it's perfectly fine to not use a reserved entry.
257 */
258static void
260{
261 /* Already reserved (or freed), nothing to do */
262 if (ReservedRefCountEntry != NULL)
263 return;
264
265 /*
266 * First search for a free entry the array, that'll be sufficient in the
267 * majority of cases.
268 */
269 {
270 int i;
271
272 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
273 {
275
276 res = &PrivateRefCountArray[i];
277
278 if (res->buffer == InvalidBuffer)
279 {
281 return;
282 }
283 }
284 }
285
286 /*
287 * No luck. All array entries are full. Move one array entry into the hash
288 * table.
289 */
290 {
291 /*
292 * Move entry from the current clock position in the array into the
293 * hashtable. Use that slot.
294 */
295 PrivateRefCountEntry *hashent;
296 bool found;
297
298 /* select victim slot */
301
302 /* Better be used, otherwise we shouldn't get here. */
304
305 /* enter victim array entry into hashtable */
309 &found);
310 Assert(!found);
312
313 /* clear the now free array slot */
316
318 }
319}
320
321/*
322 * Fill a previously reserved refcount entry.
323 */
326{
328
329 /* only allowed to be called when a reservation has been made */
331
332 /* use up the reserved entry */
335
336 /* and fill it */
337 res->buffer = buffer;
338 res->refcount = 0;
339
340 return res;
341}
342
343/*
344 * Return the PrivateRefCount entry for the passed buffer.
345 *
346 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
347 * do_move is true, and the entry resides in the hashtable the entry is
348 * optimized for frequent access by moving it to the array.
349 */
352{
354 int i;
355
358
359 /*
360 * First search for references in the array, that'll be sufficient in the
361 * majority of cases.
362 */
363 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
364 {
365 res = &PrivateRefCountArray[i];
366
367 if (res->buffer == buffer)
368 return res;
369 }
370
371 /*
372 * By here we know that the buffer, if already pinned, isn't residing in
373 * the array.
374 *
375 * Only look up the buffer in the hashtable if we've previously overflowed
376 * into it.
377 */
379 return NULL;
380
382
383 if (res == NULL)
384 return NULL;
385 else if (!do_move)
386 {
387 /* caller doesn't want us to move the hash entry into the array */
388 return res;
389 }
390 else
391 {
392 /* move buffer from hashtable into the free array slot */
393 bool found;
395
396 /* Ensure there's a free array slot */
398
399 /* Use up the reserved slot */
403 Assert(free->buffer == InvalidBuffer);
404
405 /* and fill it */
406 free->buffer = buffer;
407 free->refcount = res->refcount;
408
409 /* delete from hashtable */
411 Assert(found);
414
415 return free;
416 }
417}
418
419/*
420 * Returns how many times the passed buffer is pinned by this backend.
421 *
422 * Only works for shared memory buffers!
423 */
424static inline int32
426{
428
431
432 /*
433 * Not moving the entry - that's ok for the current users, but we might
434 * want to change this one day.
435 */
436 ref = GetPrivateRefCountEntry(buffer, false);
437
438 if (ref == NULL)
439 return 0;
440 return ref->refcount;
441}
442
443/*
444 * Release resources used to track the reference count of a buffer which we no
445 * longer have pinned and don't want to pin again immediately.
446 */
447static void
449{
450 Assert(ref->refcount == 0);
451
452 if (ref >= &PrivateRefCountArray[0] &&
454 {
455 ref->buffer = InvalidBuffer;
456
457 /*
458 * Mark the just used entry as reserved - in many scenarios that
459 * allows us to avoid ever having to search the array/hash for free
460 * entries.
461 */
463 }
464 else
465 {
466 bool found;
467 Buffer buffer = ref->buffer;
468
470 Assert(found);
473 }
474}
475
476/*
477 * BufferIsPinned
478 * True iff the buffer is pinned (also checks for valid buffer number).
479 *
480 * NOTE: what we check here is that *this* backend holds a pin on
481 * the buffer. We do not care whether some other backend does.
482 */
483#define BufferIsPinned(bufnum) \
484( \
485 !BufferIsValid(bufnum) ? \
486 false \
487 : \
488 BufferIsLocal(bufnum) ? \
489 (LocalRefCount[-(bufnum) - 1] > 0) \
490 : \
491 (GetPrivateRefCount(bufnum) > 0) \
492)
493
494
496 SMgrRelation smgr, char smgr_persistence,
497 ForkNumber forkNum, BlockNumber blockNum,
500 ForkNumber fork,
501 BufferAccessStrategy strategy,
502 uint32 flags,
503 uint32 extend_by,
504 BlockNumber extend_upto,
505 Buffer *buffers,
506 uint32 *extended_by);
508 ForkNumber fork,
509 BufferAccessStrategy strategy,
510 uint32 flags,
511 uint32 extend_by,
512 BlockNumber extend_upto,
513 Buffer *buffers,
514 uint32 *extended_by);
515static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
516static void PinBuffer_Locked(BufferDesc *buf);
517static void UnpinBuffer(BufferDesc *buf);
518static void UnpinBufferNoOwner(BufferDesc *buf);
519static void BufferSync(int flags);
521static int SyncOneBuffer(int buf_id, bool skip_recently_used,
522 WritebackContext *wb_context);
523static void WaitIO(BufferDesc *buf);
524static void AbortBufferIO(Buffer buffer);
525static void shared_buffer_write_error_callback(void *arg);
526static void local_buffer_write_error_callback(void *arg);
527static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
528 char relpersistence,
529 ForkNumber forkNum,
530 BlockNumber blockNum,
531 BufferAccessStrategy strategy,
532 bool *foundPtr, IOContext io_context);
533static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
534static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
535static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
536static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
537 IOObject io_object, IOContext io_context);
538static void FindAndDropRelationBuffers(RelFileLocator rlocator,
539 ForkNumber forkNum,
540 BlockNumber nForkBlock,
541 BlockNumber firstDelBlock);
543 RelFileLocator dstlocator,
544 ForkNumber forkNum, bool permanent);
545static void AtProcExit_Buffers(int code, Datum arg);
546static void CheckForBufferLeaks(void);
547#ifdef USE_ASSERT_CHECKING
548static void AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
549 void *unused_context);
550#endif
551static int rlocator_comparator(const void *p1, const void *p2);
552static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
553static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
554static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
555
556
557/*
558 * Implementation of PrefetchBuffer() for shared buffers.
559 */
562 ForkNumber forkNum,
563 BlockNumber blockNum)
564{
565 PrefetchBufferResult result = {InvalidBuffer, false};
566 BufferTag newTag; /* identity of requested block */
567 uint32 newHash; /* hash value for newTag */
568 LWLock *newPartitionLock; /* buffer partition lock for it */
569 int buf_id;
570
571 Assert(BlockNumberIsValid(blockNum));
572
573 /* create a tag so we can lookup the buffer */
574 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
575 forkNum, blockNum);
576
577 /* determine its hash code and partition lock ID */
578 newHash = BufTableHashCode(&newTag);
579 newPartitionLock = BufMappingPartitionLock(newHash);
580
581 /* see if the block is in the buffer pool already */
582 LWLockAcquire(newPartitionLock, LW_SHARED);
583 buf_id = BufTableLookup(&newTag, newHash);
584 LWLockRelease(newPartitionLock);
585
586 /* If not in buffers, initiate prefetch */
587 if (buf_id < 0)
588 {
589#ifdef USE_PREFETCH
590 /*
591 * Try to initiate an asynchronous read. This returns false in
592 * recovery if the relation file doesn't exist.
593 */
594 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
595 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
596 {
597 result.initiated_io = true;
598 }
599#endif /* USE_PREFETCH */
600 }
601 else
602 {
603 /*
604 * Report the buffer it was in at that time. The caller may be able
605 * to avoid a buffer table lookup, but it's not pinned and it must be
606 * rechecked!
607 */
608 result.recent_buffer = buf_id + 1;
609 }
610
611 /*
612 * If the block *is* in buffers, we do nothing. This is not really ideal:
613 * the block might be just about to be evicted, which would be stupid
614 * since we know we are going to need it soon. But the only easy answer
615 * is to bump the usage_count, which does not seem like a great solution:
616 * when the caller does ultimately touch the block, usage_count would get
617 * bumped again, resulting in too much favoritism for blocks that are
618 * involved in a prefetch sequence. A real fix would involve some
619 * additional per-buffer state, and it's not clear that there's enough of
620 * a problem to justify that.
621 */
622
623 return result;
624}
625
626/*
627 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
628 *
629 * This is named by analogy to ReadBuffer but doesn't actually allocate a
630 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
631 * block will not be delayed by the I/O. Prefetching is optional.
632 *
633 * There are three possible outcomes:
634 *
635 * 1. If the block is already cached, the result includes a valid buffer that
636 * could be used by the caller to avoid the need for a later buffer lookup, but
637 * it's not pinned, so the caller must recheck it.
638 *
639 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
640 * true. Currently there is no way to know if the data was already cached by
641 * the kernel and therefore didn't really initiate I/O, and no way to know when
642 * the I/O completes other than using synchronous ReadBuffer().
643 *
644 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
645 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
646 * lack of a kernel facility), direct I/O is enabled, or the underlying
647 * relation file wasn't found and we are in recovery. (If the relation file
648 * wasn't found and we are not in recovery, an error is raised).
649 */
652{
653 Assert(RelationIsValid(reln));
654 Assert(BlockNumberIsValid(blockNum));
655
656 if (RelationUsesLocalBuffers(reln))
657 {
658 /* see comments in ReadBufferExtended */
659 if (RELATION_IS_OTHER_TEMP(reln))
661 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
662 errmsg("cannot access temporary tables of other sessions")));
663
664 /* pass it off to localbuf.c */
665 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
666 }
667 else
668 {
669 /* pass it to the shared buffer version */
670 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
671 }
672}
673
674/*
675 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
676 *
677 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
678 * successful. Return true if the buffer is valid and still has the expected
679 * tag. In that case, the buffer is pinned and the usage count is bumped.
680 */
681bool
683 Buffer recent_buffer)
684{
685 BufferDesc *bufHdr;
686 BufferTag tag;
687 uint32 buf_state;
688 bool have_private_ref;
689
690 Assert(BufferIsValid(recent_buffer));
691
694 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
695
696 if (BufferIsLocal(recent_buffer))
697 {
698 int b = -recent_buffer - 1;
699
700 bufHdr = GetLocalBufferDescriptor(b);
701 buf_state = pg_atomic_read_u32(&bufHdr->state);
702
703 /* Is it still valid and holding the right tag? */
704 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
705 {
706 PinLocalBuffer(bufHdr, true);
707
709
710 return true;
711 }
712 }
713 else
714 {
715 bufHdr = GetBufferDescriptor(recent_buffer - 1);
716 have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
717
718 /*
719 * Do we already have this buffer pinned with a private reference? If
720 * so, it must be valid and it is safe to check the tag without
721 * locking. If not, we have to lock the header first and then check.
722 */
723 if (have_private_ref)
724 buf_state = pg_atomic_read_u32(&bufHdr->state);
725 else
726 buf_state = LockBufHdr(bufHdr);
727
728 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
729 {
730 /*
731 * It's now safe to pin the buffer. We can't pin first and ask
732 * questions later, because it might confuse code paths like
733 * InvalidateBuffer() if we pinned a random non-matching buffer.
734 */
735 if (have_private_ref)
736 PinBuffer(bufHdr, NULL); /* bump pin count */
737 else
738 PinBuffer_Locked(bufHdr); /* pin for first time */
739
741
742 return true;
743 }
744
745 /* If we locked the header above, now unlock. */
746 if (!have_private_ref)
747 UnlockBufHdr(bufHdr, buf_state);
748 }
749
750 return false;
751}
752
753/*
754 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
755 * fork with RBM_NORMAL mode and default strategy.
756 */
757Buffer
759{
760 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
761}
762
763/*
764 * ReadBufferExtended -- returns a buffer containing the requested
765 * block of the requested relation. If the blknum
766 * requested is P_NEW, extend the relation file and
767 * allocate a new block. (Caller is responsible for
768 * ensuring that only one backend tries to extend a
769 * relation at the same time!)
770 *
771 * Returns: the buffer number for the buffer containing
772 * the block read. The returned buffer has been pinned.
773 * Does not return on error --- elog's instead.
774 *
775 * Assume when this function is called, that reln has been opened already.
776 *
777 * In RBM_NORMAL mode, the page is read from disk, and the page header is
778 * validated. An error is thrown if the page header is not valid. (But
779 * note that an all-zero page is considered "valid"; see
780 * PageIsVerified().)
781 *
782 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
783 * valid, the page is zeroed instead of throwing an error. This is intended
784 * for non-critical data, where the caller is prepared to repair errors.
785 *
786 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
787 * filled with zeros instead of reading it from disk. Useful when the caller
788 * is going to fill the page from scratch, since this saves I/O and avoids
789 * unnecessary failure if the page-on-disk has corrupt page headers.
790 * The page is returned locked to ensure that the caller has a chance to
791 * initialize the page before it's made visible to others.
792 * Caution: do not use this mode to read a page that is beyond the relation's
793 * current physical EOF; that is likely to cause problems in md.c when
794 * the page is modified and written out. P_NEW is OK, though.
795 *
796 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
797 * a cleanup-strength lock on the page.
798 *
799 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
800 *
801 * If strategy is not NULL, a nondefault buffer access strategy is used.
802 * See buffer/README for details.
803 */
804inline Buffer
807{
808 Buffer buf;
809
810 /*
811 * Reject attempts to read non-local temporary relations; we would be
812 * likely to get wrong data since we have no visibility into the owning
813 * session's local buffers.
814 */
815 if (RELATION_IS_OTHER_TEMP(reln))
817 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
818 errmsg("cannot access temporary tables of other sessions")));
819
820 /*
821 * Read the buffer, and update pgstat counters to reflect a cache hit or
822 * miss.
823 */
824 buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
825 forkNum, blockNum, mode, strategy);
826
827 return buf;
828}
829
830
831/*
832 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
833 * a relcache entry for the relation.
834 *
835 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
836 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
837 * cannot be used for temporary relations (and making that work might be
838 * difficult, unless we only want to read temporary relations for our own
839 * ProcNumber).
840 */
841Buffer
844 BufferAccessStrategy strategy, bool permanent)
845{
846 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
847
848 return ReadBuffer_common(NULL, smgr,
849 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
850 forkNum, blockNum,
851 mode, strategy);
852}
853
854/*
855 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
856 */
857Buffer
859 ForkNumber forkNum,
860 BufferAccessStrategy strategy,
861 uint32 flags)
862{
863 Buffer buf;
864 uint32 extend_by = 1;
865
866 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
867 &buf, &extend_by);
868
869 return buf;
870}
871
872/*
873 * Extend relation by multiple blocks.
874 *
875 * Tries to extend the relation by extend_by blocks. Depending on the
876 * availability of resources the relation may end up being extended by a
877 * smaller number of pages (unless an error is thrown, always by at least one
878 * page). *extended_by is updated to the number of pages the relation has been
879 * extended to.
880 *
881 * buffers needs to be an array that is at least extend_by long. Upon
882 * completion, the first extend_by array elements will point to a pinned
883 * buffer.
884 *
885 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
886 * locked. This is useful for callers that want a buffer that is guaranteed to
887 * be empty.
888 */
891 ForkNumber fork,
892 BufferAccessStrategy strategy,
893 uint32 flags,
894 uint32 extend_by,
895 Buffer *buffers,
896 uint32 *extended_by)
897{
898 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
899 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
900 Assert(extend_by > 0);
901
902 if (bmr.smgr == NULL)
903 {
904 bmr.smgr = RelationGetSmgr(bmr.rel);
905 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
906 }
907
908 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
909 extend_by, InvalidBlockNumber,
910 buffers, extended_by);
911}
912
913/*
914 * Extend the relation so it is at least extend_to blocks large, return buffer
915 * (extend_to - 1).
916 *
917 * This is useful for callers that want to write a specific page, regardless
918 * of the current size of the relation (e.g. useful for visibilitymap and for
919 * crash recovery).
920 */
921Buffer
923 ForkNumber fork,
924 BufferAccessStrategy strategy,
925 uint32 flags,
926 BlockNumber extend_to,
928{
930 uint32 extended_by = 0;
932 Buffer buffers[64];
933
934 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
935 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
936 Assert(extend_to != InvalidBlockNumber && extend_to > 0);
937
938 if (bmr.smgr == NULL)
939 {
940 bmr.smgr = RelationGetSmgr(bmr.rel);
941 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
942 }
943
944 /*
945 * If desired, create the file if it doesn't exist. If
946 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
947 * an smgrexists call.
948 */
949 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
950 (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
952 !smgrexists(bmr.smgr, fork))
953 {
955
956 /* recheck, fork might have been created concurrently */
957 if (!smgrexists(bmr.smgr, fork))
958 smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
959
961 }
962
963 /*
964 * If requested, invalidate size cache, so that smgrnblocks asks the
965 * kernel.
966 */
967 if (flags & EB_CLEAR_SIZE_CACHE)
969
970 /*
971 * Estimate how many pages we'll need to extend by. This avoids acquiring
972 * unnecessarily many victim buffers.
973 */
974 current_size = smgrnblocks(bmr.smgr, fork);
975
976 /*
977 * Since no-one else can be looking at the page contents yet, there is no
978 * difference between an exclusive lock and a cleanup-strength lock. Note
979 * that we pass the original mode to ReadBuffer_common() below, when
980 * falling back to reading the buffer to a concurrent relation extension.
981 */
983 flags |= EB_LOCK_TARGET;
984
985 while (current_size < extend_to)
986 {
987 uint32 num_pages = lengthof(buffers);
988 BlockNumber first_block;
989
990 if ((uint64) current_size + num_pages > extend_to)
991 num_pages = extend_to - current_size;
992
993 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
994 num_pages, extend_to,
995 buffers, &extended_by);
996
997 current_size = first_block + extended_by;
998 Assert(num_pages != 0 || current_size >= extend_to);
999
1000 for (uint32 i = 0; i < extended_by; i++)
1001 {
1002 if (first_block + i != extend_to - 1)
1003 ReleaseBuffer(buffers[i]);
1004 else
1005 buffer = buffers[i];
1006 }
1007 }
1008
1009 /*
1010 * It's possible that another backend concurrently extended the relation.
1011 * In that case read the buffer.
1012 *
1013 * XXX: Should we control this via a flag?
1014 */
1015 if (buffer == InvalidBuffer)
1016 {
1017 Assert(extended_by == 0);
1019 fork, extend_to - 1, mode, strategy);
1020 }
1021
1022 return buffer;
1023}
1024
1025/*
1026 * Lock and optionally zero a buffer, as part of the implementation of
1027 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1028 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1029 */
1030static void
1032{
1033 BufferDesc *bufHdr;
1034 bool need_to_zero;
1035 bool isLocalBuf = BufferIsLocal(buffer);
1036
1038
1039 if (already_valid)
1040 {
1041 /*
1042 * If the caller already knew the buffer was valid, we can skip some
1043 * header interaction. The caller just wants to lock the buffer.
1044 */
1045 need_to_zero = false;
1046 }
1047 else if (isLocalBuf)
1048 {
1049 /* Simple case for non-shared buffers. */
1050 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1051 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1052 }
1053 else
1054 {
1055 /*
1056 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1057 * concurrently. Even though we aren't doing I/O, that ensures that
1058 * we don't zero a page that someone else has pinned. An exclusive
1059 * content lock wouldn't be enough, because readers are allowed to
1060 * drop the content lock after determining that a tuple is visible
1061 * (see buffer access rules in README).
1062 */
1063 bufHdr = GetBufferDescriptor(buffer - 1);
1064 need_to_zero = StartBufferIO(bufHdr, true, false);
1065 }
1066
1067 if (need_to_zero)
1068 {
1069 memset(BufferGetPage(buffer), 0, BLCKSZ);
1070
1071 /*
1072 * Grab the buffer content lock before marking the page as valid, to
1073 * make sure that no other backend sees the zeroed page before the
1074 * caller has had a chance to initialize it.
1075 *
1076 * Since no-one else can be looking at the page contents yet, there is
1077 * no difference between an exclusive lock and a cleanup-strength
1078 * lock. (Note that we cannot use LockBuffer() or
1079 * LockBufferForCleanup() here, because they assert that the buffer is
1080 * already valid.)
1081 */
1082 if (!isLocalBuf)
1084
1085 /* Set BM_VALID, terminate IO, and wake up any waiters */
1086 if (isLocalBuf)
1087 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1088 else
1089 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1090 }
1091 else if (!isLocalBuf)
1092 {
1093 /*
1094 * The buffer is valid, so we can't zero it. The caller still expects
1095 * the page to be locked on return.
1096 */
1097 if (mode == RBM_ZERO_AND_LOCK)
1099 else
1101 }
1102}
1103
1104/*
1105 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1106 * already present, or false if more work is required to either read it in or
1107 * zero it.
1108 */
1111 SMgrRelation smgr,
1112 char persistence,
1113 ForkNumber forkNum,
1114 BlockNumber blockNum,
1115 BufferAccessStrategy strategy,
1116 bool *foundPtr)
1117{
1118 BufferDesc *bufHdr;
1119 IOContext io_context;
1120 IOObject io_object;
1121
1122 Assert(blockNum != P_NEW);
1123
1124 /* Persistence should be set before */
1125 Assert((persistence == RELPERSISTENCE_TEMP ||
1126 persistence == RELPERSISTENCE_PERMANENT ||
1127 persistence == RELPERSISTENCE_UNLOGGED));
1128
1129 if (persistence == RELPERSISTENCE_TEMP)
1130 {
1131 io_context = IOCONTEXT_NORMAL;
1132 io_object = IOOBJECT_TEMP_RELATION;
1133 }
1134 else
1135 {
1136 io_context = IOContextForStrategy(strategy);
1137 io_object = IOOBJECT_RELATION;
1138 }
1139
1140 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1144 smgr->smgr_rlocator.backend);
1145
1146 if (persistence == RELPERSISTENCE_TEMP)
1147 {
1148 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1149 if (*foundPtr)
1151 }
1152 else
1153 {
1154 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1155 strategy, foundPtr, io_context);
1156 if (*foundPtr)
1158 }
1159 if (rel)
1160 {
1161 /*
1162 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1163 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1164 * zeroed instead), the per-relation stats always count them.
1165 */
1167 if (*foundPtr)
1169 }
1170 if (*foundPtr)
1171 {
1172 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1173 if (VacuumCostActive)
1175
1176 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1180 smgr->smgr_rlocator.backend,
1181 true);
1182 }
1183
1184 return BufferDescriptorGetBuffer(bufHdr);
1185}
1186
1187/*
1188 * ReadBuffer_common -- common logic for all ReadBuffer variants
1189 *
1190 * smgr is required, rel is optional unless using P_NEW.
1191 */
1193ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1194 ForkNumber forkNum,
1196 BufferAccessStrategy strategy)
1197{
1198 ReadBuffersOperation operation;
1199 Buffer buffer;
1200 int flags;
1201 char persistence;
1202
1203 /*
1204 * Backward compatibility path, most code should use ExtendBufferedRel()
1205 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1206 * scales a lot better.
1207 */
1208 if (unlikely(blockNum == P_NEW))
1209 {
1211
1212 /*
1213 * Since no-one else can be looking at the page contents yet, there is
1214 * no difference between an exclusive lock and a cleanup-strength
1215 * lock.
1216 */
1218 flags |= EB_LOCK_FIRST;
1219
1220 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1221 }
1222
1223 if (rel)
1224 persistence = rel->rd_rel->relpersistence;
1225 else
1226 persistence = smgr_persistence;
1227
1230 {
1231 bool found;
1232
1233 buffer = PinBufferForBlock(rel, smgr, persistence,
1234 forkNum, blockNum, strategy, &found);
1235 ZeroAndLockBuffer(buffer, mode, found);
1236 return buffer;
1237 }
1238
1239 /*
1240 * Signal that we are going to immediately wait. If we're immediately
1241 * waiting, there is no benefit in actually executing the IO
1242 * asynchronously, it would just add dispatch overhead.
1243 */
1245 if (mode == RBM_ZERO_ON_ERROR)
1247 operation.smgr = smgr;
1248 operation.rel = rel;
1249 operation.persistence = persistence;
1250 operation.forknum = forkNum;
1251 operation.strategy = strategy;
1252 if (StartReadBuffer(&operation,
1253 &buffer,
1254 blockNum,
1255 flags))
1256 WaitReadBuffers(&operation);
1257
1258 return buffer;
1259}
1260
1263 Buffer *buffers,
1264 BlockNumber blockNum,
1265 int *nblocks,
1266 int flags,
1267 bool allow_forwarding)
1268{
1269 int actual_nblocks = *nblocks;
1270 int maxcombine = 0;
1271 bool did_start_io;
1272
1273 Assert(*nblocks == 1 || allow_forwarding);
1274 Assert(*nblocks > 0);
1275 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1276
1277 for (int i = 0; i < actual_nblocks; ++i)
1278 {
1279 bool found;
1280
1281 if (allow_forwarding && buffers[i] != InvalidBuffer)
1282 {
1283 BufferDesc *bufHdr;
1284
1285 /*
1286 * This is a buffer that was pinned by an earlier call to
1287 * StartReadBuffers(), but couldn't be handled in one operation at
1288 * that time. The operation was split, and the caller has passed
1289 * an already pinned buffer back to us to handle the rest of the
1290 * operation. It must continue at the expected block number.
1291 */
1292 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1293
1294 /*
1295 * It might be an already valid buffer (a hit) that followed the
1296 * final contiguous block of an earlier I/O (a miss) marking the
1297 * end of it, or a buffer that some other backend has since made
1298 * valid by performing the I/O for us, in which case we can handle
1299 * it as a hit now. It is safe to check for a BM_VALID flag with
1300 * a relaxed load, because we got a fresh view of it while pinning
1301 * it in the previous call.
1302 *
1303 * On the other hand if we don't see BM_VALID yet, it must be an
1304 * I/O that was split by the previous call and we need to try to
1305 * start a new I/O from this block. We're also racing against any
1306 * other backend that might start the I/O or even manage to mark
1307 * it BM_VALID after this check, but StartBufferIO() will handle
1308 * those cases.
1309 */
1310 if (BufferIsLocal(buffers[i]))
1311 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1312 else
1313 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1315 found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1316 }
1317 else
1318 {
1319 buffers[i] = PinBufferForBlock(operation->rel,
1320 operation->smgr,
1321 operation->persistence,
1322 operation->forknum,
1323 blockNum + i,
1324 operation->strategy,
1325 &found);
1326 }
1327
1328 if (found)
1329 {
1330 /*
1331 * We have a hit. If it's the first block in the requested range,
1332 * we can return it immediately and report that WaitReadBuffers()
1333 * does not need to be called. If the initial value of *nblocks
1334 * was larger, the caller will have to call again for the rest.
1335 */
1336 if (i == 0)
1337 {
1338 *nblocks = 1;
1339
1340#ifdef USE_ASSERT_CHECKING
1341
1342 /*
1343 * Initialize enough of ReadBuffersOperation to make
1344 * CheckReadBuffersOperation() work. Outside of assertions
1345 * that's not necessary when no IO is issued.
1346 */
1347 operation->buffers = buffers;
1348 operation->blocknum = blockNum;
1349 operation->nblocks = 1;
1350 operation->nblocks_done = 1;
1351 CheckReadBuffersOperation(operation, true);
1352#endif
1353 return false;
1354 }
1355
1356 /*
1357 * Otherwise we already have an I/O to perform, but this block
1358 * can't be included as it is already valid. Split the I/O here.
1359 * There may or may not be more blocks requiring I/O after this
1360 * one, we haven't checked, but they can't be contiguous with this
1361 * one in the way. We'll leave this buffer pinned, forwarding it
1362 * to the next call, avoiding the need to unpin it here and re-pin
1363 * it in the next call.
1364 */
1365 actual_nblocks = i;
1366 break;
1367 }
1368 else
1369 {
1370 /*
1371 * Check how many blocks we can cover with the same IO. The smgr
1372 * implementation might e.g. be limited due to a segment boundary.
1373 */
1374 if (i == 0 && actual_nblocks > 1)
1375 {
1376 maxcombine = smgrmaxcombine(operation->smgr,
1377 operation->forknum,
1378 blockNum);
1379 if (unlikely(maxcombine < actual_nblocks))
1380 {
1381 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1382 blockNum, actual_nblocks, maxcombine);
1383 actual_nblocks = maxcombine;
1384 }
1385 }
1386 }
1387 }
1388 *nblocks = actual_nblocks;
1389
1390 /* Populate information needed for I/O. */
1391 operation->buffers = buffers;
1392 operation->blocknum = blockNum;
1393 operation->flags = flags;
1394 operation->nblocks = actual_nblocks;
1395 operation->nblocks_done = 0;
1396 pgaio_wref_clear(&operation->io_wref);
1397
1398 /*
1399 * When using AIO, start the IO in the background. If not, issue prefetch
1400 * requests if desired by the caller.
1401 *
1402 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1403 * de-risk the introduction of AIO somewhat. It's a large architectural
1404 * change, with lots of chances for unanticipated performance effects.
1405 *
1406 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1407 * asynchronously, but without the check here we'd execute IO earlier than
1408 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1409 */
1410 if (io_method != IOMETHOD_SYNC)
1411 {
1412 /*
1413 * Try to start IO asynchronously. It's possible that no IO needs to
1414 * be started, if another backend already performed the IO.
1415 *
1416 * Note that if an IO is started, it might not cover the entire
1417 * requested range, e.g. because an intermediary block has been read
1418 * in by another backend. In that case any "trailing" buffers we
1419 * already pinned above will be "forwarded" by read_stream.c to the
1420 * next call to StartReadBuffers().
1421 *
1422 * This is signalled to the caller by decrementing *nblocks *and*
1423 * reducing operation->nblocks. The latter is done here, but not below
1424 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1425 * overall read size anymore, we need to retry until done in its
1426 * entirety or until failed.
1427 */
1428 did_start_io = AsyncReadBuffers(operation, nblocks);
1429
1430 operation->nblocks = *nblocks;
1431 }
1432 else
1433 {
1434 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1435
1436 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1437 {
1438 /*
1439 * In theory we should only do this if PinBufferForBlock() had to
1440 * allocate new buffers above. That way, if two calls to
1441 * StartReadBuffers() were made for the same blocks before
1442 * WaitReadBuffers(), only the first would issue the advice.
1443 * That'd be a better simulation of true asynchronous I/O, which
1444 * would only start the I/O once, but isn't done here for
1445 * simplicity.
1446 */
1447 smgrprefetch(operation->smgr,
1448 operation->forknum,
1449 blockNum,
1450 actual_nblocks);
1451 }
1452
1453 /*
1454 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1455 * will initiate the necessary IO.
1456 */
1457 did_start_io = true;
1458 }
1459
1460 CheckReadBuffersOperation(operation, !did_start_io);
1461
1462 return did_start_io;
1463}
1464
1465/*
1466 * Begin reading a range of blocks beginning at blockNum and extending for
1467 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1468 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1469 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1470 * and is now being continued. On return, *nblocks holds the number of blocks
1471 * accepted by this operation. If it is less than the original number then
1472 * this operation has been split, but buffer elements up to the original
1473 * requested size may hold forwarded buffers to be used for a continuing
1474 * operation. The caller must either start a new I/O beginning at the block
1475 * immediately following the blocks accepted by this call and pass those
1476 * buffers back in, or release them if it chooses not to. It shouldn't make
1477 * any other use of or assumptions about forwarded buffers.
1478 *
1479 * If false is returned, no I/O is necessary and the buffers covered by
1480 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1481 * an I/O has been started, and WaitReadBuffers() must be called with the same
1482 * operation object before the buffers covered by *nblocks on exit can be
1483 * accessed. Along with the operation object, the caller-supplied array of
1484 * buffers must remain valid until WaitReadBuffers() is called, and any
1485 * forwarded buffers must also be preserved for a continuing call unless
1486 * they are explicitly released.
1487 *
1488 * Currently the I/O is only started with optional operating system advice if
1489 * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
1490 * happens synchronously in WaitReadBuffers(). In future work, true I/O could
1491 * be initiated here.
1492 */
1493bool
1495 Buffer *buffers,
1496 BlockNumber blockNum,
1497 int *nblocks,
1498 int flags)
1499{
1500 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1501 true /* expect forwarded buffers */ );
1502}
1503
1504/*
1505 * Single block version of the StartReadBuffers(). This might save a few
1506 * instructions when called from another translation unit, because it is
1507 * specialized for nblocks == 1.
1508 *
1509 * This version does not support "forwarded" buffers: they cannot be created
1510 * by reading only one block and *buffer is ignored on entry.
1511 */
1512bool
1514 Buffer *buffer,
1515 BlockNumber blocknum,
1516 int flags)
1517{
1518 int nblocks = 1;
1519 bool result;
1520
1521 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1522 false /* single block, no forwarding */ );
1523 Assert(nblocks == 1); /* single block can't be short */
1524
1525 return result;
1526}
1527
1528/*
1529 * Perform sanity checks on the ReadBuffersOperation.
1530 */
1531static void
1533{
1534#ifdef USE_ASSERT_CHECKING
1535 Assert(operation->nblocks_done <= operation->nblocks);
1536 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1537
1538 for (int i = 0; i < operation->nblocks; i++)
1539 {
1540 Buffer buffer = operation->buffers[i];
1541 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1544
1545 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1547
1548 if (i < operation->nblocks_done)
1550 }
1551#endif
1552}
1553
1554/* helper for ReadBuffersCanStartIO(), to avoid repetition */
1555static inline bool
1557{
1558 if (BufferIsLocal(buffer))
1560 true, nowait);
1561 else
1562 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1563}
1564
1565/*
1566 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1567 */
1568static inline bool
1570{
1571 /*
1572 * If this backend currently has staged IO, we need to submit the pending
1573 * IO before waiting for the right to issue IO, to avoid the potential for
1574 * deadlocks (and, more commonly, unnecessary delays for other backends).
1575 */
1576 if (!nowait && pgaio_have_staged())
1577 {
1579 return true;
1580
1581 /*
1582 * Unfortunately StartBufferIO() returning false doesn't allow to
1583 * distinguish between the buffer already being valid and IO already
1584 * being in progress. Since IO already being in progress is quite
1585 * rare, this approach seems fine.
1586 */
1588 }
1589
1590 return ReadBuffersCanStartIOOnce(buffer, nowait);
1591}
1592
1593/*
1594 * Helper for WaitReadBuffers() that processes the results of a readv
1595 * operation, raising an error if necessary.
1596 */
1597static void
1599{
1600 PgAioReturn *aio_ret = &operation->io_return;
1601 PgAioResultStatus rs = aio_ret->result.status;
1602 int newly_read_blocks = 0;
1603
1604 Assert(pgaio_wref_valid(&operation->io_wref));
1605 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1606
1607 /*
1608 * SMGR reports the number of blocks successfully read as the result of
1609 * the IO operation. Thus we can simply add that to ->nblocks_done.
1610 */
1611
1612 if (likely(rs != PGAIO_RS_ERROR))
1613 newly_read_blocks = aio_ret->result.result;
1614
1615 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1616 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1617 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1618 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1619 {
1620 /*
1621 * We'll retry, so we just emit a debug message to the server log (or
1622 * not even that in prod scenarios).
1623 */
1624 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1625 elog(DEBUG3, "partial read, will retry");
1626 }
1627
1628 Assert(newly_read_blocks > 0);
1629 Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1630
1631 operation->nblocks_done += newly_read_blocks;
1632
1633 Assert(operation->nblocks_done <= operation->nblocks);
1634}
1635
1636void
1638{
1639 PgAioReturn *aio_ret = &operation->io_return;
1640 IOContext io_context;
1641 IOObject io_object;
1642
1643 if (operation->persistence == RELPERSISTENCE_TEMP)
1644 {
1645 io_context = IOCONTEXT_NORMAL;
1646 io_object = IOOBJECT_TEMP_RELATION;
1647 }
1648 else
1649 {
1650 io_context = IOContextForStrategy(operation->strategy);
1651 io_object = IOOBJECT_RELATION;
1652 }
1653
1654 /*
1655 * If we get here without an IO operation having been issued, the
1656 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1657 * caller should not have called WaitReadBuffers().
1658 *
1659 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1660 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1661 * of the retry logic below, no extra code is required.
1662 *
1663 * This path is expected to eventually go away.
1664 */
1665 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1666 elog(ERROR, "waiting for read operation that didn't read");
1667
1668 /*
1669 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1670 * done. We may need multiple retries, not just because we could get
1671 * multiple partial reads, but also because some of the remaining
1672 * to-be-read buffers may have been read in by other backends, limiting
1673 * the IO size.
1674 */
1675 while (true)
1676 {
1677 int ignored_nblocks_progress;
1678
1679 CheckReadBuffersOperation(operation, false);
1680
1681 /*
1682 * If there is an IO associated with the operation, we may need to
1683 * wait for it.
1684 */
1685 if (pgaio_wref_valid(&operation->io_wref))
1686 {
1687 /*
1688 * Track the time spent waiting for the IO to complete. As
1689 * tracking a wait even if we don't actually need to wait
1690 *
1691 * a) is not cheap, due to the timestamping overhead
1692 *
1693 * b) reports some time as waiting, even if we never waited
1694 *
1695 * we first check if we already know the IO is complete.
1696 */
1697 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1698 !pgaio_wref_check_done(&operation->io_wref))
1699 {
1701
1702 pgaio_wref_wait(&operation->io_wref);
1703
1704 /*
1705 * The IO operation itself was already counted earlier, in
1706 * AsyncReadBuffers(), this just accounts for the wait time.
1707 */
1708 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1709 io_start, 0, 0);
1710 }
1711 else
1712 {
1713 Assert(pgaio_wref_check_done(&operation->io_wref));
1714 }
1715
1716 /*
1717 * We now are sure the IO completed. Check the results. This
1718 * includes reporting on errors if there were any.
1719 */
1720 ProcessReadBuffersResult(operation);
1721 }
1722
1723 /*
1724 * Most of the time, the one IO we already started, will read in
1725 * everything. But we need to deal with partial reads and buffers not
1726 * needing IO anymore.
1727 */
1728 if (operation->nblocks_done == operation->nblocks)
1729 break;
1730
1732
1733 /*
1734 * This may only complete the IO partially, either because some
1735 * buffers were already valid, or because of a partial read.
1736 *
1737 * NB: In contrast to after the AsyncReadBuffers() call in
1738 * StartReadBuffers(), we do *not* reduce
1739 * ReadBuffersOperation->nblocks here, callers expect the full
1740 * operation to be completed at this point (as more operations may
1741 * have been queued).
1742 */
1743 AsyncReadBuffers(operation, &ignored_nblocks_progress);
1744 }
1745
1746 CheckReadBuffersOperation(operation, true);
1747
1748 /* NB: READ_DONE tracepoint was already executed in completion callback */
1749}
1750
1751/*
1752 * Initiate IO for the ReadBuffersOperation
1753 *
1754 * This function only starts a single IO at a time. The size of the IO may be
1755 * limited to below the to-be-read blocks, if one of the buffers has
1756 * concurrently been read in. If the first to-be-read buffer is already valid,
1757 * no IO will be issued.
1758 *
1759 * To support retries after partial reads, the first operation->nblocks_done
1760 * buffers are skipped.
1761 *
1762 * On return *nblocks_progress is updated to reflect the number of buffers
1763 * affected by the call. If the first buffer is valid, *nblocks_progress is
1764 * set to 1 and operation->nblocks_done is incremented.
1765 *
1766 * Returns true if IO was initiated, false if no IO was necessary.
1767 */
1768static bool
1769AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
1770{
1771 Buffer *buffers = &operation->buffers[0];
1772 int flags = operation->flags;
1773 BlockNumber blocknum = operation->blocknum;
1774 ForkNumber forknum = operation->forknum;
1775 char persistence = operation->persistence;
1776 int16 nblocks_done = operation->nblocks_done;
1777 Buffer *io_buffers = &operation->buffers[nblocks_done];
1778 int io_buffers_len = 0;
1779 PgAioHandle *ioh;
1780 uint32 ioh_flags = 0;
1781 void *io_pages[MAX_IO_COMBINE_LIMIT];
1782 IOContext io_context;
1783 IOObject io_object;
1784 bool did_start_io;
1785
1786 /*
1787 * When this IO is executed synchronously, either because the caller will
1788 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1789 * the AIO subsystem needs to know.
1790 */
1791 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1792 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1793
1794 if (persistence == RELPERSISTENCE_TEMP)
1795 {
1796 io_context = IOCONTEXT_NORMAL;
1797 io_object = IOOBJECT_TEMP_RELATION;
1798 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1799 }
1800 else
1801 {
1802 io_context = IOContextForStrategy(operation->strategy);
1803 io_object = IOOBJECT_RELATION;
1804 }
1805
1806 /*
1807 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1808 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1809 * set globally, but on a per-session basis. The completion callback,
1810 * which may be run in other processes, e.g. in IO workers, may have a
1811 * different value of the zero_damaged_pages GUC.
1812 *
1813 * XXX: We probably should eventually use a different flag for
1814 * zero_damaged_pages, so we can report different log levels / error codes
1815 * for zero_damaged_pages and ZERO_ON_ERROR.
1816 */
1819
1820 /*
1821 * For the same reason as with zero_damaged_pages we need to use this
1822 * backend's ignore_checksum_failure value.
1823 */
1826
1827
1828 /*
1829 * To be allowed to report stats in the local completion callback we need
1830 * to prepare to report stats now. This ensures we can safely report the
1831 * checksum failure even in a critical section.
1832 */
1834
1835 /*
1836 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1837 * might block, which we don't want after setting IO_IN_PROGRESS.
1838 *
1839 * If we need to wait for IO before we can get a handle, submit
1840 * already-staged IO first, so that other backends don't need to wait.
1841 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1842 * wait for already submitted IO, which doesn't require additional locks,
1843 * but it could still cause undesirable waits.
1844 *
1845 * A secondary benefit is that this would allow us to measure the time in
1846 * pgaio_io_acquire() without causing undue timer overhead in the common,
1847 * non-blocking, case. However, currently the pgstats infrastructure
1848 * doesn't really allow that, as it a) asserts that an operation can't
1849 * have time without operations b) doesn't have an API to report
1850 * "accumulated" time.
1851 */
1853 if (unlikely(!ioh))
1854 {
1856
1858 }
1859
1860 /*
1861 * Check if we can start IO on the first to-be-read buffer.
1862 *
1863 * If an I/O is already in progress in another backend, we want to wait
1864 * for the outcome: either done, or something went wrong and we will
1865 * retry.
1866 */
1867 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1868 {
1869 /*
1870 * Someone else has already completed this block, we're done.
1871 *
1872 * When IO is necessary, ->nblocks_done is updated in
1873 * ProcessReadBuffersResult(), but that is not called if no IO is
1874 * necessary. Thus update here.
1875 */
1876 operation->nblocks_done += 1;
1877 *nblocks_progress = 1;
1878
1879 pgaio_io_release(ioh);
1880 pgaio_wref_clear(&operation->io_wref);
1881 did_start_io = false;
1882
1883 /*
1884 * Report and track this as a 'hit' for this backend, even though it
1885 * must have started out as a miss in PinBufferForBlock(). The other
1886 * backend will track this as a 'read'.
1887 */
1888 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1889 operation->smgr->smgr_rlocator.locator.spcOid,
1890 operation->smgr->smgr_rlocator.locator.dbOid,
1891 operation->smgr->smgr_rlocator.locator.relNumber,
1892 operation->smgr->smgr_rlocator.backend,
1893 true);
1894
1895 if (persistence == RELPERSISTENCE_TEMP)
1897 else
1899
1900 if (operation->rel)
1901 pgstat_count_buffer_hit(operation->rel);
1902
1903 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1904
1905 if (VacuumCostActive)
1907 }
1908 else
1909 {
1910 instr_time io_start;
1911
1912 /* We found a buffer that we need to read in. */
1913 Assert(io_buffers[0] == buffers[nblocks_done]);
1914 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
1915 io_buffers_len = 1;
1916
1917 /*
1918 * How many neighboring-on-disk blocks can we scatter-read into other
1919 * buffers at the same time? In this case we don't wait if we see an
1920 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1921 * head block, so we should get on with that I/O as soon as possible.
1922 */
1923 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1924 {
1925 if (!ReadBuffersCanStartIO(buffers[i], true))
1926 break;
1927 /* Must be consecutive block numbers. */
1928 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1929 BufferGetBlockNumber(buffers[i]) - 1);
1930 Assert(io_buffers[io_buffers_len] == buffers[i]);
1931
1932 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1933 }
1934
1935 /* get a reference to wait for in WaitReadBuffers() */
1936 pgaio_io_get_wref(ioh, &operation->io_wref);
1937
1938 /* provide the list of buffers to the completion callbacks */
1939 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
1940
1942 persistence == RELPERSISTENCE_TEMP ?
1945 flags);
1946
1947 pgaio_io_set_flag(ioh, ioh_flags);
1948
1949 /* ---
1950 * Even though we're trying to issue IO asynchronously, track the time
1951 * in smgrstartreadv():
1952 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
1953 * immediately
1954 * - the io method might not support the IO (e.g. worker IO for a temp
1955 * table)
1956 * ---
1957 */
1959 smgrstartreadv(ioh, operation->smgr, forknum,
1960 blocknum + nblocks_done,
1961 io_pages, io_buffers_len);
1962 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1963 io_start, 1, io_buffers_len * BLCKSZ);
1964
1965 if (persistence == RELPERSISTENCE_TEMP)
1966 pgBufferUsage.local_blks_read += io_buffers_len;
1967 else
1968 pgBufferUsage.shared_blks_read += io_buffers_len;
1969
1970 /*
1971 * Track vacuum cost when issuing IO, not after waiting for it.
1972 * Otherwise we could end up issuing a lot of IO in a short timespan,
1973 * despite a low cost limit.
1974 */
1975 if (VacuumCostActive)
1976 VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1977
1978 *nblocks_progress = io_buffers_len;
1979 did_start_io = true;
1980 }
1981
1982 return did_start_io;
1983}
1984
1985/*
1986 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1987 * buffer. If no buffer exists already, selects a replacement victim and
1988 * evicts the old page, but does NOT read in new page.
1989 *
1990 * "strategy" can be a buffer replacement strategy object, or NULL for
1991 * the default strategy. The selected buffer's usage_count is advanced when
1992 * using the default strategy, but otherwise possibly not (see PinBuffer).
1993 *
1994 * The returned buffer is pinned and is already marked as holding the
1995 * desired page. If it already did have the desired page, *foundPtr is
1996 * set true. Otherwise, *foundPtr is set false.
1997 *
1998 * io_context is passed as an output parameter to avoid calling
1999 * IOContextForStrategy() when there is a shared buffers hit and no IO
2000 * statistics need be captured.
2001 *
2002 * No locks are held either at entry or exit.
2003 */
2005BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2006 BlockNumber blockNum,
2007 BufferAccessStrategy strategy,
2008 bool *foundPtr, IOContext io_context)
2009{
2010 BufferTag newTag; /* identity of requested block */
2011 uint32 newHash; /* hash value for newTag */
2012 LWLock *newPartitionLock; /* buffer partition lock for it */
2013 int existing_buf_id;
2014 Buffer victim_buffer;
2015 BufferDesc *victim_buf_hdr;
2016 uint32 victim_buf_state;
2017
2018 /* Make sure we will have room to remember the buffer pin */
2021
2022 /* create a tag so we can lookup the buffer */
2023 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2024
2025 /* determine its hash code and partition lock ID */
2026 newHash = BufTableHashCode(&newTag);
2027 newPartitionLock = BufMappingPartitionLock(newHash);
2028
2029 /* see if the block is in the buffer pool already */
2030 LWLockAcquire(newPartitionLock, LW_SHARED);
2031 existing_buf_id = BufTableLookup(&newTag, newHash);
2032 if (existing_buf_id >= 0)
2033 {
2034 BufferDesc *buf;
2035 bool valid;
2036
2037 /*
2038 * Found it. Now, pin the buffer so no one can steal it from the
2039 * buffer pool, and check to see if the correct data has been loaded
2040 * into the buffer.
2041 */
2042 buf = GetBufferDescriptor(existing_buf_id);
2043
2044 valid = PinBuffer(buf, strategy);
2045
2046 /* Can release the mapping lock as soon as we've pinned it */
2047 LWLockRelease(newPartitionLock);
2048
2049 *foundPtr = true;
2050
2051 if (!valid)
2052 {
2053 /*
2054 * We can only get here if (a) someone else is still reading in
2055 * the page, (b) a previous read attempt failed, or (c) someone
2056 * called StartReadBuffers() but not yet WaitReadBuffers().
2057 */
2058 *foundPtr = false;
2059 }
2060
2061 return buf;
2062 }
2063
2064 /*
2065 * Didn't find it in the buffer pool. We'll have to initialize a new
2066 * buffer. Remember to unlock the mapping lock while doing the work.
2067 */
2068 LWLockRelease(newPartitionLock);
2069
2070 /*
2071 * Acquire a victim buffer. Somebody else might try to do the same, we
2072 * don't hold any conflicting locks. If so we'll have to undo our work
2073 * later.
2074 */
2075 victim_buffer = GetVictimBuffer(strategy, io_context);
2076 victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2077
2078 /*
2079 * Try to make a hashtable entry for the buffer under its new tag. If
2080 * somebody else inserted another buffer for the tag, we'll release the
2081 * victim buffer we acquired and use the already inserted one.
2082 */
2083 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2084 existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2085 if (existing_buf_id >= 0)
2086 {
2087 BufferDesc *existing_buf_hdr;
2088 bool valid;
2089
2090 /*
2091 * Got a collision. Someone has already done what we were about to do.
2092 * We'll just handle this as if it were found in the buffer pool in
2093 * the first place. First, give up the buffer we were planning to
2094 * use.
2095 *
2096 * We could do this after releasing the partition lock, but then we'd
2097 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2098 * before acquiring the lock, for the rare case of such a collision.
2099 */
2100 UnpinBuffer(victim_buf_hdr);
2101
2102 /*
2103 * The victim buffer we acquired previously is clean and unused, let
2104 * it be found again quickly
2105 */
2106 StrategyFreeBuffer(victim_buf_hdr);
2107
2108 /* remaining code should match code at top of routine */
2109
2110 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2111
2112 valid = PinBuffer(existing_buf_hdr, strategy);
2113
2114 /* Can release the mapping lock as soon as we've pinned it */
2115 LWLockRelease(newPartitionLock);
2116
2117 *foundPtr = true;
2118
2119 if (!valid)
2120 {
2121 /*
2122 * We can only get here if (a) someone else is still reading in
2123 * the page, (b) a previous read attempt failed, or (c) someone
2124 * called StartReadBuffers() but not yet WaitReadBuffers().
2125 */
2126 *foundPtr = false;
2127 }
2128
2129 return existing_buf_hdr;
2130 }
2131
2132 /*
2133 * Need to lock the buffer header too in order to change its tag.
2134 */
2135 victim_buf_state = LockBufHdr(victim_buf_hdr);
2136
2137 /* some sanity checks while we hold the buffer header lock */
2138 Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2139 Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2140
2141 victim_buf_hdr->tag = newTag;
2142
2143 /*
2144 * Make sure BM_PERMANENT is set for buffers that must be written at every
2145 * checkpoint. Unlogged buffers only need to be written at shutdown
2146 * checkpoints, except for their "init" forks, which need to be treated
2147 * just like permanent relations.
2148 */
2149 victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2150 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2151 victim_buf_state |= BM_PERMANENT;
2152
2153 UnlockBufHdr(victim_buf_hdr, victim_buf_state);
2154
2155 LWLockRelease(newPartitionLock);
2156
2157 /*
2158 * Buffer contents are currently invalid.
2159 */
2160 *foundPtr = false;
2161
2162 return victim_buf_hdr;
2163}
2164
2165/*
2166 * InvalidateBuffer -- mark a shared buffer invalid and return it to the
2167 * freelist.
2168 *
2169 * The buffer header spinlock must be held at entry. We drop it before
2170 * returning. (This is sane because the caller must have locked the
2171 * buffer in order to be sure it should be dropped.)
2172 *
2173 * This is used only in contexts such as dropping a relation. We assume
2174 * that no other backend could possibly be interested in using the page,
2175 * so the only reason the buffer might be pinned is if someone else is
2176 * trying to write it out. We have to let them finish before we can
2177 * reclaim the buffer.
2178 *
2179 * The buffer could get reclaimed by someone else while we are waiting
2180 * to acquire the necessary locks; if so, don't mess it up.
2181 */
2182static void
2184{
2185 BufferTag oldTag;
2186 uint32 oldHash; /* hash value for oldTag */
2187 LWLock *oldPartitionLock; /* buffer partition lock for it */
2188 uint32 oldFlags;
2189 uint32 buf_state;
2190
2191 /* Save the original buffer tag before dropping the spinlock */
2192 oldTag = buf->tag;
2193
2194 buf_state = pg_atomic_read_u32(&buf->state);
2195 Assert(buf_state & BM_LOCKED);
2196 UnlockBufHdr(buf, buf_state);
2197
2198 /*
2199 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2200 * worth storing the hashcode in BufferDesc so we need not recompute it
2201 * here? Probably not.
2202 */
2203 oldHash = BufTableHashCode(&oldTag);
2204 oldPartitionLock = BufMappingPartitionLock(oldHash);
2205
2206retry:
2207
2208 /*
2209 * Acquire exclusive mapping lock in preparation for changing the buffer's
2210 * association.
2211 */
2212 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2213
2214 /* Re-lock the buffer header */
2215 buf_state = LockBufHdr(buf);
2216
2217 /* If it's changed while we were waiting for lock, do nothing */
2218 if (!BufferTagsEqual(&buf->tag, &oldTag))
2219 {
2220 UnlockBufHdr(buf, buf_state);
2221 LWLockRelease(oldPartitionLock);
2222 return;
2223 }
2224
2225 /*
2226 * We assume the reason for it to be pinned is that either we were
2227 * asynchronously reading the page in before erroring out or someone else
2228 * is flushing the page out. Wait for the IO to finish. (This could be
2229 * an infinite loop if the refcount is messed up... it would be nice to
2230 * time out after awhile, but there seems no way to be sure how many loops
2231 * may be needed. Note that if the other guy has pinned the buffer but
2232 * not yet done StartBufferIO, WaitIO will fall through and we'll
2233 * effectively be busy-looping here.)
2234 */
2235 if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2236 {
2237 UnlockBufHdr(buf, buf_state);
2238 LWLockRelease(oldPartitionLock);
2239 /* safety check: should definitely not be our *own* pin */
2241 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2242 WaitIO(buf);
2243 goto retry;
2244 }
2245
2246 /*
2247 * Clear out the buffer's tag and flags. We must do this to ensure that
2248 * linear scans of the buffer array don't think the buffer is valid.
2249 */
2250 oldFlags = buf_state & BUF_FLAG_MASK;
2251 ClearBufferTag(&buf->tag);
2252 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2253 UnlockBufHdr(buf, buf_state);
2254
2255 /*
2256 * Remove the buffer from the lookup hashtable, if it was in there.
2257 */
2258 if (oldFlags & BM_TAG_VALID)
2259 BufTableDelete(&oldTag, oldHash);
2260
2261 /*
2262 * Done with mapping lock.
2263 */
2264 LWLockRelease(oldPartitionLock);
2265
2266 /*
2267 * Insert the buffer at the head of the list of free buffers.
2268 */
2270}
2271
2272/*
2273 * Helper routine for GetVictimBuffer()
2274 *
2275 * Needs to be called on a buffer with a valid tag, pinned, but without the
2276 * buffer header spinlock held.
2277 *
2278 * Returns true if the buffer can be reused, in which case the buffer is only
2279 * pinned by this backend and marked as invalid, false otherwise.
2280 */
2281static bool
2283{
2284 uint32 buf_state;
2285 uint32 hash;
2286 LWLock *partition_lock;
2287 BufferTag tag;
2288
2290
2291 /* have buffer pinned, so it's safe to read tag without lock */
2292 tag = buf_hdr->tag;
2293
2294 hash = BufTableHashCode(&tag);
2295 partition_lock = BufMappingPartitionLock(hash);
2296
2297 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2298
2299 /* lock the buffer header */
2300 buf_state = LockBufHdr(buf_hdr);
2301
2302 /*
2303 * We have the buffer pinned nobody else should have been able to unset
2304 * this concurrently.
2305 */
2306 Assert(buf_state & BM_TAG_VALID);
2307 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2308 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2309
2310 /*
2311 * If somebody else pinned the buffer since, or even worse, dirtied it,
2312 * give up on this buffer: It's clearly in use.
2313 */
2314 if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2315 {
2316 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2317
2318 UnlockBufHdr(buf_hdr, buf_state);
2319 LWLockRelease(partition_lock);
2320
2321 return false;
2322 }
2323
2324 /*
2325 * Clear out the buffer's tag and flags and usagecount. This is not
2326 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2327 * doing anything with the buffer. But currently it's beneficial, as the
2328 * cheaper pre-check for several linear scans of shared buffers use the
2329 * tag (see e.g. FlushDatabaseBuffers()).
2330 */
2331 ClearBufferTag(&buf_hdr->tag);
2332 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2333 UnlockBufHdr(buf_hdr, buf_state);
2334
2335 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2336
2337 /* finally delete buffer from the buffer mapping table */
2338 BufTableDelete(&tag, hash);
2339
2340 LWLockRelease(partition_lock);
2341
2342 Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2343 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2345
2346 return true;
2347}
2348
2349static Buffer
2351{
2352 BufferDesc *buf_hdr;
2353 Buffer buf;
2354 uint32 buf_state;
2355 bool from_ring;
2356
2357 /*
2358 * Ensure, while the spinlock's not yet held, that there's a free refcount
2359 * entry, and a resource owner slot for the pin.
2360 */
2363
2364 /* we return here if a prospective victim buffer gets used concurrently */
2365again:
2366
2367 /*
2368 * Select a victim buffer. The buffer is returned with its header
2369 * spinlock still held!
2370 */
2371 buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2372 buf = BufferDescriptorGetBuffer(buf_hdr);
2373
2374 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
2375
2376 /* Pin the buffer and then release the buffer spinlock */
2377 PinBuffer_Locked(buf_hdr);
2378
2379 /*
2380 * We shouldn't have any other pins for this buffer.
2381 */
2383
2384 /*
2385 * If the buffer was dirty, try to write it out. There is a race
2386 * condition here, in that someone might dirty it after we released the
2387 * buffer header lock above, or even while we are writing it out (since
2388 * our share-lock won't prevent hint-bit updates). We will recheck the
2389 * dirty bit after re-locking the buffer header.
2390 */
2391 if (buf_state & BM_DIRTY)
2392 {
2393 LWLock *content_lock;
2394
2395 Assert(buf_state & BM_TAG_VALID);
2396 Assert(buf_state & BM_VALID);
2397
2398 /*
2399 * We need a share-lock on the buffer contents to write it out (else
2400 * we might write invalid data, eg because someone else is compacting
2401 * the page contents while we write). We must use a conditional lock
2402 * acquisition here to avoid deadlock. Even though the buffer was not
2403 * pinned (and therefore surely not locked) when StrategyGetBuffer
2404 * returned it, someone else could have pinned and exclusive-locked it
2405 * by the time we get here. If we try to get the lock unconditionally,
2406 * we'd block waiting for them; if they later block waiting for us,
2407 * deadlock ensues. (This has been observed to happen when two
2408 * backends are both trying to split btree index pages, and the second
2409 * one just happens to be trying to split the page the first one got
2410 * from StrategyGetBuffer.)
2411 */
2412 content_lock = BufferDescriptorGetContentLock(buf_hdr);
2413 if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2414 {
2415 /*
2416 * Someone else has locked the buffer, so give it up and loop back
2417 * to get another one.
2418 */
2419 UnpinBuffer(buf_hdr);
2420 goto again;
2421 }
2422
2423 /*
2424 * If using a nondefault strategy, and writing the buffer would
2425 * require a WAL flush, let the strategy decide whether to go ahead
2426 * and write/reuse the buffer or to choose another victim. We need a
2427 * lock to inspect the page LSN, so this can't be done inside
2428 * StrategyGetBuffer.
2429 */
2430 if (strategy != NULL)
2431 {
2432 XLogRecPtr lsn;
2433
2434 /* Read the LSN while holding buffer header lock */
2435 buf_state = LockBufHdr(buf_hdr);
2436 lsn = BufferGetLSN(buf_hdr);
2437 UnlockBufHdr(buf_hdr, buf_state);
2438
2439 if (XLogNeedsFlush(lsn)
2440 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2441 {
2442 LWLockRelease(content_lock);
2443 UnpinBuffer(buf_hdr);
2444 goto again;
2445 }
2446 }
2447
2448 /* OK, do the I/O */
2449 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2450 LWLockRelease(content_lock);
2451
2453 &buf_hdr->tag);
2454 }
2455
2456
2457 if (buf_state & BM_VALID)
2458 {
2459 /*
2460 * When a BufferAccessStrategy is in use, blocks evicted from shared
2461 * buffers are counted as IOOP_EVICT in the corresponding context
2462 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2463 * strategy in two cases: 1) while initially claiming buffers for the
2464 * strategy ring 2) to replace an existing strategy ring buffer
2465 * because it is pinned or in use and cannot be reused.
2466 *
2467 * Blocks evicted from buffers already in the strategy ring are
2468 * counted as IOOP_REUSE in the corresponding strategy context.
2469 *
2470 * At this point, we can accurately count evictions and reuses,
2471 * because we have successfully claimed the valid buffer. Previously,
2472 * we may have been forced to release the buffer due to concurrent
2473 * pinners or erroring out.
2474 */
2476 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2477 }
2478
2479 /*
2480 * If the buffer has an entry in the buffer mapping table, delete it. This
2481 * can fail because another backend could have pinned or dirtied the
2482 * buffer.
2483 */
2484 if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2485 {
2486 UnpinBuffer(buf_hdr);
2487 goto again;
2488 }
2489
2490 /* a final set of sanity checks */
2491#ifdef USE_ASSERT_CHECKING
2492 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2493
2494 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2495 Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2496
2498#endif
2499
2500 return buf;
2501}
2502
2503/*
2504 * Return the maximum number of buffers that a backend should try to pin once,
2505 * to avoid exceeding its fair share. This is the highest value that
2506 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2507 * system with a very small buffer pool relative to max_connections.
2508 */
2509uint32
2511{
2512 return MaxProportionalPins;
2513}
2514
2515/*
2516 * Return the maximum number of additional buffers that this backend should
2517 * pin if it wants to stay under the per-backend limit, considering the number
2518 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2519 * return by this function can be zero.
2520 */
2521uint32
2523{
2524 uint32 estimated_pins_held;
2525
2526 /*
2527 * We get the number of "overflowed" pins for free, but don't know the
2528 * number of pins in PrivateRefCountArray. The cost of calculating that
2529 * exactly doesn't seem worth it, so just assume the max.
2530 */
2531 estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2532
2533 /* Is this backend already holding more than its fair share? */
2534 if (estimated_pins_held > MaxProportionalPins)
2535 return 0;
2536
2537 return MaxProportionalPins - estimated_pins_held;
2538}
2539
2540/*
2541 * Limit the number of pins a batch operation may additionally acquire, to
2542 * avoid running out of pinnable buffers.
2543 *
2544 * One additional pin is always allowed, on the assumption that the operation
2545 * requires at least one to make progress.
2546 */
2547void
2549{
2550 uint32 limit;
2551
2552 if (*additional_pins <= 1)
2553 return;
2554
2555 limit = GetAdditionalPinLimit();
2556 limit = Max(limit, 1);
2557 if (limit < *additional_pins)
2558 *additional_pins = limit;
2559}
2560
2561/*
2562 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2563 * avoid duplicating the tracing and relpersistence related logic.
2564 */
2565static BlockNumber
2567 ForkNumber fork,
2568 BufferAccessStrategy strategy,
2569 uint32 flags,
2570 uint32 extend_by,
2571 BlockNumber extend_upto,
2572 Buffer *buffers,
2573 uint32 *extended_by)
2574{
2575 BlockNumber first_block;
2576
2577 TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2582 extend_by);
2583
2584 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2585 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2586 extend_by, extend_upto,
2587 buffers, &extend_by);
2588 else
2589 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2590 extend_by, extend_upto,
2591 buffers, &extend_by);
2592 *extended_by = extend_by;
2593
2594 TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2599 *extended_by,
2600 first_block);
2601
2602 return first_block;
2603}
2604
2605/*
2606 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2607 * shared buffers.
2608 */
2609static BlockNumber
2611 ForkNumber fork,
2612 BufferAccessStrategy strategy,
2613 uint32 flags,
2614 uint32 extend_by,
2615 BlockNumber extend_upto,
2616 Buffer *buffers,
2617 uint32 *extended_by)
2618{
2619 BlockNumber first_block;
2620 IOContext io_context = IOContextForStrategy(strategy);
2621 instr_time io_start;
2622
2623 LimitAdditionalPins(&extend_by);
2624
2625 /*
2626 * Acquire victim buffers for extension without holding extension lock.
2627 * Writing out victim buffers is the most expensive part of extending the
2628 * relation, particularly when doing so requires WAL flushes. Zeroing out
2629 * the buffers is also quite expensive, so do that before holding the
2630 * extension lock as well.
2631 *
2632 * These pages are pinned by us and not valid. While we hold the pin they
2633 * can't be acquired as victim buffers by another backend.
2634 */
2635 for (uint32 i = 0; i < extend_by; i++)
2636 {
2637 Block buf_block;
2638
2639 buffers[i] = GetVictimBuffer(strategy, io_context);
2640 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2641
2642 /* new buffers are zero-filled */
2643 MemSet(buf_block, 0, BLCKSZ);
2644 }
2645
2646 /*
2647 * Lock relation against concurrent extensions, unless requested not to.
2648 *
2649 * We use the same extension lock for all forks. That's unnecessarily
2650 * restrictive, but currently extensions for forks don't happen often
2651 * enough to make it worth locking more granularly.
2652 *
2653 * Note that another backend might have extended the relation by the time
2654 * we get the lock.
2655 */
2656 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2658
2659 /*
2660 * If requested, invalidate size cache, so that smgrnblocks asks the
2661 * kernel.
2662 */
2663 if (flags & EB_CLEAR_SIZE_CACHE)
2665
2666 first_block = smgrnblocks(bmr.smgr, fork);
2667
2668 /*
2669 * Now that we have the accurate relation size, check if the caller wants
2670 * us to extend to only up to a specific size. If there were concurrent
2671 * extensions, we might have acquired too many buffers and need to release
2672 * them.
2673 */
2674 if (extend_upto != InvalidBlockNumber)
2675 {
2676 uint32 orig_extend_by = extend_by;
2677
2678 if (first_block > extend_upto)
2679 extend_by = 0;
2680 else if ((uint64) first_block + extend_by > extend_upto)
2681 extend_by = extend_upto - first_block;
2682
2683 for (uint32 i = extend_by; i < orig_extend_by; i++)
2684 {
2685 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2686
2687 /*
2688 * The victim buffer we acquired previously is clean and unused,
2689 * let it be found again quickly
2690 */
2691 StrategyFreeBuffer(buf_hdr);
2692 UnpinBuffer(buf_hdr);
2693 }
2694
2695 if (extend_by == 0)
2696 {
2697 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2699 *extended_by = extend_by;
2700 return first_block;
2701 }
2702 }
2703
2704 /* Fail if relation is already at maximum possible length */
2705 if ((uint64) first_block + extend_by >= MaxBlockNumber)
2706 ereport(ERROR,
2707 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2708 errmsg("cannot extend relation %s beyond %u blocks",
2709 relpath(bmr.smgr->smgr_rlocator, fork).str,
2710 MaxBlockNumber)));
2711
2712 /*
2713 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2714 *
2715 * This needs to happen before we extend the relation, because as soon as
2716 * we do, other backends can start to read in those pages.
2717 */
2718 for (uint32 i = 0; i < extend_by; i++)
2719 {
2720 Buffer victim_buf = buffers[i];
2721 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2722 BufferTag tag;
2723 uint32 hash;
2724 LWLock *partition_lock;
2725 int existing_id;
2726
2727 /* in case we need to pin an existing buffer below */
2730
2731 InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2732 hash = BufTableHashCode(&tag);
2733 partition_lock = BufMappingPartitionLock(hash);
2734
2735 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2736
2737 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2738
2739 /*
2740 * We get here only in the corner case where we are trying to extend
2741 * the relation but we found a pre-existing buffer. This can happen
2742 * because a prior attempt at extending the relation failed, and
2743 * because mdread doesn't complain about reads beyond EOF (when
2744 * zero_damaged_pages is ON) and so a previous attempt to read a block
2745 * beyond EOF could have left a "valid" zero-filled buffer.
2746 * Unfortunately, we have also seen this case occurring because of
2747 * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2748 * that doesn't account for a recent write. In that situation, the
2749 * pre-existing buffer would contain valid data that we don't want to
2750 * overwrite. Since the legitimate cases should always have left a
2751 * zero-filled buffer, complain if not PageIsNew.
2752 */
2753 if (existing_id >= 0)
2754 {
2755 BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2756 Block buf_block;
2757 bool valid;
2758
2759 /*
2760 * Pin the existing buffer before releasing the partition lock,
2761 * preventing it from being evicted.
2762 */
2763 valid = PinBuffer(existing_hdr, strategy);
2764
2765 LWLockRelease(partition_lock);
2766
2767 /*
2768 * The victim buffer we acquired previously is clean and unused,
2769 * let it be found again quickly
2770 */
2771 StrategyFreeBuffer(victim_buf_hdr);
2772 UnpinBuffer(victim_buf_hdr);
2773
2774 buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2775 buf_block = BufHdrGetBlock(existing_hdr);
2776
2777 if (valid && !PageIsNew((Page) buf_block))
2778 ereport(ERROR,
2779 (errmsg("unexpected data beyond EOF in block %u of relation %s",
2780 existing_hdr->tag.blockNum,
2781 relpath(bmr.smgr->smgr_rlocator, fork).str),
2782 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2783
2784 /*
2785 * We *must* do smgr[zero]extend before succeeding, else the page
2786 * will not be reserved by the kernel, and the next P_NEW call
2787 * will decide to return the same page. Clear the BM_VALID bit,
2788 * do StartBufferIO() and proceed.
2789 *
2790 * Loop to handle the very small possibility that someone re-sets
2791 * BM_VALID between our clearing it and StartBufferIO inspecting
2792 * it.
2793 */
2794 do
2795 {
2796 uint32 buf_state = LockBufHdr(existing_hdr);
2797
2798 buf_state &= ~BM_VALID;
2799 UnlockBufHdr(existing_hdr, buf_state);
2800 } while (!StartBufferIO(existing_hdr, true, false));
2801 }
2802 else
2803 {
2804 uint32 buf_state;
2805
2806 buf_state = LockBufHdr(victim_buf_hdr);
2807
2808 /* some sanity checks while we hold the buffer header lock */
2809 Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2810 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2811
2812 victim_buf_hdr->tag = tag;
2813
2814 buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2815 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2816 buf_state |= BM_PERMANENT;
2817
2818 UnlockBufHdr(victim_buf_hdr, buf_state);
2819
2820 LWLockRelease(partition_lock);
2821
2822 /* XXX: could combine the locked operations in it with the above */
2823 StartBufferIO(victim_buf_hdr, true, false);
2824 }
2825 }
2826
2828
2829 /*
2830 * Note: if smgrzeroextend fails, we will end up with buffers that are
2831 * allocated but not marked BM_VALID. The next relation extension will
2832 * still select the same block number (because the relation didn't get any
2833 * longer on disk) and so future attempts to extend the relation will find
2834 * the same buffers (if they have not been recycled) but come right back
2835 * here to try smgrzeroextend again.
2836 *
2837 * We don't need to set checksum for all-zero pages.
2838 */
2839 smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2840
2841 /*
2842 * Release the file-extension lock; it's now OK for someone else to extend
2843 * the relation some more.
2844 *
2845 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2846 * take noticeable time.
2847 */
2848 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2850
2852 io_start, 1, extend_by * BLCKSZ);
2853
2854 /* Set BM_VALID, terminate IO, and wake up any waiters */
2855 for (uint32 i = 0; i < extend_by; i++)
2856 {
2857 Buffer buf = buffers[i];
2858 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2859 bool lock = false;
2860
2861 if (flags & EB_LOCK_FIRST && i == 0)
2862 lock = true;
2863 else if (flags & EB_LOCK_TARGET)
2864 {
2865 Assert(extend_upto != InvalidBlockNumber);
2866 if (first_block + i + 1 == extend_upto)
2867 lock = true;
2868 }
2869
2870 if (lock)
2872
2873 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2874 }
2875
2877
2878 *extended_by = extend_by;
2879
2880 return first_block;
2881}
2882
2883/*
2884 * BufferIsExclusiveLocked
2885 *
2886 * Checks if buffer is exclusive-locked.
2887 *
2888 * Buffer must be pinned.
2889 */
2890bool
2892{
2893 BufferDesc *bufHdr;
2894
2896
2897 if (BufferIsLocal(buffer))
2898 {
2899 /* Content locks are not maintained for local buffers. */
2900 return true;
2901 }
2902 else
2903 {
2904 bufHdr = GetBufferDescriptor(buffer - 1);
2906 LW_EXCLUSIVE);
2907 }
2908}
2909
2910/*
2911 * BufferIsDirty
2912 *
2913 * Checks if buffer is already dirty.
2914 *
2915 * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2916 * the result may be stale before it's returned.)
2917 */
2918bool
2920{
2921 BufferDesc *bufHdr;
2922
2924
2925 if (BufferIsLocal(buffer))
2926 {
2927 int bufid = -buffer - 1;
2928
2929 bufHdr = GetLocalBufferDescriptor(bufid);
2930 /* Content locks are not maintained for local buffers. */
2931 }
2932 else
2933 {
2934 bufHdr = GetBufferDescriptor(buffer - 1);
2936 LW_EXCLUSIVE));
2937 }
2938
2939 return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2940}
2941
2942/*
2943 * MarkBufferDirty
2944 *
2945 * Marks buffer contents as dirty (actual write happens later).
2946 *
2947 * Buffer must be pinned and exclusive-locked. (If caller does not hold
2948 * exclusive lock, then somebody could be in process of writing the buffer,
2949 * leading to risk of bad data written to disk.)
2950 */
2951void
2953{
2954 BufferDesc *bufHdr;
2955 uint32 buf_state;
2956 uint32 old_buf_state;
2957
2958 if (!BufferIsValid(buffer))
2959 elog(ERROR, "bad buffer ID: %d", buffer);
2960
2961 if (BufferIsLocal(buffer))
2962 {
2964 return;
2965 }
2966
2967 bufHdr = GetBufferDescriptor(buffer - 1);
2968
2971 LW_EXCLUSIVE));
2972
2973 old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2974 for (;;)
2975 {
2976 if (old_buf_state & BM_LOCKED)
2977 old_buf_state = WaitBufHdrUnlocked(bufHdr);
2978
2979 buf_state = old_buf_state;
2980
2981 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2982 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2983
2984 if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2985 buf_state))
2986 break;
2987 }
2988
2989 /*
2990 * If the buffer was not dirty already, do vacuum accounting.
2991 */
2992 if (!(old_buf_state & BM_DIRTY))
2993 {
2995 if (VacuumCostActive)
2997 }
2998}
2999
3000/*
3001 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3002 *
3003 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3004 * compared to calling the two routines separately. Now it's mainly just
3005 * a convenience function. However, if the passed buffer is valid and
3006 * already contains the desired block, we just return it as-is; and that
3007 * does save considerable work compared to a full release and reacquire.
3008 *
3009 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3010 * buffer actually needs to be released. This case is the same as ReadBuffer,
3011 * but can save some tests in the caller.
3012 */
3013Buffer
3015 Relation relation,
3016 BlockNumber blockNum)
3017{
3018 ForkNumber forkNum = MAIN_FORKNUM;
3019 BufferDesc *bufHdr;
3020
3021 if (BufferIsValid(buffer))
3022 {
3024 if (BufferIsLocal(buffer))
3025 {
3026 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3027 if (bufHdr->tag.blockNum == blockNum &&
3028 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3029 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3030 return buffer;
3032 }
3033 else
3034 {
3035 bufHdr = GetBufferDescriptor(buffer - 1);
3036 /* we have pin, so it's ok to examine tag without spinlock */
3037 if (bufHdr->tag.blockNum == blockNum &&
3038 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3039 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3040 return buffer;
3041 UnpinBuffer(bufHdr);
3042 }
3043 }
3044
3045 return ReadBuffer(relation, blockNum);
3046}
3047
3048/*
3049 * PinBuffer -- make buffer unavailable for replacement.
3050 *
3051 * For the default access strategy, the buffer's usage_count is incremented
3052 * when we first pin it; for other strategies we just make sure the usage_count
3053 * isn't zero. (The idea of the latter is that we don't want synchronized
3054 * heap scans to inflate the count, but we need it to not be zero to discourage
3055 * other backends from stealing buffers from our ring. As long as we cycle
3056 * through the ring faster than the global clock-sweep cycles, buffers in
3057 * our ring won't be chosen as victims for replacement by other backends.)
3058 *
3059 * This should be applied only to shared buffers, never local ones.
3060 *
3061 * Since buffers are pinned/unpinned very frequently, pin buffers without
3062 * taking the buffer header lock; instead update the state variable in loop of
3063 * CAS operations. Hopefully it's just a single CAS.
3064 *
3065 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3066 * must have been done already.
3067 *
3068 * Returns true if buffer is BM_VALID, else false. This provision allows
3069 * some callers to avoid an extra spinlock cycle.
3070 */
3071static bool
3073{
3075 bool result;
3077
3080
3081 ref = GetPrivateRefCountEntry(b, true);
3082
3083 if (ref == NULL)
3084 {
3085 uint32 buf_state;
3086 uint32 old_buf_state;
3087
3089
3090 old_buf_state = pg_atomic_read_u32(&buf->state);
3091 for (;;)
3092 {
3093 if (old_buf_state & BM_LOCKED)
3094 old_buf_state = WaitBufHdrUnlocked(buf);
3095
3096 buf_state = old_buf_state;
3097
3098 /* increase refcount */
3099 buf_state += BUF_REFCOUNT_ONE;
3100
3101 if (strategy == NULL)
3102 {
3103 /* Default case: increase usagecount unless already max. */
3105 buf_state += BUF_USAGECOUNT_ONE;
3106 }
3107 else
3108 {
3109 /*
3110 * Ring buffers shouldn't evict others from pool. Thus we
3111 * don't make usagecount more than 1.
3112 */
3113 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3114 buf_state += BUF_USAGECOUNT_ONE;
3115 }
3116
3117 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3118 buf_state))
3119 {
3120 result = (buf_state & BM_VALID) != 0;
3121
3122 /*
3123 * Assume that we acquired a buffer pin for the purposes of
3124 * Valgrind buffer client checks (even in !result case) to
3125 * keep things simple. Buffers that are unsafe to access are
3126 * not generally guaranteed to be marked undefined or
3127 * non-accessible in any case.
3128 */
3130 break;
3131 }
3132 }
3133 }
3134 else
3135 {
3136 /*
3137 * If we previously pinned the buffer, it is likely to be valid, but
3138 * it may not be if StartReadBuffers() was called and
3139 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3140 * the flags without locking. This is racy, but it's OK to return
3141 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3142 * it'll see that it's now valid.
3143 *
3144 * Note: We deliberately avoid a Valgrind client request here.
3145 * Individual access methods can optionally superimpose buffer page
3146 * client requests on top of our client requests to enforce that
3147 * buffers are only accessed while locked (and pinned). It's possible
3148 * that the buffer page is legitimately non-accessible here. We
3149 * cannot meddle with that.
3150 */
3151 result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3152 }
3153
3154 ref->refcount++;
3155 Assert(ref->refcount > 0);
3157 return result;
3158}
3159
3160/*
3161 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3162 * The spinlock is released before return.
3163 *
3164 * As this function is called with the spinlock held, the caller has to
3165 * previously call ReservePrivateRefCountEntry() and
3166 * ResourceOwnerEnlarge(CurrentResourceOwner);
3167 *
3168 * Currently, no callers of this function want to modify the buffer's
3169 * usage_count at all, so there's no need for a strategy parameter.
3170 * Also we don't bother with a BM_VALID test (the caller could check that for
3171 * itself).
3172 *
3173 * Also all callers only ever use this function when it's known that the
3174 * buffer can't have a preexisting pin by this backend. That allows us to skip
3175 * searching the private refcount array & hash, which is a boon, because the
3176 * spinlock is still held.
3177 *
3178 * Note: use of this routine is frequently mandatory, not just an optimization
3179 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3180 * its state can change under us.
3181 */
3182static void
3184{
3185 Buffer b;
3187 uint32 buf_state;
3188
3189 /*
3190 * As explained, We don't expect any preexisting pins. That allows us to
3191 * manipulate the PrivateRefCount after releasing the spinlock
3192 */
3194
3195 /*
3196 * Buffer can't have a preexisting pin, so mark its page as defined to
3197 * Valgrind (this is similar to the PinBuffer() case where the backend
3198 * doesn't already have a buffer pin)
3199 */
3201
3202 /*
3203 * Since we hold the buffer spinlock, we can update the buffer state and
3204 * release the lock in one operation.
3205 */
3206 buf_state = pg_atomic_read_u32(&buf->state);
3207 Assert(buf_state & BM_LOCKED);
3208 buf_state += BUF_REFCOUNT_ONE;
3209 UnlockBufHdr(buf, buf_state);
3210
3212
3214 ref->refcount++;
3215
3217}
3218
3219/*
3220 * Support for waking up another backend that is waiting for the cleanup lock
3221 * to be released using BM_PIN_COUNT_WAITER.
3222 *
3223 * See LockBufferForCleanup().
3224 *
3225 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3226 * not just reducing the backend-local pincount for the buffer).
3227 */
3228static void
3230{
3231 /*
3232 * Acquire the buffer header lock, re-check that there's a waiter. Another
3233 * backend could have unpinned this buffer, and already woken up the
3234 * waiter.
3235 *
3236 * There's no danger of the buffer being replaced after we unpinned it
3237 * above, as it's pinned by the waiter. The waiter removes
3238 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3239 * backend waking it up.
3240 */
3241 uint32 buf_state = LockBufHdr(buf);
3242
3243 if ((buf_state & BM_PIN_COUNT_WAITER) &&
3244 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3245 {
3246 /* we just released the last pin other than the waiter's */
3247 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3248
3249 buf_state &= ~BM_PIN_COUNT_WAITER;
3250 UnlockBufHdr(buf, buf_state);
3251 ProcSendSignal(wait_backend_pgprocno);
3252 }
3253 else
3254 UnlockBufHdr(buf, buf_state);
3255}
3256
3257/*
3258 * UnpinBuffer -- make buffer available for replacement.
3259 *
3260 * This should be applied only to shared buffers, never local ones. This
3261 * always adjusts CurrentResourceOwner.
3262 */
3263static void
3265{
3267
3270}
3271
3272static void
3274{
3277
3279
3280 /* not moving as we're likely deleting it soon anyway */
3281 ref = GetPrivateRefCountEntry(b, false);
3282 Assert(ref != NULL);
3283 Assert(ref->refcount > 0);
3284 ref->refcount--;
3285 if (ref->refcount == 0)
3286 {
3287 uint32 buf_state;
3288 uint32 old_buf_state;
3289
3290 /*
3291 * Mark buffer non-accessible to Valgrind.
3292 *
3293 * Note that the buffer may have already been marked non-accessible
3294 * within access method code that enforces that buffers are only
3295 * accessed while a buffer lock is held.
3296 */
3298
3299 /* I'd better not still hold the buffer content lock */
3301
3302 /*
3303 * Decrement the shared reference count.
3304 *
3305 * Since buffer spinlock holder can update status using just write,
3306 * it's not safe to use atomic decrement here; thus use a CAS loop.
3307 */
3308 old_buf_state = pg_atomic_read_u32(&buf->state);
3309 for (;;)
3310 {
3311 if (old_buf_state & BM_LOCKED)
3312 old_buf_state = WaitBufHdrUnlocked(buf);
3313
3314 buf_state = old_buf_state;
3315
3316 buf_state -= BUF_REFCOUNT_ONE;
3317
3318 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3319 buf_state))
3320 break;
3321 }
3322
3323 /* Support LockBufferForCleanup() */
3324 if (buf_state & BM_PIN_COUNT_WAITER)
3326
3328 }
3329}
3330
3331#define ST_SORT sort_checkpoint_bufferids
3332#define ST_ELEMENT_TYPE CkptSortItem
3333#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3334#define ST_SCOPE static
3335#define ST_DEFINE
3336#include "lib/sort_template.h"
3337
3338/*
3339 * BufferSync -- Write out all dirty buffers in the pool.
3340 *
3341 * This is called at checkpoint time to write out all dirty shared buffers.
3342 * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
3343 * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3344 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
3345 * unlogged buffers, which are otherwise skipped. The remaining flags
3346 * currently have no effect here.
3347 */
3348static void
3349BufferSync(int flags)
3350{
3351 uint32 buf_state;
3352 int buf_id;
3353 int num_to_scan;
3354 int num_spaces;
3355 int num_processed;
3356 int num_written;
3357 CkptTsStatus *per_ts_stat = NULL;
3358 Oid last_tsid;
3359 binaryheap *ts_heap;
3360 int i;
3361 int mask = BM_DIRTY;
3362 WritebackContext wb_context;
3363
3364 /*
3365 * Unless this is a shutdown checkpoint or we have been explicitly told,
3366 * we write only permanent, dirty buffers. But at shutdown or end of
3367 * recovery, we write all dirty buffers.
3368 */
3371 mask |= BM_PERMANENT;
3372
3373 /*
3374 * Loop over all buffers, and mark the ones that need to be written with
3375 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3376 * can estimate how much work needs to be done.
3377 *
3378 * This allows us to write only those pages that were dirty when the
3379 * checkpoint began, and not those that get dirtied while it proceeds.
3380 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3381 * later in this function, or by normal backends or the bgwriter cleaning
3382 * scan, the flag is cleared. Any buffer dirtied after this point won't
3383 * have the flag set.
3384 *
3385 * Note that if we fail to write some buffer, we may leave buffers with
3386 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3387 * certainly need to be written for the next checkpoint attempt, too.
3388 */
3389 num_to_scan = 0;
3390 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3391 {
3392 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3393
3394 /*
3395 * Header spinlock is enough to examine BM_DIRTY, see comment in
3396 * SyncOneBuffer.
3397 */
3398 buf_state = LockBufHdr(bufHdr);
3399
3400 if ((buf_state & mask) == mask)
3401 {
3402 CkptSortItem *item;
3403
3404 buf_state |= BM_CHECKPOINT_NEEDED;
3405
3406 item = &CkptBufferIds[num_to_scan++];
3407 item->buf_id = buf_id;
3408 item->tsId = bufHdr->tag.spcOid;
3409 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3410 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3411 item->blockNum = bufHdr->tag.blockNum;
3412 }
3413
3414 UnlockBufHdr(bufHdr, buf_state);
3415
3416 /* Check for barrier events in case NBuffers is large. */
3419 }
3420
3421 if (num_to_scan == 0)
3422 return; /* nothing to do */
3423
3425
3426 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3427
3428 /*
3429 * Sort buffers that need to be written to reduce the likelihood of random
3430 * IO. The sorting is also important for the implementation of balancing
3431 * writes between tablespaces. Without balancing writes we'd potentially
3432 * end up writing to the tablespaces one-by-one; possibly overloading the
3433 * underlying system.
3434 */
3435 sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3436
3437 num_spaces = 0;
3438
3439 /*
3440 * Allocate progress status for each tablespace with buffers that need to
3441 * be flushed. This requires the to-be-flushed array to be sorted.
3442 */
3443 last_tsid = InvalidOid;
3444 for (i = 0; i < num_to_scan; i++)
3445 {
3446 CkptTsStatus *s;
3447 Oid cur_tsid;
3448
3449 cur_tsid = CkptBufferIds[i].tsId;
3450
3451 /*
3452 * Grow array of per-tablespace status structs, every time a new
3453 * tablespace is found.
3454 */
3455 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3456 {
3457 Size sz;
3458
3459 num_spaces++;
3460
3461 /*
3462 * Not worth adding grow-by-power-of-2 logic here - even with a
3463 * few hundred tablespaces this should be fine.
3464 */
3465 sz = sizeof(CkptTsStatus) * num_spaces;
3466
3467 if (per_ts_stat == NULL)
3468 per_ts_stat = (CkptTsStatus *) palloc(sz);
3469 else
3470 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3471
3472 s = &per_ts_stat[num_spaces - 1];
3473 memset(s, 0, sizeof(*s));
3474 s->tsId = cur_tsid;
3475
3476 /*
3477 * The first buffer in this tablespace. As CkptBufferIds is sorted
3478 * by tablespace all (s->num_to_scan) buffers in this tablespace
3479 * will follow afterwards.
3480 */
3481 s->index = i;
3482
3483 /*
3484 * progress_slice will be determined once we know how many buffers
3485 * are in each tablespace, i.e. after this loop.
3486 */
3487
3488 last_tsid = cur_tsid;
3489 }
3490 else
3491 {
3492 s = &per_ts_stat[num_spaces - 1];
3493 }
3494
3495 s->num_to_scan++;
3496
3497 /* Check for barrier events. */
3500 }
3501
3502 Assert(num_spaces > 0);
3503
3504 /*
3505 * Build a min-heap over the write-progress in the individual tablespaces,
3506 * and compute how large a portion of the total progress a single
3507 * processed buffer is.
3508 */
3509 ts_heap = binaryheap_allocate(num_spaces,
3511 NULL);
3512
3513 for (i = 0; i < num_spaces; i++)
3514 {
3515 CkptTsStatus *ts_stat = &per_ts_stat[i];
3516
3517 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3518
3519 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3520 }
3521
3522 binaryheap_build(ts_heap);
3523
3524 /*
3525 * Iterate through to-be-checkpointed buffers and write the ones (still)
3526 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3527 * tablespaces; otherwise the sorting would lead to only one tablespace
3528 * receiving writes at a time, making inefficient use of the hardware.
3529 */
3530 num_processed = 0;
3531 num_written = 0;
3532 while (!binaryheap_empty(ts_heap))
3533 {
3534 BufferDesc *bufHdr = NULL;
3535 CkptTsStatus *ts_stat = (CkptTsStatus *)
3537
3538 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3539 Assert(buf_id != -1);
3540
3541 bufHdr = GetBufferDescriptor(buf_id);
3542
3543 num_processed++;
3544
3545 /*
3546 * We don't need to acquire the lock here, because we're only looking
3547 * at a single bit. It's possible that someone else writes the buffer
3548 * and clears the flag right after we check, but that doesn't matter
3549 * since SyncOneBuffer will then do nothing. However, there is a
3550 * further race condition: it's conceivable that between the time we
3551 * examine the bit here and the time SyncOneBuffer acquires the lock,
3552 * someone else not only wrote the buffer but replaced it with another
3553 * page and dirtied it. In that improbable case, SyncOneBuffer will
3554 * write the buffer though we didn't need to. It doesn't seem worth
3555 * guarding against this, though.
3556 */
3558 {
3559 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3560 {
3561 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3563 num_written++;
3564 }
3565 }
3566
3567 /*
3568 * Measure progress independent of actually having to flush the buffer
3569 * - otherwise writing become unbalanced.
3570 */
3571 ts_stat->progress += ts_stat->progress_slice;
3572 ts_stat->num_scanned++;
3573 ts_stat->index++;
3574
3575 /* Have all the buffers from the tablespace been processed? */
3576 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3577 {
3578 binaryheap_remove_first(ts_heap);
3579 }
3580 else
3581 {
3582 /* update heap with the new progress */
3583 binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3584 }
3585
3586 /*
3587 * Sleep to throttle our I/O rate.
3588 *
3589 * (This will check for barrier events even if it doesn't sleep.)
3590 */
3591 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3592 }
3593
3594 /*
3595 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3596 * IOContext will always be IOCONTEXT_NORMAL.
3597 */
3599
3600 pfree(per_ts_stat);
3601 per_ts_stat = NULL;
3602 binaryheap_free(ts_heap);
3603
3604 /*
3605 * Update checkpoint statistics. As noted above, this doesn't include
3606 * buffers written by other backends or bgwriter scan.
3607 */
3608 CheckpointStats.ckpt_bufs_written += num_written;
3609
3610 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3611}
3612
3613/*
3614 * BgBufferSync -- Write out some dirty buffers in the pool.
3615 *
3616 * This is called periodically by the background writer process.
3617 *
3618 * Returns true if it's appropriate for the bgwriter process to go into
3619 * low-power hibernation mode. (This happens if the strategy clock sweep
3620 * has been "lapped" and no buffer allocations have occurred recently,
3621 * or if the bgwriter has been effectively disabled by setting
3622 * bgwriter_lru_maxpages to 0.)
3623 */
3624bool
3626{
3627 /* info obtained from freelist.c */
3628 int strategy_buf_id;
3629 uint32 strategy_passes;
3630 uint32 recent_alloc;
3631
3632 /*
3633 * Information saved between calls so we can determine the strategy
3634 * point's advance rate and avoid scanning already-cleaned buffers.
3635 */
3636 static bool saved_info_valid = false;
3637 static int prev_strategy_buf_id;
3638 static uint32 prev_strategy_passes;
3639 static int next_to_clean;
3640 static uint32 next_passes;
3641
3642 /* Moving averages of allocation rate and clean-buffer density */
3643 static float smoothed_alloc = 0;
3644 static float smoothed_density = 10.0;
3645
3646 /* Potentially these could be tunables, but for now, not */
3647 float smoothing_samples = 16;
3648 float scan_whole_pool_milliseconds = 120000.0;
3649
3650 /* Used to compute how far we scan ahead */
3651 long strategy_delta;
3652 int bufs_to_lap;
3653 int bufs_ahead;
3654 float scans_per_alloc;
3655 int reusable_buffers_est;
3656 int upcoming_alloc_est;
3657 int min_scan_buffers;
3658
3659 /* Variables for the scanning loop proper */
3660 int num_to_scan;
3661 int num_written;
3662 int reusable_buffers;
3663
3664 /* Variables for final smoothed_density update */
3665 long new_strategy_delta;
3666 uint32 new_recent_alloc;
3667
3668 /*
3669 * Find out where the freelist clock sweep currently is, and how many
3670 * buffer allocations have happened since our last call.
3671 */
3672 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3673
3674 /* Report buffer alloc counts to pgstat */
3675 PendingBgWriterStats.buf_alloc += recent_alloc;
3676
3677 /*
3678 * If we're not running the LRU scan, just stop after doing the stats
3679 * stuff. We mark the saved state invalid so that we can recover sanely
3680 * if LRU scan is turned back on later.
3681 */
3682 if (bgwriter_lru_maxpages <= 0)
3683 {
3684 saved_info_valid = false;
3685 return true;
3686 }
3687
3688 /*
3689 * Compute strategy_delta = how many buffers have been scanned by the
3690 * clock sweep since last time. If first time through, assume none. Then
3691 * see if we are still ahead of the clock sweep, and if so, how many
3692 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3693 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3694 * behavior when the passes counts wrap around.
3695 */
3696 if (saved_info_valid)
3697 {
3698 int32 passes_delta = strategy_passes - prev_strategy_passes;
3699
3700 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3701 strategy_delta += (long) passes_delta * NBuffers;
3702
3703 Assert(strategy_delta >= 0);
3704
3705 if ((int32) (next_passes - strategy_passes) > 0)
3706 {
3707 /* we're one pass ahead of the strategy point */
3708 bufs_to_lap = strategy_buf_id - next_to_clean;
3709#ifdef BGW_DEBUG
3710 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3711 next_passes, next_to_clean,
3712 strategy_passes, strategy_buf_id,
3713 strategy_delta, bufs_to_lap);
3714#endif
3715 }
3716 else if (next_passes == strategy_passes &&
3717 next_to_clean >= strategy_buf_id)
3718 {
3719 /* on same pass, but ahead or at least not behind */
3720 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3721#ifdef BGW_DEBUG
3722 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3723 next_passes, next_to_clean,
3724 strategy_passes, strategy_buf_id,
3725 strategy_delta, bufs_to_lap);
3726#endif
3727 }
3728 else
3729 {
3730 /*
3731 * We're behind, so skip forward to the strategy point and start
3732 * cleaning from there.
3733 */
3734#ifdef BGW_DEBUG
3735 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3736 next_passes, next_to_clean,
3737 strategy_passes, strategy_buf_id,
3738 strategy_delta);
3739#endif
3740 next_to_clean = strategy_buf_id;
3741 next_passes = strategy_passes;
3742 bufs_to_lap = NBuffers;
3743 }
3744 }
3745 else
3746 {
3747 /*
3748 * Initializing at startup or after LRU scanning had been off. Always
3749 * start at the strategy point.
3750 */
3751#ifdef BGW_DEBUG
3752 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3753 strategy_passes, strategy_buf_id);
3754#endif
3755 strategy_delta = 0;
3756 next_to_clean = strategy_buf_id;
3757 next_passes = strategy_passes;
3758 bufs_to_lap = NBuffers;
3759 }
3760
3761 /* Update saved info for next time */
3762 prev_strategy_buf_id = strategy_buf_id;
3763 prev_strategy_passes = strategy_passes;
3764 saved_info_valid = true;
3765
3766 /*
3767 * Compute how many buffers had to be scanned for each new allocation, ie,
3768 * 1/density of reusable buffers, and track a moving average of that.
3769 *
3770 * If the strategy point didn't move, we don't update the density estimate
3771 */
3772 if (strategy_delta > 0 && recent_alloc > 0)
3773 {
3774 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3775 smoothed_density += (scans_per_alloc - smoothed_density) /
3776 smoothing_samples;
3777 }
3778
3779 /*
3780 * Estimate how many reusable buffers there are between the current
3781 * strategy point and where we've scanned ahead to, based on the smoothed
3782 * density estimate.
3783 */
3784 bufs_ahead = NBuffers - bufs_to_lap;
3785 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3786
3787 /*
3788 * Track a moving average of recent buffer allocations. Here, rather than
3789 * a true average we want a fast-attack, slow-decline behavior: we
3790 * immediately follow any increase.
3791 */
3792 if (smoothed_alloc <= (float) recent_alloc)
3793 smoothed_alloc = recent_alloc;
3794 else
3795 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3796 smoothing_samples;
3797
3798 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3799 upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3800
3801 /*
3802 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3803 * eventually underflow to zero, and the underflows produce annoying
3804 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3805 * zero, there's no point in tracking smaller and smaller values of
3806 * smoothed_alloc, so just reset it to exactly zero to avoid this
3807 * syndrome. It will pop back up as soon as recent_alloc increases.
3808 */
3809 if (upcoming_alloc_est == 0)
3810 smoothed_alloc = 0;
3811
3812 /*
3813 * Even in cases where there's been little or no buffer allocation
3814 * activity, we want to make a small amount of progress through the buffer
3815 * cache so that as many reusable buffers as possible are clean after an
3816 * idle period.
3817 *
3818 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3819 * the BGW will be called during the scan_whole_pool time; slice the
3820 * buffer pool into that many sections.
3821 */
3822 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3823
3824 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3825 {
3826#ifdef BGW_DEBUG
3827 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3828 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3829#endif
3830 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3831 }
3832
3833 /*
3834 * Now write out dirty reusable buffers, working forward from the
3835 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3836 * enough buffers to match our estimate of the next cycle's allocation
3837 * requirements, or hit the bgwriter_lru_maxpages limit.
3838 */
3839
3840 num_to_scan = bufs_to_lap;
3841 num_written = 0;
3842 reusable_buffers = reusable_buffers_est;
3843
3844 /* Execute the LRU scan */
3845 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3846 {
3847 int sync_state = SyncOneBuffer(next_to_clean, true,
3848 wb_context);
3849
3850 if (++next_to_clean >= NBuffers)
3851 {
3852 next_to_clean = 0;
3853 next_passes++;
3854 }
3855 num_to_scan--;
3856
3857 if (sync_state & BUF_WRITTEN)
3858 {
3859 reusable_buffers++;
3860 if (++num_written >= bgwriter_lru_maxpages)
3861 {
3863 break;
3864 }
3865 }
3866 else if (sync_state & BUF_REUSABLE)
3867 reusable_buffers++;
3868 }
3869
3871
3872#ifdef BGW_DEBUG
3873 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3874 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3875 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3876 bufs_to_lap - num_to_scan,
3877 num_written,
3878 reusable_buffers - reusable_buffers_est);
3879#endif
3880
3881 /*
3882 * Consider the above scan as being like a new allocation scan.
3883 * Characterize its density and update the smoothed one based on it. This
3884 * effectively halves the moving average period in cases where both the
3885 * strategy and the background writer are doing some useful scanning,
3886 * which is helpful because a long memory isn't as desirable on the
3887 * density estimates.
3888 */
3889 new_strategy_delta = bufs_to_lap - num_to_scan;
3890 new_recent_alloc = reusable_buffers - reusable_buffers_est;
3891 if (new_strategy_delta > 0 && new_recent_alloc > 0)
3892 {
3893 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3894 smoothed_density += (scans_per_alloc - smoothed_density) /
3895 smoothing_samples;
3896
3897#ifdef BGW_DEBUG
3898 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3899 new_recent_alloc, new_strategy_delta,
3900 scans_per_alloc, smoothed_density);
3901#endif
3902 }
3903
3904 /* Return true if OK to hibernate */
3905 return (bufs_to_lap == 0 && recent_alloc == 0);
3906}
3907
3908/*
3909 * SyncOneBuffer -- process a single buffer during syncing.
3910 *
3911 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3912 * buffers marked recently used, as these are not replacement candidates.
3913 *
3914 * Returns a bitmask containing the following flag bits:
3915 * BUF_WRITTEN: we wrote the buffer.
3916 * BUF_REUSABLE: buffer is available for replacement, ie, it has
3917 * pin count 0 and usage count 0.
3918 *
3919 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3920 * after locking it, but we don't care all that much.)
3921 */
3922static int
3923SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3924{
3925 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3926 int result = 0;
3927 uint32 buf_state;
3928 BufferTag tag;
3929
3930 /* Make sure we can handle the pin */
3933
3934 /*
3935 * Check whether buffer needs writing.
3936 *
3937 * We can make this check without taking the buffer content lock so long
3938 * as we mark pages dirty in access methods *before* logging changes with
3939 * XLogInsert(): if someone marks the buffer dirty just after our check we
3940 * don't worry because our checkpoint.redo points before log record for
3941 * upcoming changes and so we are not required to write such dirty buffer.
3942 */
3943 buf_state = LockBufHdr(bufHdr);
3944
3945 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3946 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3947 {
3948 result |= BUF_REUSABLE;
3949 }
3950 else if (skip_recently_used)
3951 {
3952 /* Caller told us not to write recently-used buffers */
3953 UnlockBufHdr(bufHdr, buf_state);
3954 return result;
3955 }
3956
3957 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3958 {
3959 /* It's clean, so nothing to do */
3960 UnlockBufHdr(bufHdr, buf_state);
3961 return result;
3962 }
3963
3964 /*
3965 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3966 * buffer is clean by the time we've locked it.)
3967 */
3968 PinBuffer_Locked(bufHdr);
3970
3972
3974
3975 tag = bufHdr->tag;
3976
3977 UnpinBuffer(bufHdr);
3978
3979 /*
3980 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3981 * IOContext will always be IOCONTEXT_NORMAL.
3982 */
3984
3985 return result | BUF_WRITTEN;
3986}
3987
3988/*
3989 * AtEOXact_Buffers - clean up at end of transaction.
3990 *
3991 * As of PostgreSQL 8.0, buffer pins should get released by the
3992 * ResourceOwner mechanism. This routine is just a debugging
3993 * cross-check that no pins remain.
3994 */
3995void
3996AtEOXact_Buffers(bool isCommit)
3997{
3999
4000 AtEOXact_LocalBuffers(isCommit);
4001
4003}
4004
4005/*
4006 * Initialize access to shared buffer pool
4007 *
4008 * This is called during backend startup (whether standalone or under the
4009 * postmaster). It sets up for this backend's access to the already-existing
4010 * buffer pool.
4011 */
4012void
4014{
4015 HASHCTL hash_ctl;
4016
4017 /*
4018 * An advisory limit on the number of pins each backend should hold, based
4019 * on shared_buffers and the maximum number of connections possible.
4020 * That's very pessimistic, but outside toy-sized shared_buffers it should
4021 * allow plenty of pins. LimitAdditionalPins() and
4022 * GetAdditionalPinLimit() can be used to check the remaining balance.
4023 */
4025
4026 memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
4027
4028 hash_ctl.keysize = sizeof(int32);
4029 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4030
4031 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4033
4034 /*
4035 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4036 * the corresponding phase of backend shutdown.
4037 */
4038 Assert(MyProc != NULL);
4040}
4041
4042/*
4043 * During backend exit, ensure that we released all shared-buffer locks and
4044 * assert that we have no remaining pins.
4045 */
4046static void
4048{
4049 UnlockBuffers();
4050
4052
4053 /* localbuf.c needs a chance too */
4055}
4056
4057/*
4058 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4059 *
4060 * As of PostgreSQL 8.0, buffer pins should get released by the
4061 * ResourceOwner mechanism. This routine is just a debugging
4062 * cross-check that no pins remain.
4063 */
4064static void
4066{
4067#ifdef USE_ASSERT_CHECKING
4068 int RefCountErrors = 0;
4070 int i;
4071 char *s;
4072
4073 /* check the array */
4074 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4075 {
4076 res = &PrivateRefCountArray[i];
4077
4078 if (res->buffer != InvalidBuffer)
4079 {
4081 elog(WARNING, "buffer refcount leak: %s", s);
4082 pfree(s);
4083
4084 RefCountErrors++;
4085 }
4086 }
4087
4088 /* if necessary search the hash */
4090 {
4091 HASH_SEQ_STATUS hstat;
4092
4094 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4095 {
4097 elog(WARNING, "buffer refcount leak: %s", s);
4098 pfree(s);
4099 RefCountErrors++;
4100 }
4101 }
4102
4103 Assert(RefCountErrors == 0);
4104#endif
4105}
4106
4107#ifdef USE_ASSERT_CHECKING
4108/*
4109 * Check for exclusive-locked catalog buffers. This is the core of
4110 * AssertCouldGetRelation().
4111 *
4112 * A backend would self-deadlock on LWLocks if the catalog scan read the
4113 * exclusive-locked buffer. The main threat is exclusive-locked buffers of
4114 * catalogs used in relcache, because a catcache search on any catalog may
4115 * build that catalog's relcache entry. We don't have an inventory of
4116 * catalogs relcache uses, so just check buffers of most catalogs.
4117 *
4118 * It's better to minimize waits while holding an exclusive buffer lock, so it
4119 * would be nice to broaden this check not to be catalog-specific. However,
4120 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4121 * read tables. That is deadlock-free as long as there's no loop in the
4122 * dependency graph: modifying table A may cause an opclass to read table B,
4123 * but it must not cause a read of table A.
4124 */
4125void
4126AssertBufferLocksPermitCatalogRead(void)
4127{
4128 ForEachLWLockHeldByMe(AssertNotCatalogBufferLock, NULL);
4129}
4130
4131static void
4132AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
4133 void *unused_context)
4134{
4135 BufferDesc *bufHdr;
4136 BufferTag tag;
4137 Oid relid;
4138
4139 if (mode != LW_EXCLUSIVE)
4140 return;
4141
4142 if (!((BufferDescPadded *) lock > BufferDescriptors &&
4144 return; /* not a buffer lock */
4145
4146 bufHdr = (BufferDesc *)
4147 ((char *) lock - offsetof(BufferDesc, content_lock));
4148 tag = bufHdr->tag;
4149
4150 /*
4151 * This relNumber==relid assumption holds until a catalog experiences
4152 * VACUUM FULL or similar. After a command like that, relNumber will be
4153 * in the normal (non-catalog) range, and we lose the ability to detect
4154 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4155 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4156 * held lock.
4157 */
4158 relid = tag.relNumber;
4159
4160 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4161 return;
4162
4164}
4165#endif
4166
4167
4168/*
4169 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4170 */
4171char *
4173{
4174 BufferDesc *buf;
4175 int32 loccount;
4176 char *result;
4177 ProcNumber backend;
4178 uint32 buf_state;
4179
4181 if (BufferIsLocal(buffer))
4182 {
4184 loccount = LocalRefCount[-buffer - 1];
4185 backend = MyProcNumber;
4186 }
4187 else
4188 {
4190 loccount = GetPrivateRefCount(buffer);
4191 backend = INVALID_PROC_NUMBER;
4192 }
4193
4194 /* theoretically we should lock the bufhdr here */
4195 buf_state = pg_atomic_read_u32(&buf->state);
4196
4197 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4198 buffer,
4200 BufTagGetForkNum(&buf->tag)).str,
4201 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4202 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4203 return result;
4204}
4205
4206/*
4207 * CheckPointBuffers
4208 *
4209 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4210 *
4211 * Note: temporary relations do not participate in checkpoints, so they don't
4212 * need to be flushed.
4213 */
4214void
4216{
4217 BufferSync(flags);
4218}
4219
4220/*
4221 * BufferGetBlockNumber
4222 * Returns the block number associated with a buffer.
4223 *
4224 * Note:
4225 * Assumes that the buffer is valid and pinned, else the
4226 * value may be obsolete immediately...
4227 */
4230{
4231 BufferDesc *bufHdr;
4232
4234
4235 if (BufferIsLocal(buffer))
4236 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4237 else
4238 bufHdr = GetBufferDescriptor(buffer - 1);
4239
4240 /* pinned, so OK to read tag without spinlock */
4241 return bufHdr->tag.blockNum;
4242}
4243
4244/*
4245 * BufferGetTag
4246 * Returns the relfilelocator, fork number and block number associated with
4247 * a buffer.
4248 */
4249void
4251 BlockNumber *blknum)
4252{
4253 BufferDesc *bufHdr;
4254
4255 /* Do the same checks as BufferGetBlockNumber. */
4257
4258 if (BufferIsLocal(buffer))
4259 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4260 else
4261 bufHdr = GetBufferDescriptor(buffer - 1);
4262
4263 /* pinned, so OK to read tag without spinlock */
4264 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4265 *forknum = BufTagGetForkNum(&bufHdr->tag);
4266 *blknum = bufHdr->tag.blockNum;
4267}
4268
4269/*
4270 * FlushBuffer
4271 * Physically write out a shared buffer.
4272 *
4273 * NOTE: this actually just passes the buffer contents to the kernel; the
4274 * real write to disk won't happen until the kernel feels like it. This
4275 * is okay from our point of view since we can redo the changes from WAL.
4276 * However, we will need to force the changes to disk via fsync before
4277 * we can checkpoint WAL.
4278 *
4279 * The caller must hold a pin on the buffer and have share-locked the
4280 * buffer contents. (Note: a share-lock does not prevent updates of
4281 * hint bits in the buffer, so the page could change while the write
4282 * is in progress, but we assume that that will not invalidate the data
4283 * written.)
4284 *
4285 * If the caller has an smgr reference for the buffer's relation, pass it
4286 * as the second parameter. If not, pass NULL.
4287 */
4288static void
4290 IOContext io_context)
4291{
4292 XLogRecPtr recptr;
4293 ErrorContextCallback errcallback;
4294 instr_time io_start;
4295 Block bufBlock;
4296 char *bufToWrite;
4297 uint32 buf_state;
4298
4299 /*
4300 * Try to start an I/O operation. If StartBufferIO returns false, then
4301 * someone else flushed the buffer before we could, so we need not do
4302 * anything.
4303 */
4304 if (!StartBufferIO(buf, false, false))
4305 return;
4306
4307 /* Setup error traceback support for ereport() */
4309 errcallback.arg = buf;
4310 errcallback.previous = error_context_stack;
4311 error_context_stack = &errcallback;
4312
4313 /* Find smgr relation for buffer */
4314 if (reln == NULL)
4316
4317 TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4318 buf->tag.blockNum,
4322
4323 buf_state = LockBufHdr(buf);
4324
4325 /*
4326 * Run PageGetLSN while holding header lock, since we don't have the
4327 * buffer locked exclusively in all cases.
4328 */
4329 recptr = BufferGetLSN(buf);
4330
4331 /* To check if block content changes while flushing. - vadim 01/17/97 */
4332 buf_state &= ~BM_JUST_DIRTIED;
4333 UnlockBufHdr(buf, buf_state);
4334
4335 /*
4336 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4337 * rule that log updates must hit disk before any of the data-file changes
4338 * they describe do.
4339 *
4340 * However, this rule does not apply to unlogged relations, which will be
4341 * lost after a crash anyway. Most unlogged relation pages do not bear
4342 * LSNs since we never emit WAL records for them, and therefore flushing
4343 * up through the buffer LSN would be useless, but harmless. However,
4344 * GiST indexes use LSNs internally to track page-splits, and therefore
4345 * unlogged GiST pages bear "fake" LSNs generated by
4346 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4347 * LSN counter could advance past the WAL insertion point; and if it did
4348 * happen, attempting to flush WAL through that location would fail, with
4349 * disastrous system-wide consequences. To make sure that can't happen,
4350 * skip the flush if the buffer isn't permanent.
4351 */
4352 if (buf_state & BM_PERMANENT)
4353 XLogFlush(recptr);
4354
4355 /*
4356 * Now it's safe to write the buffer to disk. Note that no one else should
4357 * have been able to write it, while we were busy with log flushing,
4358 * because we got the exclusive right to perform I/O by setting the
4359 * BM_IO_IN_PROGRESS bit.
4360 */
4361 bufBlock = BufHdrGetBlock(buf);
4362
4363 /*
4364 * Update page checksum if desired. Since we have only shared lock on the
4365 * buffer, other processes might be updating hint bits in it, so we must
4366 * copy the page to private storage if we do checksumming.
4367 */
4368 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4369
4371
4372 /*
4373 * bufToWrite is either the shared buffer or a copy, as appropriate.
4374 */
4375 smgrwrite(reln,
4376 BufTagGetForkNum(&buf->tag),
4377 buf->tag.blockNum,
4378 bufToWrite,
4379 false);
4380
4381 /*
4382 * When a strategy is in use, only flushes of dirty buffers already in the
4383 * strategy ring are counted as strategy writes (IOCONTEXT
4384 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4385 * statistics tracking.
4386 *
4387 * If a shared buffer initially added to the ring must be flushed before
4388 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4389 *
4390 * If a shared buffer which was added to the ring later because the
4391 * current strategy buffer is pinned or in use or because all strategy
4392 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4393 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4394 * (from_ring will be false).
4395 *
4396 * When a strategy is not in use, the write can only be a "regular" write
4397 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4398 */
4400 IOOP_WRITE, io_start, 1, BLCKSZ);
4401
4403
4404 /*
4405 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4406 * end the BM_IO_IN_PROGRESS state.
4407 */
4408 TerminateBufferIO(buf, true, 0, true, false);
4409
4410 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4411 buf->tag.blockNum,
4415
4416 /* Pop the error context stack */
4417 error_context_stack = errcallback.previous;
4418}
4419
4420/*
4421 * RelationGetNumberOfBlocksInFork
4422 * Determines the current number of pages in the specified relation fork.
4423 *
4424 * Note that the accuracy of the result will depend on the details of the
4425 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4426 * it might not be.
4427 */
4430{
4431 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4432 {
4433 /*
4434 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4435 * tableam returns the size in bytes - but for the purpose of this
4436 * routine, we want the number of blocks. Therefore divide, rounding
4437 * up.
4438 */
4439 uint64 szbytes;
4440
4441 szbytes = table_relation_size(relation, forkNum);
4442
4443 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4444 }
4445 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4446 {
4447 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4448 }
4449 else
4450 Assert(false);
4451
4452 return 0; /* keep compiler quiet */
4453}
4454
4455/*
4456 * BufferIsPermanent
4457 * Determines whether a buffer will potentially still be around after
4458 * a crash. Caller must hold a buffer pin.
4459 */
4460bool
4462{
4463 BufferDesc *bufHdr;
4464
4465 /* Local buffers are used only for temp relations. */
4466 if (BufferIsLocal(buffer))
4467 return false;
4468
4469 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4472
4473 /*
4474 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4475 * need not bother with the buffer header spinlock. Even if someone else
4476 * changes the buffer header state while we're doing this, the state is
4477 * changed atomically, so we'll read the old value or the new value, but
4478 * not random garbage.
4479 */
4480 bufHdr = GetBufferDescriptor(buffer - 1);
4481 return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4482}
4483
4484/*
4485 * BufferGetLSNAtomic
4486 * Retrieves the LSN of the buffer atomically using a buffer header lock.
4487 * This is necessary for some callers who may not have an exclusive lock
4488 * on the buffer.
4489 */
4492{
4493 char *page = BufferGetPage(buffer);
4494 BufferDesc *bufHdr;
4495 XLogRecPtr lsn;
4496 uint32 buf_state;
4497
4498 /*
4499 * If we don't need locking for correctness, fastpath out.
4500 */
4502 return PageGetLSN(page);
4503
4504 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4507
4508 bufHdr = GetBufferDescriptor(buffer - 1);
4509 buf_state = LockBufHdr(bufHdr);
4510 lsn = PageGetLSN(page);
4511 UnlockBufHdr(bufHdr, buf_state);
4512
4513 return lsn;
4514}
4515
4516/* ---------------------------------------------------------------------
4517 * DropRelationBuffers
4518 *
4519 * This function removes from the buffer pool all the pages of the
4520 * specified relation forks that have block numbers >= firstDelBlock.
4521 * (In particular, with firstDelBlock = 0, all pages are removed.)
4522 * Dirty pages are simply dropped, without bothering to write them
4523 * out first. Therefore, this is NOT rollback-able, and so should be
4524 * used only with extreme caution!
4525 *
4526 * Currently, this is called only from smgr.c when the underlying file
4527 * is about to be deleted or truncated (firstDelBlock is needed for
4528 * the truncation case). The data in the affected pages would therefore
4529 * be deleted momentarily anyway, and there is no point in writing it.
4530 * It is the responsibility of higher-level code to ensure that the
4531 * deletion or truncation does not lose any data that could be needed
4532 * later. It is also the responsibility of higher-level code to ensure
4533 * that no other process could be trying to load more pages of the
4534 * relation into buffers.
4535 * --------------------------------------------------------------------
4536 */
4537void
4539 int nforks, BlockNumber *firstDelBlock)
4540{
4541 int i;
4542 int j;
4543 RelFileLocatorBackend rlocator;
4544 BlockNumber nForkBlock[MAX_FORKNUM];
4545 uint64 nBlocksToInvalidate = 0;
4546
4547 rlocator = smgr_reln->smgr_rlocator;
4548
4549 /* If it's a local relation, it's localbuf.c's problem. */
4550 if (RelFileLocatorBackendIsTemp(rlocator))
4551 {
4552 if (rlocator.backend == MyProcNumber)
4553 {
4554 for (j = 0; j < nforks; j++)
4555 DropRelationLocalBuffers(rlocator.locator, forkNum[j],
4556 firstDelBlock[j]);
4557 }
4558 return;
4559 }
4560
4561 /*
4562 * To remove all the pages of the specified relation forks from the buffer
4563 * pool, we need to scan the entire buffer pool but we can optimize it by
4564 * finding the buffers from BufMapping table provided we know the exact
4565 * size of each fork of the relation. The exact size is required to ensure
4566 * that we don't leave any buffer for the relation being dropped as
4567 * otherwise the background writer or checkpointer can lead to a PANIC
4568 * error while flushing buffers corresponding to files that don't exist.
4569 *
4570 * To know the exact size, we rely on the size cached for each fork by us
4571 * during recovery which limits the optimization to recovery and on
4572 * standbys but we can easily extend it once we have shared cache for
4573 * relation size.
4574 *
4575 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4576 * and the future writes keeps the cached value up-to-date. See
4577 * smgrextend. It is possible that the value of the first lseek is smaller
4578 * than the actual number of existing blocks in the file due to buggy
4579 * Linux kernels that might not have accounted for the recent write. But
4580 * that should be fine because there must not be any buffers after that
4581 * file size.
4582 */
4583 for (i = 0; i < nforks; i++)
4584 {
4585 /* Get the number of blocks for a relation's fork */
4586 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4587
4588 if (nForkBlock[i] == InvalidBlockNumber)
4589 {
4590 nBlocksToInvalidate = InvalidBlockNumber;
4591 break;
4592 }
4593
4594 /* calculate the number of blocks to be invalidated */
4595 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4596 }
4597
4598 /*
4599 * We apply the optimization iff the total number of blocks to invalidate
4600 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4601 */
4602 if (BlockNumberIsValid(nBlocksToInvalidate) &&
4603 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4604 {
4605 for (j = 0; j < nforks; j++)
4606 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4607 nForkBlock[j], firstDelBlock[j]);
4608 return;
4609 }
4610
4611 for (i = 0; i < NBuffers; i++)
4612 {
4613 BufferDesc *bufHdr = GetBufferDescriptor(i);
4614 uint32 buf_state;
4615
4616 /*
4617 * We can make this a tad faster by prechecking the buffer tag before
4618 * we attempt to lock the buffer; this saves a lot of lock
4619 * acquisitions in typical cases. It should be safe because the
4620 * caller must have AccessExclusiveLock on the relation, or some other
4621 * reason to be certain that no one is loading new pages of the rel
4622 * into the buffer pool. (Otherwise we might well miss such pages
4623 * entirely.) Therefore, while the tag might be changing while we
4624 * look at it, it can't be changing *to* a value we care about, only
4625 * *away* from such a value. So false negatives are impossible, and
4626 * false positives are safe because we'll recheck after getting the
4627 * buffer lock.
4628 *
4629 * We could check forkNum and blockNum as well as the rlocator, but
4630 * the incremental win from doing so seems small.
4631 */
4632 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4633 continue;
4634
4635 buf_state = LockBufHdr(bufHdr);
4636
4637 for (j = 0; j < nforks; j++)
4638 {
4639 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4640 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4641 bufHdr->tag.blockNum >= firstDelBlock[j])
4642 {
4643 InvalidateBuffer(bufHdr); /* releases spinlock */
4644 break;
4645 }
4646 }
4647 if (j >= nforks)
4648 UnlockBufHdr(bufHdr, buf_state);
4649 }
4650}
4651
4652/* ---------------------------------------------------------------------
4653 * DropRelationsAllBuffers
4654 *
4655 * This function removes from the buffer pool all the pages of all
4656 * forks of the specified relations. It's equivalent to calling
4657 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4658 * --------------------------------------------------------------------
4659 */
4660void
4661DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4662{
4663 int i;
4664 int n = 0;
4665 SMgrRelation *rels;
4666 BlockNumber (*block)[MAX_FORKNUM + 1];
4667 uint64 nBlocksToInvalidate = 0;
4668 RelFileLocator *locators;
4669 bool cached = true;
4670 bool use_bsearch;
4671
4672 if (nlocators == 0)
4673 return;
4674
4675 rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4676
4677 /* If it's a local relation, it's localbuf.c's problem. */
4678 for (i = 0; i < nlocators; i++)
4679 {
4680 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4681 {
4682 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4683 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4684 }
4685 else
4686 rels[n++] = smgr_reln[i];
4687 }
4688
4689 /*
4690 * If there are no non-local relations, then we're done. Release the
4691 * memory and return.
4692 */
4693 if (n == 0)
4694 {
4695 pfree(rels);
4696 return;
4697 }
4698
4699 /*
4700 * This is used to remember the number of blocks for all the relations
4701 * forks.
4702 */
4703 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4704 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4705
4706 /*
4707 * We can avoid scanning the entire buffer pool if we know the exact size
4708 * of each of the given relation forks. See DropRelationBuffers.
4709 */
4710 for (i = 0; i < n && cached; i++)
4711 {
4712 for (int j = 0; j <= MAX_FORKNUM; j++)
4713 {
4714 /* Get the number of blocks for a relation's fork. */
4715 block[i][j] = smgrnblocks_cached(rels[i], j);
4716
4717 /* We need to only consider the relation forks that exists. */
4718 if (block[i][j] == InvalidBlockNumber)
4719 {
4720 if (!smgrexists(rels[i], j))
4721 continue;
4722 cached = false;
4723 break;
4724 }
4725
4726 /* calculate the total number of blocks to be invalidated */
4727 nBlocksToInvalidate += block[i][j];
4728 }
4729 }
4730
4731 /*
4732 * We apply the optimization iff the total number of blocks to invalidate
4733 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4734 */
4735 if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4736 {
4737 for (i = 0; i < n; i++)
4738 {
4739 for (int j = 0; j <= MAX_FORKNUM; j++)
4740 {
4741 /* ignore relation forks that doesn't exist */
4742 if (!BlockNumberIsValid(block[i][j]))
4743 continue;
4744
4745 /* drop all the buffers for a particular relation fork */
4746 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4747 j, block[i][j], 0);
4748 }
4749 }
4750
4751 pfree(block);
4752 pfree(rels);
4753 return;
4754 }
4755
4756 pfree(block);
4757 locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4758 for (i = 0; i < n; i++)
4759 locators[i] = rels[i]->smgr_rlocator.locator;
4760
4761 /*
4762 * For low number of relations to drop just use a simple walk through, to
4763 * save the bsearch overhead. The threshold to use is rather a guess than
4764 * an exactly determined value, as it depends on many factors (CPU and RAM
4765 * speeds, amount of shared buffers etc.).
4766 */
4767 use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4768
4769 /* sort the list of rlocators if necessary */
4770 if (use_bsearch)
4771 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4772
4773 for (i = 0; i < NBuffers; i++)
4774 {
4775 RelFileLocator *rlocator = NULL;
4776 BufferDesc *bufHdr = GetBufferDescriptor(i);
4777 uint32 buf_state;
4778
4779 /*
4780 * As in DropRelationBuffers, an unlocked precheck should be safe and
4781 * saves some cycles.
4782 */
4783
4784 if (!use_bsearch)
4785 {
4786 int j;
4787
4788 for (j = 0; j < n; j++)
4789 {
4790 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4791 {
4792 rlocator = &locators[j];
4793 break;
4794 }
4795 }
4796 }
4797 else
4798 {
4799 RelFileLocator locator;
4800
4801 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4802 rlocator = bsearch(&locator,
4803 locators, n, sizeof(RelFileLocator),
4805 }
4806
4807 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4808 if (rlocator == NULL)
4809 continue;
4810
4811 buf_state = LockBufHdr(bufHdr);
4812 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4813 InvalidateBuffer(bufHdr); /* releases spinlock */
4814 else
4815 UnlockBufHdr(bufHdr, buf_state);
4816 }
4817
4818 pfree(locators);
4819 pfree(rels);
4820}
4821
4822/* ---------------------------------------------------------------------
4823 * FindAndDropRelationBuffers
4824 *
4825 * This function performs look up in BufMapping table and removes from the
4826 * buffer pool all the pages of the specified relation fork that has block
4827 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4828 * pages are removed.)
4829 * --------------------------------------------------------------------
4830 */
4831static void
4833 BlockNumber nForkBlock,
4834 BlockNumber firstDelBlock)
4835{
4836 BlockNumber curBlock;
4837
4838 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4839 {
4840 uint32 bufHash; /* hash value for tag */
4841 BufferTag bufTag; /* identity of requested block */
4842 LWLock *bufPartitionLock; /* buffer partition lock for it */
4843 int buf_id;
4844 BufferDesc *bufHdr;
4845 uint32 buf_state;
4846
4847 /* create a tag so we can lookup the buffer */
4848 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4849
4850 /* determine its hash code and partition lock ID */
4851 bufHash = BufTableHashCode(&bufTag);
4852 bufPartitionLock = BufMappingPartitionLock(bufHash);
4853
4854 /* Check that it is in the buffer pool. If not, do nothing. */
4855 LWLockAcquire(bufPartitionLock, LW_SHARED);
4856 buf_id = BufTableLookup(&bufTag, bufHash);
4857 LWLockRelease(bufPartitionLock);
4858
4859 if (buf_id < 0)
4860 continue;
4861
4862 bufHdr = GetBufferDescriptor(buf_id);
4863
4864 /*
4865 * We need to lock the buffer header and recheck if the buffer is
4866 * still associated with the same block because the buffer could be
4867 * evicted by some other backend loading blocks for a different
4868 * relation after we release lock on the BufMapping table.
4869 */
4870 buf_state = LockBufHdr(bufHdr);
4871
4872 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4873 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4874 bufHdr->tag.blockNum >= firstDelBlock)
4875 InvalidateBuffer(bufHdr); /* releases spinlock */
4876 else
4877 UnlockBufHdr(bufHdr, buf_state);
4878 }
4879}
4880
4881/* ---------------------------------------------------------------------
4882 * DropDatabaseBuffers
4883 *
4884 * This function removes all the buffers in the buffer cache for a
4885 * particular database. Dirty pages are simply dropped, without
4886 * bothering to write them out first. This is used when we destroy a
4887 * database, to avoid trying to flush data to disk when the directory
4888 * tree no longer exists. Implementation is pretty similar to
4889 * DropRelationBuffers() which is for destroying just one relation.
4890 * --------------------------------------------------------------------
4891 */
4892void
4894{
4895 int i;
4896
4897 /*
4898 * We needn't consider local buffers, since by assumption the target
4899 * database isn't our own.
4900 */
4901
4902 for (i = 0; i < NBuffers; i++)
4903 {
4904 BufferDesc *bufHdr = GetBufferDescriptor(i);
4905 uint32 buf_state;
4906
4907 /*
4908 * As in DropRelationBuffers, an unlocked precheck should be safe and
4909 * saves some cycles.
4910 */
4911 if (bufHdr->tag.dbOid != dbid)
4912 continue;
4913
4914 buf_state = LockBufHdr(bufHdr);
4915 if (bufHdr->tag.dbOid == dbid)
4916 InvalidateBuffer(bufHdr); /* releases spinlock */
4917 else
4918 UnlockBufHdr(bufHdr, buf_state);
4919 }
4920}
4921
4922/* ---------------------------------------------------------------------
4923 * FlushRelationBuffers
4924 *
4925 * This function writes all dirty pages of a relation out to disk
4926 * (or more accurately, out to kernel disk buffers), ensuring that the
4927 * kernel has an up-to-date view of the relation.
4928 *
4929 * Generally, the caller should be holding AccessExclusiveLock on the
4930 * target relation to ensure that no other backend is busy dirtying
4931 * more blocks of the relation; the effects can't be expected to last
4932 * after the lock is released.
4933 *
4934 * XXX currently it sequentially searches the buffer pool, should be
4935 * changed to more clever ways of searching. This routine is not
4936 * used in any performance-critical code paths, so it's not worth
4937 * adding additional overhead to normal paths to make it go faster.
4938 * --------------------------------------------------------------------
4939 */
4940void
4942{
4943 int i;
4944 BufferDesc *bufHdr;
4945 SMgrRelation srel = RelationGetSmgr(rel);
4946
4947 if (RelationUsesLocalBuffers(rel))
4948 {
4949 for (i = 0; i < NLocBuffer; i++)
4950 {
4951 uint32 buf_state;
4952
4953 bufHdr = GetLocalBufferDescriptor(i);
4954 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4955 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4956 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4957 {
4958 ErrorContextCallback errcallback;
4959
4960 /* Setup error traceback support for ereport() */
4962 errcallback.arg = bufHdr;
4963 errcallback.previous = error_context_stack;
4964 error_context_stack = &errcallback;
4965
4966 /* Make sure we can handle the pin */
4969
4970 /*
4971 * Pin/unpin mostly to make valgrind work, but it also seems
4972 * like the right thing to do.
4973 */
4974 PinLocalBuffer(bufHdr, false);
4975
4976
4977 FlushLocalBuffer(bufHdr, srel);
4978
4980
4981 /* Pop the error context stack */
4982 error_context_stack = errcallback.previous;
4983 }
4984 }
4985
4986 return;
4987 }
4988
4989 for (i = 0; i < NBuffers; i++)
4990 {
4991 uint32 buf_state;
4992
4993 bufHdr = GetBufferDescriptor(i);
4994
4995 /*
4996 * As in DropRelationBuffers, an unlocked precheck should be safe and
4997 * saves some cycles.
4998 */
4999 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
5000 continue;
5001
5002 /* Make sure we can handle the pin */
5005
5006 buf_state = LockBufHdr(bufHdr);
5007 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5008 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5009 {
5010 PinBuffer_Locked(bufHdr);
5014 UnpinBuffer(bufHdr);
5015 }
5016 else
5017 UnlockBufHdr(bufHdr, buf_state);
5018 }
5019}
5020
5021/* ---------------------------------------------------------------------
5022 * FlushRelationsAllBuffers
5023 *
5024 * This function flushes out of the buffer pool all the pages of all
5025 * forks of the specified smgr relations. It's equivalent to calling
5026 * FlushRelationBuffers once per relation. The relations are assumed not
5027 * to use local buffers.
5028 * --------------------------------------------------------------------
5029 */
5030void
5032{
5033 int i;
5034 SMgrSortArray *srels;
5035 bool use_bsearch;
5036
5037 if (nrels == 0)
5038 return;
5039
5040 /* fill-in array for qsort */
5041 srels = palloc(sizeof(SMgrSortArray) * nrels);
5042
5043 for (i = 0; i < nrels; i++)
5044 {
5045 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5046
5047 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5048 srels[i].srel = smgrs[i];
5049 }
5050
5051 /*
5052 * Save the bsearch overhead for low number of relations to sync. See
5053 * DropRelationsAllBuffers for details.
5054 */
5055 use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5056
5057 /* sort the list of SMgrRelations if necessary */
5058 if (use_bsearch)
5059 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5060
5061 for (i = 0; i < NBuffers; i++)
5062 {
5063 SMgrSortArray *srelent = NULL;
5064 BufferDesc *bufHdr = GetBufferDescriptor(i);
5065 uint32 buf_state;
5066
5067 /*
5068 * As in DropRelationBuffers, an unlocked precheck should be safe and
5069 * saves some cycles.
5070 */
5071
5072 if (!use_bsearch)
5073 {
5074 int j;
5075
5076 for (j = 0; j < nrels; j++)
5077 {
5078 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5079 {
5080 srelent = &srels[j];
5081 break;
5082 }
5083 }
5084 }
5085 else
5086 {
5087 RelFileLocator rlocator;
5088
5089 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5090 srelent = bsearch(&rlocator,
5091 srels, nrels, sizeof(SMgrSortArray),
5093 }
5094
5095 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5096 if (srelent == NULL)
5097 continue;
5098
5099 /* Make sure we can handle the pin */
5102
5103 buf_state = LockBufHdr(bufHdr);
5104 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5105 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5106 {
5107 PinBuffer_Locked(bufHdr);
5111 UnpinBuffer(bufHdr);
5112 }
5113 else
5114 UnlockBufHdr(bufHdr, buf_state);
5115 }
5116
5117 pfree(srels);
5118}
5119
5120/* ---------------------------------------------------------------------
5121 * RelationCopyStorageUsingBuffer
5122 *
5123 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5124 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5125 *
5126 * Refer comments atop CreateAndCopyRelationData() for details about
5127 * 'permanent' parameter.
5128 * --------------------------------------------------------------------
5129 */
5130static void
5132 RelFileLocator dstlocator,
5133 ForkNumber forkNum, bool permanent)
5134{
5135 Buffer srcBuf;
5136 Buffer dstBuf;
5137 Page srcPage;
5138 Page dstPage;
5139 bool use_wal;
5140 BlockNumber nblocks;
5141 BlockNumber blkno;
5143 BufferAccessStrategy bstrategy_src;
5144 BufferAccessStrategy bstrategy_dst;
5146 ReadStream *src_stream;
5147 SMgrRelation src_smgr;
5148
5149 /*
5150 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5151 * can skip it when copying any fork of an unlogged relation other than
5152 * the init fork.
5153 */
5154 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5155
5156 /* Get number of blocks in the source relation. */
5157 nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5158 forkNum);
5159
5160 /* Nothing to copy; just return. */
5161 if (nblocks == 0)
5162 return;
5163
5164 /*
5165 * Bulk extend the destination relation of the same size as the source
5166 * relation before starting to copy block by block.
5167 */
5168 memset(buf.data, 0, BLCKSZ);
5169 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5170 buf.data, true);
5171
5172 /* This is a bulk operation, so use buffer access strategies. */
5173 bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5174 bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5175
5176 /* Initialize streaming read */
5177 p.current_blocknum = 0;
5178 p.last_exclusive = nblocks;
5179 src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5180
5181 /*
5182 * It is safe to use batchmode as block_range_read_stream_cb takes no
5183 * locks.
5184 */
5187 bstrategy_src,
5188 src_smgr,
5189 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5190 forkNum,
5192 &p,
5193 0);
5194
5195 /* Iterate over each block of the source relation file. */
5196 for (blkno = 0; blkno < nblocks; blkno++)
5197 {
5199
5200 /* Read block from source relation. */
5201 srcBuf = read_stream_next_buffer(src_stream, NULL);
5203 srcPage = BufferGetPage(srcBuf);
5204
5205 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5206 BufferGetBlockNumber(srcBuf),
5207 RBM_ZERO_AND_LOCK, bstrategy_dst,
5208 permanent);
5209 dstPage = BufferGetPage(dstBuf);
5210
5212
5213 /* Copy page data from the source to the destination. */
5214 memcpy(dstPage, srcPage, BLCKSZ);
5215 MarkBufferDirty(dstBuf);
5216
5217 /* WAL-log the copied page. */
5218 if (use_wal)
5219 log_newpage_buffer(dstBuf, true);
5220
5222
5223 UnlockReleaseBuffer(dstBuf);
5224 UnlockReleaseBuffer(srcBuf);
5225 }
5226 Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5227 read_stream_end(src_stream);
5228
5229 FreeAccessStrategy(bstrategy_src);
5230 FreeAccessStrategy(bstrategy_dst);
5231}
5232
5233/* ---------------------------------------------------------------------
5234 * CreateAndCopyRelationData
5235 *
5236 * Create destination relation storage and copy all forks from the
5237 * source relation to the destination.
5238 *
5239 * Pass permanent as true for permanent relations and false for
5240 * unlogged relations. Currently this API is not supported for
5241 * temporary relations.
5242 * --------------------------------------------------------------------
5243 */
5244void
5246 RelFileLocator dst_rlocator, bool permanent)
5247{
5248 char relpersistence;
5249 SMgrRelation src_rel;
5250 SMgrRelation dst_rel;
5251
5252 /* Set the relpersistence. */
5253 relpersistence = permanent ?
5254 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5255
5256 src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5257 dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5258
5259 /*
5260 * Create and copy all forks of the relation. During create database we
5261 * have a separate cleanup mechanism which deletes complete database
5262 * directory. Therefore, each individual relation doesn't need to be
5263 * registered for cleanup.
5264 */
5265 RelationCreateStorage(dst_rlocator, relpersistence, false);
5266
5267 /* copy main fork. */
5268 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5269 permanent);
5270
5271 /* copy those extra forks that exist */
5272 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5273 forkNum <= MAX_FORKNUM; forkNum++)
5274 {
5275 if (smgrexists(src_rel, forkNum))
5276 {
5277 smgrcreate(dst_rel, forkNum, false);
5278
5279 /*
5280 * WAL log creation if the relation is persistent, or this is the
5281 * init fork of an unlogged relation.
5282 */
5283 if (permanent || forkNum == INIT_FORKNUM)
5284 log_smgrcreate(&dst_rlocator, forkNum);
5285
5286 /* Copy a fork's data, block by block. */
5287 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5288 permanent);
5289 }
5290 }
5291}
5292
5293/* ---------------------------------------------------------------------
5294 * FlushDatabaseBuffers
5295 *
5296 * This function writes all dirty pages of a database out to disk
5297 * (or more accurately, out to kernel disk buffers), ensuring that the
5298 * kernel has an up-to-date view of the database.
5299 *
5300 * Generally, the caller should be holding an appropriate lock to ensure
5301 * no other backend is active in the target database; otherwise more
5302 * pages could get dirtied.
5303 *
5304 * Note we don't worry about flushing any pages of temporary relations.
5305 * It's assumed these wouldn't be interesting.
5306 * --------------------------------------------------------------------
5307 */
5308void
5310{
5311 int i;
5312 BufferDesc *bufHdr;
5313
5314 for (i = 0; i < NBuffers; i++)
5315 {
5316 uint32 buf_state;
5317
5318 bufHdr = GetBufferDescriptor(i);
5319
5320 /*
5321 * As in DropRelationBuffers, an unlocked precheck should be safe and
5322 * saves some cycles.
5323 */
5324 if (bufHdr->tag.dbOid != dbid)
5325 continue;
5326
5327 /* Make sure we can handle the pin */
5330
5331 buf_state = LockBufHdr(bufHdr);
5332 if (bufHdr->tag.dbOid == dbid &&
5333 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5334 {
5335 PinBuffer_Locked(bufHdr);
5339 UnpinBuffer(bufHdr);
5340 }
5341 else
5342 UnlockBufHdr(bufHdr, buf_state);
5343 }
5344}
5345
5346/*
5347 * Flush a previously, shared or exclusively, locked and pinned buffer to the
5348 * OS.
5349 */
5350void
5352{
5353 BufferDesc *bufHdr;
5354
5355 /* currently not needed, but no fundamental reason not to support */
5357
5359
5360 bufHdr = GetBufferDescriptor(buffer - 1);
5361
5363
5365}
5366
5367/*
5368 * ReleaseBuffer -- release the pin on a buffer
5369 */
5370void
5372{
5373 if (!BufferIsValid(buffer))
5374 elog(ERROR, "bad buffer ID: %d", buffer);
5375
5376 if (BufferIsLocal(buffer))
5378 else
5380}
5381
5382/*
5383 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5384 *
5385 * This is just a shorthand for a common combination.
5386 */
5387void
5389{
5392}
5393
5394/*
5395 * IncrBufferRefCount
5396 * Increment the pin count on a buffer that we have *already* pinned
5397 * at least once.
5398 *
5399 * This function cannot be used on a buffer we do not have pinned,
5400 * because it doesn't change the shared buffer state.
5401 */
5402void
5404{
5407 if (BufferIsLocal(buffer))
5408 LocalRefCount[-buffer - 1]++;
5409 else
5410 {
5412
5413 ref = GetPrivateRefCountEntry(buffer, true);
5414 Assert(ref != NULL);
5415 ref->refcount++;
5416 }
5418}
5419
5420/*
5421 * MarkBufferDirtyHint
5422 *
5423 * Mark a buffer dirty for non-critical changes.
5424 *
5425 * This is essentially the same as MarkBufferDirty, except:
5426 *
5427 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5428 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5429 * 2. The caller might have only share-lock instead of exclusive-lock on the
5430 * buffer's content lock.
5431 * 3. This function does not guarantee that the buffer is always marked dirty
5432 * (due to a race condition), so it cannot be used for important changes.
5433 */
5434void
5436{
5437 BufferDesc *bufHdr;
5438 Page page = BufferGetPage(buffer);
5439
5440 if (!BufferIsValid(buffer))
5441 elog(ERROR, "bad buffer ID: %d", buffer);
5442
5443 if (BufferIsLocal(buffer))
5444 {
5446 return;
5447 }
5448
5449 bufHdr = GetBufferDescriptor(buffer - 1);
5450
5452 /* here, either share or exclusive lock is OK */
5454
5455 /*
5456 * This routine might get called many times on the same page, if we are
5457 * making the first scan after commit of an xact that added/deleted many
5458 * tuples. So, be as quick as we can if the buffer is already dirty. We
5459 * do this by not acquiring spinlock if it looks like the status bits are
5460 * already set. Since we make this test unlocked, there's a chance we
5461 * might fail to notice that the flags have just been cleared, and failed
5462 * to reset them, due to memory-ordering issues. But since this function
5463 * is only intended to be used in cases where failing to write out the
5464 * data would be harmless anyway, it doesn't really matter.
5465 */
5466 if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5468 {
5470 bool dirtied = false;
5471 bool delayChkptFlags = false;
5472 uint32 buf_state;
5473
5474 /*
5475 * If we need to protect hint bit updates from torn writes, WAL-log a
5476 * full page image of the page. This full page image is only necessary
5477 * if the hint bit update is the first change to the page since the
5478 * last checkpoint.
5479 *
5480 * We don't check full_page_writes here because that logic is included
5481 * when we call XLogInsert() since the value changes dynamically.
5482 */
5483 if (XLogHintBitIsNeeded() &&
5485 {
5486 /*
5487 * If we must not write WAL, due to a relfilelocator-specific
5488 * condition or being in recovery, don't dirty the page. We can
5489 * set the hint, just not dirty the page as a result so the hint
5490 * is lost when we evict the page or shutdown.
5491 *
5492 * See src/backend/storage/page/README for longer discussion.
5493 */
5494 if (RecoveryInProgress() ||
5496 return;
5497
5498 /*
5499 * If the block is already dirty because we either made a change
5500 * or set a hint already, then we don't need to write a full page
5501 * image. Note that aggressive cleaning of blocks dirtied by hint
5502 * bit setting would increase the call rate. Bulk setting of hint
5503 * bits would reduce the call rate...
5504 *
5505 * We must issue the WAL record before we mark the buffer dirty.
5506 * Otherwise we might write the page before we write the WAL. That
5507 * causes a race condition, since a checkpoint might occur between
5508 * writing the WAL record and marking the buffer dirty. We solve
5509 * that with a kluge, but one that is already in use during
5510 * transaction commit to prevent race conditions. Basically, we
5511 * simply prevent the checkpoint WAL record from being written
5512 * until we have marked the buffer dirty. We don't start the
5513 * checkpoint flush until we have marked dirty, so our checkpoint
5514 * must flush the change to disk successfully or the checkpoint
5515 * never gets written, so crash recovery will fix.
5516 *
5517 * It's possible we may enter here without an xid, so it is
5518 * essential that CreateCheckPoint waits for virtual transactions
5519 * rather than full transactionids.
5520 */
5523 delayChkptFlags = true;
5524 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5525 }
5526
5527 buf_state = LockBufHdr(bufHdr);
5528
5529 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5530
5531 if (!(buf_state & BM_DIRTY))
5532 {
5533 dirtied = true; /* Means "will be dirtied by this action" */
5534
5535 /*
5536 * Set the page LSN if we wrote a backup block. We aren't supposed
5537 * to set this when only holding a share lock but as long as we
5538 * serialise it somehow we're OK. We choose to set LSN while
5539 * holding the buffer header lock, which causes any reader of an
5540 * LSN who holds only a share lock to also obtain a buffer header
5541 * lock before using PageGetLSN(), which is enforced in
5542 * BufferGetLSNAtomic().
5543 *
5544 * If checksums are enabled, you might think we should reset the
5545 * checksum here. That will happen when the page is written
5546 * sometime later in this checkpoint cycle.
5547 */
5548 if (!XLogRecPtrIsInvalid(lsn))
5549 PageSetLSN(page, lsn);
5550 }
5551
5552 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5553 UnlockBufHdr(bufHdr, buf_state);
5554
5555 if (delayChkptFlags)
5556 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5557
5558 if (dirtied)
5559 {
5561 if (VacuumCostActive)
5563 }
5564 }
5565}
5566
5567/*
5568 * Release buffer content locks for shared buffers.
5569 *
5570 * Used to clean up after errors.
5571 *
5572 * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5573 * of releasing buffer content locks per se; the only thing we need to deal
5574 * with here is clearing any PIN_COUNT request that was in progress.
5575 */
5576void
5578{
5580
5581 if (buf)
5582 {
5583 uint32 buf_state;
5584
5585 buf_state = LockBufHdr(buf);
5586
5587 /*
5588 * Don't complain if flag bit not set; it could have been reset but we
5589 * got a cancel/die interrupt before getting the signal.
5590 */
5591 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5592 buf->wait_backend_pgprocno == MyProcNumber)
5593 buf_state &= ~BM_PIN_COUNT_WAITER;
5594
5595 UnlockBufHdr(buf, buf_state);
5596
5597 PinCountWaitBuf = NULL;
5598 }
5599}
5600
5601/*
5602 * Acquire or release the content_lock for the buffer.
5603 */
5604void
5606{
5607 BufferDesc *buf;
5608
5610 if (BufferIsLocal(buffer))
5611 return; /* local buffers need no lock */
5612
5614
5615 if (mode == BUFFER_LOCK_UNLOCK)
5617 else if (mode == BUFFER_LOCK_SHARE)
5619 else if (mode == BUFFER_LOCK_EXCLUSIVE)
5621 else
5622 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5623}
5624
5625/*
5626 * Acquire the content_lock for the buffer, but only if we don't have to wait.
5627 *
5628 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5629 */
5630bool
5632{
5633 BufferDesc *buf;
5634
5636 if (BufferIsLocal(buffer))
5637 return true; /* act as though we got it */
5638
5640
5642 LW_EXCLUSIVE);
5643}
5644
5645/*
5646 * Verify that this backend is pinning the buffer exactly once.
5647 *
5648 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5649 * holds a pin on the buffer. We do not care whether some other backend does.
5650 */
5651void
5653{
5654 if (BufferIsLocal(buffer))
5655 {
5656 if (LocalRefCount[-buffer - 1] != 1)
5657 elog(ERROR, "incorrect local pin count: %d",
5658 LocalRefCount[-buffer - 1]);
5659 }
5660 else
5661 {
5662 if (GetPrivateRefCount(buffer) != 1)
5663 elog(ERROR, "incorrect local pin count: %d",
5665 }
5666}
5667
5668/*
5669 * LockBufferForCleanup - lock a buffer in preparation for deleting items
5670 *
5671 * Items may be deleted from a disk page only when the caller (a) holds an
5672 * exclusive lock on the buffer and (b) has observed that no other backend
5673 * holds a pin on the buffer. If there is a pin, then the other backend
5674 * might have a pointer into the buffer (for example, a heapscan reference
5675 * to an item --- see README for more details). It's OK if a pin is added
5676 * after the cleanup starts, however; the newly-arrived backend will be
5677 * unable to look at the page until we release the exclusive lock.
5678 *
5679 * To implement this protocol, a would-be deleter must pin the buffer and
5680 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5681 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5682 * it has successfully observed pin count = 1.
5683 */
5684void
5686{
5687 BufferDesc *bufHdr;
5688 TimestampTz waitStart = 0;
5689 bool waiting = false;
5690 bool logged_recovery_conflict = false;
5691
5693 Assert(PinCountWaitBuf == NULL);
5694
5696
5697 /*
5698 * We do not yet need to be worried about in-progress AIOs holding a pin,
5699 * as we, so far, only support doing reads via AIO and this function can
5700 * only be called once the buffer is valid (i.e. no read can be in
5701 * flight).
5702 */
5703
5704 /* Nobody else to wait for */
5705 if (BufferIsLocal(buffer))
5706 return;
5707
5708 bufHdr = GetBufferDescriptor(buffer - 1);
5709
5710 for (;;)
5711 {
5712 uint32 buf_state;
5713
5714 /* Try to acquire lock */
5716 buf_state = LockBufHdr(bufHdr);
5717
5718 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5719 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5720 {
5721 /* Successfully acquired exclusive lock with pincount 1 */
5722 UnlockBufHdr(bufHdr, buf_state);
5723
5724 /*
5725 * Emit the log message if recovery conflict on buffer pin was
5726 * resolved but the startup process waited longer than
5727 * deadlock_timeout for it.
5728 */
5729 if (logged_recovery_conflict)
5731 waitStart, GetCurrentTimestamp(),
5732 NULL, false);
5733
5734 if (waiting)
5735 {
5736 /* reset ps display to remove the suffix if we added one */
5738 waiting = false;
5739 }
5740 return;
5741 }
5742 /* Failed, so mark myself as waiting for pincount 1 */
5743 if (buf_state & BM_PIN_COUNT_WAITER)
5744 {
5745 UnlockBufHdr(bufHdr, buf_state);
5747 elog(ERROR, "multiple backends attempting to wait for pincount 1");
5748 }
5750 PinCountWaitBuf = bufHdr;
5751 buf_state |= BM_PIN_COUNT_WAITER;
5752 UnlockBufHdr(bufHdr, buf_state);
5754
5755 /* Wait to be signaled by UnpinBuffer() */
5756 if (InHotStandby)
5757 {
5758 if (!waiting)
5759 {
5760 /* adjust the process title to indicate that it's waiting */
5761 set_ps_display_suffix("waiting");
5762 waiting = true;
5763 }
5764
5765 /*
5766 * Emit the log message if the startup process is waiting longer
5767 * than deadlock_timeout for recovery conflict on buffer pin.
5768 *
5769 * Skip this if first time through because the startup process has
5770 * not started waiting yet in this case. So, the wait start
5771 * timestamp is set after this logic.
5772 */
5773 if (waitStart != 0 && !logged_recovery_conflict)
5774 {
5776
5777 if (TimestampDifferenceExceeds(waitStart, now,
5779 {
5781 waitStart, now, NULL, true);
5782 logged_recovery_conflict = true;
5783 }
5784 }
5785
5786 /*
5787 * Set the wait start timestamp if logging is enabled and first
5788 * time through.
5789 */
5790 if (log_recovery_conflict_waits && waitStart == 0)
5791 waitStart = GetCurrentTimestamp();
5792
5793 /* Publish the bufid that Startup process waits on */
5795 /* Set alarm and then wait to be signaled by UnpinBuffer() */
5797 /* Reset the published bufid */
5799 }
5800 else
5801 ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5802
5803 /*
5804 * Remove flag marking us as waiter. Normally this will not be set
5805 * anymore, but ProcWaitForSignal() can return for other signals as
5806 * well. We take care to only reset the flag if we're the waiter, as
5807 * theoretically another backend could have started waiting. That's
5808 * impossible with the current usages due to table level locking, but
5809 * better be safe.
5810 */
5811 buf_state = LockBufHdr(bufHdr);
5812 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5814 buf_state &= ~BM_PIN_COUNT_WAITER;
5815 UnlockBufHdr(bufHdr, buf_state);
5816
5817 PinCountWaitBuf = NULL;
5818 /* Loop back and try again */
5819 }
5820}
5821
5822/*
5823 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5824 * requests cancellation of all pin holders that are blocking it.
5825 */
5826bool
5828{
5829 int bufid = GetStartupBufferPinWaitBufId();
5830
5831 /*
5832 * If we get woken slowly then it's possible that the Startup process was
5833 * already woken by other backends before we got here. Also possible that
5834 * we get here by multiple interrupts or interrupts at inappropriate
5835 * times, so make sure we do nothing if the bufid is not set.
5836 */
5837 if (bufid < 0)
5838 return false;
5839
5840 if (GetPrivateRefCount(bufid + 1) > 0)
5841 return true;
5842
5843 return false;
5844}
5845
5846/*
5847 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5848 *
5849 * We won't loop, but just check once to see if the pin count is OK. If
5850 * not, return false with no lock held.
5851 */
5852bool
5854{
5855 BufferDesc *bufHdr;
5856 uint32 buf_state,
5857 refcount;
5858
5860
5861 /* see AIO related comment in LockBufferForCleanup() */
5862
5863 if (BufferIsLocal(buffer))
5864 {
5866 /* There should be exactly one pin */
5867 Assert(refcount > 0);
5868 if (refcount != 1)
5869 return false;
5870 /* Nobody else to wait for */
5871 return true;
5872 }
5873
5874 /* There should be exactly one local pin */
5877 if (refcount != 1)
5878 return false;
5879
5880 /* Try to acquire lock */
5882 return false;
5883
5884 bufHdr = GetBufferDescriptor(buffer - 1);
5885 buf_state = LockBufHdr(bufHdr);
5886 refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5887
5888 Assert(refcount > 0);
5889 if (refcount == 1)
5890 {
5891 /* Successfully acquired exclusive lock with pincount 1 */
5892 UnlockBufHdr(bufHdr, buf_state);
5893 return true;
5894 }
5895
5896 /* Failed, so release the lock */
5897 UnlockBufHdr(bufHdr, buf_state);
5899 return false;
5900}
5901
5902/*
5903 * IsBufferCleanupOK - as above, but we already have the lock
5904 *
5905 * Check whether it's OK to perform cleanup on a buffer we've already
5906 * locked. If we observe that the pin count is 1, our exclusive lock
5907 * happens to be a cleanup lock, and we can proceed with anything that
5908 * would have been allowable had we sought a cleanup lock originally.
5909 */
5910bool
5912{
5913 BufferDesc *bufHdr;
5914 uint32 buf_state;
5915
5917
5918 /* see AIO related comment in LockBufferForCleanup() */
5919
5920 if (BufferIsLocal(buffer))
5921 {
5922 /* There should be exactly one pin */
5923 if (LocalRefCount[-buffer - 1] != 1)
5924 return false;
5925 /* Nobody else to wait for */
5926 return true;
5927 }
5928
5929 /* There should be exactly one local pin */
5930 if (GetPrivateRefCount(buffer) != 1)
5931 return false;
5932
5933 bufHdr = GetBufferDescriptor(buffer - 1);
5934
5935 /* caller must hold exclusive lock on buffer */
5937 LW_EXCLUSIVE));
5938
5939 buf_state = LockBufHdr(bufHdr);
5940
5941 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5942 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5943 {
5944 /* pincount is OK. */
5945 UnlockBufHdr(bufHdr, buf_state);
5946 return true;
5947 }
5948
5949 UnlockBufHdr(bufHdr, buf_state);
5950 return false;
5951}
5952
5953
5954/*
5955 * Functions for buffer I/O handling
5956 *
5957 * Also note that these are used only for shared buffers, not local ones.
5958 */
5959
5960/*
5961 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5962 */
5963static void
5965{
5967
5969 for (;;)
5970 {
5971 uint32 buf_state;
5972 PgAioWaitRef iow;
5973
5974 /*
5975 * It may not be necessary to acquire the spinlock to check the flag
5976 * here, but since this test is essential for correctness, we'd better
5977 * play it safe.
5978 */
5979 buf_state = LockBufHdr(buf);
5980
5981 /*
5982 * Copy the wait reference while holding the spinlock. This protects
5983 * against a concurrent TerminateBufferIO() in another backend from
5984 * clearing the wref while it's being read.
5985 */
5986 iow = buf->io_wref;
5987 UnlockBufHdr(buf, buf_state);
5988
5989 /* no IO in progress, we don't need to wait */
5990 if (!(buf_state & BM_IO_IN_PROGRESS))
5991 break;
5992
5993 /*
5994 * The buffer has asynchronous IO in progress, wait for it to
5995 * complete.
5996 */
5997 if (pgaio_wref_valid(&iow))
5998 {
5999 pgaio_wref_wait(&iow);
6000
6001 /*
6002 * The AIO subsystem internally uses condition variables and thus
6003 * might remove this backend from the BufferDesc's CV. While that
6004 * wouldn't cause a correctness issue (the first CV sleep just
6005 * immediately returns if not already registered), it seems worth
6006 * avoiding unnecessary loop iterations, given that we take care
6007 * to do so at the start of the function.
6008 */
6010 continue;
6011 }
6012
6013 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
6014 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
6015 }
6017}
6018
6019/*
6020 * StartBufferIO: begin I/O on this buffer
6021 * (Assumptions)
6022 * My process is executing no IO on this buffer
6023 * The buffer is Pinned
6024 *
6025 * In some scenarios multiple backends could attempt the same I/O operation
6026 * concurrently. If someone else has already started I/O on this buffer then
6027 * we will wait for completion of the IO using WaitIO().
6028 *
6029 * Input operations are only attempted on buffers that are not BM_VALID,
6030 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
6031 * so we can always tell if the work is already done.
6032 *
6033 * Returns true if we successfully marked the buffer as I/O busy,
6034 * false if someone else already did the work.
6035 *
6036 * If nowait is true, then we don't wait for an I/O to be finished by another
6037 * backend. In that case, false indicates either that the I/O was already
6038 * finished, or is still in progress. This is useful for callers that want to
6039 * find out if they can perform the I/O as part of a larger operation, without
6040 * waiting for the answer or distinguishing the reasons why not.
6041 */
6042bool
6043StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
6044{
6045 uint32 buf_state;
6046
6048
6049 for (;;)
6050 {
6051 buf_state = LockBufHdr(buf);
6052
6053 if (!(buf_state & BM_IO_IN_PROGRESS))
6054 break;
6055 UnlockBufHdr(buf, buf_state);
6056 if (nowait)
6057 return false;
6058 WaitIO(buf);
6059 }
6060
6061 /* Once we get here, there is definitely no I/O active on this buffer */
6062
6063 /* Check if someone else already did the I/O */
6064 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6065 {
6066 UnlockBufHdr(buf, buf_state);
6067 return false;
6068 }
6069
6070 buf_state |= BM_IO_IN_PROGRESS;
6071 UnlockBufHdr(buf, buf_state);
6072
6075
6076 return true;
6077}
6078
6079/*
6080 * TerminateBufferIO: release a buffer we were doing I/O on
6081 * (Assumptions)
6082 * My process is executing IO for the buffer
6083 * BM_IO_IN_PROGRESS bit is set for the buffer
6084 * The buffer is Pinned
6085 *
6086 * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6087 * buffer's BM_DIRTY flag. This is appropriate when terminating a
6088 * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6089 * marking the buffer clean if it was re-dirtied while we were writing.
6090 *
6091 * set_flag_bits gets ORed into the buffer's flags. It must include
6092 * BM_IO_ERROR in a failure case. For successful completion it could
6093 * be 0, or BM_VALID if we just finished reading in the page.
6094 *
6095 * If forget_owner is true, we release the buffer I/O from the current
6096 * resource owner. (forget_owner=false is used when the resource owner itself
6097 * is being released)
6098 */
6099void
6100TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
6101 bool forget_owner, bool release_aio)
6102{
6103 uint32 buf_state;
6104
6105 buf_state = LockBufHdr(buf);
6106
6107 Assert(buf_state & BM_IO_IN_PROGRESS);
6108 buf_state &= ~BM_IO_IN_PROGRESS;
6109
6110 /* Clear earlier errors, if this IO failed, it'll be marked again */
6111 buf_state &= ~BM_IO_ERROR;
6112
6113 if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6114 buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
6115
6116 if (release_aio)
6117 {
6118 /* release ownership by the AIO subsystem */
6119 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6120 buf_state -= BUF_REFCOUNT_ONE;
6121 pgaio_wref_clear(&buf->io_wref);
6122 }
6123
6124 buf_state |= set_flag_bits;
6125 UnlockBufHdr(buf, buf_state);
6126
6127 if (forget_owner)
6130
6132
6133 /*
6134 * Support LockBufferForCleanup()
6135 *
6136 * We may have just released the last pin other than the waiter's. In most
6137 * cases, this backend holds another pin on the buffer. But, if, for
6138 * example, this backend is completing an IO issued by another backend, it
6139 * may be time to wake the waiter.
6140 */
6141 if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6143}
6144
6145/*
6146 * AbortBufferIO: Clean up active buffer I/O after an error.
6147 *
6148 * All LWLocks we might have held have been released,
6149 * but we haven't yet released buffer pins, so the buffer is still pinned.
6150 *
6151 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
6152 * possible the error condition wasn't related to the I/O.
6153 *
6154 * Note: this does not remove the buffer I/O from the resource owner.
6155 * That's correct when we're releasing the whole resource owner, but
6156 * beware if you use this in other contexts.
6157 */
6158static void
6160{
6161 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6162 uint32 buf_state;
6163
6164 buf_state = LockBufHdr(buf_hdr);
6165 Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6166
6167 if (!(buf_state & BM_VALID))
6168 {
6169 Assert(!(buf_state & BM_DIRTY));
6170 UnlockBufHdr(buf_hdr, buf_state);
6171 }
6172 else
6173 {
6174 Assert(buf_state & BM_DIRTY);
6175 UnlockBufHdr(buf_hdr, buf_state);
6176
6177 /* Issue notice if this is not the first failure... */
6178 if (buf_state & BM_IO_ERROR)
6179 {
6180 /* Buffer is pinned, so we can read tag without spinlock */
6182 (errcode(ERRCODE_IO_ERROR),
6183 errmsg("could not write block %u of %s",
6184 buf_hdr->tag.blockNum,
6186 BufTagGetForkNum(&buf_hdr->tag)).str),
6187 errdetail("Multiple failures --- write error might be permanent.")));
6188 }
6189 }
6190
6191 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
6192}
6193
6194/*
6195 * Error context callback for errors occurring during shared buffer writes.
6196 */
6197static void
6199{
6200 BufferDesc *bufHdr = (BufferDesc *) arg;
6201
6202 /* Buffer is pinned, so we can read the tag without locking the spinlock */
6203 if (bufHdr != NULL)
6204 errcontext("writing block %u of relation %s",
6205 bufHdr->tag.blockNum,
6207 BufTagGetForkNum(&bufHdr->tag)).str);
6208}
6209
6210/*
6211 * Error context callback for errors occurring during local buffer writes.
6212 */
6213static void
6215{
6216 BufferDesc *bufHdr = (BufferDesc *) arg;
6217
6218 if (bufHdr != NULL)
6219 errcontext("writing block %u of relation %s",
6220 bufHdr->tag.blockNum,
6223 BufTagGetForkNum(&bufHdr->tag)).str);
6224}
6225
6226/*
6227 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
6228 */
6229static int
6230rlocator_comparator(const void *p1, const void *p2)
6231{
6232 RelFileLocator n1 = *(const RelFileLocator *) p1;
6233 RelFileLocator n2 = *(const RelFileLocator *) p2;
6234
6235 if (n1.relNumber < n2.relNumber)
6236 return -1;
6237 else if (n1.relNumber > n2.relNumber)
6238 return 1;
6239
6240 if (n1.dbOid < n2.dbOid)
6241 return -1;
6242 else if (n1.dbOid > n2.dbOid)
6243 return 1;
6244
6245 if (n1.spcOid < n2.spcOid)
6246 return -1;
6247 else if (n1.spcOid > n2.spcOid)
6248 return 1;
6249 else
6250 return 0;
6251}
6252
6253/*
6254 * Lock buffer header - set BM_LOCKED in buffer state.
6255 */
6256uint32
6258{
6259 SpinDelayStatus delayStatus;
6260 uint32 old_buf_state;
6261
6263
6264 init_local_spin_delay(&delayStatus);
6265
6266 while (true)
6267 {
6268 /* set BM_LOCKED flag */
6269 old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6270 /* if it wasn't set before we're OK */
6271 if (!(old_buf_state & BM_LOCKED))
6272 break;
6273 perform_spin_delay(&delayStatus);
6274 }
6275 finish_spin_delay(&delayStatus);
6276 return old_buf_state | BM_LOCKED;
6277}
6278
6279/*
6280 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
6281 * state at that point.
6282 *
6283 * Obviously the buffer could be locked by the time the value is returned, so
6284 * this is primarily useful in CAS style loops.
6285 */
6286static uint32
6288{
6289 SpinDelayStatus delayStatus;
6290 uint32 buf_state;
6291
6292 init_local_spin_delay(&delayStatus);
6293
6294 buf_state = pg_atomic_read_u32(&buf->state);
6295
6296 while (buf_state & BM_LOCKED)
6297 {
6298 perform_spin_delay(&delayStatus);
6299 buf_state = pg_atomic_read_u32(&buf->state);
6300 }
6301
6302 finish_spin_delay(&delayStatus);
6303
6304 return buf_state;
6305}
6306
6307/*
6308 * BufferTag comparator.
6309 */
6310static inline int
6312{
6313 int ret;
6314 RelFileLocator rlocatora;
6315 RelFileLocator rlocatorb;
6316
6317 rlocatora = BufTagGetRelFileLocator(ba);
6318 rlocatorb = BufTagGetRelFileLocator(bb);
6319
6320 ret = rlocator_comparator(&rlocatora, &rlocatorb);
6321
6322 if (ret != 0)
6323 return ret;
6324
6325 if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
6326 return -1;
6327 if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
6328 return 1;
6329
6330 if (ba->blockNum < bb->blockNum)
6331 return -1;
6332 if (ba->blockNum > bb->blockNum)
6333 return 1;
6334
6335 return 0;
6336}
6337
6338/*
6339 * Comparator determining the writeout order in a checkpoint.
6340 *
6341 * It is important that tablespaces are compared first, the logic balancing
6342 * writes between tablespaces relies on it.
6343 */
6344static inline int
6346{
6347 /* compare tablespace */
6348 if (a->tsId < b->tsId)
6349 return -1;
6350 else if (a->tsId > b->tsId)
6351 return 1;
6352 /* compare relation */
6353 if (a->relNumber < b->relNumber)
6354 return -1;
6355 else if (a->relNumber > b->relNumber)
6356 return 1;
6357 /* compare fork */
6358 else if (a->forkNum < b->forkNum)
6359 return -1;
6360 else if (a->forkNum > b->forkNum)
6361 return 1;
6362 /* compare block number */
6363 else if (a->blockNum < b->blockNum)
6364 return -1;
6365 else if (a->blockNum > b->blockNum)
6366 return 1;
6367 /* equal page IDs are unlikely, but not impossible */
6368 return 0;
6369}
6370
6371/*
6372 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
6373 * progress.
6374 */
6375static int
6377{
6379 CkptTsStatus *sb = (CkptTsStatus *) b;
6380
6381 /* we want a min-heap, so return 1 for the a < b */
6382 if (sa->progress < sb->progress)
6383 return 1;
6384 else if (sa->progress == sb->progress)
6385 return 0;
6386 else
6387 return -1;
6388}
6389
6390/*
6391 * Initialize a writeback context, discarding potential previous state.
6392 *
6393 * *max_pending is a pointer instead of an immediate value, so the coalesce
6394 * limits can easily changed by the GUC mechanism, and so calling code does
6395 * not have to check the current configuration. A value of 0 means that no
6396 * writeback control will be performed.
6397 */
6398void
6399WritebackContextInit(WritebackContext *context, int *max_pending)
6400{
6401 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6402
6403 context->max_pending = max_pending;
6404 context->nr_pending = 0;
6405}
6406
6407/*
6408 * Add buffer to list of pending writeback requests.
6409 */
6410void
6412 BufferTag *tag)
6413{
6414 PendingWriteback *pending;
6415
6416 /*
6417 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6418 * point in tracking in that case.
6419 */
6421 !enableFsync)
6422 return;
6423
6424 /*
6425 * Add buffer to the pending writeback array, unless writeback control is
6426 * disabled.
6427 */
6428 if (*wb_context->max_pending > 0)
6429 {
6431
6432 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6433
6434 pending->tag = *tag;
6435 }
6436
6437 /*
6438 * Perform pending flushes if the writeback limit is exceeded. This
6439 * includes the case where previously an item has been added, but control
6440 * is now disabled.
6441 */
6442 if (wb_context->nr_pending >= *wb_context->max_pending)
6443 IssuePendingWritebacks(wb_context, io_context);
6444}
6445
6446#define ST_SORT sort_pending_writebacks
6447#define ST_ELEMENT_TYPE PendingWriteback
6448#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
6449#define ST_SCOPE static
6450#define ST_DEFINE
6451#include "lib/sort_template.h"
6452
6453/*
6454 * Issue all pending writeback requests, previously scheduled with
6455 * ScheduleBufferTagForWriteback, to the OS.
6456 *
6457 * Because this is only used to improve the OSs IO scheduling we try to never
6458 * error out - it's just a hint.
6459 */
6460void
6462{
6463 instr_time io_start;
6464 int i;
6465
6466 if (wb_context->nr_pending == 0)
6467 return;
6468
6469 /*
6470 * Executing the writes in-order can make them a lot faster, and allows to
6471 * merge writeback requests to consecutive blocks into larger writebacks.
6472 */
6473 sort_pending_writebacks(wb_context->pending_writebacks,
6474 wb_context->nr_pending);
6475
6477
6478 /*
6479 * Coalesce neighbouring writes, but nothing else. For that we iterate
6480 * through the, now sorted, array of pending flushes, and look forward to
6481 * find all neighbouring (or identical) writes.
6482 */
6483 for (i = 0; i < wb_context->nr_pending; i++)
6484 {
6487 SMgrRelation reln;
6488 int ahead;
6489 BufferTag tag;
6490 RelFileLocator currlocator;
6491 Size nblocks = 1;
6492
6493 cur = &wb_context->pending_writebacks[i];
6494 tag = cur->tag;
6495 currlocator = BufTagGetRelFileLocator(&tag);
6496
6497 /*
6498 * Peek ahead, into following writeback requests, to see if they can
6499 * be combined with the current one.
6500 */
6501 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6502 {
6503
6504 next = &wb_context->pending_writebacks[i + ahead + 1];
6505
6506 /* different file, stop */
6507 if (!RelFileLocatorEquals(currlocator,
6508 BufTagGetRelFileLocator(&next->tag)) ||
6509 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6510 break;
6511
6512 /* ok, block queued twice, skip */
6513 if (cur->tag.blockNum == next->tag.blockNum)
6514 continue;
6515
6516 /* only merge consecutive writes */
6517 if (cur->tag.blockNum + 1 != next->tag.blockNum)
6518 break;
6519
6520 nblocks++;
6521 cur = next;
6522 }
6523
6524 i += ahead;
6525
6526 /* and finally tell the kernel to write the data to storage */
6527 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6528 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6529 }
6530
6531 /*
6532 * Assume that writeback requests are only issued for buffers containing
6533 * blocks of permanent relations.
6534 */
6536 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6537
6538 wb_context->nr_pending = 0;
6539}
6540
6541/* ResourceOwner callbacks */
6542
6543static void
6545{
6547
6549}
6550
6551static char *
6553{
6555
6556 return psprintf("lost track of buffer IO on buffer %d", buffer);
6557}
6558
6559static void
6561{
6563
6564 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6565 if (!BufferIsValid(buffer))
6566 elog(ERROR, "bad buffer ID: %d", buffer);
6567
6568 if (BufferIsLocal(buffer))
6570 else
6572}
6573
6574static char *
6576{
6578}
6579
6580/*
6581 * Helper function to evict unpinned buffer whose buffer header lock is
6582 * already acquired.
6583 */
6584static bool
6585EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
6586{
6587 uint32 buf_state;
6588 bool result;
6589
6590 *buffer_flushed = false;
6591
6592 buf_state = pg_atomic_read_u32(&(desc->state));
6593 Assert(buf_state & BM_LOCKED);
6594
6595 if ((buf_state & BM_VALID) == 0)
6596 {
6597 UnlockBufHdr(desc, buf_state);
6598 return false;
6599 }
6600
6601 /* Check that it's not pinned already. */
6602 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6603 {
6604 UnlockBufHdr(desc, buf_state);
6605 return false;
6606 }
6607
6608 PinBuffer_Locked(desc); /* releases spinlock */
6609
6610 /* If it was dirty, try to clean it once. */
6611 if (buf_state & BM_DIRTY)
6612 {
6615 *buffer_flushed = true;
6617 }
6618
6619 /* This will return false if it becomes dirty or someone else pins it. */
6620 result = InvalidateVictimBuffer(desc);
6621
6622 UnpinBuffer(desc);
6623
6624 return result;
6625}
6626
6627/*
6628 * Try to evict the current block in a shared buffer.
6629 *
6630 * This function is intended for testing/development use only!
6631 *
6632 * To succeed, the buffer must not be pinned on entry, so if the caller had a
6633 * particular block in mind, it might already have been replaced by some other
6634 * block by the time this function runs. It's also unpinned on return, so the
6635 * buffer might be occupied again by the time control is returned, potentially
6636 * even by the same block. This inherent raciness without other interlocking
6637 * makes the function unsuitable for non-testing usage.
6638 *
6639 * *buffer_flushed is set to true if the buffer was dirty and has been
6640 * flushed, false otherwise. However, *buffer_flushed=true does not
6641 * necessarily mean that we flushed the buffer, it could have been flushed by
6642 * someone else.
6643 *
6644 * Returns true if the buffer was valid and it has now been made invalid.
6645 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6646 * or if the buffer becomes dirty again while we're trying to write it out.
6647 */
6648bool
6649EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
6650{
6651 BufferDesc *desc;
6652
6654
6655 /* Make sure we can pin the buffer. */
6658
6659 desc = GetBufferDescriptor(buf - 1);
6660 LockBufHdr(desc);
6661
6662 return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6663}
6664
6665/*
6666 * Try to evict all the shared buffers.
6667 *
6668 * This function is intended for testing/development use only! See
6669 * EvictUnpinnedBuffer().
6670 *
6671 * The buffers_* parameters are mandatory and indicate the total count of
6672 * buffers that:
6673 * - buffers_evicted - were evicted
6674 * - buffers_flushed - were flushed
6675 * - buffers_skipped - could not be evicted
6676 */
6677void
6678EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
6679 int32 *buffers_skipped)
6680{
6681 *buffers_evicted = 0;
6682 *buffers_skipped = 0;
6683 *buffers_flushed = 0;
6684
6685 for (int buf = 1; buf <= NBuffers; buf++)
6686 {
6687 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6688 uint32 buf_state;
6689 bool buffer_flushed;
6690
6691 buf_state = pg_atomic_read_u32(&desc->state);
6692 if (!(buf_state & BM_VALID))
6693 continue;
6694
6697
6698 LockBufHdr(desc);
6699
6700 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6701 (*buffers_evicted)++;
6702 else
6703 (*buffers_skipped)++;
6704
6705 if (buffer_flushed)
6706 (*buffers_flushed)++;
6707 }
6708}
6709
6710/*
6711 * Try to evict all the shared buffers containing provided relation's pages.
6712 *
6713 * This function is intended for testing/development use only! See
6714 * EvictUnpinnedBuffer().
6715 *
6716 * The caller must hold at least AccessShareLock on the relation to prevent
6717 * the relation from being dropped.
6718 *
6719 * The buffers_* parameters are mandatory and indicate the total count of
6720 * buffers that:
6721 * - buffers_evicted - were evicted
6722 * - buffers_flushed - were flushed
6723 * - buffers_skipped - could not be evicted
6724 */
6725void
6727 int32 *buffers_flushed, int32 *buffers_skipped)
6728{
6730
6731 *buffers_skipped = 0;
6732 *buffers_evicted = 0;
6733 *buffers_flushed = 0;
6734
6735 for (int buf = 1; buf <= NBuffers; buf++)
6736 {
6737 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6738 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6739 bool buffer_flushed;
6740
6741 /* An unlocked precheck should be safe and saves some cycles. */
6742 if ((buf_state & BM_VALID) == 0 ||
6744 continue;
6745
6746 /* Make sure we can pin the buffer. */
6749
6750 buf_state = LockBufHdr(desc);
6751
6752 /* recheck, could have changed without the lock */
6753 if ((buf_state & BM_VALID) == 0 ||
6755 {
6756 UnlockBufHdr(desc, buf_state);
6757 continue;
6758 }
6759
6760 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6761 (*buffers_evicted)++;
6762 else
6763 (*buffers_skipped)++;
6764
6765 if (buffer_flushed)
6766 (*buffers_flushed)++;
6767 }
6768}
6769
6770/*
6771 * Generic implementation of the AIO handle staging callback for readv/writev
6772 * on local/shared buffers.
6773 *
6774 * Each readv/writev can target multiple buffers. The buffers have already
6775 * been registered with the IO handle.
6776 *
6777 * To make the IO ready for execution ("staging"), we need to ensure that the
6778 * targeted buffers are in an appropriate state while the IO is ongoing. For
6779 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
6780 * in this backend could lead to this backend's buffer pin being released as
6781 * part of error handling, which in turn could lead to the buffer being
6782 * replaced while IO is ongoing.
6783 */
6785buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
6786{
6787 uint64 *io_data;
6788 uint8 handle_data_len;
6789 PgAioWaitRef io_ref;
6791
6792 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
6793
6794 pgaio_io_get_wref(ioh, &io_ref);
6795
6796 /* iterate over all buffers affected by the vectored readv/writev */
6797 for (int i = 0; i < handle_data_len; i++)
6798 {
6799 Buffer buffer = (Buffer) io_data[i];
6800 BufferDesc *buf_hdr = is_temp ?
6803 uint32 buf_state;
6804
6805 /*
6806 * Check that all the buffers are actually ones that could conceivably
6807 * be done in one IO, i.e. are sequential. This is the last
6808 * buffer-aware code before IO is actually executed and confusion
6809 * about which buffers are targeted by IO can be hard to debug, making
6810 * it worth doing extra-paranoid checks.
6811 */
6812 if (i == 0)
6813 first = buf_hdr->tag;
6814 else
6815 {
6816 Assert(buf_hdr->tag.relNumber == first.relNumber);
6817 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
6818 }
6819
6820 if (is_temp)
6821 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6822 else
6823 buf_state = LockBufHdr(buf_hdr);
6824
6825 /* verify the buffer is in the expected state */
6826 Assert(buf_state & BM_TAG_VALID);
6827 if (is_write)
6828 {
6829 Assert(buf_state & BM_VALID);
6830 Assert(buf_state & BM_DIRTY);
6831 }
6832 else
6833 {
6834 Assert(!(buf_state & BM_VALID));
6835 Assert(!(buf_state & BM_DIRTY));
6836 }
6837
6838 /* temp buffers don't use BM_IO_IN_PROGRESS */
6839 if (!is_temp)
6840 Assert(buf_state & BM_IO_IN_PROGRESS);
6841
6842 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
6843
6844 /*
6845 * Reflect that the buffer is now owned by the AIO subsystem.
6846 *
6847 * For local buffers: This can't be done just via LocalRefCount, as
6848 * one might initially think, as this backend could error out while
6849 * AIO is still in progress, releasing all the pins by the backend
6850 * itself.
6851 *
6852 * This pin is released again in TerminateBufferIO().
6853 */
6854 buf_state += BUF_REFCOUNT_ONE;
6855 buf_hdr->io_wref = io_ref;
6856
6857 if (is_temp)
6858 pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
6859 else
6860 UnlockBufHdr(buf_hdr, buf_state);
6861
6862 /*
6863 * Ensure the content lock that prevents buffer modifications while
6864 * the buffer is being written out is not released early due to an
6865 * error.
6866 */
6867 if (is_write && !is_temp)
6868 {
6869 LWLock *content_lock;
6870
6871 content_lock = BufferDescriptorGetContentLock(buf_hdr);
6872
6873 Assert(LWLockHeldByMe(content_lock));
6874
6875 /*
6876 * Lock is now owned by AIO subsystem.
6877 */
6878 LWLockDisown(content_lock);
6879 }
6880
6881 /*
6882 * Stop tracking this buffer via the resowner - the AIO system now
6883 * keeps track.
6884 */
6885 if (!is_temp)
6887 }
6888}
6889
6890/*
6891 * Decode readv errors as encoded by buffer_readv_encode_error().
6892 */
6893static inline void
6895 bool *zeroed_any,
6896 bool *ignored_any,
6897 uint8 *zeroed_or_error_count,
6898 uint8 *checkfail_count,
6899 uint8 *first_off)
6900{
6901 uint32 rem_error = result.error_data;
6902
6903 /* see static asserts in buffer_readv_encode_error */
6904#define READV_COUNT_BITS 7
6905#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
6906
6907 *zeroed_any = rem_error & 1;
6908 rem_error >>= 1;
6909
6910 *ignored_any = rem_error & 1;
6911 rem_error >>= 1;
6912
6913 *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
6914 rem_error >>= READV_COUNT_BITS;
6915
6916 *checkfail_count = rem_error & READV_COUNT_MASK;
6917 rem_error >>= READV_COUNT_BITS;
6918
6919 *first_off = rem_error & READV_COUNT_MASK;
6920 rem_error >>= READV_COUNT_BITS;
6921}
6922
6923/*
6924 * Helper to encode errors for buffer_readv_complete()
6925 *
6926 * Errors are encoded as follows:
6927 * - bit 0 indicates whether any page was zeroed (1) or not (0)
6928 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
6929 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
6930 * - next READV_COUNT_BITS bits indicate the number of checksum failures
6931 * - next READV_COUNT_BITS bits indicate the first offset of the first page
6932 * that was errored or zeroed or, if no errors/zeroes, the first ignored
6933 * checksum
6934 */
6935static inline void
6937 bool is_temp,
6938 bool zeroed_any,
6939 bool ignored_any,
6940 uint8 error_count,
6941 uint8 zeroed_count,
6942 uint8 checkfail_count,
6943 uint8 first_error_off,
6944 uint8 first_zeroed_off,
6945 uint8 first_ignored_off)
6946{
6947
6948 uint8 shift = 0;
6949 uint8 zeroed_or_error_count =
6950 error_count > 0 ? error_count : zeroed_count;
6951 uint8 first_off;
6952
6954 "PG_IOV_MAX is bigger than reserved space for error data");
6956 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
6957
6958 /*
6959 * We only have space to encode one offset - but luckily that's good
6960 * enough. If there is an error, the error is the interesting offset, same
6961 * with a zeroed buffer vs an ignored buffer.
6962 */
6963 if (error_count > 0)
6964 first_off = first_error_off;
6965 else if (zeroed_count > 0)
6966 first_off = first_zeroed_off;
6967 else
6968 first_off = first_ignored_off;
6969
6970 Assert(!zeroed_any || error_count == 0);
6971
6972 result->error_data = 0;
6973
6974 result->error_data |= zeroed_any << shift;
6975 shift += 1;
6976
6977 result->error_data |= ignored_any << shift;
6978 shift += 1;
6979
6980 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
6981 shift += READV_COUNT_BITS;
6982
6983 result->error_data |= ((uint32) checkfail_count) << shift;
6984 shift += READV_COUNT_BITS;
6985
6986 result->error_data |= ((uint32) first_off) << shift;
6987 shift += READV_COUNT_BITS;
6988
6989 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
6991
6992 if (error_count > 0)
6993 result->status = PGAIO_RS_ERROR;
6994 else
6995 result->status = PGAIO_RS_WARNING;
6996
6997 /*
6998 * The encoding is complicated enough to warrant cross-checking it against
6999 * the decode function.
7000 */
7001#ifdef USE_ASSERT_CHECKING
7002 {
7003 bool zeroed_any_2,
7004 ignored_any_2;
7005 uint8 zeroed_or_error_count_2,
7006 checkfail_count_2,
7007 first_off_2;
7008
7010 &zeroed_any_2, &ignored_any_2,
7011 &zeroed_or_error_count_2,
7012 &checkfail_count_2,
7013 &first_off_2);
7014 Assert(zeroed_any == zeroed_any_2);
7015 Assert(ignored_any == ignored_any_2);
7016 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
7017 Assert(checkfail_count == checkfail_count_2);
7018 Assert(first_off == first_off_2);
7019 }
7020#endif
7021
7022#undef READV_COUNT_BITS
7023#undef READV_COUNT_MASK
7024}
7025
7026/*
7027 * Helper for AIO readv completion callbacks, supporting both shared and temp
7028 * buffers. Gets called once for each buffer in a multi-page read.
7029 */
7032 uint8 flags, bool failed, bool is_temp,
7033 bool *buffer_invalid,
7034 bool *failed_checksum,
7035 bool *ignored_checksum,
7036 bool *zeroed_buffer)
7037{
7038 BufferDesc *buf_hdr = is_temp ?
7041 BufferTag tag = buf_hdr->tag;
7042 char *bufdata = BufferGetBlock(buffer);
7043 uint32 set_flag_bits;
7044 int piv_flags;
7045
7046 /* check that the buffer is in the expected state for a read */
7047#ifdef USE_ASSERT_CHECKING
7048 {
7049 uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
7050
7051 Assert(buf_state & BM_TAG_VALID);
7052 Assert(!(buf_state & BM_VALID));
7053 /* temp buffers don't use BM_IO_IN_PROGRESS */
7054 if (!is_temp)
7055 Assert(buf_state & BM_IO_IN_PROGRESS);
7056 Assert(!(buf_state & BM_DIRTY));
7057 }
7058#endif
7059
7060 *buffer_invalid = false;
7061 *failed_checksum = false;
7062 *ignored_checksum = false;
7063 *zeroed_buffer = false;
7064
7065 /*
7066 * We ask PageIsVerified() to only log the message about checksum errors,
7067 * as the completion might be run in any backend (or IO workers). We will
7068 * report checksum errors in buffer_readv_report().
7069 */
7070 piv_flags = PIV_LOG_LOG;
7071
7072 /* the local zero_damaged_pages may differ from the definer's */
7074 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7075
7076 /* Check for garbage data. */
7077 if (!failed)
7078 {
7079 /*
7080 * If the buffer is not currently pinned by this backend, e.g. because
7081 * we're completing this IO after an error, the buffer data will have
7082 * been marked as inaccessible when the buffer was unpinned. The AIO
7083 * subsystem holds a pin, but that doesn't prevent the buffer from
7084 * having been marked as inaccessible. The completion might also be
7085 * executed in a different process.
7086 */
7087#ifdef USE_VALGRIND
7088 if (!BufferIsPinned(buffer))
7089 VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7090#endif
7091
7092 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7093 failed_checksum))
7094 {
7095 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7096 {
7097 memset(bufdata, 0, BLCKSZ);
7098 *zeroed_buffer = true;
7099 }
7100 else
7101 {
7102 *buffer_invalid = true;
7103 /* mark buffer as having failed */
7104 failed = true;
7105 }
7106 }
7107 else if (*failed_checksum)
7108 *ignored_checksum = true;
7109
7110 /* undo what we did above */
7111#ifdef USE_VALGRIND
7112 if (!BufferIsPinned(buffer))
7113 VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7114#endif
7115
7116 /*
7117 * Immediately log a message about the invalid page, but only to the
7118 * server log. The reason to do so immediately is that this may be
7119 * executed in a different backend than the one that originated the
7120 * request. The reason to do so immediately is that the originator
7121 * might not process the query result immediately (because it is busy
7122 * doing another part of query processing) or at all (e.g. if it was
7123 * cancelled or errored out due to another IO also failing). The
7124 * definer of the IO will emit an ERROR or WARNING when processing the
7125 * IO's results
7126 *
7127 * To avoid duplicating the code to emit these log messages, we reuse
7128 * buffer_readv_report().
7129 */
7130 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
7131 {
7132 PgAioResult result_one = {0};
7133
7134 buffer_readv_encode_error(&result_one, is_temp,
7135 *zeroed_buffer,
7136 *ignored_checksum,
7137 *buffer_invalid,
7138 *zeroed_buffer ? 1 : 0,
7139 *failed_checksum ? 1 : 0,
7140 buf_off, buf_off, buf_off);
7141 pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7142 }
7143 }
7144
7145 /* Terminate I/O and set BM_VALID. */
7146 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7147 if (is_temp)
7148 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7149 else
7150 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7151
7152 /*
7153 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7154 * callback may not be executed in the same backend that called
7155 * BUFFER_READ_START. The alternative would be to defer calling the
7156 * tracepoint to a later point (e.g. the local completion callback for
7157 * shared buffer reads), which seems even less helpful.
7158 */
7159 TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7160 tag.blockNum,
7161 tag.spcOid,
7162 tag.dbOid,
7163 tag.relNumber,
7165 false);
7166}
7167
7168/*
7169 * Perform completion handling of a single AIO read. This read may cover
7170 * multiple blocks / buffers.
7171 *
7172 * Shared between shared and local buffers, to reduce code duplication.
7173 */
7176 uint8 cb_data, bool is_temp)
7177{
7178 PgAioResult result = prior_result;
7180 uint8 first_error_off = 0;
7181 uint8 first_zeroed_off = 0;
7182 uint8 first_ignored_off = 0;
7183 uint8 error_count = 0;
7184 uint8 zeroed_count = 0;
7185 uint8 ignored_count = 0;
7186 uint8 checkfail_count = 0;
7187 uint64 *io_data;
7188 uint8 handle_data_len;
7189
7190 if (is_temp)
7191 {
7192 Assert(td->smgr.is_temp);
7194 }
7195 else
7196 Assert(!td->smgr.is_temp);
7197
7198 /*
7199 * Iterate over all the buffers affected by this IO and call the
7200 * per-buffer completion function for each buffer.
7201 */
7202 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7203 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7204 {
7205 Buffer buf = io_data[buf_off];
7206 bool failed;
7207 bool failed_verification = false;
7208 bool failed_checksum = false;
7209 bool zeroed_buffer = false;
7210 bool ignored_checksum = false;
7211
7213
7214 /*
7215 * If the entire I/O failed on a lower-level, each buffer needs to be
7216 * marked as failed. In case of a partial read, the first few buffers
7217 * may be ok.
7218 */
7219 failed =
7220 prior_result.status == PGAIO_RS_ERROR
7221 || prior_result.result <= buf_off;
7222
7223 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7224 &failed_verification,
7225 &failed_checksum,
7226 &ignored_checksum,
7227 &zeroed_buffer);
7228
7229 /*
7230 * Track information about the number of different kinds of error
7231 * conditions across all pages, as there can be multiple pages failing
7232 * verification as part of one IO.
7233 */
7234 if (failed_verification && !zeroed_buffer && error_count++ == 0)
7235 first_error_off = buf_off;
7236 if (zeroed_buffer && zeroed_count++ == 0)
7237 first_zeroed_off = buf_off;
7238 if (ignored_checksum && ignored_count++ == 0)
7239 first_ignored_off = buf_off;
7240 if (failed_checksum)
7241 checkfail_count++;
7242 }
7243
7244 /*
7245 * If the smgr read succeeded [partially] and page verification failed for
7246 * some of the pages, adjust the IO's result state appropriately.
7247 */
7248 if (prior_result.status != PGAIO_RS_ERROR &&
7249 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7250 {
7251 buffer_readv_encode_error(&result, is_temp,
7252 zeroed_count > 0, ignored_count > 0,
7253 error_count, zeroed_count, checkfail_count,
7254 first_error_off, first_zeroed_off,
7255 first_ignored_off);
7256 pgaio_result_report(result, td, DEBUG1);
7257 }
7258
7259 /*
7260 * For shared relations this reporting is done in
7261 * shared_buffer_readv_complete_local().
7262 */
7263 if (is_temp && checkfail_count > 0)
7265 checkfail_count);
7266
7267 return result;
7268}
7269
7270/*
7271 * AIO error reporting callback for aio_shared_buffer_readv_cb and
7272 * aio_local_buffer_readv_cb.
7273 *
7274 * The error is encoded / decoded in buffer_readv_encode_error() /
7275 * buffer_readv_decode_error().
7276 */
7277static void
7279 int elevel)
7280{
7281 int nblocks = td->smgr.nblocks;
7282 BlockNumber first = td->smgr.blockNum;
7283 BlockNumber last = first + nblocks - 1;
7284 ProcNumber errProc =
7286 RelPathStr rpath =
7287 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7288 bool zeroed_any,
7289 ignored_any;
7290 uint8 zeroed_or_error_count,
7291 checkfail_count,
7292 first_off;
7293 uint8 affected_count;
7294 const char *msg_one,
7295 *msg_mult,
7296 *det_mult,
7297 *hint_mult;
7298
7299 buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7300 &zeroed_or_error_count,
7301 &checkfail_count,
7302 &first_off);
7303
7304 /*
7305 * Treat a read that had both zeroed buffers *and* ignored checksums as a
7306 * special case, it's too irregular to be emitted the same way as the
7307 * other cases.
7308 */
7309 if (zeroed_any && ignored_any)
7310 {
7311 Assert(zeroed_any && ignored_any);
7312 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7313 Assert(result.status != PGAIO_RS_ERROR);
7314 affected_count = zeroed_or_error_count;
7315
7316 ereport(elevel,
7318 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation %s",
7319 affected_count, checkfail_count, first, last, rpath.str),
7320 affected_count > 1 ?
7321 errdetail("Block %u held first zeroed page.",
7322 first + first_off) : 0,
7323 errhint("See server log for details about the other %u invalid block(s).",
7324 affected_count + checkfail_count - 1));
7325 return;
7326 }
7327
7328 /*
7329 * The other messages are highly repetitive. To avoid duplicating a long
7330 * and complicated ereport(), gather the translated format strings
7331 * separately and then do one common ereport.
7332 */
7333 if (result.status == PGAIO_RS_ERROR)
7334 {
7335 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7336 affected_count = zeroed_or_error_count;
7337 msg_one = _("invalid page in block %u of relation %s");
7338 msg_mult = _("%u invalid pages among blocks %u..%u of relation %s");
7339 det_mult = _("Block %u held first invalid page.");
7340 hint_mult = _("See server log for the other %u invalid block(s).");
7341 }
7342 else if (zeroed_any && !ignored_any)
7343 {
7344 affected_count = zeroed_or_error_count;
7345 msg_one = _("invalid page in block %u of relation %s; zeroing out page");
7346 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation %s");
7347 det_mult = _("Block %u held first zeroed page.");
7348 hint_mult = _("See server log for the other %u zeroed block(s).");
7349 }
7350 else if (!zeroed_any && ignored_any)
7351 {
7352 affected_count = checkfail_count;
7353 msg_one = _("ignoring checksum failure in block %u of relation %s");
7354 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation %s");
7355 det_mult = _("Block %u held first ignored page.");
7356 hint_mult = _("See server log for the other %u ignored block(s).");
7357 }
7358 else
7360
7361 ereport(elevel,
7363 affected_count == 1 ?
7364 errmsg_internal(msg_one, first + first_off, rpath.str) :
7365 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7366 affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7367 affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7368}
7369
7370static void
7372{
7373 buffer_stage_common(ioh, false, false);
7374}
7375
7376static PgAioResult
7378 uint8 cb_data)
7379{
7380 return buffer_readv_complete(ioh, prior_result, cb_data, false);
7381}
7382
7383/*
7384 * We need a backend-local completion callback for shared buffers, to be able
7385 * to report checksum errors correctly. Unfortunately that can only safely
7386 * happen if the reporting backend has previously called
7387 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
7388 * the backend that started the IO. Hence this callback.
7389 */
7390static PgAioResult
7392 uint8 cb_data)
7393{
7394 bool zeroed_any,
7395 ignored_any;
7396 uint8 zeroed_or_error_count,
7397 checkfail_count,
7398 first_off;
7399
7400 if (prior_result.status == PGAIO_RS_OK)
7401 return prior_result;
7402
7403 buffer_readv_decode_error(prior_result,
7404 &zeroed_any,
7405 &ignored_any,
7406 &zeroed_or_error_count,
7407 &checkfail_count,
7408 &first_off);
7409
7410 if (checkfail_count)
7411 {
7413
7415 checkfail_count);
7416 }
7417
7418 return prior_result;
7419}
7420
7421static void
7423{
7424 buffer_stage_common(ioh, false, true);
7425}
7426
7427static PgAioResult
7429 uint8 cb_data)
7430{
7431 return buffer_readv_complete(ioh, prior_result, cb_data, true);
7432}
7433
7434/* readv callback is passed READ_BUFFERS_* flags as callback data */
7437 .complete_shared = shared_buffer_readv_complete,
7438 /* need a local callback to report checksum failures */
7439 .complete_local = shared_buffer_readv_complete_local,
7440 .report = buffer_readv_report,
7441};
7442
7443/* readv callback is passed READ_BUFFERS_* flags as callback data */
7446
7447 /*
7448 * Note that this, in contrast to the shared_buffers case, uses
7449 * complete_local, as only the issuing backend has access to the required
7450 * datastructures. This is important in case the IO completion may be
7451 * consumed incidentally by another backend.
7452 */
7453 .complete_local = local_buffer_readv_complete,
7454 .report = buffer_readv_report,
7455};
int io_method
Definition: aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:875
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:159
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:868
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:340
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:304
bool pgaio_have_staged(void)
Definition: aio.c:1006
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:909
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:329
void pgaio_submit_staged(void)
Definition: aio.c:1022
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:895
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:228
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:185
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition: aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition: aio.h:198
@ IOMETHOD_SYNC
Definition: aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition: aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
Definition: aio_callback.c:140
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
Definition: aio_callback.c:156
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:173
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition: aio_types.h:98
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_OK
Definition: aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:349
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:410
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:295
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:239
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
int BgWriterDelay
Definition: bgwriter.c:58
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
#define binaryheap_empty(h)
Definition: binaryheap.h:65
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define MaxBlockNumber
Definition: block.h:35
static int32 next
Definition: blutils.c:224
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
#define BufferIsLocal(buffer)
Definition: buf.h:37
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
BufferDescPadded * BufferDescriptors
Definition: buf_init.c:21
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:86
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:71
#define BM_PERMANENT
Definition: buf_internals.h:77
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:53
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:51
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
Definition: buf_internals.h:56
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:75
#define BM_DIRTY
Definition: buf_internals.h:69
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
Definition: buf_internals.h:68
#define BM_JUST_DIRTIED
Definition: buf_internals.h:74
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:60
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:72
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:54
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:59
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:70
#define BM_IO_ERROR
Definition: buf_internals.h:73
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:76
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
bool track_io_timing
Definition: bufmgr.c:147
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5652
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:5031
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5403
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4893
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition: bufmgr.c:6345
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition: bufmgr.c:7175
bool BufferIsExclusiveLocked(Buffer buffer)
Definition: bufmgr.c:2891
const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:244
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4229
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:325
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1569
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:4538
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:3014
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7391
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition: bufmgr.c:1262
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition: bufmgr.c:1532
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:651
static uint32 PrivateRefCountClock
Definition: bufmgr.c:218
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4289
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6544
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7428
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition: bufmgr.c:1494
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6678
int io_max_combine_limit
Definition: bufmgr.c:172
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:3072
const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:235
bool zero_damaged_pages
Definition: bufmgr.c:144
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:91
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:3183
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6726
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition: bufmgr.c:7031
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:6287
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition: bufmgr.c:6311
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5911
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:73
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6552
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:858
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:3996
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:6159
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition: bufmgr.c:7435
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:890
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1193
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition: bufmgr.c:1598
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1031
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:2005
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:4065
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition: bufmgr.c:1556
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition: bufmgr.c:5245
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition: bufmgr.c:4661
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:6230
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition: bufmgr.c:922
struct SMgrSortArray SMgrSortArray
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition: bufmgr.c:7444
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:2282
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:4047
int io_combine_limit_guc
Definition: bufmgr.c:171
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:6376
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:4250
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:72
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition: bufmgr.c:6785
#define BUF_REUSABLE
Definition: bufmgr.c:81
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6214
static void BufferSync(int flags)
Definition: bufmgr.c:3349
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition: bufmgr.c:1769
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7422
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:4172
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6575
void CheckPointBuffers(int flags)
Definition: bufmgr.c:4215
bool BufferIsDirty(Buffer buffer)
Definition: bufmgr.c:2919
static uint32 MaxProportionalPins
Definition: bufmgr.c:221
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2610
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:3625
static void WakePinCountWaiter(BufferDesc *buf)
Definition: bufmgr.c:3229
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:4461
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:100
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7371
void UnlockBuffers(void)
Definition: bufmgr.c:5577
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:561
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7377
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:2350
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5631
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:4429
int bgwriter_flush_after
Definition: bufmgr.c:179
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5371
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4832
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4491
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:5827
int checkpoint_flush_after
Definition: bufmgr.c:178
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5388
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1110
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
Definition: bufmgr.c:6100
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:3273
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6198
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:6411
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1637
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:6399
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2952
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:483
double bgwriter_lru_multiplier
Definition: bufmgr.c:146
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition: bufmgr.c:6585
int backend_flush_after
Definition: bufmgr.c:180
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2548
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: bufmgr.c:7278
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:259
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:183
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:425
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2566
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5685
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5605
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:219
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5435
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:4941
#define READV_COUNT_BITS
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:6461
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:448
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition: bufmgr.c:6649
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:842
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition: bufmgr.c:682
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:83
int maintenance_io_concurrency
Definition: bufmgr.c:162
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:3264
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:5309
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:2183
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:5131
int effective_io_concurrency
Definition: bufmgr.c:155
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:351
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:6043
struct PrivateRefCountEntry PrivateRefCountEntry
struct CkptTsStatus CkptTsStatus
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1513
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:805
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:6257
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6560
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:215
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition: bufmgr.c:6894
#define READV_COUNT_MASK
int io_combine_limit
Definition: bufmgr.c:170
void InitBufferManagerAccess(void)
Definition: bufmgr.c:4013
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition: bufmgr.c:6936
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3923
uint32 GetAdditionalPinLimit(void)
Definition: bufmgr.c:2522
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:758
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:216
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:217
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5853
int bgwriter_lru_maxpages
Definition: bufmgr.c:145
uint32 GetPinLimit(void)
Definition: bufmgr.c:2510
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5964
#define BUF_WRITTEN
Definition: bufmgr.c:80
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:5351
@ BAS_BULKREAD
Definition: bufmgr.h:37
@ BAS_BULKWRITE
Definition: bufmgr.h:39
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:196
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:197
#define P_NEW
Definition: bufmgr.h:191
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:112
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:417
#define DEFAULT_IO_COMBINE_LIMIT
Definition: bufmgr.h:167
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:384
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:114
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:166
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition: bufmgr.h:161
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition: bufmgr.h:116
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition: bufmgr.h:162
void * Block
Definition: bufmgr.h:26
@ EB_LOCK_TARGET
Definition: bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:75
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition: bufmgr.h:118
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:198
ReadBufferMode
Definition: bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:47
@ RBM_NORMAL
Definition: bufmgr.h:46
#define BMR_REL(p_rel)
Definition: bufmgr.h:108
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:368
bool ignore_checksum_failure
Definition: bufpage.c:27
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1509
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_LOG
Definition: bufpage.h:469
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
PageData * Page
Definition: bufpage.h:82
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:470
#define likely(x)
Definition: c.h:346
uint8_t uint8
Definition: c.h:500
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:224
#define Max(x, y)
Definition: c.h:969
double float8
Definition: c.h:601
#define pg_attribute_always_inline
Definition: c.h:270
int16_t int16
Definition: c.h:497
int32_t int32
Definition: c.h:498
uint64_t uint64
Definition: c.h:503
#define pg_unreachable()
Definition: c.h:332
#define unlikely(x)
Definition: c.h:347
uint32_t uint32
Definition: c.h:502
#define lengthof(array)
Definition: c.h:759
#define MemSet(start, val, len)
Definition: c.h:991
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:909
size_t Size
Definition: c.h:576
bool IsCatalogRelationOid(Oid relid)
Definition: catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition: catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:773
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition: timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:956
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1421
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1386
struct cursor * cur
Definition: ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1231
int errdetail(const char *fmt,...)
Definition: elog.c:1204
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint_internal(const char *fmt,...)
Definition: elog.c:1340
int errhint(const char *fmt,...)
Definition: elog.c:1318
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define _(x)
Definition: elog.c:91
#define errcontext
Definition: elog.h:197
#define DEBUG3
Definition: elog.h:28
#define LOG_SERVER_ONLY
Definition: elog.h:32
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:723
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:800
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:840
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:40
int NBuffers
Definition: globals.c:143
bool enableFsync
Definition: globals.c:130
ProcNumber MyProcNumber
Definition: globals.c:91
int VacuumCostPageMiss
Definition: globals.c:153
bool VacuumCostActive
Definition: globals.c:159
int VacuumCostBalance
Definition: globals.c:158
int MaxBackends
Definition: globals.c:147
int VacuumCostPageDirty
Definition: globals.c:154
int VacuumCostPageHit
Definition: globals.c:152
Assert(PointerIsAligned(start, uint64))
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
BufferUsage pgBufferUsage
Definition: instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
int32 * LocalRefCount
Definition: localbuf.c:48
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition: localbuf.c:182
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:832
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition: localbuf.c:521
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:993
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:663
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:1004
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:796
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:489
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:693
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
Definition: localbuf.c:560
int NLocBuffer
Definition: localbuf.c:44
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:71
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:345
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:839
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:118
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1985
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1182
void LWLockDisown(LWLock *lock)
Definition: lwlock.c:1891
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2029
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1902
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1353
void ForEachLWLockHeldByMe(void(*callback)(LWLock *, LWLockMode, void *), void *context)
Definition: lwlock.c:1970
LWLockMode
Definition: lwlock.h:113
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:2172
void pfree(void *pointer)
Definition: mcxt.c:2152
void * palloc(Size size)
Definition: mcxt.c:1945
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
static PgChecksumMode mode
Definition: pg_checksums.c:55
static int64 current_size
Definition: pg_checksums.c:63
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
#define PG_IOV_MAX
Definition: pg_iovec.h:41
static char * buf
Definition: pg_test_fsync.c:72
IOObject
Definition: pgstat.h:273
@ IOOBJECT_RELATION
Definition: pgstat.h:274
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:275
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:704
IOContext
Definition: pgstat.h:282
@ IOCONTEXT_NORMAL
Definition: pgstat.h:286
@ IOOP_EXTEND
Definition: pgstat.h:311
@ IOOP_READ
Definition: pgstat.h:312
@ IOOP_WRITEBACK
Definition: pgstat.h:308
@ IOOP_HIT
Definition: pgstat.h:306
@ IOOP_EVICT
Definition: pgstat.h:304
@ IOOP_REUSE
Definition: pgstat.h:307
@ IOOP_WRITE
Definition: pgstat.h:313
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:709
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:90
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:121
#define qsort(a, b, c, d)
Definition: port.h:479
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
uintptr_t Datum
Definition: postgres.h:69
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
#define InvalidOid
Definition: postgres_ext.h:35
unsigned int Oid
Definition: postgres_ext.h:30
#define NUM_AUXILIARY_PROCS
Definition: proc.h:455
#define DELAY_CHKPT_START
Definition: proc.h:128
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:499
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:48
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:423
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:371
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:740
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:770
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1055
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:578
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:648
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:669
#define RelationIsValid(relation)
Definition: rel.h:489
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:452
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:751
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:131
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1987
PGPROC * MyProc
Definition: proc.c:67
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:767
int DeadlockTimeout
Definition: proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:755
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1975
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
PgAioWaitRef io_wref
struct SMgrRelationData * smgr
Definition: bufmgr.h:104
int64 shared_blks_dirtied
Definition: instrument.h:28
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_read
Definition: instrument.h:27
int64 shared_blks_written
Definition: instrument.h:29
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:119
int index
Definition: bufmgr.c:127
int num_scanned
Definition: bufmgr.c:124
float8 progress
Definition: bufmgr.c:118
int num_to_scan
Definition: bufmgr.c:122
Oid tsId
Definition: bufmgr.c:109
struct ErrorContextCallback * previous
Definition: elog.h:296
void(* callback)(void *arg)
Definition: elog.h:297
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
Definition: dynahash.c:220
Definition: lwlock.h:42
int delayChkptFlags
Definition: proc.h:249
PgAioHandleCallbackStage stage
Definition: aio.h:219
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
int32 result
Definition: aio_types.h:113
uint32 id
Definition: aio_types.h:105
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133
PgStat_Counter buf_written_clean
Definition: pgstat.h:239
PgStat_Counter maxwritten_clean
Definition: pgstat.h:240
PgStat_Counter buf_alloc
Definition: pgstat.h:241
PgStat_Counter buffers_written
Definition: pgstat.h:263
Buffer recent_buffer
Definition: bufmgr.h:61
ForkNumber forknum
Definition: bufmgr.h:127
PgAioWaitRef io_wref
Definition: bufmgr.h:140
Buffer * buffers
Definition: bufmgr.h:135
BufferAccessStrategy strategy
Definition: bufmgr.h:128
BlockNumber blocknum
Definition: bufmgr.h:136
PgAioReturn io_return
Definition: bufmgr.h:141
struct SMgrRelationData * smgr
Definition: bufmgr.h:125
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
RelFileLocator rd_locator
Definition: rel.h:57
Form_pg_class rd_rel
Definition: rel.h:111
const char * name
Definition: resowner.h:93
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:47
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38
SMgrRelation srel
Definition: bufmgr.c:140
RelFileLocator rlocator
Definition: bufmgr.c:139
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
Oid spcOid
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1828
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
BlockNumber nblocks
Definition: aio_types.h:67
struct PgAioTargetData::@124 smgr
ForkNumber forkNum
Definition: aio_types.h:68
static volatile sig_atomic_t waiting
Definition: waiteventset.c:170
bool RecoveryInProgress(void)
Definition: xlog.c:6522
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3254
CheckpointStatsData CheckpointStats
Definition: xlog.c:209
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2923
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:143
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139
#define XLogIsNeeded()
Definition: xlog.h:109
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1065
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237
#define InHotStandby
Definition: xlogutils.h:60