Skip to content

Commit f92c854

Browse files
committed
Make pg_stat_io count IOs as bytes instead of blocks for some operations
Currently in pg_stat_io view, IOs are counted as blocks of size BLCKSZ. There are two limitations with this design: * The actual number of I/O requests sent to the kernel is lower because I/O requests may be merged before being sent. Additionally, it gives the impression that all I/Os are done in block size, which shadows the benefits of merging I/O requests. * Some patches are under work to extend pg_stat_io for the tracking of operations that may not be linked to the block size. For example, WAL read IOs are done in variable bytes and it is not possible to correctly show these IOs in pg_stat_io view, and we want to keep all this data in a single system view rather than spread it across multiple relations to ease monitoring. WaitReadBuffers() can now be tracked as a single read operation worth N blocks. Same for ExtendBufferedRelShared() and ExtendBufferedRelLocal() for extensions. Three columns are added to pg_stat_io for reads, writes and extensions for the byte calculations. op_bytes, which was always hardcoded to BLCKSZ, is removed. IO backend statistics are updated to reflect these changes. Bump catalog version. Author: Nazir Bilal Yavuz Reviewed-by: Bertrand Drouvot, Melanie Plageman Discussion: https://postgr.es/m/CAN55FZ0oqxBaaHAEsj=xFqkzE3n5P=3RA1V_igXwL-RV7QRzyw@mail.gmail.com
1 parent b4a07f5 commit f92c854

File tree

12 files changed

+164
-71
lines changed

12 files changed

+164
-71
lines changed

doc/src/sgml/monitoring.sgml

+32-19
Original file line numberDiff line numberDiff line change
@@ -2692,8 +2692,18 @@ description | Waiting for a newly initialized WAL file to reach durable storage
26922692
<structfield>reads</structfield> <type>bigint</type>
26932693
</para>
26942694
<para>
2695-
Number of read operations, each of the size specified in
2696-
<varname>op_bytes</varname>.
2695+
Number of read operations.
2696+
</para>
2697+
</entry>
2698+
</row>
2699+
2700+
<row>
2701+
<entry role="catalog_table_entry">
2702+
<para role="column_definition">
2703+
<structfield>read_bytes</structfield> <type>numeric</type>
2704+
</para>
2705+
<para>
2706+
The total size of read operations in bytes.
26972707
</para>
26982708
</entry>
26992709
</row>
@@ -2716,8 +2726,18 @@ description | Waiting for a newly initialized WAL file to reach durable storage
27162726
<structfield>writes</structfield> <type>bigint</type>
27172727
</para>
27182728
<para>
2719-
Number of write operations, each of the size specified in
2720-
<varname>op_bytes</varname>.
2729+
Number of write operations.
2730+
</para>
2731+
</entry>
2732+
</row>
2733+
2734+
<row>
2735+
<entry role="catalog_table_entry">
2736+
<para role="column_definition">
2737+
<structfield>write_bytes</structfield> <type>numeric</type>
2738+
</para>
2739+
<para>
2740+
The total size of write operations in bytes.
27212741
</para>
27222742
</entry>
27232743
</row>
@@ -2740,8 +2760,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage
27402760
<structfield>writebacks</structfield> <type>bigint</type>
27412761
</para>
27422762
<para>
2743-
Number of units of size <varname>op_bytes</varname> which the process
2744-
requested the kernel write out to permanent storage.
2763+
Number of units of size <symbol>BLCKSZ</symbol> (typically 8kB) which
2764+
the process requested the kernel write out to permanent storage.
27452765
</para>
27462766
</entry>
27472767
</row>
@@ -2766,37 +2786,30 @@ description | Waiting for a newly initialized WAL file to reach durable storage
27662786
<structfield>extends</structfield> <type>bigint</type>
27672787
</para>
27682788
<para>
2769-
Number of relation extend operations, each of the size specified in
2770-
<varname>op_bytes</varname>.
2789+
Number of relation extend operations.
27712790
</para>
27722791
</entry>
27732792
</row>
27742793

27752794
<row>
27762795
<entry role="catalog_table_entry">
27772796
<para role="column_definition">
2778-
<structfield>extend_time</structfield> <type>double precision</type>
2797+
<structfield>extend_bytes</structfield> <type>numeric</type>
27792798
</para>
27802799
<para>
2781-
Time spent in extend operations in milliseconds (if
2782-
<xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
2800+
The total size of relation extend operations in bytes.
27832801
</para>
27842802
</entry>
27852803
</row>
27862804

27872805
<row>
27882806
<entry role="catalog_table_entry">
27892807
<para role="column_definition">
2790-
<structfield>op_bytes</structfield> <type>bigint</type>
2791-
</para>
2792-
<para>
2793-
The number of bytes per unit of I/O read, written, or extended.
2808+
<structfield>extend_time</structfield> <type>double precision</type>
27942809
</para>
27952810
<para>
2796-
Relation data reads, writes, and extends are done in
2797-
<varname>block_size</varname> units, derived from the build-time
2798-
parameter <symbol>BLCKSZ</symbol>, which is <literal>8192</literal> by
2799-
default.
2811+
Time spent in extend operations in milliseconds (if
2812+
<xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
28002813
</para>
28012814
</entry>
28022815
</row>

src/backend/catalog/system_views.sql

+3-1
Original file line numberDiff line numberDiff line change
@@ -1156,14 +1156,16 @@ SELECT
11561156
b.object,
11571157
b.context,
11581158
b.reads,
1159+
b.read_bytes,
11591160
b.read_time,
11601161
b.writes,
1162+
b.write_bytes,
11611163
b.write_time,
11621164
b.writebacks,
11631165
b.writeback_time,
11641166
b.extends,
1167+
b.extend_bytes,
11651168
b.extend_time,
1166-
b.op_bytes,
11671169
b.hits,
11681170
b.evictions,
11691171
b.reuses,

src/backend/storage/buffer/bufmgr.c

+7-7
Original file line numberDiff line numberDiff line change
@@ -1165,7 +1165,7 @@ PinBufferForBlock(Relation rel,
11651165
}
11661166
if (*foundPtr)
11671167
{
1168-
pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1);
1168+
pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
11691169
if (VacuumCostActive)
11701170
VacuumCostBalance += VacuumCostPageHit;
11711171

@@ -1515,7 +1515,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
15151515
io_start = pgstat_prepare_io_time(track_io_timing);
15161516
smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
15171517
pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
1518-
io_buffers_len);
1518+
1, io_buffers_len * BLCKSZ);
15191519

15201520
/* Verify each block we read, and terminate the I/O. */
15211521
for (int j = 0; j < io_buffers_len; ++j)
@@ -2073,7 +2073,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
20732073
* pinners or erroring out.
20742074
*/
20752075
pgstat_count_io_op(IOOBJECT_RELATION, io_context,
2076-
from_ring ? IOOP_REUSE : IOOP_EVICT, 1);
2076+
from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
20772077
}
20782078

20792079
/*
@@ -2429,7 +2429,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
24292429
UnlockRelationForExtension(bmr.rel, ExclusiveLock);
24302430

24312431
pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
2432-
io_start, extend_by);
2432+
io_start, 1, extend_by * BLCKSZ);
24332433

24342434
/* Set BM_VALID, terminate IO, and wake up any waiters */
24352435
for (uint32 i = 0; i < extend_by; i++)
@@ -3891,7 +3891,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
38913891
* of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
38923892
*/
38933893
pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
3894-
IOOP_WRITE, io_start, 1);
3894+
IOOP_WRITE, io_start, 1, BLCKSZ);
38953895

38963896
pgBufferUsage.shared_blks_written++;
38973897

@@ -4530,7 +4530,7 @@ FlushRelationBuffers(Relation rel)
45304530

45314531
pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
45324532
IOCONTEXT_NORMAL, IOOP_WRITE,
4533-
io_start, 1);
4533+
io_start, 1, BLCKSZ);
45344534

45354535
buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
45364536
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
@@ -6037,7 +6037,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
60376037
* blocks of permanent relations.
60386038
*/
60396039
pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
6040-
IOOP_WRITEBACK, io_start, wb_context->nr_pending);
6040+
IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
60416041

60426042
wb_context->nr_pending = 0;
60436043
}

src/backend/storage/buffer/localbuf.c

+4-3
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ GetLocalVictimBuffer(void)
255255

256256
/* Temporary table I/O does not use Buffer Access Strategies */
257257
pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL,
258-
IOOP_WRITE, io_start, 1);
258+
IOOP_WRITE, io_start, 1, BLCKSZ);
259259

260260
/* Mark not-dirty now in case we error out below */
261261
buf_state &= ~BM_DIRTY;
@@ -279,7 +279,8 @@ GetLocalVictimBuffer(void)
279279
ClearBufferTag(&bufHdr->tag);
280280
buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
281281
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
282-
pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EVICT, 1);
282+
283+
pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EVICT, 1, 0);
283284
}
284285

285286
return BufferDescriptorGetBuffer(bufHdr);
@@ -419,7 +420,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
419420
smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
420421

421422
pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EXTEND,
422-
io_start, extend_by);
423+
io_start, 1, extend_by * BLCKSZ);
423424

424425
for (uint32 i = 0; i < extend_by; i++)
425426
{

src/backend/storage/smgr/md.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -1401,7 +1401,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
14011401
* backend fsyncs.
14021402
*/
14031403
pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
1404-
IOOP_FSYNC, io_start, 1);
1404+
IOOP_FSYNC, io_start, 1, 0);
14051405
}
14061406
}
14071407

@@ -1796,7 +1796,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
17961796
FileClose(file);
17971797

17981798
pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
1799-
IOOP_FSYNC, io_start, 1);
1799+
IOOP_FSYNC, io_start, 1, 0);
18001800

18011801
errno = save_errno;
18021802
return result;

src/backend/utils/activity/pgstat_backend.c

+2
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ pgstat_flush_backend_entry_io(PgStat_EntryRef *entry_ref)
6565

6666
bktype_shstats->counts[io_object][io_context][io_op] +=
6767
pending_io->counts[io_object][io_context][io_op];
68+
bktype_shstats->bytes[io_object][io_context][io_op] +=
69+
pending_io->bytes[io_object][io_context][io_op];
6870

6971
time = pending_io->pending_times[io_object][io_context][io_op];
7072

src/backend/utils/activity/pgstat_io.c

+17-3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@
2323
static PgStat_PendingIO PendingIOStats;
2424
static bool have_iostats = false;
2525

26+
/*
27+
* Check if an IOOp is tracked in bytes. This relies on the ordering of IOOp
28+
* defined in pgstat.h, so make sure to update this check when changing its
29+
* elements.
30+
*/
31+
#define pgstat_is_ioop_tracked_in_bytes(io_op) \
32+
((io_op) < IOOP_NUM_TYPES && (io_op) >= IOOP_EXTEND)
2633

2734
/*
2835
* Check that stats have not been counted for any combination of IOObject,
@@ -66,11 +73,13 @@ pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
6673
}
6774

6875
void
69-
pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt)
76+
pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
77+
uint32 cnt, uint64 bytes)
7078
{
7179
Assert((unsigned int) io_object < IOOBJECT_NUM_TYPES);
7280
Assert((unsigned int) io_context < IOCONTEXT_NUM_TYPES);
7381
Assert((unsigned int) io_op < IOOP_NUM_TYPES);
82+
Assert(pgstat_is_ioop_tracked_in_bytes(io_op) || bytes == 0);
7483
Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
7584

7685
if (pgstat_tracks_backend_bktype(MyBackendType))
@@ -79,9 +88,11 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32
7988

8089
entry_ref = pgstat_prep_backend_pending(MyProcNumber);
8190
entry_ref->pending_io.counts[io_object][io_context][io_op] += cnt;
91+
entry_ref->pending_io.bytes[io_object][io_context][io_op] += bytes;
8292
}
8393

8494
PendingIOStats.counts[io_object][io_context][io_op] += cnt;
95+
PendingIOStats.bytes[io_object][io_context][io_op] += bytes;
8596

8697
have_iostats = true;
8798
}
@@ -114,7 +125,7 @@ pgstat_prepare_io_time(bool track_io_guc)
114125
*/
115126
void
116127
pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
117-
instr_time start_time, uint32 cnt)
128+
instr_time start_time, uint32 cnt, uint64 bytes)
118129
{
119130
if (track_io_timing)
120131
{
@@ -153,7 +164,7 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
153164
}
154165
}
155166

156-
pgstat_count_io_op(io_object, io_context, io_op, cnt);
167+
pgstat_count_io_op(io_object, io_context, io_op, cnt, bytes);
157168
}
158169

159170
PgStat_IO *
@@ -219,6 +230,9 @@ pgstat_io_flush_cb(bool nowait)
219230
bktype_shstats->counts[io_object][io_context][io_op] +=
220231
PendingIOStats.counts[io_object][io_context][io_op];
221232

233+
bktype_shstats->bytes[io_object][io_context][io_op] +=
234+
PendingIOStats.bytes[io_object][io_context][io_op];
235+
222236
time = PendingIOStats.pending_times[io_object][io_context][io_op];
223237

224238
bktype_shstats->times[io_object][io_context][io_op] +=

0 commit comments

Comments
 (0)