Skip to content

Commit 05d4cbf

Browse files
committed
Increase width of RelFileNumbers from 32 bits to 56 bits.
RelFileNumbers are now assigned using a separate counter, instead of being assigned from the OID counter. This counter never wraps around: if all 2^56 possible RelFileNumbers are used, an internal error occurs. As the cluster is limited to 2^64 total bytes of WAL, this limitation should not cause a problem in practice. If the counter were 64 bits wide rather than 56 bits wide, we would need to increase the width of the BufferTag, which might adversely impact buffer lookup performance. Also, this lets us use bigint for pg_class.relfilenode and other places where these values are exposed at the SQL level without worrying about overflow. This should remove the need to keep "tombstone" files around until the next checkpoint when relations are removed. We do that to keep RelFileNumbers from being recycled, but now that won't happen anyway. However, this patch doesn't actually change anything in this area; it just makes it possible for a future patch to do so. Dilip Kumar, based on an idea from Andres Freund, who also reviewed some earlier versions of the patch. Further review and some wordsmithing by me. Also reviewed at various points by Ashutosh Sharma, Vignesh C, Amul Sul, Álvaro Herrera, and Tom Lane. Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
1 parent 2f47715 commit 05d4cbf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+694
-290
lines changed

contrib/pg_buffercache/Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ OBJS = \
66
pg_buffercache_pages.o
77

88
EXTENSION = pg_buffercache
9-
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
10-
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql
9+
DATA = pg_buffercache--1.0--1.1.sql pg_buffercache--1.1--1.2.sql pg_buffercache--1.2.sql \
10+
pg_buffercache--1.2--1.3.sql pg_buffercache--1.3--1.4.sql
1111
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
1212

1313
REGRESS = pg_buffercache
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/* contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql */
2+
3+
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
4+
\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.4'" to load this file. \quit
5+
6+
/* First we have to remove them from the extension */
7+
ALTER EXTENSION pg_buffercache DROP VIEW pg_buffercache;
8+
ALTER EXTENSION pg_buffercache DROP FUNCTION pg_buffercache_pages();
9+
10+
/* Then we can drop them */
11+
DROP VIEW pg_buffercache;
12+
DROP FUNCTION pg_buffercache_pages();
13+
14+
/* Now redefine */
15+
CREATE FUNCTION pg_buffercache_pages()
16+
RETURNS SETOF RECORD
17+
AS 'MODULE_PATHNAME', 'pg_buffercache_pages_v1_4'
18+
LANGUAGE C PARALLEL SAFE;
19+
20+
CREATE VIEW pg_buffercache AS
21+
SELECT P.* FROM pg_buffercache_pages() AS P
22+
(bufferid integer, relfilenode int8, reltablespace oid, reldatabase oid,
23+
relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
24+
pinning_backends int4);
25+
26+
-- Don't want these to be available to public.
27+
REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC;
28+
REVOKE ALL ON pg_buffercache FROM PUBLIC;
29+
GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor;
30+
GRANT SELECT ON pg_buffercache TO pg_monitor;
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# pg_buffercache extension
22
comment = 'examine the shared buffer cache'
3-
default_version = '1.3'
3+
default_version = '1.4'
44
module_pathname = '$libdir/pg_buffercache'
55
relocatable = true

contrib/pg_buffercache/pg_buffercache_pages.c

+35-4
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,10 @@ typedef struct
5959
* relation node/tablespace/database/blocknum and dirty indicator.
6060
*/
6161
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
62+
PG_FUNCTION_INFO_V1(pg_buffercache_pages_v1_4);
6263

63-
Datum
64-
pg_buffercache_pages(PG_FUNCTION_ARGS)
64+
static Datum
65+
pg_buffercache_pages_internal(PG_FUNCTION_ARGS, Oid rfn_typid)
6566
{
6667
FuncCallContext *funcctx;
6768
Datum result;
@@ -103,7 +104,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
103104
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
104105
INT4OID, -1, 0);
105106
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
106-
OIDOID, -1, 0);
107+
rfn_typid, -1, 0);
107108
TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
108109
OIDOID, -1, 0);
109110
TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
@@ -209,7 +210,24 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
209210
}
210211
else
211212
{
212-
values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
213+
if (rfn_typid == INT8OID)
214+
values[1] =
215+
Int64GetDatum((int64) fctx->record[i].relfilenumber);
216+
else
217+
{
218+
Assert(rfn_typid == OIDOID);
219+
220+
if (fctx->record[i].relfilenumber > OID_MAX)
221+
ereport(ERROR,
222+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
223+
errmsg("relfilenode %llu is too large to be represented as an OID",
224+
(unsigned long long) fctx->record[i].relfilenumber),
225+
errhint("Upgrade the extension using ALTER EXTENSION pg_buffercache UPDATE"));
226+
227+
values[1] =
228+
ObjectIdGetDatum((Oid) fctx->record[i].relfilenumber);
229+
}
230+
213231
nulls[1] = false;
214232
values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
215233
nulls[2] = false;
@@ -237,3 +255,16 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
237255
else
238256
SRF_RETURN_DONE(funcctx);
239257
}
258+
259+
/* entry point for old extension version */
260+
Datum
261+
pg_buffercache_pages(PG_FUNCTION_ARGS)
262+
{
263+
return pg_buffercache_pages_internal(fcinfo, OIDOID);
264+
}
265+
266+
Datum
267+
pg_buffercache_pages_v1_4(PG_FUNCTION_ARGS)
268+
{
269+
return pg_buffercache_pages_internal(fcinfo, INT8OID);
270+
}

contrib/pg_prewarm/autoprewarm.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ apw_load_buffers(void)
345345
{
346346
unsigned forknum;
347347

348-
if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
348+
if (fscanf(file, "%u,%u," UINT64_FORMAT ",%u,%u\n", &blkinfo[i].database,
349349
&blkinfo[i].tablespace, &blkinfo[i].filenumber,
350350
&forknum, &blkinfo[i].blocknum) != 5)
351351
ereport(ERROR,
@@ -669,7 +669,7 @@ apw_dump_now(bool is_bgworker, bool dump_unlogged)
669669
{
670670
CHECK_FOR_INTERRUPTS();
671671

672-
ret = fprintf(file, "%u,%u,%u,%u,%u\n",
672+
ret = fprintf(file, "%u,%u," UINT64_FORMAT ",%u,%u\n",
673673
block_info_array[i].database,
674674
block_info_array[i].tablespace,
675675
block_info_array[i].filenumber,

contrib/pg_walinspect/expected/pg_walinspect.out

+2-2
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ SELECT COUNT(*) >= 0 AS ok FROM pg_get_wal_stats_till_end_of_wal(:'wal_lsn1');
5454
-- ===================================================================
5555
-- Test for filtering out WAL records of a particular table
5656
-- ===================================================================
57-
SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset
57+
SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset
5858
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2')
59-
WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap';
59+
WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap';
6060
ok
6161
----
6262
t

contrib/pg_walinspect/sql/pg_walinspect.sql

+2-2
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@ SELECT COUNT(*) >= 0 AS ok FROM pg_get_wal_stats_till_end_of_wal(:'wal_lsn1');
3939
-- Test for filtering out WAL records of a particular table
4040
-- ===================================================================
4141

42-
SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset
42+
SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset
4343

4444
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2')
45-
WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap';
45+
WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap';
4646

4747
-- ===================================================================
4848
-- Test for filtering out WAL records based on resource_manager and

doc/src/sgml/catalogs.sgml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1984,7 +1984,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
19841984

19851985
<row>
19861986
<entry role="catalog_table_entry"><para role="column_definition">
1987-
<structfield>relfilenode</structfield> <type>oid</type>
1987+
<structfield>relfilenode</structfield> <type>int8</type>
19881988
</para>
19891989
<para>
19901990
Name of the on-disk file of this relation; zero means this

doc/src/sgml/func.sgml

+5
Original file line numberDiff line numberDiff line change
@@ -25210,6 +25210,11 @@ SELECT collation for ('foo' COLLATE "de_DE");
2521025210
<entry><type>timestamp with time zone</type></entry>
2521125211
</row>
2521225212

25213+
<row>
25214+
<entry><structfield>next_relfilenumber</structfield></entry>
25215+
<entry><type>timestamp with time zone</type></entry>
25216+
</row>
25217+
2521325218
</tbody>
2521425219
</tgroup>
2521525220
</table>

doc/src/sgml/pgbuffercache.sgml

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262

6363
<row>
6464
<entry role="catalog_table_entry"><para role="column_definition">
65-
<structfield>relfilenode</structfield> <type>oid</type>
65+
<structfield>relfilenode</structfield> <type>int8</type>
6666
(references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>relfilenode</structfield>)
6767
</para>
6868
<para>

doc/src/sgml/storage.sgml

+6-5
Original file line numberDiff line numberDiff line change
@@ -217,11 +217,12 @@ with the suffix <literal>_init</literal> (see <xref linkend="storage-init"/>).
217217

218218
<caution>
219219
<para>
220-
Note that while a table's filenode often matches its OID, this is
221-
<emphasis>not</emphasis> necessarily the case; some operations, like
222-
<command>TRUNCATE</command>, <command>REINDEX</command>, <command>CLUSTER</command> and some forms
223-
of <command>ALTER TABLE</command>, can change the filenode while preserving the OID.
224-
Avoid assuming that filenode and table OID are the same.
220+
Note that a table's filenode will normally be different than the OID. For
221+
system tables, the initial filenode will be equal to the table OID, but it will
222+
be different if the table has ever been subjected to a rewriting operation,
223+
such as <command>TRUNCATE</command>, <command>REINDEX</command>,
224+
<command>CLUSTER</command> or some forms of <command>ALTER TABLE</command>.
225+
For user tables, even the initial filenode will be different than the table OID.
225226
Also, for certain system catalogs including <structname>pg_class</structname> itself,
226227
<structname>pg_class</structname>.<structfield>relfilenode</structfield> contains zero. The
227228
actual filenode number of these catalogs is stored in a lower-level data

src/backend/access/gin/ginxlog.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda
100100
BlockNumber blknum;
101101

102102
BufferGetTag(buffer, &locator, &forknum, &blknum);
103-
elog(ERROR, "failed to add item to index page in %u/%u/%u",
103+
elog(ERROR, "failed to add item to index page in %u/%u/" UINT64_FORMAT,
104104
locator.spcOid, locator.dbOid, locator.relNumber);
105105
}
106106
}

src/backend/access/rmgrdesc/gistdesc.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec)
2626
static void
2727
out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec)
2828
{
29-
appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %u:%u",
29+
appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; blk %u; latestRemovedXid %u:%u",
3030
xlrec->locator.spcOid, xlrec->locator.dbOid,
3131
xlrec->locator.relNumber, xlrec->block,
3232
EpochFromFullTransactionId(xlrec->latestRemovedFullXid),

src/backend/access/rmgrdesc/heapdesc.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ heap2_desc(StringInfo buf, XLogReaderState *record)
169169
{
170170
xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec;
171171

172-
appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u",
172+
appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; tid %u/%u",
173173
xlrec->target_locator.spcOid,
174174
xlrec->target_locator.dbOid,
175175
xlrec->target_locator.relNumber,

src/backend/access/rmgrdesc/nbtdesc.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ btree_desc(StringInfo buf, XLogReaderState *record)
100100
{
101101
xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec;
102102

103-
appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u",
103+
appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; latestRemovedXid %u:%u",
104104
xlrec->locator.spcOid, xlrec->locator.dbOid,
105105
xlrec->locator.relNumber,
106106
EpochFromFullTransactionId(xlrec->latestRemovedFullXid),

src/backend/access/rmgrdesc/seqdesc.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ seq_desc(StringInfo buf, XLogReaderState *record)
2525
xl_seq_rec *xlrec = (xl_seq_rec *) rec;
2626

2727
if (info == XLOG_SEQ_LOG)
28-
appendStringInfo(buf, "rel %u/%u/%u",
28+
appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT,
2929
xlrec->locator.spcOid, xlrec->locator.dbOid,
3030
xlrec->locator.relNumber);
3131
}

src/backend/access/rmgrdesc/xlogdesc.c

+16-5
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
4545
CheckPoint *checkpoint = (CheckPoint *) rec;
4646

4747
appendStringInfo(buf, "redo %X/%X; "
48-
"tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; "
49-
"oldest xid %u in DB %u; oldest multi %u in DB %u; "
48+
"tli %u; prev tli %u; fpw %s; xid %u:%u; relfilenumber " UINT64_FORMAT ";oid %u; "
49+
"multi %u; offset %u; oldest xid %u in DB %u; oldest multi %u in DB %u; "
5050
"oldest/newest commit timestamp xid: %u/%u; "
5151
"oldest running xid %u; %s",
5252
LSN_FORMAT_ARGS(checkpoint->redo),
@@ -55,6 +55,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
5555
checkpoint->fullPageWrites ? "true" : "false",
5656
EpochFromFullTransactionId(checkpoint->nextXid),
5757
XidFromFullTransactionId(checkpoint->nextXid),
58+
checkpoint->nextRelFileNumber,
5859
checkpoint->nextOid,
5960
checkpoint->nextMulti,
6061
checkpoint->nextMultiOffset,
@@ -74,6 +75,13 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
7475
memcpy(&nextOid, rec, sizeof(Oid));
7576
appendStringInfo(buf, "%u", nextOid);
7677
}
78+
else if (info == XLOG_NEXT_RELFILENUMBER)
79+
{
80+
RelFileNumber nextRelFileNumber;
81+
82+
memcpy(&nextRelFileNumber, rec, sizeof(RelFileNumber));
83+
appendStringInfo(buf, UINT64_FORMAT, nextRelFileNumber);
84+
}
7785
else if (info == XLOG_RESTORE_POINT)
7886
{
7987
xl_restore_point *xlrec = (xl_restore_point *) rec;
@@ -169,6 +177,9 @@ xlog_identify(uint8 info)
169177
case XLOG_NEXTOID:
170178
id = "NEXTOID";
171179
break;
180+
case XLOG_NEXT_RELFILENUMBER:
181+
id = "NEXT_RELFILENUMBER";
182+
break;
172183
case XLOG_SWITCH:
173184
id = "SWITCH";
174185
break;
@@ -237,7 +248,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty,
237248
appendStringInfoChar(buf, ' ');
238249

239250
appendStringInfo(buf,
240-
"blkref #%d: rel %u/%u/%u fork %s blk %u",
251+
"blkref #%d: rel %u/%u/" UINT64_FORMAT " fork %s blk %u",
241252
block_id,
242253
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
243254
forkNames[forknum],
@@ -297,7 +308,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty,
297308
if (forknum != MAIN_FORKNUM)
298309
{
299310
appendStringInfo(buf,
300-
", blkref #%d: rel %u/%u/%u fork %s blk %u",
311+
", blkref #%d: rel %u/%u/" UINT64_FORMAT " fork %s blk %u",
301312
block_id,
302313
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
303314
forkNames[forknum],
@@ -306,7 +317,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty,
306317
else
307318
{
308319
appendStringInfo(buf,
309-
", blkref #%d: rel %u/%u/%u blk %u",
320+
", blkref #%d: rel %u/%u/" UINT64_FORMAT " blk %u",
310321
block_id,
311322
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
312323
blk);

src/backend/access/transam/README

+3-2
Original file line numberDiff line numberDiff line change
@@ -692,8 +692,9 @@ by having database restart search for files that don't have any committed
692692
entry in pg_class, but that currently isn't done because of the possibility
693693
of deleting data that is useful for forensic analysis of the crash.
694694
Orphan files are harmless --- at worst they waste a bit of disk space ---
695-
because we check for on-disk collisions when allocating new relfilenumber
696-
OIDs. So cleaning up isn't really necessary.
695+
because the relfilenumber counter is monotonically increasing. The maximum
696+
value is 2^56-1, and there is no provision for wraparound. Thus, on-disk
697+
collisions aren't possible.
697698

698699
3. Deleting a table, which requires an unlink() that could fail.
699700

0 commit comments

Comments
 (0)