Skip to content

Commit d8cd0c6

Browse files
committed
Remove the restriction that the relmap must be 512 bytes.
Instead of relying on the ability to atomically overwrite the entire relmap file in one shot, write a new one and durably rename it into place. Removing the struct padding and the calculation showing why the map is exactly 512 bytes, and change the maximum number of entries to a nearby round number. Patch by me, reviewed by Andres Freund and Dilip Kumar. Discussion: http://postgr.es/m/CA+TgmoZq5%3DLWDK7kHaUbmWXxcaTuw_QwafgG9dr-BaPym_U8WQ%40mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-ttOXLX75k_WzRo9ar=VvxFhrHi+rJxns997F+yvkm==A@mail.gmail.com
1 parent e530be2 commit d8cd0c6

File tree

4 files changed

+58
-46
lines changed

4 files changed

+58
-46
lines changed

doc/src/sgml/monitoring.sgml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1409,8 +1409,8 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
14091409
<entry>Waiting for a read of the relation map file.</entry>
14101410
</row>
14111411
<row>
1412-
<entry><literal>RelationMapSync</literal></entry>
1413-
<entry>Waiting for the relation map file to reach durable storage.</entry>
1412+
<entry><literal>RelationMapReplace</literal></entry>
1413+
<entry>Waiting for durable replacement of a relation map file.</entry>
14141414
</row>
14151415
<row>
14161416
<entry><literal>RelationMapWrite</literal></entry>

src/backend/utils/activity/wait_event.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -633,8 +633,8 @@ pgstat_get_wait_io(WaitEventIO w)
633633
case WAIT_EVENT_RELATION_MAP_READ:
634634
event_name = "RelationMapRead";
635635
break;
636-
case WAIT_EVENT_RELATION_MAP_SYNC:
637-
event_name = "RelationMapSync";
636+
case WAIT_EVENT_RELATION_MAP_REPLACE:
637+
event_name = "RelationMapReplace";
638638
break;
639639
case WAIT_EVENT_RELATION_MAP_WRITE:
640640
event_name = "RelationMapWrite";

src/backend/utils/cache/relmapper.c

Lines changed: 53 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -60,21 +60,26 @@
6060
/*
6161
* The map file is critical data: we have no automatic method for recovering
6262
* from loss or corruption of it. We use a CRC so that we can detect
63-
* corruption. To minimize the risk of failed updates, the map file should
64-
* be kept to no more than one standard-size disk sector (ie 512 bytes),
65-
* and we use overwrite-in-place rather than playing renaming games.
66-
* The struct layout below is designed to occupy exactly 512 bytes, which
67-
* might make filesystem updates a bit more efficient.
63+
* corruption. Since the file might be more than one standard-size disk
64+
* sector in size, we cannot rely on overwrite-in-place. Instead, we generate
65+
* a new file and rename it into place, atomically replacing the original file.
6866
*
6967
* Entries in the mappings[] array are in no particular order. We could
7068
* speed searching by insisting on OID order, but it really shouldn't be
7169
* worth the trouble given the intended size of the mapping sets.
7270
*/
7371
#define RELMAPPER_FILENAME "pg_filenode.map"
72+
#define RELMAPPER_TEMP_FILENAME "pg_filenode.map.tmp"
7473

7574
#define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
7675

77-
#define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */
76+
/*
77+
* There's no need for this constant to have any particular value, and we
78+
* can raise it as necessary if we end up with more mapped relations. For
79+
* now, we just pick a round number that is modestly larger than the expected
80+
* number of mappings.
81+
*/
82+
#define MAX_MAPPINGS 64
7883

7984
typedef struct RelMapping
8085
{
@@ -88,7 +93,6 @@ typedef struct RelMapFile
8893
int32 num_mappings; /* number of valid RelMapping entries */
8994
RelMapping mappings[MAX_MAPPINGS];
9095
pg_crc32c crc; /* CRC of all above */
91-
int32 pad; /* to make the struct size be 512 exactly */
9296
} RelMapFile;
9397

9498
/*
@@ -877,6 +881,7 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
877881
{
878882
int fd;
879883
char mapfilename[MAXPGPATH];
884+
char maptempfilename[MAXPGPATH];
880885

881886
/*
882887
* Fill in the overhead fields and update CRC.
@@ -890,17 +895,47 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
890895
FIN_CRC32C(newmap->crc);
891896

892897
/*
893-
* Open the target file. We prefer to do this before entering the
894-
* critical section, so that an open() failure need not force PANIC.
898+
* Construct filenames -- a temporary file that we'll create to write the
899+
* data initially, and then the permanent name to which we will rename it.
895900
*/
896901
snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
897902
dbpath, RELMAPPER_FILENAME);
898-
fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY);
903+
snprintf(maptempfilename, sizeof(maptempfilename), "%s/%s",
904+
dbpath, RELMAPPER_TEMP_FILENAME);
905+
906+
/*
907+
* Open a temporary file. If a file already exists with this name, it must
908+
* be left over from a previous crash, so we can overwrite it. Concurrent
909+
* calls to this function are not allowed.
910+
*/
911+
fd = OpenTransientFile(maptempfilename,
912+
O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY);
899913
if (fd < 0)
900914
ereport(ERROR,
901915
(errcode_for_file_access(),
902916
errmsg("could not open file \"%s\": %m",
903-
mapfilename)));
917+
maptempfilename)));
918+
919+
/* Write new data to the file. */
920+
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
921+
if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
922+
{
923+
/* if write didn't set errno, assume problem is no disk space */
924+
if (errno == 0)
925+
errno = ENOSPC;
926+
ereport(ERROR,
927+
(errcode_for_file_access(),
928+
errmsg("could not write file \"%s\": %m",
929+
maptempfilename)));
930+
}
931+
pgstat_report_wait_end();
932+
933+
/* And close the file. */
934+
if (CloseTransientFile(fd) != 0)
935+
ereport(ERROR,
936+
(errcode_for_file_access(),
937+
errmsg("could not close file \"%s\": %m",
938+
maptempfilename)));
904939

905940
if (write_wal)
906941
{
@@ -924,40 +959,17 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
924959
XLogFlush(lsn);
925960
}
926961

927-
errno = 0;
928-
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
929-
if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
930-
{
931-
/* if write didn't set errno, assume problem is no disk space */
932-
if (errno == 0)
933-
errno = ENOSPC;
934-
ereport(ERROR,
935-
(errcode_for_file_access(),
936-
errmsg("could not write file \"%s\": %m",
937-
mapfilename)));
938-
}
939-
pgstat_report_wait_end();
940-
941962
/*
942-
* We choose to fsync the data to disk before considering the task done.
943-
* It would be possible to relax this if it turns out to be a performance
944-
* issue, but it would complicate checkpointing --- see notes for
945-
* CheckPointRelationMap.
963+
* durable_rename() does all the hard work of making sure that we rename
964+
* the temporary file into place in a crash-safe manner.
965+
*
966+
* NB: Although we instruct durable_rename() to use ERROR, we will often
967+
* be in a critical section at this point; if so, ERROR will become PANIC.
946968
*/
947-
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
948-
if (pg_fsync(fd) != 0)
949-
ereport(data_sync_elevel(ERROR),
950-
(errcode_for_file_access(),
951-
errmsg("could not fsync file \"%s\": %m",
952-
mapfilename)));
969+
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_REPLACE);
970+
durable_rename(maptempfilename, mapfilename, ERROR);
953971
pgstat_report_wait_end();
954972

955-
if (CloseTransientFile(fd) != 0)
956-
ereport(ERROR,
957-
(errcode_for_file_access(),
958-
errmsg("could not close file \"%s\": %m",
959-
mapfilename)));
960-
961973
/*
962974
* Now that the file is safely on disk, send sinval message to let other
963975
* backends know to re-read it. We must do this inside the critical

src/include/utils/wait_event.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ typedef enum
194194
WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE,
195195
WAIT_EVENT_LOGICAL_REWRITE_WRITE,
196196
WAIT_EVENT_RELATION_MAP_READ,
197-
WAIT_EVENT_RELATION_MAP_SYNC,
197+
WAIT_EVENT_RELATION_MAP_REPLACE,
198198
WAIT_EVENT_RELATION_MAP_WRITE,
199199
WAIT_EVENT_REORDER_BUFFER_READ,
200200
WAIT_EVENT_REORDER_BUFFER_WRITE,

0 commit comments

Comments
 (0)