Skip to content

Commit 866237a

Browse files
committed
Advance old-segment horizon properly after slot invalidation
When some slots are invalidated due to the max_slot_wal_keep_size limit, the old segment horizon should move forward to stay within the limit. However, in commit c655077 we forgot to call KeepLogSeg again to recompute the horizon after invalidating replication slots. In cases where other slots remained, the limits would be recomputed eventually for other reasons, but if all slots were invalidated, the limits would not move at all afterwards. Repair. Backpatch to 13 where the feature was introduced. Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reported-by: Marcin Krupowicz <mk@071.ovh> Discussion: https://postgr.es/m/17103-004130e8f27782c9@postgresql.org
1 parent 91a1651 commit 866237a

File tree

4 files changed

+74
-12
lines changed

4 files changed

+74
-12
lines changed

src/backend/access/transam/xlog.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9056,7 +9056,15 @@ CreateCheckPoint(int flags)
90569056
*/
90579057
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
90589058
KeepLogSeg(recptr, &_logSegNo);
9059-
InvalidateObsoleteReplicationSlots(_logSegNo);
9059+
if (InvalidateObsoleteReplicationSlots(_logSegNo))
9060+
{
9061+
/*
9062+
* Some slots have been invalidated; recalculate the old-segment
9063+
* horizon, starting again from RedoRecPtr.
9064+
*/
9065+
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9066+
KeepLogSeg(recptr, &_logSegNo);
9067+
}
90609068
_logSegNo--;
90619069
RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
90629070

@@ -9391,7 +9399,15 @@ CreateRestartPoint(int flags)
93919399
replayPtr = GetXLogReplayRecPtr(&replayTLI);
93929400
endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
93939401
KeepLogSeg(endptr, &_logSegNo);
9394-
InvalidateObsoleteReplicationSlots(_logSegNo);
9402+
if (InvalidateObsoleteReplicationSlots(_logSegNo))
9403+
{
9404+
/*
9405+
* Some slots have been invalidated; recalculate the old-segment
9406+
* horizon, starting again from RedoRecPtr.
9407+
*/
9408+
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9409+
KeepLogSeg(endptr, &_logSegNo);
9410+
}
93959411
_logSegNo--;
93969412

93979413
/*
@@ -9559,6 +9575,12 @@ GetWALAvailability(XLogRecPtr targetLSN)
95599575
* requirement of replication slots. For the latter criterion we do consider
95609576
* the effects of max_slot_wal_keep_size: reserve at most that much space back
95619577
* from recptr.
9578+
*
9579+
* Note about replication slots: if this function calculates a value
9580+
* that's further ahead than what slots need reserved, then affected
9581+
* slots need to be invalidated and this function invoked again.
9582+
* XXX it might be a good idea to rewrite this function so that
9583+
* invalidation is optionally done here, instead.
95629584
*/
95639585
static void
95649586
KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)

src/backend/replication/slot.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,11 +1130,14 @@ ReplicationSlotReserveWal(void)
11301130
* Returns whether ReplicationSlotControlLock was released in the interim (and
11311131
* in that case we're not holding the lock at return, otherwise we are).
11321132
*
1133+
* Sets *invalidated true if the slot was invalidated. (Untouched otherwise.)
1134+
*
11331135
* This is inherently racy, because we release the LWLock
11341136
* for syscalls, so caller must restart if we return true.
11351137
*/
11361138
static bool
1137-
InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
1139+
InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN,
1140+
bool *invalidated)
11381141
{
11391142
int last_signaled_pid = 0;
11401143
bool released_lock = false;
@@ -1191,6 +1194,9 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
11911194
s->active_pid = MyProcPid;
11921195
s->data.invalidated_at = restart_lsn;
11931196
s->data.restart_lsn = InvalidXLogRecPtr;
1197+
1198+
/* Let caller know */
1199+
*invalidated = true;
11941200
}
11951201

11961202
SpinLockRelease(&s->mutex);
@@ -1279,12 +1285,15 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
12791285
* Mark any slot that points to an LSN older than the given segment
12801286
* as invalid; it requires WAL that's about to be removed.
12811287
*
1288+
* Returns true when any slot have got invalidated.
1289+
*
12821290
* NB - this runs as part of checkpoint, so avoid raising errors if possible.
12831291
*/
1284-
void
1292+
bool
12851293
InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
12861294
{
12871295
XLogRecPtr oldestLSN;
1296+
bool invalidated = false;
12881297

12891298
XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
12901299

@@ -1297,13 +1306,24 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
12971306
if (!s->in_use)
12981307
continue;
12991308

1300-
if (InvalidatePossiblyObsoleteSlot(s, oldestLSN))
1309+
if (InvalidatePossiblyObsoleteSlot(s, oldestLSN, &invalidated))
13011310
{
13021311
/* if the lock was released, start from scratch */
13031312
goto restart;
13041313
}
13051314
}
13061315
LWLockRelease(ReplicationSlotControlLock);
1316+
1317+
/*
1318+
* If any slots have been invalidated, recalculate the resource limits.
1319+
*/
1320+
if (invalidated)
1321+
{
1322+
ReplicationSlotsComputeRequiredXmin(false);
1323+
ReplicationSlotsComputeRequiredLSN();
1324+
}
1325+
1326+
return invalidated;
13071327
}
13081328

13091329
/*

src/include/replication/slot.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ extern void ReplicationSlotsComputeRequiredLSN(void);
209209
extern XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void);
210210
extern bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive);
211211
extern void ReplicationSlotsDropDBSlots(Oid dboid);
212-
extern void InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno);
212+
extern bool InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno);
213213

214214
extern void StartupReplicationSlots(void);
215215
extern void CheckPointReplicationSlots(void);

src/test/recovery/t/019_replslot_limit.pl

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use PostgresNode;
99

1010
use File::Path qw(rmtree);
11-
use Test::More tests => 14;
11+
use Test::More tests => 15;
1212
use Time::HiRes qw(usleep);
1313

1414
$ENV{PGDATABASE} = 'postgres';
@@ -173,7 +173,12 @@
173173
# Advance WAL again, the slot loses the oldest segment.
174174
my $logstart = get_log_size($node_master);
175175
advance_wal($node_master, 7);
176-
$node_master->safe_psql('postgres', "CHECKPOINT;");
176+
177+
# This slot should be broken, wait for that to happen
178+
$node_master->poll_query_until(
179+
'postgres',
180+
qq[SELECT wal_status = 'lost' FROM pg_replication_slots
181+
WHERE slot_name = 'rep1']);
177182

178183
# WARNING should be issued
179184
ok( find_in_log(
@@ -182,13 +187,28 @@
182187
$logstart),
183188
'check that the warning is logged');
184189

185-
# This slot should be broken
186-
$result = $node_master->safe_psql('postgres',
187-
"SELECT slot_name, active, restart_lsn IS NULL, wal_status, safe_wal_size FROM pg_replication_slots WHERE slot_name = 'rep1'"
188-
);
190+
$result = $node_master->safe_psql(
191+
'postgres',
192+
qq[
193+
SELECT slot_name, active, restart_lsn IS NULL, wal_status, safe_wal_size
194+
FROM pg_replication_slots WHERE slot_name = 'rep1']);
189195
is($result, "rep1|f|t|lost|",
190196
'check that the slot became inactive and the state "lost" persists');
191197

198+
# The invalidated slot shouldn't keep the old-segment horizon back;
199+
# see bug #17103: https://postgr.es/m/17103-004130e8f27782c9@postgresql.org
200+
# Test for this by creating a new slot and comparing its restart LSN
201+
# to the oldest existing file.
202+
my $redoseg = $node_master->safe_psql('postgres',
203+
"SELECT pg_walfile_name(lsn) FROM pg_create_physical_replication_slot('s2', true)"
204+
);
205+
my $oldestseg = $node_master->safe_psql('postgres',
206+
"SELECT pg_ls_dir AS f FROM pg_ls_dir('pg_wal') WHERE pg_ls_dir ~ '^[0-9A-F]{24}\$' ORDER BY 1 LIMIT 1"
207+
);
208+
$node_master->safe_psql('postgres',
209+
qq[SELECT pg_drop_replication_slot('s2')]);
210+
is($oldestseg, $redoseg, "check that segments have been removed");
211+
192212
# The standby no longer can connect to the master
193213
$logstart = get_log_size($node_standby);
194214
$node_standby->start;

0 commit comments

Comments
 (0)