Skip to content

Commit ead9e51

Browse files
committed
Advance old-segment horizon properly after slot invalidation
When some slots are invalidated due to the max_slot_wal_keep_size limit, the old segment horizon should move forward to stay within the limit. However, in commit c655077 we forgot to call KeepLogSeg again to recompute the horizon after invalidating replication slots. In cases where other slots remained, the limits would be recomputed eventually for other reasons, but if all slots were invalidated, the limits would not move at all afterwards. Repair. Backpatch to 13 where the feature was introduced. Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reported-by: Marcin Krupowicz <mk@071.ovh> Discussion: https://postgr.es/m/17103-004130e8f27782c9@postgresql.org
1 parent 46111fb commit ead9e51

File tree

4 files changed

+74
-12
lines changed

4 files changed

+74
-12
lines changed

src/backend/access/transam/xlog.c

+24-2
Original file line numberDiff line numberDiff line change
@@ -9300,7 +9300,15 @@ CreateCheckPoint(int flags)
93009300
*/
93019301
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
93029302
KeepLogSeg(recptr, &_logSegNo);
9303-
InvalidateObsoleteReplicationSlots(_logSegNo);
9303+
if (InvalidateObsoleteReplicationSlots(_logSegNo))
9304+
{
9305+
/*
9306+
* Some slots have been invalidated; recalculate the old-segment
9307+
* horizon, starting again from RedoRecPtr.
9308+
*/
9309+
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9310+
KeepLogSeg(recptr, &_logSegNo);
9311+
}
93049312
_logSegNo--;
93059313
RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
93069314

@@ -9640,7 +9648,15 @@ CreateRestartPoint(int flags)
96409648
replayPtr = GetXLogReplayRecPtr(&replayTLI);
96419649
endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
96429650
KeepLogSeg(endptr, &_logSegNo);
9643-
InvalidateObsoleteReplicationSlots(_logSegNo);
9651+
if (InvalidateObsoleteReplicationSlots(_logSegNo))
9652+
{
9653+
/*
9654+
* Some slots have been invalidated; recalculate the old-segment
9655+
* horizon, starting again from RedoRecPtr.
9656+
*/
9657+
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9658+
KeepLogSeg(endptr, &_logSegNo);
9659+
}
96449660
_logSegNo--;
96459661

96469662
/*
@@ -9809,6 +9825,12 @@ GetWALAvailability(XLogRecPtr targetLSN)
98099825
* requirement of replication slots. For the latter criterion we do consider
98109826
* the effects of max_slot_wal_keep_size: reserve at most that much space back
98119827
* from recptr.
9828+
*
9829+
* Note about replication slots: if this function calculates a value
9830+
* that's further ahead than what slots need reserved, then affected
9831+
* slots need to be invalidated and this function invoked again.
9832+
* XXX it might be a good idea to rewrite this function so that
9833+
* invalidation is optionally done here, instead.
98129834
*/
98139835
static void
98149836
KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)

src/backend/replication/slot.c

+23-3
Original file line numberDiff line numberDiff line change
@@ -1143,11 +1143,14 @@ ReplicationSlotReserveWal(void)
11431143
* Returns whether ReplicationSlotControlLock was released in the interim (and
11441144
* in that case we're not holding the lock at return, otherwise we are).
11451145
*
1146+
* Sets *invalidated true if the slot was invalidated. (Untouched otherwise.)
1147+
*
11461148
* This is inherently racy, because we release the LWLock
11471149
* for syscalls, so caller must restart if we return true.
11481150
*/
11491151
static bool
1150-
InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
1152+
InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN,
1153+
bool *invalidated)
11511154
{
11521155
int last_signaled_pid = 0;
11531156
bool released_lock = false;
@@ -1204,6 +1207,9 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
12041207
s->active_pid = MyProcPid;
12051208
s->data.invalidated_at = restart_lsn;
12061209
s->data.restart_lsn = InvalidXLogRecPtr;
1210+
1211+
/* Let caller know */
1212+
*invalidated = true;
12071213
}
12081214

12091215
SpinLockRelease(&s->mutex);
@@ -1291,12 +1297,15 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
12911297
* Mark any slot that points to an LSN older than the given segment
12921298
* as invalid; it requires WAL that's about to be removed.
12931299
*
1300+
* Returns true when any slot have got invalidated.
1301+
*
12941302
* NB - this runs as part of checkpoint, so avoid raising errors if possible.
12951303
*/
1296-
void
1304+
bool
12971305
InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
12981306
{
12991307
XLogRecPtr oldestLSN;
1308+
bool invalidated = false;
13001309

13011310
XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
13021311

@@ -1309,13 +1318,24 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
13091318
if (!s->in_use)
13101319
continue;
13111320

1312-
if (InvalidatePossiblyObsoleteSlot(s, oldestLSN))
1321+
if (InvalidatePossiblyObsoleteSlot(s, oldestLSN, &invalidated))
13131322
{
13141323
/* if the lock was released, start from scratch */
13151324
goto restart;
13161325
}
13171326
}
13181327
LWLockRelease(ReplicationSlotControlLock);
1328+
1329+
/*
1330+
* If any slots have been invalidated, recalculate the resource limits.
1331+
*/
1332+
if (invalidated)
1333+
{
1334+
ReplicationSlotsComputeRequiredXmin(false);
1335+
ReplicationSlotsComputeRequiredLSN();
1336+
}
1337+
1338+
return invalidated;
13191339
}
13201340

13211341
/*

src/include/replication/slot.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ extern void ReplicationSlotsComputeRequiredLSN(void);
213213
extern XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void);
214214
extern bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive);
215215
extern void ReplicationSlotsDropDBSlots(Oid dboid);
216-
extern void InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno);
216+
extern bool InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno);
217217
extern ReplicationSlot *SearchNamedReplicationSlot(const char *name, bool need_lock);
218218
extern void ReplicationSlotNameForTablesync(Oid suboid, Oid relid, char *syncslotname, int szslot);
219219
extern void ReplicationSlotDropAtPubNode(WalReceiverConn *wrconn, char *slotname, bool missing_ok);

src/test/recovery/t/019_replslot_limit.pl

+26-6
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
use PostgresNode;
1212

1313
use File::Path qw(rmtree);
14-
use Test::More tests => $TestLib::windows_os ? 14 : 18;
14+
use Test::More tests => $TestLib::windows_os ? 15 : 19;
1515
use Time::HiRes qw(usleep);
1616

1717
$ENV{PGDATABASE} = 'postgres';
@@ -176,7 +176,12 @@
176176
# Advance WAL again, the slot loses the oldest segment.
177177
my $logstart = get_log_size($node_primary);
178178
advance_wal($node_primary, 7);
179-
$node_primary->safe_psql('postgres', "CHECKPOINT;");
179+
180+
# This slot should be broken, wait for that to happen
181+
$node_primary->poll_query_until(
182+
'postgres',
183+
qq[SELECT wal_status = 'lost' FROM pg_replication_slots
184+
WHERE slot_name = 'rep1']);
180185

181186
# WARNING should be issued
182187
ok( find_in_log(
@@ -185,13 +190,28 @@
185190
$logstart),
186191
'check that the warning is logged');
187192

188-
# This slot should be broken
189-
$result = $node_primary->safe_psql('postgres',
190-
"SELECT slot_name, active, restart_lsn IS NULL, wal_status, safe_wal_size FROM pg_replication_slots WHERE slot_name = 'rep1'"
191-
);
193+
$result = $node_primary->safe_psql(
194+
'postgres',
195+
qq[
196+
SELECT slot_name, active, restart_lsn IS NULL, wal_status, safe_wal_size
197+
FROM pg_replication_slots WHERE slot_name = 'rep1']);
192198
is($result, "rep1|f|t|lost|",
193199
'check that the slot became inactive and the state "lost" persists');
194200

201+
# The invalidated slot shouldn't keep the old-segment horizon back;
202+
# see bug #17103: https://postgr.es/m/17103-004130e8f27782c9@postgresql.org
203+
# Test for this by creating a new slot and comparing its restart LSN
204+
# to the oldest existing file.
205+
my $redoseg = $node_primary->safe_psql('postgres',
206+
"SELECT pg_walfile_name(lsn) FROM pg_create_physical_replication_slot('s2', true)"
207+
);
208+
my $oldestseg = $node_primary->safe_psql('postgres',
209+
"SELECT pg_ls_dir AS f FROM pg_ls_dir('pg_wal') WHERE pg_ls_dir ~ '^[0-9A-F]{24}\$' ORDER BY 1 LIMIT 1"
210+
);
211+
$node_primary->safe_psql('postgres',
212+
qq[SELECT pg_drop_replication_slot('s2')]);
213+
is($oldestseg, $redoseg, "check that segments have been removed");
214+
195215
# The standby no longer can connect to the primary
196216
$logstart = get_log_size($node_standby);
197217
$node_standby->start;

0 commit comments

Comments
 (0)