Skip to content

Commit e5bcbb1

Browse files
committed
Advance old-segment horizon properly after slot invalidation
When some slots are invalidated due to the max_slot_wal_keep_size limit, the old segment horizon should move forward to stay within the limit. However, in commit c655077 we forgot to call KeepLogSeg again to recompute the horizon after invalidating replication slots. In cases where other slots remained, the limits would be recomputed eventually for other reasons, but if all slots were invalidated, the limits would not move at all afterwards. Repair. Backpatch to 13 where the feature was introduced. Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reported-by: Marcin Krupowicz <mk@071.ovh> Discussion: https://postgr.es/m/17103-004130e8f27782c9@postgresql.org
1 parent 525115a commit e5bcbb1

File tree

4 files changed

+74
-12
lines changed

4 files changed

+74
-12
lines changed

src/backend/access/transam/xlog.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9262,7 +9262,15 @@ CreateCheckPoint(int flags)
92629262
*/
92639263
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
92649264
KeepLogSeg(recptr, &_logSegNo);
9265-
InvalidateObsoleteReplicationSlots(_logSegNo);
9265+
if (InvalidateObsoleteReplicationSlots(_logSegNo))
9266+
{
9267+
/*
9268+
* Some slots have been invalidated; recalculate the old-segment
9269+
* horizon, starting again from RedoRecPtr.
9270+
*/
9271+
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9272+
KeepLogSeg(recptr, &_logSegNo);
9273+
}
92669274
_logSegNo--;
92679275
RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
92689276

@@ -9602,7 +9610,15 @@ CreateRestartPoint(int flags)
96029610
replayPtr = GetXLogReplayRecPtr(&replayTLI);
96039611
endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
96049612
KeepLogSeg(endptr, &_logSegNo);
9605-
InvalidateObsoleteReplicationSlots(_logSegNo);
9613+
if (InvalidateObsoleteReplicationSlots(_logSegNo))
9614+
{
9615+
/*
9616+
* Some slots have been invalidated; recalculate the old-segment
9617+
* horizon, starting again from RedoRecPtr.
9618+
*/
9619+
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9620+
KeepLogSeg(endptr, &_logSegNo);
9621+
}
96069622
_logSegNo--;
96079623

96089624
/*
@@ -9771,6 +9787,12 @@ GetWALAvailability(XLogRecPtr targetLSN)
97719787
* requirement of replication slots. For the latter criterion we do consider
97729788
* the effects of max_slot_wal_keep_size: reserve at most that much space back
97739789
* from recptr.
9790+
*
9791+
* Note about replication slots: if this function calculates a value
9792+
* that's further ahead than what slots need reserved, then affected
9793+
* slots need to be invalidated and this function invoked again.
9794+
* XXX it might be a good idea to rewrite this function so that
9795+
* invalidation is optionally done here, instead.
97749796
*/
97759797
static void
97769798
KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)

src/backend/replication/slot.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,11 +1142,14 @@ ReplicationSlotReserveWal(void)
11421142
* Returns whether ReplicationSlotControlLock was released in the interim (and
11431143
* in that case we're not holding the lock at return, otherwise we are).
11441144
*
1145+
* Sets *invalidated true if the slot was invalidated. (Untouched otherwise.)
1146+
*
11451147
* This is inherently racy, because we release the LWLock
11461148
* for syscalls, so caller must restart if we return true.
11471149
*/
11481150
static bool
1149-
InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
1151+
InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN,
1152+
bool *invalidated)
11501153
{
11511154
int last_signaled_pid = 0;
11521155
bool released_lock = false;
@@ -1203,6 +1206,9 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
12031206
s->active_pid = MyProcPid;
12041207
s->data.invalidated_at = restart_lsn;
12051208
s->data.restart_lsn = InvalidXLogRecPtr;
1209+
1210+
/* Let caller know */
1211+
*invalidated = true;
12061212
}
12071213

12081214
SpinLockRelease(&s->mutex);
@@ -1290,12 +1296,15 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN)
12901296
* Mark any slot that points to an LSN older than the given segment
12911297
* as invalid; it requires WAL that's about to be removed.
12921298
*
1299+
* Returns true when any slot have got invalidated.
1300+
*
12931301
* NB - this runs as part of checkpoint, so avoid raising errors if possible.
12941302
*/
1295-
void
1303+
bool
12961304
InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
12971305
{
12981306
XLogRecPtr oldestLSN;
1307+
bool invalidated = false;
12991308

13001309
XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
13011310

@@ -1308,13 +1317,24 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
13081317
if (!s->in_use)
13091318
continue;
13101319

1311-
if (InvalidatePossiblyObsoleteSlot(s, oldestLSN))
1320+
if (InvalidatePossiblyObsoleteSlot(s, oldestLSN, &invalidated))
13121321
{
13131322
/* if the lock was released, start from scratch */
13141323
goto restart;
13151324
}
13161325
}
13171326
LWLockRelease(ReplicationSlotControlLock);
1327+
1328+
/*
1329+
* If any slots have been invalidated, recalculate the resource limits.
1330+
*/
1331+
if (invalidated)
1332+
{
1333+
ReplicationSlotsComputeRequiredXmin(false);
1334+
ReplicationSlotsComputeRequiredLSN();
1335+
}
1336+
1337+
return invalidated;
13181338
}
13191339

13201340
/*

src/include/replication/slot.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ extern void ReplicationSlotsComputeRequiredLSN(void);
214214
extern XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void);
215215
extern bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive);
216216
extern void ReplicationSlotsDropDBSlots(Oid dboid);
217-
extern void InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno);
217+
extern bool InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno);
218218
extern ReplicationSlot *SearchNamedReplicationSlot(const char *name, bool need_lock);
219219
extern void ReplicationSlotNameForTablesync(Oid suboid, Oid relid, char *syncslotname, int szslot);
220220
extern void ReplicationSlotDropAtPubNode(WalReceiverConn *wrconn, char *slotname, bool missing_ok);

src/test/recovery/t/019_replslot_limit.pl

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
use PostgresNode;
1212

1313
use File::Path qw(rmtree);
14-
use Test::More tests => $TestLib::windows_os ? 14 : 18;
14+
use Test::More tests => $TestLib::windows_os ? 15 : 19;
1515
use Time::HiRes qw(usleep);
1616

1717
$ENV{PGDATABASE} = 'postgres';
@@ -176,7 +176,12 @@
176176
# Advance WAL again, the slot loses the oldest segment.
177177
my $logstart = get_log_size($node_primary);
178178
advance_wal($node_primary, 7);
179-
$node_primary->safe_psql('postgres', "CHECKPOINT;");
179+
180+
# This slot should be broken, wait for that to happen
181+
$node_primary->poll_query_until(
182+
'postgres',
183+
qq[SELECT wal_status = 'lost' FROM pg_replication_slots
184+
WHERE slot_name = 'rep1']);
180185

181186
# WARNING should be issued
182187
ok( find_in_log(
@@ -185,13 +190,28 @@
185190
$logstart),
186191
'check that the warning is logged');
187192

188-
# This slot should be broken
189-
$result = $node_primary->safe_psql('postgres',
190-
"SELECT slot_name, active, restart_lsn IS NULL, wal_status, safe_wal_size FROM pg_replication_slots WHERE slot_name = 'rep1'"
191-
);
193+
$result = $node_primary->safe_psql(
194+
'postgres',
195+
qq[
196+
SELECT slot_name, active, restart_lsn IS NULL, wal_status, safe_wal_size
197+
FROM pg_replication_slots WHERE slot_name = 'rep1']);
192198
is($result, "rep1|f|t|lost|",
193199
'check that the slot became inactive and the state "lost" persists');
194200

201+
# The invalidated slot shouldn't keep the old-segment horizon back;
202+
# see bug #17103: https://postgr.es/m/17103-004130e8f27782c9@postgresql.org
203+
# Test for this by creating a new slot and comparing its restart LSN
204+
# to the oldest existing file.
205+
my $redoseg = $node_primary->safe_psql('postgres',
206+
"SELECT pg_walfile_name(lsn) FROM pg_create_physical_replication_slot('s2', true)"
207+
);
208+
my $oldestseg = $node_primary->safe_psql('postgres',
209+
"SELECT pg_ls_dir AS f FROM pg_ls_dir('pg_wal') WHERE pg_ls_dir ~ '^[0-9A-F]{24}\$' ORDER BY 1 LIMIT 1"
210+
);
211+
$node_primary->safe_psql('postgres',
212+
qq[SELECT pg_drop_replication_slot('s2')]);
213+
is($oldestseg, $redoseg, "check that segments have been removed");
214+
195215
# The standby no longer can connect to the primary
196216
$logstart = get_log_size($node_standby);
197217
$node_standby->start;

0 commit comments

Comments
 (0)