Skip to content

Commit 24043c2

Browse files
committed
Add test case for obsoleting slot with active walsender, take 2
The code to signal a running walsender when its reserved WAL size grows too large is completely uncovered before this commit; this adds coverage for that case. This test involves sending SIGSTOP to walsender and walreceiver, then advancing enough WAL for a checkpoint to trigger, then sending SIGCONT. There's no precedent for STOP signalling in Perl tests, and my reading of relevant manpages says it's likely to fail on Windows. Because of this, this test is always skipped on that platform. This version fixes a couple of rarely hit race conditions in the previous attempt 0912698; most notably, both LOG string searches are loops, not just the second one; we acquire the start-of-log position before STOP-signalling; and reference the correct process name in the test description. All per Tom Lane. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/202106102202.mjw4huiix7lo@alvherre.pgsql
1 parent 741d7f1 commit 24043c2

File tree

1 file changed

+94
-3
lines changed

1 file changed

+94
-3
lines changed

src/test/recovery/t/019_replslot_limit.pl

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
use PostgresNode;
1212

1313
use File::Path qw(rmtree);
14-
use Test::More tests => 14;
14+
use Test::More tests => $TestLib::windows_os ? 14 : 18;
1515
use Time::HiRes qw(usleep);
1616

1717
$ENV{PGDATABASE} = 'postgres';
@@ -211,8 +211,8 @@
211211
}
212212
ok($failed, 'check that replication has been broken');
213213

214-
$node_primary->stop('immediate');
215-
$node_standby->stop('immediate');
214+
$node_primary->stop;
215+
$node_standby->stop;
216216

217217
my $node_primary2 = get_new_node('primary2');
218218
$node_primary2->init(allows_streaming => 1);
@@ -253,6 +253,97 @@
253253
timeout => '60'));
254254
is($result[1], 'finished', 'check if checkpoint command is not blocked');
255255

256+
$node_primary2->stop;
257+
$node_standby->stop;
258+
259+
# The next test depends on Perl's `kill`, which apparently is not
260+
# portable to Windows. (It would be nice to use Test::More's `subtest`,
261+
# but that's not in the ancient version we require.)
262+
if ($TestLib::windows_os)
263+
{
264+
done_testing();
265+
exit;
266+
}
267+
268+
# Get a slot terminated while the walsender is active
269+
# We do this by sending SIGSTOP to the walsender. Skip this on Windows.
270+
my $node_primary3 = get_new_node('primary3');
271+
$node_primary3->init(allows_streaming => 1, extra => ['--wal-segsize=1']);
272+
$node_primary3->append_conf(
273+
'postgresql.conf', qq(
274+
min_wal_size = 2MB
275+
max_wal_size = 2MB
276+
log_checkpoints = yes
277+
max_slot_wal_keep_size = 1MB
278+
));
279+
$node_primary3->start;
280+
$node_primary3->safe_psql('postgres',
281+
"SELECT pg_create_physical_replication_slot('rep3')");
282+
# Take backup
283+
$backup_name = 'my_backup';
284+
$node_primary3->backup($backup_name);
285+
# Create standby
286+
my $node_standby3 = get_new_node('standby_3');
287+
$node_standby3->init_from_backup($node_primary3, $backup_name,
288+
has_streaming => 1);
289+
$node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'");
290+
$node_standby3->start;
291+
$node_primary3->wait_for_catchup($node_standby3->name, 'replay');
292+
my $senderpid = $node_primary3->safe_psql('postgres',
293+
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
294+
like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
295+
my $receiverpid = $node_standby3->safe_psql('postgres',
296+
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
297+
like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
298+
299+
$logstart = get_log_size($node_primary3);
300+
# freeze walsender and walreceiver. Slot will still be active, but walreceiver
301+
# won't get anything anymore.
302+
kill 'STOP', $senderpid, $receiverpid;
303+
advance_wal($node_primary3, 2);
304+
305+
my $max_attempts = 180;
306+
while ($max_attempts-- >= 0)
307+
{
308+
if (find_in_log(
309+
$node_primary3,
310+
"terminating process $senderpid to release replication slot \"rep3\"",
311+
$logstart))
312+
{
313+
ok(1, "walsender termination logged");
314+
last;
315+
}
316+
sleep 1;
317+
}
318+
319+
# Now let the walsender continue; slot should be killed now.
320+
# (Must not let walreceiver run yet; otherwise the standby could start another
321+
# one before the slot can be killed)
322+
kill 'CONT', $senderpid;
323+
$node_primary3->poll_query_until('postgres',
324+
"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep3'",
325+
"lost")
326+
or die "timed out waiting for slot to be lost";
327+
328+
$max_attempts = 180;
329+
while ($max_attempts-- >= 0)
330+
{
331+
if (find_in_log(
332+
$node_primary3,
333+
'invalidating slot "rep3" because its restart_lsn', $logstart))
334+
{
335+
ok(1, "slot invalidation logged");
336+
last;
337+
}
338+
sleep 1;
339+
}
340+
341+
# Now let the walreceiver continue, so that the node can be stopped cleanly
342+
kill 'CONT', $receiverpid;
343+
344+
$node_primary3->stop;
345+
$node_standby3->stop;
346+
256347
#####################################
257348
# Advance WAL of $node by $n segments
258349
sub advance_wal

0 commit comments

Comments
 (0)