Skip to content

Commit 867d396

Browse files
committed
Adjust pg_wal_replay_wait() procedure behavior on promoted standby
pg_wal_replay_wait() is intended to be called on standby. However, standby can be promoted to primary at any moment, even concurrently with the pg_wal_replay_wait() call. If recovery is not currently in progress that doesn't mean the wait was unsuccessful. Thus, we always need to recheck if the target LSN is replayed. Reported-by: Kevin Hale Boyes Discussion: https://postgr.es/m/CAPpHfdu5QN%2BZGACS%2B7foxmr8_nekgA2PA%2B-G3BuOUrdBLBFb6Q%40mail.gmail.com Author: Alexander Korotkov
1 parent bbf668d commit 867d396

File tree

3 files changed

+55
-11
lines changed

3 files changed

+55
-11
lines changed

doc/src/sgml/func.sgml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28969,6 +28969,15 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset
2896928969
connection pooler side.
2897028970
</para>
2897128971

28972+
<para>
28973+
<function>pg_wal_replay_wait</function> should be called on standby.
28974+
If a user calls <function>pg_wal_replay_wait</function> on primary, it
28975+
will error out. However, if <function>pg_wal_replay_wait</function> is
28976+
called on primary promoted from standby and <literal>target_lsn</literal>
28977+
was already replayed, then <function>pg_wal_replay_wait</function> just
28978+
exits immediately.
28979+
</para>
28980+
2897228981
<para>
2897328982
You can use <function>pg_wal_replay_wait</function> to wait for
2897428983
the <type>pg_lsn</type> value. For example, an application could update

src/backend/commands/waitlsn.c

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -230,14 +230,27 @@ WaitForLSNReplay(XLogRecPtr targetLSN, int64 timeout)
230230
Assert(MyProcNumber >= 0 && MyProcNumber < MaxBackends);
231231

232232
if (!RecoveryInProgress())
233+
{
234+
/*
235+
* Recovery is not in progress. Given that we detected this in the
236+
* very first check, this procedure was mistakenly called on primary.
237+
* However, it's possible that standby was promoted concurrently to
238+
* the procedure call, while target LSN is replayed. So, we still
239+
* check the last replay LSN before reporting an error.
240+
*/
241+
if (targetLSN <= GetXLogReplayRecPtr(NULL))
242+
return;
233243
ereport(ERROR,
234244
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
235245
errmsg("recovery is not in progress"),
236246
errhint("Waiting for LSN can only be executed during recovery.")));
237-
238-
/* If target LSN is already replayed, exit immediately */
239-
if (targetLSN <= GetXLogReplayRecPtr(NULL))
240-
return;
247+
}
248+
else
249+
{
250+
/* If target LSN is already replayed, exit immediately */
251+
if (targetLSN <= GetXLogReplayRecPtr(NULL))
252+
return;
253+
}
241254

242255
if (timeout > 0)
243256
{
@@ -257,19 +270,30 @@ WaitForLSNReplay(XLogRecPtr targetLSN, int64 timeout)
257270
int rc;
258271
long delay_ms = 0;
259272

260-
/* Check if the waited LSN has been replayed */
261-
currentLSN = GetXLogReplayRecPtr(NULL);
262-
if (targetLSN <= currentLSN)
263-
break;
264-
265273
/* Recheck that recovery is still in-progress */
266274
if (!RecoveryInProgress())
275+
{
276+
/*
277+
* Recovery was ended, but recheck if target LSN was already
278+
* replayed.
279+
*/
280+
currentLSN = GetXLogReplayRecPtr(NULL);
281+
if (targetLSN <= currentLSN)
282+
return;
267283
ereport(ERROR,
268284
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
269285
errmsg("recovery is not in progress"),
270286
errdetail("Recovery ended before replaying target LSN %X/%X; last replay LSN %X/%X.",
271287
LSN_FORMAT_ARGS(targetLSN),
272288
LSN_FORMAT_ARGS(currentLSN))));
289+
}
290+
else
291+
{
292+
/* Check if the waited LSN has been replayed */
293+
currentLSN = GetXLogReplayRecPtr(NULL);
294+
if (targetLSN <= currentLSN)
295+
break;
296+
}
273297

274298
/*
275299
* If the timeout value is specified, calculate the number of

src/test/recovery/t/043_wal_replay_wait.pl

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,12 +126,18 @@
126126

127127
# 5. Check that the standby promotion terminates the wait on LSN. Start
128128
# waiting for an unreachable LSN then promote. Check the log for the relevant
129-
# error message.
129+
# error message. Also, check that waiting for already replayed LSN doesn't
130+
# cause an error even after promotion.
131+
my $lsn4 =
132+
$node_primary->safe_psql('postgres',
133+
"SELECT pg_current_wal_insert_lsn() + 10000000000");
134+
my $lsn5 =
135+
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_insert_lsn()");
130136
my $psql_session = $node_standby1->background_psql('postgres');
131137
$psql_session->query_until(
132138
qr/start/, qq[
133139
\\echo start
134-
CALL pg_wal_replay_wait('${lsn3}');
140+
CALL pg_wal_replay_wait('${lsn4}');
135141
]);
136142

137143
$log_offset = -s $node_standby1->logfile;
@@ -140,6 +146,11 @@
140146

141147
ok(1, 'got error after standby promote');
142148

149+
$node_standby1->safe_psql('postgres', "CALL pg_wal_replay_wait('${lsn5}');");
150+
151+
ok(1,
152+
'wait for already replayed LSN exists immediately even after promotion');
153+
143154
$node_standby1->stop;
144155
$node_primary->stop;
145156

0 commit comments

Comments
 (0)