Skip to content

Commit 39b0369

Browse files
committed
Log long wait time on recovery conflict when it's resolved.
This is a follow-up of the work done in commit 0650ff2. This commit extends log_recovery_conflict_waits so that a log message is produced also when recovery conflict has already been resolved after deadlock_timeout passes, i.e., when the startup process finishes waiting for recovery conflict after deadlock_timeout. This is useful in investigating how long recovery conflicts prevented the recovery from applying WAL. Author: Fujii Masao Reviewed-by: Kyotaro Horiguchi, Bertrand Drouvot Discussion: https://postgr.es/m/9a60178c-a853-1440-2cdc-c3af916cff59@amazon.com
1 parent 6ecaaf8 commit 39b0369

File tree

5 files changed

+61
-15
lines changed

5 files changed

+61
-15
lines changed

doc/src/sgml/config.sgml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6961,7 +6961,7 @@ log_line_prefix = '%m [%p] %q%u@%d/%a '
69616961
<listitem>
69626962
<para>
69636963
Controls whether a log message is produced when the startup process
6964-
is waiting longer than <varname>deadlock_timeout</varname>
6964+
waits longer than <varname>deadlock_timeout</varname>
69656965
for recovery conflicts. This is useful in determining if recovery
69666966
conflicts prevent the recovery from applying WAL.
69676967
</para>

src/backend/storage/buffer/bufmgr.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4034,6 +4034,16 @@ LockBufferForCleanup(Buffer buffer)
40344034
/* Successfully acquired exclusive lock with pincount 1 */
40354035
UnlockBufHdr(bufHdr, buf_state);
40364036

4037+
/*
4038+
* Emit the log message if recovery conflict on buffer pin was
4039+
* resolved but the startup process waited longer than
4040+
* deadlock_timeout for it.
4041+
*/
4042+
if (logged_recovery_conflict)
4043+
LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
4044+
waitStart, GetCurrentTimestamp(),
4045+
NULL, false);
4046+
40374047
/* Report change to non-waiting status */
40384048
if (new_status)
40394049
{
@@ -4088,7 +4098,7 @@ LockBufferForCleanup(Buffer buffer)
40884098
DeadlockTimeout))
40894099
{
40904100
LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
4091-
waitStart, now, NULL);
4101+
waitStart, now, NULL, true);
40924102
logged_recovery_conflict = true;
40934103
}
40944104
}

src/backend/storage/ipc/standby.c

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -226,18 +226,27 @@ WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
226226
* wait_start is the timestamp when the caller started to wait.
227227
* now is the timestamp when this function has been called.
228228
* wait_list is the list of virtual transaction ids assigned to
229-
* conflicting processes.
229+
* conflicting processes. still_waiting indicates whether
230+
* the startup process is still waiting for the recovery conflict
231+
* to be resolved or not.
230232
*/
231233
void
232234
LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
233-
TimestampTz now, VirtualTransactionId *wait_list)
235+
TimestampTz now, VirtualTransactionId *wait_list,
236+
bool still_waiting)
234237
{
235238
long secs;
236239
int usecs;
237240
long msecs;
238241
StringInfoData buf;
239242
int nprocs = 0;
240243

244+
/*
245+
* There must be no conflicting processes when the recovery conflict has
246+
* already been resolved.
247+
*/
248+
Assert(still_waiting || wait_list == NULL);
249+
241250
TimestampDifference(wait_start, now, &secs, &usecs);
242251
msecs = secs * 1000 + usecs / 1000;
243252
usecs = usecs % 1000;
@@ -275,12 +284,21 @@ LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
275284
* conflicting backends in a detail message. Note that if all the backends
276285
* in the list are not active, no detail message is logged.
277286
*/
278-
ereport(LOG,
279-
errmsg("recovery still waiting after %ld.%03d ms: %s",
280-
msecs, usecs, _(get_recovery_conflict_desc(reason))),
281-
nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
282-
"Conflicting processes: %s.",
283-
nprocs, buf.data) : 0);
287+
if (still_waiting)
288+
{
289+
ereport(LOG,
290+
errmsg("recovery still waiting after %ld.%03d ms: %s",
291+
msecs, usecs, _(get_recovery_conflict_desc(reason))),
292+
nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
293+
"Conflicting processes: %s.",
294+
nprocs, buf.data) : 0);
295+
}
296+
else
297+
{
298+
ereport(LOG,
299+
errmsg("recovery finished waiting after %ld.%03d ms: %s",
300+
msecs, usecs, _(get_recovery_conflict_desc(reason))));
301+
}
284302

285303
if (nprocs > 0)
286304
pfree(buf.data);
@@ -375,13 +393,12 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
375393

376394
/*
377395
* Emit the log message if the startup process is waiting
378-
* longer than deadlock_timeout for recovery conflict on
379-
* buffer pin.
396+
* longer than deadlock_timeout for recovery conflict.
380397
*/
381398
if (maybe_log_conflict &&
382399
TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
383400
{
384-
LogRecoveryConflict(reason, waitStart, now, waitlist);
401+
LogRecoveryConflict(reason, waitStart, now, waitlist, true);
385402
logged_recovery_conflict = true;
386403
}
387404
}
@@ -391,6 +408,14 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
391408
waitlist++;
392409
}
393410

411+
/*
412+
* Emit the log message if recovery conflict was resolved but the startup
413+
* process waited longer than deadlock_timeout for it.
414+
*/
415+
if (logged_recovery_conflict)
416+
LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
417+
NULL, false);
418+
394419
/* Reset ps display if we changed it */
395420
if (new_status)
396421
{

src/backend/storage/lmgr/proc.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1322,7 +1322,8 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
13221322
* longer than deadlock_timeout.
13231323
*/
13241324
LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
1325-
standbyWaitStart, now, cnt > 0 ? vxids : NULL);
1325+
standbyWaitStart, now,
1326+
cnt > 0 ? vxids : NULL, true);
13261327
logged_recovery_conflict = true;
13271328
}
13281329
}
@@ -1607,6 +1608,15 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
16071608
disable_timeout(DEADLOCK_TIMEOUT, false);
16081609
}
16091610

1611+
/*
1612+
* Emit the log message if recovery conflict on lock was resolved but the
1613+
* startup process waited longer than deadlock_timeout for it.
1614+
*/
1615+
if (InHotStandby && logged_recovery_conflict)
1616+
LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
1617+
standbyWaitStart, GetCurrentTimestamp(),
1618+
NULL, false);
1619+
16101620
/*
16111621
* Re-acquire the lock table's partition lock. We have to do this to hold
16121622
* off cancel/die interrupts before we can mess with lockAwaited (else we

src/include/storage/standby.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ extern void StandbyDeadLockHandler(void);
4040
extern void StandbyTimeoutHandler(void);
4141
extern void StandbyLockTimeoutHandler(void);
4242
extern void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
43-
TimestampTz cur_ts, VirtualTransactionId *wait_list);
43+
TimestampTz cur_ts, VirtualTransactionId *wait_list,
44+
bool still_waiting);
4445

4546
/*
4647
* Standby Rmgr (RM_STANDBY_ID)

0 commit comments

Comments
 (0)