Skip to content

Commit 09cf1d5

Browse files
committed
Improve timeout.c's handling of repeated timeout set/cancel.
A very common usage pattern is that we set a timeout that we don't expect to reach, cancel it after a little bit, and later repeat. With the original implementation of timeout.c, this results in one setitimer() call per timeout set or cancel. We can do a lot better by being lazy about changing the timeout interrupt request, namely: (1) never cancel the outstanding interrupt, even when we have no active timeout events; (2) if we need to set an interrupt, but there already is one pending at or before the required time, leave it alone. When the interrupt happens, the signal handler will reschedule it at whatever time is then needed. For example, with a one-second setting for statement_timeout, this method results in having to interact with the kernel only a little more than once a second, no matter how many statements we execute in between. The mainline code might never call setitimer() at all after the first time, while each time the signal handler fires, it sees that the then-pending request is most of a second away, and that's when it sets the next interrupt request for. Each mainline timeout-set request after that will observe that the time it wants is past the pending interrupt request time, and do nothing. This also works pretty well for cases where a few different timeout lengths are in use, as long as none of them are very short. But that describes our usage well. Idea and original patch by Thomas Munro; I fixed a race condition and improved the comments. Discussion: https://postgr.es/m/CA+hUKG+o6pbuHBJSGnud=TadsuXySWA7CCcPgCt2QE9F6_4iHQ@mail.gmail.com
1 parent 8a4f618 commit 09cf1d5

File tree

1 file changed

+100
-26
lines changed

1 file changed

+100
-26
lines changed

src/backend/utils/misc/timeout.c

Lines changed: 100 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,29 @@ static timeout_params *volatile active_timeouts[MAX_TIMEOUTS];
5353

5454
/*
5555
* Flag controlling whether the signal handler is allowed to do anything.
56-
* We leave this "false" when we're not expecting interrupts, just in case.
56+
* This is useful to avoid race conditions with the handler. Note in
57+
* particular that this lets us make changes in the data structures without
58+
* tediously disabling and re-enabling the timer signal. Most of the time,
59+
* no interrupt would happen anyway during such critical sections, but if
60+
* one does, this rule ensures it's safe. Leaving the signal enabled across
61+
* multiple operations can greatly reduce the number of kernel calls we make,
62+
* too. See comments in schedule_alarm() about that.
5763
*
58-
* Note that we don't bother to reset any pending timer interrupt when we
59-
* disable the signal handler; it's not really worth the cycles to do so,
60-
* since the probability of the interrupt actually occurring while we have
61-
* it disabled is low. See comments in schedule_alarm() about that.
64+
* We leave this "false" when we're not expecting interrupts, just in case.
6265
*/
6366
static volatile sig_atomic_t alarm_enabled = false;
6467

6568
#define disable_alarm() (alarm_enabled = false)
6669
#define enable_alarm() (alarm_enabled = true)
6770

71+
/*
72+
* State recording if and when we next expect the interrupt to fire.
73+
* Note that the signal handler will unconditionally reset signal_pending to
74+
* false, so that can change asynchronously even when alarm_enabled is false.
75+
*/
76+
static volatile sig_atomic_t signal_pending = false;
77+
static TimestampTz signal_due_at = 0; /* valid only when signal_pending */
78+
6879

6980
/*****************************************************************************
7081
* Internal helper functions
@@ -185,29 +196,50 @@ enable_timeout(TimeoutId id, TimestampTz now, TimestampTz fin_time)
185196
* Schedule alarm for the next active timeout, if any
186197
*
187198
* We assume the caller has obtained the current time, or a close-enough
188-
* approximation.
199+
* approximation. (It's okay if a tick or two has passed since "now", or
200+
* if a little more time elapses before we reach the kernel call; that will
201+
* cause us to ask for an interrupt a tick or two later than the nearest
202+
* timeout, which is no big deal. Passing a "now" value that's in the future
203+
* would be bad though.)
189204
*/
190205
static void
191206
schedule_alarm(TimestampTz now)
192207
{
193208
if (num_active_timeouts > 0)
194209
{
195210
struct itimerval timeval;
211+
TimestampTz nearest_timeout;
196212
long secs;
197213
int usecs;
198214

199215
MemSet(&timeval, 0, sizeof(struct itimerval));
200216

201-
/* Get the time remaining till the nearest pending timeout */
202-
TimestampDifference(now, active_timeouts[0]->fin_time,
203-
&secs, &usecs);
204-
205217
/*
206-
* It's possible that the difference is less than a microsecond;
207-
* ensure we don't cancel, rather than set, the interrupt.
218+
* Get the time remaining till the nearest pending timeout. If it is
219+
* negative, assume that we somehow missed an interrupt, and force
220+
* signal_pending off. This gives us a chance to recover if the
221+
* kernel drops a timeout request for some reason.
208222
*/
209-
if (secs == 0 && usecs == 0)
223+
nearest_timeout = active_timeouts[0]->fin_time;
224+
if (now > nearest_timeout)
225+
{
226+
signal_pending = false;
227+
/* force an interrupt as soon as possible */
228+
secs = 0;
210229
usecs = 1;
230+
}
231+
else
232+
{
233+
TimestampDifference(now, nearest_timeout,
234+
&secs, &usecs);
235+
236+
/*
237+
* It's possible that the difference is less than a microsecond;
238+
* ensure we don't cancel, rather than set, the interrupt.
239+
*/
240+
if (secs == 0 && usecs == 0)
241+
usecs = 1;
242+
}
211243

212244
timeval.it_value.tv_sec = secs;
213245
timeval.it_value.tv_usec = usecs;
@@ -218,7 +250,7 @@ schedule_alarm(TimestampTz now)
218250
* interrupt could occur before we can set alarm_enabled, so that the
219251
* signal handler would fail to do anything.
220252
*
221-
* Because we didn't bother to reset the timer in disable_alarm(),
253+
* Because we didn't bother to disable the timer in disable_alarm(),
222254
* it's possible that a previously-set interrupt will fire between
223255
* enable_alarm() and setitimer(). This is safe, however. There are
224256
* two possible outcomes:
@@ -244,9 +276,53 @@ schedule_alarm(TimestampTz now)
244276
*/
245277
enable_alarm();
246278

279+
/*
280+
* If there is already an interrupt pending that's at or before the
281+
* needed time, we need not do anything more. The signal handler will
282+
* do the right thing in the first case, and re-schedule the interrupt
283+
* for later in the second case. It might seem that the extra
284+
* interrupt is wasted work, but it's not terribly much work, and this
285+
* method has very significant advantages in the common use-case where
286+
* we repeatedly set a timeout that we don't expect to reach and then
287+
* cancel it. Instead of invoking setitimer() every time the timeout
288+
* is set or canceled, we perform one interrupt and a re-scheduling
289+
* setitimer() call at intervals roughly equal to the timeout delay.
290+
* For example, with statement_timeout = 1s and a throughput of
291+
* thousands of queries per second, this method requires an interrupt
292+
* and setitimer() call roughly once a second, rather than thousands
293+
* of setitimer() calls per second.
294+
*
295+
* Because of the possible passage of time between when we obtained
296+
* "now" and when we reach setitimer(), the kernel's opinion of when
297+
* to trigger the interrupt is likely to be a bit later than
298+
* signal_due_at. That's fine, for the same reasons described above.
299+
*/
300+
if (signal_pending && nearest_timeout >= signal_due_at)
301+
return;
302+
303+
/*
304+
* As with calling enable_alarm(), we must set signal_pending *before*
305+
* calling setitimer(); if we did it after, the signal handler could
306+
* trigger before we set it, leaving us with a false opinion that a
307+
* signal is still coming.
308+
*
309+
* Other race conditions involved with setting/checking signal_pending
310+
* are okay, for the reasons described above.
311+
*/
312+
signal_due_at = nearest_timeout;
313+
signal_pending = true;
314+
247315
/* Set the alarm timer */
248316
if (setitimer(ITIMER_REAL, &timeval, NULL) != 0)
317+
{
318+
/*
319+
* Clearing signal_pending here is a bit pro forma, but not
320+
* entirely so, since something in the FATAL exit path could try
321+
* to use timeout facilities.
322+
*/
323+
signal_pending = false;
249324
elog(FATAL, "could not enable SIGALRM timer: %m");
325+
}
250326
}
251327
}
252328

@@ -279,6 +355,12 @@ handle_sig_alarm(SIGNAL_ARGS)
279355
*/
280356
SetLatch(MyLatch);
281357

358+
/*
359+
* Always reset signal_pending, even if !alarm_enabled, since indeed no
360+
* signal is now pending.
361+
*/
362+
signal_pending = false;
363+
282364
/*
283365
* Fire any pending timeouts, but only if we're enabled to do so.
284366
*/
@@ -591,7 +673,7 @@ disable_timeouts(const DisableTimeoutParams *timeouts, int count)
591673
}
592674

593675
/*
594-
* Disable SIGALRM and remove all timeouts from the active list,
676+
* Disable the signal handler, remove all timeouts from the active list,
595677
* and optionally reset their timeout indicators.
596678
*/
597679
void
@@ -602,18 +684,10 @@ disable_all_timeouts(bool keep_indicators)
602684
disable_alarm();
603685

604686
/*
605-
* Only bother to reset the timer if we think it's active. We could just
606-
* let the interrupt happen anyway, but it's probably a bit cheaper to do
607-
* setitimer() than to let the useless interrupt happen.
687+
* We used to disable the timer interrupt here, but in common usage
688+
* patterns it's cheaper to leave it enabled; that may save us from having
689+
* to enable it again shortly. See comments in schedule_alarm().
608690
*/
609-
if (num_active_timeouts > 0)
610-
{
611-
struct itimerval timeval;
612-
613-
MemSet(&timeval, 0, sizeof(struct itimerval));
614-
if (setitimer(ITIMER_REAL, &timeval, NULL) != 0)
615-
elog(FATAL, "could not disable SIGALRM timer: %m");
616-
}
617691

618692
num_active_timeouts = 0;
619693

0 commit comments

Comments
 (0)