Skip to content

Commit baa9be4

Browse files
auldpIngo Molnar
authored andcommitted
sched/fair: Fix throttle_list starvation with low CFS quota
With a very low cpu.cfs_quota_us setting, such as the minimum of 1000, distribute_cfs_runtime may not empty the throttled_list before it runs out of runtime to distribute. In that case, due to the change from c06f04c to put throttled entries at the head of the list, later entries on the list will starve. Essentially, the same X processes will get pulled off the list, given CPU time and then, when expired, get put back on the head of the list where distribute_cfs_runtime will give runtime to the same set of processes leaving the rest. Fix the issue by setting a bit in struct cfs_bandwidth when distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can decide to put the throttled entry on the tail or the head of the list. The bit is set/cleared by the callers of distribute_cfs_runtime while they hold cfs_bandwidth->lock. This is easy to reproduce with a handful of CPU consumers. I use 'crash' on the live system. In some cases you can simply look at the throttled list and see the later entries are not changing: crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1" "$4}' | pr -t -n3 1 ffff90b56cb2d200 -976050 2 ffff90b56cb2cc00 -484925 3 ffff90b56cb2bc00 -658814 4 ffff90b56cb2ba00 -275365 5 ffff90b166a45600 -135138 6 ffff90b56cb2da00 -282505 7 ffff90b56cb2e000 -148065 8 ffff90b56cb2fa00 -872591 9 ffff90b56cb2c000 -84687 10 ffff90b56cb2f000 -87237 11 ffff90b166a40a00 -164582 crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1" "$4}' | pr -t -n3 1 ffff90b56cb2d200 -994147 2 ffff90b56cb2cc00 -306051 3 ffff90b56cb2bc00 -961321 4 ffff90b56cb2ba00 -24490 5 ffff90b166a45600 -135138 6 ffff90b56cb2da00 -282505 7 ffff90b56cb2e000 -148065 8 ffff90b56cb2fa00 -872591 9 ffff90b56cb2c000 -84687 10 ffff90b56cb2f000 -87237 11 ffff90b166a40a00 -164582 Sometimes it is easier to see by finding a process getting starved and looking at the sched_info: crash> task ffff8eb765994500 sched_info PID: 7800 TASK: ffff8eb765994500 CPU: 16 COMMAND: "cputest" sched_info = { pcount = 8, run_delay = 697094208, last_arrival = 240260125039, last_queued = 240260327513 }, crash> task ffff8eb765994500 sched_info PID: 7800 TASK: ffff8eb765994500 CPU: 16 COMMAND: "cputest" sched_info = { pcount = 8, run_delay = 697094208, last_arrival = 240260125039, last_queued = 240260327513 }, Signed-off-by: Phil Auld <pauld@redhat.com> Reviewed-by: Ben Segall <bsegall@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: stable@vger.kernel.org Fixes: c06f04c ("sched: Fix potential near-infinite distribute_cfs_runtime() loop") Link: http://lkml.kernel.org/r/20181008143639.GA4019@pauld.bos.csb Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent e054637 commit baa9be4

File tree

2 files changed

+21
-3
lines changed

2 files changed

+21
-3
lines changed

kernel/sched/fair.c

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
44764476

44774477
/*
44784478
* Add to the _head_ of the list, so that an already-started
4479-
* distribute_cfs_runtime will not see us
4479+
* distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4480+
* not running add to the tail so that later runqueues don't get starved.
44804481
*/
4481-
list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4482+
if (cfs_b->distribute_running)
4483+
list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4484+
else
4485+
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
44824486

44834487
/*
44844488
* If we're the first throttled task, make sure the bandwidth
@@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
46224626
* in us over-using our runtime if it is all used during this loop, but
46234627
* only by limited amounts in that extreme case.
46244628
*/
4625-
while (throttled && cfs_b->runtime > 0) {
4629+
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
46264630
runtime = cfs_b->runtime;
4631+
cfs_b->distribute_running = 1;
46274632
raw_spin_unlock(&cfs_b->lock);
46284633
/* we can't nest cfs_b->lock while distributing bandwidth */
46294634
runtime = distribute_cfs_runtime(cfs_b, runtime,
46304635
runtime_expires);
46314636
raw_spin_lock(&cfs_b->lock);
46324637

4638+
cfs_b->distribute_running = 0;
46334639
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
46344640

46354641
cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
47404746

47414747
/* confirm we're still not at a refresh boundary */
47424748
raw_spin_lock(&cfs_b->lock);
4749+
if (cfs_b->distribute_running) {
4750+
raw_spin_unlock(&cfs_b->lock);
4751+
return;
4752+
}
4753+
47434754
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
47444755
raw_spin_unlock(&cfs_b->lock);
47454756
return;
@@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
47494760
runtime = cfs_b->runtime;
47504761

47514762
expires = cfs_b->runtime_expires;
4763+
if (runtime)
4764+
cfs_b->distribute_running = 1;
4765+
47524766
raw_spin_unlock(&cfs_b->lock);
47534767

47544768
if (!runtime)
@@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
47594773
raw_spin_lock(&cfs_b->lock);
47604774
if (expires == cfs_b->runtime_expires)
47614775
cfs_b->runtime -= min(runtime, cfs_b->runtime);
4776+
cfs_b->distribute_running = 0;
47624777
raw_spin_unlock(&cfs_b->lock);
47634778
}
47644779

@@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
48674882
cfs_b->period_timer.function = sched_cfs_period_timer;
48684883
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
48694884
cfs_b->slack_timer.function = sched_cfs_slack_timer;
4885+
cfs_b->distribute_running = 0;
48704886
}
48714887

48724888
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)

kernel/sched/sched.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,8 @@ struct cfs_bandwidth {
346346
int nr_periods;
347347
int nr_throttled;
348348
u64 throttled_time;
349+
350+
bool distribute_running;
349351
#endif
350352
};
351353

0 commit comments

Comments
 (0)