Skip to content

Commit c469933

Browse files
derklingIngo Molnar
authored andcommitted
sched/fair: Fix cpu_util_wake() for 'execl' type workloads
A ~10% regression has been reported for UnixBench's execl throughput test by Aaron Lu and Ye Xiaolong: https://lkml.org/lkml/2018/10/30/765 That test is pretty simple, it does a "recursive" execve() syscall on the same binary. Starting from the syscall, this sequence is possible: do_execve() do_execveat_common() __do_execve_file() sched_exec() select_task_rq_fair() <==| Task already enqueued find_idlest_cpu() find_idlest_group() capacity_spare_wake() <==| Functions not called from cpu_util_wake() | the wakeup path which means we can end up calling cpu_util_wake() not only from the "wakeup path", as its name would suggest. Indeed, the task doing an execve() syscall is already enqueued on the CPU we want to get the cpu_util_wake() for. The estimated utilization for a CPU computed in cpu_util_wake() was written under the assumption that function can be called only from the wakeup path. If instead the task is already enqueued, we end up with a utilization which does not remove the current task's contribution from the estimated utilization of the CPU. This will wrongly assume a reduced spare capacity on the current CPU and increase the chances to migrate the task on execve. The regression is tracked down to: commit d519329 ("sched/fair: Update util_est only on util_avg updates") because in that patch we turn on by default the UTIL_EST sched feature. However, the real issue is introduced by: commit f9be3e5 ("sched/fair: Use util_est in LB and WU paths") Let's fix this by ensuring to always discount the task estimated utilization from the CPU's estimated utilization when the task is also the current one. The same benchmark of the bug report, executed on a dual socket 40 CPUs Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz machine, reports these "Execl Throughput" figures (higher the better): mainline : 48136.5 lps mainline+fix : 55376.5 lps which correspond to a 15% speedup. Moreover, since {cpu_util,capacity_spare}_wake() are not really only used from the wakeup path, let's remove this ambiguity by using a better matching name: {cpu_util,capacity_spare}_without(). Since we are at that, let's also improve the existing documentation. Reported-by: Aaron Lu <aaron.lu@intel.com> Reported-by: Ye Xiaolong <xiaolong.ye@intel.com> Tested-by: Aaron Lu <aaron.lu@intel.com> Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten Rasmussen <morten.rasmussen@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Perret <quentin.perret@arm.com> Cc: Steve Muckle <smuckle@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Todd Kjos <tkjos@google.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Fixes: f9be3e5 (sched/fair: Use util_est in LB and WU paths) Link: https://lore.kernel.org/lkml/20181025093100.GB13236@e110439-lin/ Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent e1ff516 commit c469933

File tree

1 file changed

+48
-14
lines changed

1 file changed

+48
-14
lines changed

kernel/sched/fair.c

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
56745674
return target;
56755675
}
56765676

5677-
static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5677+
static unsigned long cpu_util_without(int cpu, struct task_struct *p);
56785678

5679-
static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5679+
static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
56805680
{
5681-
return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
5681+
return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
56825682
}
56835683

56845684
/*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
57385738

57395739
avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
57405740

5741-
spare_cap = capacity_spare_wake(i, p);
5741+
spare_cap = capacity_spare_without(i, p);
57425742

57435743
if (spare_cap > max_spare_cap)
57445744
max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
58895889
return prev_cpu;
58905890

58915891
/*
5892-
* We need task's util for capacity_spare_wake, sync it up to prev_cpu's
5893-
* last_update_time.
5892+
* We need task's util for capacity_spare_without, sync it up to
5893+
* prev_cpu's last_update_time.
58945894
*/
58955895
if (!(sd_flag & SD_BALANCE_FORK))
58965896
sync_entity_load_avg(&p->se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
62166216
}
62176217

62186218
/*
6219-
* cpu_util_wake: Compute CPU utilization with any contributions from
6220-
* the waking task p removed.
6219+
* cpu_util_without: compute cpu utilization without any contributions from *p
6220+
* @cpu: the CPU which utilization is requested
6221+
* @p: the task which utilization should be discounted
6222+
*
6223+
* The utilization of a CPU is defined by the utilization of tasks currently
6224+
* enqueued on that CPU as well as tasks which are currently sleeping after an
6225+
* execution on that CPU.
6226+
*
6227+
* This method returns the utilization of the specified CPU by discounting the
6228+
* utilization of the specified task, whenever the task is currently
6229+
* contributing to the CPU utilization.
62216230
*/
6222-
static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6231+
static unsigned long cpu_util_without(int cpu, struct task_struct *p)
62236232
{
62246233
struct cfs_rq *cfs_rq;
62256234
unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
62316240
cfs_rq = &cpu_rq(cpu)->cfs;
62326241
util = READ_ONCE(cfs_rq->avg.util_avg);
62336242

6234-
/* Discount task's blocked util from CPU's util */
6243+
/* Discount task's util from CPU's util */
62356244
util -= min_t(unsigned int, util, task_util(p));
62366245

62376246
/*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
62406249
* a) if *p is the only task sleeping on this CPU, then:
62416250
* cpu_util (== task_util) > util_est (== 0)
62426251
* and thus we return:
6243-
* cpu_util_wake = (cpu_util - task_util) = 0
6252+
* cpu_util_without = (cpu_util - task_util) = 0
62446253
*
62456254
* b) if other tasks are SLEEPING on this CPU, which is now exiting
62466255
* IDLE, then:
62476256
* cpu_util >= task_util
62486257
* cpu_util > util_est (== 0)
62496258
* and thus we discount *p's blocked utilization to return:
6250-
* cpu_util_wake = (cpu_util - task_util) >= 0
6259+
* cpu_util_without = (cpu_util - task_util) >= 0
62516260
*
62526261
* c) if other tasks are RUNNABLE on that CPU and
62536262
* util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
62606269
* covered by the following code when estimated utilization is
62616270
* enabled.
62626271
*/
6263-
if (sched_feat(UTIL_EST))
6264-
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6272+
if (sched_feat(UTIL_EST)) {
6273+
unsigned int estimated =
6274+
READ_ONCE(cfs_rq->avg.util_est.enqueued);
6275+
6276+
/*
6277+
* Despite the following checks we still have a small window
6278+
* for a possible race, when an execl's select_task_rq_fair()
6279+
* races with LB's detach_task():
6280+
*
6281+
* detach_task()
6282+
* p->on_rq = TASK_ON_RQ_MIGRATING;
6283+
* ---------------------------------- A
6284+
* deactivate_task() \
6285+
* dequeue_task() + RaceTime
6286+
* util_est_dequeue() /
6287+
* ---------------------------------- B
6288+
*
6289+
* The additional check on "current == p" it's required to
6290+
* properly fix the execl regression and it helps in further
6291+
* reducing the chances for the above race.
6292+
*/
6293+
if (unlikely(task_on_rq_queued(p) || current == p)) {
6294+
estimated -= min_t(unsigned int, estimated,
6295+
(_task_util_est(p) | UTIL_AVG_UNCHANGED));
6296+
}
6297+
util = max(util, estimated);
6298+
}
62656299

62666300
/*
62676301
* Utilization (estimated) can exceed the CPU capacity, thus let's

0 commit comments

Comments
 (0)