@@ -2400,8 +2400,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2400
2400
local = 1 ;
2401
2401
2402
2402
/*
2403
- * Retry task to preferred node migration periodically, in case it
2404
- * case it previously failed, or the scheduler moved us.
2403
+ * Retry to migrate task to preferred node periodically, in case it
2404
+ * previously failed, or the scheduler moved us.
2405
2405
*/
2406
2406
if (time_after (jiffies , p -> numa_migrate_retry )) {
2407
2407
task_numa_placement (p );
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5674
5674
return target ;
5675
5675
}
5676
5676
5677
- static unsigned long cpu_util_wake (int cpu , struct task_struct * p );
5677
+ static unsigned long cpu_util_without (int cpu , struct task_struct * p );
5678
5678
5679
- static unsigned long capacity_spare_wake (int cpu , struct task_struct * p )
5679
+ static unsigned long capacity_spare_without (int cpu , struct task_struct * p )
5680
5680
{
5681
- return max_t (long , capacity_of (cpu ) - cpu_util_wake (cpu , p ), 0 );
5681
+ return max_t (long , capacity_of (cpu ) - cpu_util_without (cpu , p ), 0 );
5682
5682
}
5683
5683
5684
5684
/*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5738
5738
5739
5739
avg_load += cfs_rq_load_avg (& cpu_rq (i )-> cfs );
5740
5740
5741
- spare_cap = capacity_spare_wake (i , p );
5741
+ spare_cap = capacity_spare_without (i , p );
5742
5742
5743
5743
if (spare_cap > max_spare_cap )
5744
5744
max_spare_cap = spare_cap ;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
5889
5889
return prev_cpu ;
5890
5890
5891
5891
/*
5892
- * We need task's util for capacity_spare_wake , sync it up to prev_cpu's
5893
- * last_update_time.
5892
+ * We need task's util for capacity_spare_without , sync it up to
5893
+ * prev_cpu's last_update_time.
5894
5894
*/
5895
5895
if (!(sd_flag & SD_BALANCE_FORK ))
5896
5896
sync_entity_load_avg (& p -> se );
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
6216
6216
}
6217
6217
6218
6218
/*
6219
- * cpu_util_wake: Compute CPU utilization with any contributions from
6220
- * the waking task p removed.
6219
+ * cpu_util_without: compute cpu utilization without any contributions from *p
6220
+ * @cpu: the CPU which utilization is requested
6221
+ * @p: the task which utilization should be discounted
6222
+ *
6223
+ * The utilization of a CPU is defined by the utilization of tasks currently
6224
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
6225
+ * execution on that CPU.
6226
+ *
6227
+ * This method returns the utilization of the specified CPU by discounting the
6228
+ * utilization of the specified task, whenever the task is currently
6229
+ * contributing to the CPU utilization.
6221
6230
*/
6222
- static unsigned long cpu_util_wake (int cpu , struct task_struct * p )
6231
+ static unsigned long cpu_util_without (int cpu , struct task_struct * p )
6223
6232
{
6224
6233
struct cfs_rq * cfs_rq ;
6225
6234
unsigned int util ;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6231
6240
cfs_rq = & cpu_rq (cpu )-> cfs ;
6232
6241
util = READ_ONCE (cfs_rq -> avg .util_avg );
6233
6242
6234
- /* Discount task's blocked util from CPU's util */
6243
+ /* Discount task's util from CPU's util */
6235
6244
util -= min_t (unsigned int , util , task_util (p ));
6236
6245
6237
6246
/*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6240
6249
* a) if *p is the only task sleeping on this CPU, then:
6241
6250
* cpu_util (== task_util) > util_est (== 0)
6242
6251
* and thus we return:
6243
- * cpu_util_wake = (cpu_util - task_util) = 0
6252
+ * cpu_util_without = (cpu_util - task_util) = 0
6244
6253
*
6245
6254
* b) if other tasks are SLEEPING on this CPU, which is now exiting
6246
6255
* IDLE, then:
6247
6256
* cpu_util >= task_util
6248
6257
* cpu_util > util_est (== 0)
6249
6258
* and thus we discount *p's blocked utilization to return:
6250
- * cpu_util_wake = (cpu_util - task_util) >= 0
6259
+ * cpu_util_without = (cpu_util - task_util) >= 0
6251
6260
*
6252
6261
* c) if other tasks are RUNNABLE on that CPU and
6253
6262
* util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6260
6269
* covered by the following code when estimated utilization is
6261
6270
* enabled.
6262
6271
*/
6263
- if (sched_feat (UTIL_EST ))
6264
- util = max (util , READ_ONCE (cfs_rq -> avg .util_est .enqueued ));
6272
+ if (sched_feat (UTIL_EST )) {
6273
+ unsigned int estimated =
6274
+ READ_ONCE (cfs_rq -> avg .util_est .enqueued );
6275
+
6276
+ /*
6277
+ * Despite the following checks we still have a small window
6278
+ * for a possible race, when an execl's select_task_rq_fair()
6279
+ * races with LB's detach_task():
6280
+ *
6281
+ * detach_task()
6282
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
6283
+ * ---------------------------------- A
6284
+ * deactivate_task() \
6285
+ * dequeue_task() + RaceTime
6286
+ * util_est_dequeue() /
6287
+ * ---------------------------------- B
6288
+ *
6289
+ * The additional check on "current == p" it's required to
6290
+ * properly fix the execl regression and it helps in further
6291
+ * reducing the chances for the above race.
6292
+ */
6293
+ if (unlikely (task_on_rq_queued (p ) || current == p )) {
6294
+ estimated -= min_t (unsigned int , estimated ,
6295
+ (_task_util_est (p ) | UTIL_AVG_UNCHANGED ));
6296
+ }
6297
+ util = max (util , estimated );
6298
+ }
6265
6299
6266
6300
/*
6267
6301
* Utilization (estimated) can exceed the CPU capacity, thus let's
0 commit comments