@@ -5387,8 +5387,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5387
5387
static struct {
5388
5388
cpumask_var_t idle_cpus_mask ;
5389
5389
atomic_t nr_cpus ;
5390
+ int has_blocked ; /* Idle CPUS has blocked load */
5390
5391
unsigned long next_balance ; /* in jiffy units */
5391
- unsigned long next_stats ;
5392
+ unsigned long next_blocked ; /* Next update of blocked load in jiffies */
5392
5393
} nohz ____cacheline_aligned ;
5393
5394
5394
5395
#endif /* CONFIG_NO_HZ_COMMON */
@@ -7038,6 +7039,7 @@ enum fbq_type { regular, remote, all };
7038
7039
#define LBF_DST_PINNED 0x04
7039
7040
#define LBF_SOME_PINNED 0x08
7040
7041
#define LBF_NOHZ_STATS 0x10
7042
+ #define LBF_NOHZ_AGAIN 0x20
7041
7043
7042
7044
struct lb_env {
7043
7045
struct sched_domain * sd ;
@@ -7422,8 +7424,6 @@ static void attach_tasks(struct lb_env *env)
7422
7424
rq_unlock (env -> dst_rq , & rf );
7423
7425
}
7424
7426
7425
- #ifdef CONFIG_FAIR_GROUP_SCHED
7426
-
7427
7427
static inline bool cfs_rq_is_decayed (struct cfs_rq * cfs_rq )
7428
7428
{
7429
7429
if (cfs_rq -> load .weight )
@@ -7441,11 +7441,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7441
7441
return true;
7442
7442
}
7443
7443
7444
+ #ifdef CONFIG_FAIR_GROUP_SCHED
7445
+
7444
7446
static void update_blocked_averages (int cpu )
7445
7447
{
7446
7448
struct rq * rq = cpu_rq (cpu );
7447
7449
struct cfs_rq * cfs_rq , * pos ;
7448
7450
struct rq_flags rf ;
7451
+ bool done = true;
7449
7452
7450
7453
rq_lock_irqsave (rq , & rf );
7451
7454
update_rq_clock (rq );
@@ -7475,10 +7478,14 @@ static void update_blocked_averages(int cpu)
7475
7478
*/
7476
7479
if (cfs_rq_is_decayed (cfs_rq ))
7477
7480
list_del_leaf_cfs_rq (cfs_rq );
7481
+ else
7482
+ done = false;
7478
7483
}
7479
7484
7480
7485
#ifdef CONFIG_NO_HZ_COMMON
7481
7486
rq -> last_blocked_load_update_tick = jiffies ;
7487
+ if (done )
7488
+ rq -> has_blocked_load = 0 ;
7482
7489
#endif
7483
7490
rq_unlock_irqrestore (rq , & rf );
7484
7491
}
@@ -7541,6 +7548,8 @@ static inline void update_blocked_averages(int cpu)
7541
7548
update_cfs_rq_load_avg (cfs_rq_clock_task (cfs_rq ), cfs_rq );
7542
7549
#ifdef CONFIG_NO_HZ_COMMON
7543
7550
rq -> last_blocked_load_update_tick = jiffies ;
7551
+ if (cfs_rq_is_decayed (cfs_rq ))
7552
+ rq -> has_blocked_load = 0 ;
7544
7553
#endif
7545
7554
rq_unlock_irqrestore (rq , & rf );
7546
7555
}
@@ -7876,18 +7885,25 @@ group_type group_classify(struct sched_group *group,
7876
7885
return group_other ;
7877
7886
}
7878
7887
7879
- static void update_nohz_stats (struct rq * rq )
7888
+ static bool update_nohz_stats (struct rq * rq )
7880
7889
{
7881
7890
#ifdef CONFIG_NO_HZ_COMMON
7882
7891
unsigned int cpu = rq -> cpu ;
7883
7892
7893
+ if (!rq -> has_blocked_load )
7894
+ return false;
7895
+
7884
7896
if (!cpumask_test_cpu (cpu , nohz .idle_cpus_mask ))
7885
- return ;
7897
+ return false ;
7886
7898
7887
7899
if (!time_after (jiffies , rq -> last_blocked_load_update_tick ))
7888
- return ;
7900
+ return true ;
7889
7901
7890
7902
update_blocked_averages (cpu );
7903
+
7904
+ return rq -> has_blocked_load ;
7905
+ #else
7906
+ return false;
7891
7907
#endif
7892
7908
}
7893
7909
@@ -7913,8 +7929,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
7913
7929
for_each_cpu_and (i , sched_group_span (group ), env -> cpus ) {
7914
7930
struct rq * rq = cpu_rq (i );
7915
7931
7916
- if (env -> flags & LBF_NOHZ_STATS )
7917
- update_nohz_stats ( rq ) ;
7932
+ if (( env -> flags & LBF_NOHZ_STATS ) && update_nohz_stats ( rq ) )
7933
+ env -> flags |= LBF_NOHZ_AGAIN ;
7918
7934
7919
7935
/* Bias balancing toward CPUs of our domain: */
7920
7936
if (local_group )
@@ -8072,12 +8088,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
8072
8088
prefer_sibling = 1 ;
8073
8089
8074
8090
#ifdef CONFIG_NO_HZ_COMMON
8075
- if (env -> idle == CPU_NEWLY_IDLE ) {
8091
+ if (env -> idle == CPU_NEWLY_IDLE && READ_ONCE ( nohz . has_blocked ))
8076
8092
env -> flags |= LBF_NOHZ_STATS ;
8077
-
8078
- if (cpumask_subset (nohz .idle_cpus_mask , sched_domain_span (env -> sd )))
8079
- nohz .next_stats = jiffies + msecs_to_jiffies (LOAD_AVG_PERIOD );
8080
- }
8081
8093
#endif
8082
8094
8083
8095
load_idx = get_sd_load_idx (env -> sd , env -> idle );
@@ -8133,6 +8145,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
8133
8145
sg = sg -> next ;
8134
8146
} while (sg != env -> sd -> groups );
8135
8147
8148
+ #ifdef CONFIG_NO_HZ_COMMON
8149
+ if ((env -> flags & LBF_NOHZ_AGAIN ) &&
8150
+ cpumask_subset (nohz .idle_cpus_mask , sched_domain_span (env -> sd ))) {
8151
+
8152
+ WRITE_ONCE (nohz .next_blocked ,
8153
+ jiffies + msecs_to_jiffies (LOAD_AVG_PERIOD ));
8154
+ }
8155
+ #endif
8156
+
8136
8157
if (env -> sd -> flags & SD_NUMA )
8137
8158
env -> fbq_type = fbq_classify_group (& sds -> busiest_stat );
8138
8159
@@ -9174,7 +9195,8 @@ static void nohz_balancer_kick(struct rq *rq)
9174
9195
if (likely (!atomic_read (& nohz .nr_cpus )))
9175
9196
return ;
9176
9197
9177
- if (time_after (now , nohz .next_stats ))
9198
+ if (READ_ONCE (nohz .has_blocked ) &&
9199
+ time_after (now , READ_ONCE (nohz .next_blocked )))
9178
9200
flags = NOHZ_STATS_KICK ;
9179
9201
9180
9202
if (time_before (now , nohz .next_balance ))
@@ -9293,8 +9315,21 @@ void nohz_balance_enter_idle(int cpu)
9293
9315
if (!housekeeping_cpu (cpu , HK_FLAG_SCHED ))
9294
9316
return ;
9295
9317
9318
+ /*
9319
+ * Can be set safely without rq->lock held
9320
+ * If a clear happens, it will have evaluated last additions because
9321
+ * rq->lock is held during the check and the clear
9322
+ */
9323
+ rq -> has_blocked_load = 1 ;
9324
+
9325
+ /*
9326
+ * The tick is still stopped but load could have been added in the
9327
+ * meantime. We set the nohz.has_blocked flag to trig a check of the
9328
+ * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
9329
+ * of nohz.has_blocked can only happen after checking the new load
9330
+ */
9296
9331
if (rq -> nohz_tick_stopped )
9297
- return ;
9332
+ goto out ;
9298
9333
9299
9334
/* If we're a completely isolated CPU, we don't play: */
9300
9335
if (on_null_domain (rq ))
@@ -9305,7 +9340,21 @@ void nohz_balance_enter_idle(int cpu)
9305
9340
cpumask_set_cpu (cpu , nohz .idle_cpus_mask );
9306
9341
atomic_inc (& nohz .nr_cpus );
9307
9342
9343
+ /*
9344
+ * Ensures that if nohz_idle_balance() fails to observe our
9345
+ * @idle_cpus_mask store, it must observe the @has_blocked
9346
+ * store.
9347
+ */
9348
+ smp_mb__after_atomic ();
9349
+
9308
9350
set_cpu_sd_state_idle (cpu );
9351
+
9352
+ out :
9353
+ /*
9354
+ * Each time a cpu enter idle, we assume that it has blocked load and
9355
+ * enable the periodic update of the load of idle cpus
9356
+ */
9357
+ WRITE_ONCE (nohz .has_blocked , 1 );
9309
9358
}
9310
9359
#else
9311
9360
static inline void nohz_balancer_kick (struct rq * rq ) { }
@@ -9439,7 +9488,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9439
9488
/* Earliest time when we have to do rebalance again */
9440
9489
unsigned long now = jiffies ;
9441
9490
unsigned long next_balance = now + 60 * HZ ;
9442
- unsigned long next_stats = now + msecs_to_jiffies ( LOAD_AVG_PERIOD ) ;
9491
+ bool has_blocked_load = false ;
9443
9492
int update_next_balance = 0 ;
9444
9493
int this_cpu = this_rq -> cpu ;
9445
9494
unsigned int flags ;
@@ -9458,6 +9507,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9458
9507
9459
9508
SCHED_WARN_ON ((flags & NOHZ_KICK_MASK ) == NOHZ_BALANCE_KICK );
9460
9509
9510
+ /*
9511
+ * We assume there will be no idle load after this update and clear
9512
+ * the has_blocked flag. If a cpu enters idle in the mean time, it will
9513
+ * set the has_blocked flag and trig another update of idle load.
9514
+ * Because a cpu that becomes idle, is added to idle_cpus_mask before
9515
+ * setting the flag, we are sure to not clear the state and not
9516
+ * check the load of an idle cpu.
9517
+ */
9518
+ WRITE_ONCE (nohz .has_blocked , 0 );
9519
+
9520
+ /*
9521
+ * Ensures that if we miss the CPU, we must see the has_blocked
9522
+ * store from nohz_balance_enter_idle().
9523
+ */
9524
+ smp_mb ();
9525
+
9461
9526
for_each_cpu (balance_cpu , nohz .idle_cpus_mask ) {
9462
9527
if (balance_cpu == this_cpu || !idle_cpu (balance_cpu ))
9463
9528
continue ;
@@ -9467,11 +9532,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9467
9532
* work being done for other CPUs. Next load
9468
9533
* balancing owner will pick it up.
9469
9534
*/
9470
- if (need_resched ())
9471
- break ;
9535
+ if (need_resched ()) {
9536
+ has_blocked_load = true;
9537
+ goto abort ;
9538
+ }
9472
9539
9473
9540
rq = cpu_rq (balance_cpu );
9474
9541
9542
+ update_blocked_averages (rq -> cpu );
9543
+ has_blocked_load |= rq -> has_blocked_load ;
9544
+
9475
9545
/*
9476
9546
* If time for next balance is due,
9477
9547
* do the balance.
@@ -9484,7 +9554,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9484
9554
cpu_load_update_idle (rq );
9485
9555
rq_unlock_irq (rq , & rf );
9486
9556
9487
- update_blocked_averages (rq -> cpu );
9488
9557
if (flags & NOHZ_BALANCE_KICK )
9489
9558
rebalance_domains (rq , CPU_IDLE );
9490
9559
}
@@ -9499,7 +9568,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9499
9568
if (flags & NOHZ_BALANCE_KICK )
9500
9569
rebalance_domains (this_rq , CPU_IDLE );
9501
9570
9502
- nohz .next_stats = next_stats ;
9571
+ WRITE_ONCE (nohz .next_blocked ,
9572
+ now + msecs_to_jiffies (LOAD_AVG_PERIOD ));
9573
+
9574
+ abort :
9575
+ /* There is still blocked load, enable periodic update */
9576
+ if (has_blocked_load )
9577
+ WRITE_ONCE (nohz .has_blocked , 1 );
9503
9578
9504
9579
/*
9505
9580
* next_balance will be updated only when there is a need.
@@ -10135,6 +10210,7 @@ __init void init_sched_fair_class(void)
10135
10210
10136
10211
#ifdef CONFIG_NO_HZ_COMMON
10137
10212
nohz .next_balance = jiffies ;
10213
+ nohz .next_blocked = jiffies ;
10138
10214
zalloc_cpumask_var (& nohz .idle_cpus_mask , GFP_NOWAIT );
10139
10215
#endif
10140
10216
#endif /* SMP */
0 commit comments