Skip to content

Commit bb17f65

Browse files
paulturnerIngo Molnar
authored andcommitted
sched: Normalize tg load contributions against runnable time
Entities of equal weight should receive equitable distribution of cpu time. This is challenging in the case of a task_group's shares as execution may be occurring on multiple cpus simultaneously. To handle this we divide up the shares into weights proportionate with the load on each cfs_rq. This does not however, account for the fact that the sum of the parts may be less than one cpu and so we need to normalize: load(tg) = min(runnable_avg(tg), 1) * tg->shares Where runnable_avg is the aggregate time in which the task_group had runnable children. Signed-off-by: Paul Turner <pjt@google.com> Reviewed-by: Ben Segall <bsegall@google.com>. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20120823141506.930124292@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 8165e14 commit bb17f65

File tree

3 files changed

+62
-0
lines changed

3 files changed

+62
-0
lines changed

kernel/sched/debug.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
234234
atomic64_read(&cfs_rq->tg->load_avg));
235235
SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
236236
cfs_rq->tg_load_contrib);
237+
SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
238+
cfs_rq->tg_runnable_contrib);
239+
SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
240+
atomic_read(&cfs_rq->tg->runnable_avg));
237241
#endif
238242

239243
print_cfs_group_stats(m, cpu, cfs_rq->tg);

kernel/sched/fair.c

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,19 +1118,73 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
11181118
}
11191119
}
11201120

1121+
/*
1122+
* Aggregate cfs_rq runnable averages into an equivalent task_group
1123+
* representation for computing load contributions.
1124+
*/
1125+
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1126+
struct cfs_rq *cfs_rq)
1127+
{
1128+
struct task_group *tg = cfs_rq->tg;
1129+
long contrib;
1130+
1131+
/* The fraction of a cpu used by this cfs_rq */
1132+
contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1133+
sa->runnable_avg_period + 1);
1134+
contrib -= cfs_rq->tg_runnable_contrib;
1135+
1136+
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1137+
atomic_add(contrib, &tg->runnable_avg);
1138+
cfs_rq->tg_runnable_contrib += contrib;
1139+
}
1140+
}
1141+
11211142
static inline void __update_group_entity_contrib(struct sched_entity *se)
11221143
{
11231144
struct cfs_rq *cfs_rq = group_cfs_rq(se);
11241145
struct task_group *tg = cfs_rq->tg;
1146+
int runnable_avg;
1147+
11251148
u64 contrib;
11261149

11271150
contrib = cfs_rq->tg_load_contrib * tg->shares;
11281151
se->avg.load_avg_contrib = div64_u64(contrib,
11291152
atomic64_read(&tg->load_avg) + 1);
1153+
1154+
/*
1155+
* For group entities we need to compute a correction term in the case
1156+
* that they are consuming <1 cpu so that we would contribute the same
1157+
* load as a task of equal weight.
1158+
*
1159+
* Explicitly co-ordinating this measurement would be expensive, but
1160+
* fortunately the sum of each cpus contribution forms a usable
1161+
* lower-bound on the true value.
1162+
*
1163+
* Consider the aggregate of 2 contributions. Either they are disjoint
1164+
* (and the sum represents true value) or they are disjoint and we are
1165+
* understating by the aggregate of their overlap.
1166+
*
1167+
* Extending this to N cpus, for a given overlap, the maximum amount we
1168+
* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1169+
* cpus that overlap for this interval and w_i is the interval width.
1170+
*
1171+
* On a small machine; the first term is well-bounded which bounds the
1172+
* total error since w_i is a subset of the period. Whereas on a
1173+
* larger machine, while this first term can be larger, if w_i is the
1174+
* of consequential size guaranteed to see n_i*w_i quickly converge to
1175+
* our upper bound of 1-cpu.
1176+
*/
1177+
runnable_avg = atomic_read(&tg->runnable_avg);
1178+
if (runnable_avg < NICE_0_LOAD) {
1179+
se->avg.load_avg_contrib *= runnable_avg;
1180+
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1181+
}
11301182
}
11311183
#else
11321184
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
11331185
int force_update) {}
1186+
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1187+
struct cfs_rq *cfs_rq) {}
11341188
static inline void __update_group_entity_contrib(struct sched_entity *se) {}
11351189
#endif
11361190

@@ -1152,6 +1206,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
11521206
if (entity_is_task(se)) {
11531207
__update_task_entity_contrib(se);
11541208
} else {
1209+
__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
11551210
__update_group_entity_contrib(se);
11561211
}
11571212

@@ -1220,6 +1275,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
12201275
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
12211276
{
12221277
__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1278+
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
12231279
}
12241280

12251281
/* Add the load generated by se into cfs_rq's child load-average */

kernel/sched/sched.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ struct task_group {
113113

114114
atomic_t load_weight;
115115
atomic64_t load_avg;
116+
atomic_t runnable_avg;
116117
#endif
117118

118119
#ifdef CONFIG_RT_GROUP_SCHED
@@ -234,6 +235,7 @@ struct cfs_rq {
234235
atomic64_t decay_counter, removed_load;
235236
u64 last_decay;
236237
#ifdef CONFIG_FAIR_GROUP_SCHED
238+
u32 tg_runnable_contrib;
237239
u64 tg_load_contrib;
238240
#endif
239241
#endif

0 commit comments

Comments
 (0)