Skip to content

Commit 970e178

Browse files
Mike GalbraithIngo Molnar
authored andcommitted
sched: Improve scalability via 'CPU buddies', which withstand random perturbations
Traversing an entire package is not only expensive, it also leads to tasks bouncing all over a partially idle and possible quite large package. Fix that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try to motivate that one other CPU, if it's busy, tough, it may then try its SMT sibling, but that's all this optimization is allowed to cost. Sibling cache buddies are cross-wired to prevent bouncing. 4 socket 40 core + SMT Westmere box, single 30 sec tbench runs, higher is better: clients 1 2 4 8 16 32 64 128 .......................................................................... pre 30 41 118 645 3769 6214 12233 14312 post 299 603 1211 2418 4697 6847 11606 14557 A nice increase in performance. Signed-off-by: Mike Galbraith <efault@gmx.de> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1339471112.7352.32.camel@marge.simpson.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent a1cd2b1 commit 970e178

File tree

3 files changed

+46
-22
lines changed

3 files changed

+46
-22
lines changed

include/linux/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,7 @@ struct sched_domain {
949949
unsigned int smt_gain;
950950
int flags; /* See SD_* */
951951
int level;
952+
int idle_buddy; /* cpu assigned to select_idle_sibling() */
952953

953954
/* Runtime fields. */
954955
unsigned long last_balance; /* init to jiffies. units in jiffies */

kernel/sched/core.c

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
60246024
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
60256025
* allows us to avoid some pointer chasing select_idle_sibling().
60266026
*
6027+
* Iterate domains and sched_groups downward, assigning CPUs to be
6028+
* select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
6029+
* due to random perturbation self canceling, ie sw buddies pull
6030+
* their counterpart to their CPU's hw counterpart.
6031+
*
60276032
* Also keep a unique ID per domain (we use the first cpu number in
60286033
* the cpumask of the domain), this allows us to quickly tell if
60296034
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
60376042
int id = cpu;
60386043

60396044
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6040-
if (sd)
6045+
if (sd) {
6046+
struct sched_domain *tmp = sd;
6047+
struct sched_group *sg, *prev;
6048+
bool right;
6049+
6050+
/*
6051+
* Traverse to first CPU in group, and count hops
6052+
* to cpu from there, switching direction on each
6053+
* hop, never ever pointing the last CPU rightward.
6054+
*/
6055+
do {
6056+
id = cpumask_first(sched_domain_span(tmp));
6057+
prev = sg = tmp->groups;
6058+
right = 1;
6059+
6060+
while (cpumask_first(sched_group_cpus(sg)) != id)
6061+
sg = sg->next;
6062+
6063+
while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
6064+
prev = sg;
6065+
sg = sg->next;
6066+
right = !right;
6067+
}
6068+
6069+
/* A CPU went down, never point back to domain start. */
6070+
if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
6071+
right = false;
6072+
6073+
sg = right ? sg->next : prev;
6074+
tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
6075+
} while ((tmp = tmp->child));
6076+
60416077
id = cpumask_first(sched_domain_span(sd));
6078+
}
60426079

60436080
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
60446081
per_cpu(sd_llc_id, cpu) = id;

kernel/sched/fair.c

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
26372637
int cpu = smp_processor_id();
26382638
int prev_cpu = task_cpu(p);
26392639
struct sched_domain *sd;
2640-
struct sched_group *sg;
2641-
int i;
26422640

26432641
/*
26442642
* If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
26552653
return prev_cpu;
26562654

26572655
/*
2658-
* Otherwise, iterate the domains and find an elegible idle cpu.
2656+
* Otherwise, check assigned siblings to find an elegible idle cpu.
26592657
*/
26602658
sd = rcu_dereference(per_cpu(sd_llc, target));
2661-
for_each_lower_domain(sd) {
2662-
sg = sd->groups;
2663-
do {
2664-
if (!cpumask_intersects(sched_group_cpus(sg),
2665-
tsk_cpus_allowed(p)))
2666-
goto next;
2667-
2668-
for_each_cpu(i, sched_group_cpus(sg)) {
2669-
if (!idle_cpu(i))
2670-
goto next;
2671-
}
26722659

2673-
target = cpumask_first_and(sched_group_cpus(sg),
2674-
tsk_cpus_allowed(p));
2675-
goto done;
2676-
next:
2677-
sg = sg->next;
2678-
} while (sg != sd->groups);
2660+
for_each_lower_domain(sd) {
2661+
if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
2662+
continue;
2663+
if (idle_cpu(sd->idle_buddy))
2664+
return sd->idle_buddy;
26792665
}
2680-
done:
2666+
26812667
return target;
26822668
}
26832669

0 commit comments

Comments
 (0)