Skip to content

Commit 7053ea1

Browse files
Rik van RielIngo Molnar
authored andcommitted
stop_machine: Fix race between stop_two_cpus() and stop_cpus()
There is a race between stop_two_cpus, and the global stop_cpus. It is possible for two CPUs to get their stopper functions queued "backwards" from one another, resulting in the stopper threads getting stuck, and the system hanging. This can happen because queuing up stoppers is not synchronized. This patch adds synchronization between stop_cpus (a rare operation), and stop_two_cpus. Reported-and-Tested-by: Prarit Bhargava <prarit@redhat.com> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Acked-by: Mel Gorman <mgorman@suse.de> Link: http://lkml.kernel.org/r/20131101104146.03d1e043@annuminas.surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 37dc6b5 commit 7053ea1

File tree

1 file changed

+13
-2
lines changed

1 file changed

+13
-2
lines changed

kernel/stop_machine.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <linux/kallsyms.h>
2121
#include <linux/smpboot.h>
2222
#include <linux/atomic.h>
23+
#include <linux/lglock.h>
2324

2425
/*
2526
* Structure to determine completion condition and record errors. May
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
4344
static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
4445
static bool stop_machine_initialized = false;
4546

47+
/*
48+
* Avoids a race between stop_two_cpus and global stop_cpus, where
49+
* the stoppers could get queued up in reverse order, leading to
50+
* system deadlock. Using an lglock means stop_two_cpus remains
51+
* relatively cheap.
52+
*/
53+
DEFINE_STATIC_LGLOCK(stop_cpus_lock);
54+
4655
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
4756
{
4857
memset(done, 0, sizeof(*done));
@@ -276,6 +285,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
276285
return -ENOENT;
277286
}
278287

288+
lg_local_lock(&stop_cpus_lock);
279289
/*
280290
* Queuing needs to be done by the lowest numbered CPU, to ensure
281291
* that works are always queued in the same order on every CPU.
@@ -284,6 +294,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
284294
smp_call_function_single(min(cpu1, cpu2),
285295
&irq_cpu_stop_queue_work,
286296
&call_args, 0);
297+
lg_local_unlock(&stop_cpus_lock);
287298
preempt_enable();
288299

289300
wait_for_completion(&done.completion);
@@ -335,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
335346
* preempted by a stopper which might wait for other stoppers
336347
* to enter @fn which can lead to deadlock.
337348
*/
338-
preempt_disable();
349+
lg_global_lock(&stop_cpus_lock);
339350
for_each_cpu(cpu, cpumask)
340351
cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
341-
preempt_enable();
352+
lg_global_unlock(&stop_cpus_lock);
342353
}
343354

344355
static int __stop_cpus(const struct cpumask *cpumask,

0 commit comments

Comments
 (0)