Skip to content

Commit 52d7e48

Browse files
committed
rcu: Narrow early boot window of illegal synchronous grace periods
The current preemptible RCU implementation goes through three phases during bootup. In the first phase, there is only one CPU that is running with preemption disabled, so that a no-op is a synchronous grace period. In the second mid-boot phase, the scheduler is running, but RCU has not yet gotten its kthreads spawned (and, for expedited grace periods, workqueues are not yet running. During this time, any attempt to do a synchronous grace period will hang the system (or complain bitterly, depending). In the third and final phase, RCU is fully operational and everything works normally. This has been OK for some time, but there has recently been some synchronous grace periods showing up during the second mid-boot phase. This code worked "by accident" for awhile, but started failing as soon as expedited RCU grace periods switched over to workqueues in commit 8b355e3 ("rcu: Drive expedited grace periods from workqueue"). Note that the code was buggy even before this commit, as it was subject to failure on real-time systems that forced all expedited grace periods to run as normal grace periods (for example, using the rcu_normal ksysfs parameter). The callchain from the failure case is as follows: early_amd_iommu_init() |-> acpi_put_table(ivrs_base); |-> acpi_tb_put_table(table_desc); |-> acpi_tb_invalidate_table(table_desc); |-> acpi_tb_release_table(...) |-> acpi_os_unmap_memory |-> acpi_os_unmap_iomem |-> acpi_os_map_cleanup |-> synchronize_rcu_expedited The kernel showing this callchain was built with CONFIG_PREEMPT_RCU=y, which caused the code to try using workqueues before they were initialized, which did not go well. This commit therefore reworks RCU to permit synchronous grace periods to proceed during this mid-boot phase. This commit is therefore a fix to a regression introduced in v4.9, and is therefore being put forward post-merge-window in v4.10. This commit sets a flag from the existing rcu_scheduler_starting() function which causes all synchronous grace periods to take the expedited path. The expedited path now checks this flag, using the requesting task to drive the expedited grace period forward during the mid-boot phase. Finally, this flag is updated by a core_initcall() function named rcu_exp_runtime_mode(), which causes the runtime codepaths to be used. Note that this arrangement assumes that tasks are not sent POSIX signals (or anything similar) from the time that the first task is spawned through core_initcall() time. Fixes: 8b355e3 ("rcu: Drive expedited grace periods from workqueue") Reported-by: "Zheng, Lv" <lv.zheng@intel.com> Reported-by: Borislav Petkov <bp@alien8.de> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Stan Kain <stan.kain@gmail.com> Tested-by: Ivan <waffolz@hotmail.com> Tested-by: Emanuel Castelo <emanuel.castelo@gmail.com> Tested-by: Bruno Pesavento <bpesavento@infinito.it> Tested-by: Borislav Petkov <bp@suse.de> Tested-by: Frederic Bezies <fredbezies@gmail.com> Cc: <stable@vger.kernel.org> # 4.9.0-
1 parent f466ae6 commit 52d7e48

File tree

7 files changed

+104
-35
lines changed

7 files changed

+104
-35
lines changed

include/linux/rcupdate.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,10 @@ bool __rcu_is_watching(void);
444444
#error "Unknown RCU implementation specified to kernel configuration"
445445
#endif
446446

447+
#define RCU_SCHEDULER_INACTIVE 0
448+
#define RCU_SCHEDULER_INIT 1
449+
#define RCU_SCHEDULER_RUNNING 2
450+
447451
/*
448452
* init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
449453
* initialization and destruction of rcu_head on the stack. rcu_head structures

kernel/rcu/rcu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
136136
#define TPS(x) tracepoint_string(x)
137137

138138
void rcu_early_boot_tests(void);
139+
void rcu_test_sync_prims(void);
139140

140141
/*
141142
* This function really isn't for public consumption, but RCU is special in

kernel/rcu/tiny_plugin.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
6060

6161
/*
6262
* During boot, we forgive RCU lockdep issues. After this function is
63-
* invoked, we start taking RCU lockdep issues seriously.
63+
* invoked, we start taking RCU lockdep issues seriously. Note that unlike
64+
* Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
65+
* to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
66+
* The reason for this is that Tiny RCU does not need kthreads, so does
67+
* not have to care about the fact that the scheduler is half-initialized
68+
* at a certain phase of the boot process.
6469
*/
6570
void __init rcu_scheduler_starting(void)
6671
{
6772
WARN_ON(nr_context_switches() > 0);
68-
rcu_scheduler_active = 1;
73+
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
6974
}
7075

7176
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

kernel/rcu/tree.c

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
127127
int sysctl_panic_on_rcu_stall __read_mostly;
128128

129129
/*
130-
* The rcu_scheduler_active variable transitions from zero to one just
131-
* before the first task is spawned. So when this variable is zero, RCU
132-
* can assume that there is but one task, allowing RCU to (for example)
130+
* The rcu_scheduler_active variable is initialized to the value
131+
* RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
132+
* first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
133+
* RCU can assume that there is but one task, allowing RCU to (for example)
133134
* optimize synchronize_rcu() to a simple barrier(). When this variable
134-
* is one, RCU must actually do all the hard work required to detect real
135-
* grace periods. This variable is also used to suppress boot-time false
136-
* positives from lockdep-RCU error checking.
135+
* is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
136+
* to detect real grace periods. This variable is also used to suppress
137+
* boot-time false positives from lockdep-RCU error checking. Finally, it
138+
* transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
139+
* is fully initialized, including all of its kthreads having been spawned.
137140
*/
138141
int rcu_scheduler_active __read_mostly;
139142
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
@@ -3980,18 +3983,22 @@ static int __init rcu_spawn_gp_kthread(void)
39803983
early_initcall(rcu_spawn_gp_kthread);
39813984

39823985
/*
3983-
* This function is invoked towards the end of the scheduler's initialization
3984-
* process. Before this is called, the idle task might contain
3985-
* RCU read-side critical sections (during which time, this idle
3986-
* task is booting the system). After this function is called, the
3987-
* idle tasks are prohibited from containing RCU read-side critical
3988-
* sections. This function also enables RCU lockdep checking.
3986+
* This function is invoked towards the end of the scheduler's
3987+
* initialization process. Before this is called, the idle task might
3988+
* contain synchronous grace-period primitives (during which time, this idle
3989+
* task is booting the system, and such primitives are no-ops). After this
3990+
* function is called, any synchronous grace-period primitives are run as
3991+
* expedited, with the requesting task driving the grace period forward.
3992+
* A later core_initcall() rcu_exp_runtime_mode() will switch to full
3993+
* runtime RCU functionality.
39893994
*/
39903995
void rcu_scheduler_starting(void)
39913996
{
39923997
WARN_ON(num_online_cpus() != 1);
39933998
WARN_ON(nr_context_switches() > 0);
3994-
rcu_scheduler_active = 1;
3999+
rcu_test_sync_prims();
4000+
rcu_scheduler_active = RCU_SCHEDULER_INIT;
4001+
rcu_test_sync_prims();
39954002
}
39964003

39974004
/*

kernel/rcu/tree_exp.h

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -531,19 +531,29 @@ struct rcu_exp_work {
531531
struct work_struct rew_work;
532532
};
533533

534+
/*
535+
* Common code to drive an expedited grace period forward, used by
536+
* workqueues and mid-boot-time tasks.
537+
*/
538+
static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
539+
smp_call_func_t func, unsigned long s)
540+
{
541+
/* Initialize the rcu_node tree in preparation for the wait. */
542+
sync_rcu_exp_select_cpus(rsp, func);
543+
544+
/* Wait and clean up, including waking everyone. */
545+
rcu_exp_wait_wake(rsp, s);
546+
}
547+
534548
/*
535549
* Work-queue handler to drive an expedited grace period forward.
536550
*/
537551
static void wait_rcu_exp_gp(struct work_struct *wp)
538552
{
539553
struct rcu_exp_work *rewp;
540554

541-
/* Initialize the rcu_node tree in preparation for the wait. */
542555
rewp = container_of(wp, struct rcu_exp_work, rew_work);
543-
sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
544-
545-
/* Wait and clean up, including waking everyone. */
546-
rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
556+
rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
547557
}
548558

549559
/*
@@ -569,12 +579,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
569579
if (exp_funnel_lock(rsp, s))
570580
return; /* Someone else did our work for us. */
571581

572-
/* Marshall arguments and schedule the expedited grace period. */
573-
rew.rew_func = func;
574-
rew.rew_rsp = rsp;
575-
rew.rew_s = s;
576-
INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
577-
schedule_work(&rew.rew_work);
582+
/* Ensure that load happens before action based on it. */
583+
if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
584+
/* Direct call during scheduler init and early_initcalls(). */
585+
rcu_exp_sel_wait_wake(rsp, func, s);
586+
} else {
587+
/* Marshall arguments & schedule the expedited grace period. */
588+
rew.rew_func = func;
589+
rew.rew_rsp = rsp;
590+
rew.rew_s = s;
591+
INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
592+
schedule_work(&rew.rew_work);
593+
}
578594

579595
/* Wait for expedited grace period to complete. */
580596
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
@@ -676,6 +692,8 @@ void synchronize_rcu_expedited(void)
676692
{
677693
struct rcu_state *rsp = rcu_state_p;
678694

695+
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
696+
return;
679697
_synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
680698
}
681699
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -693,3 +711,15 @@ void synchronize_rcu_expedited(void)
693711
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
694712

695713
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
714+
715+
/*
716+
* Switch to run-time mode once Tree RCU has fully initialized.
717+
*/
718+
static int __init rcu_exp_runtime_mode(void)
719+
{
720+
rcu_test_sync_prims();
721+
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
722+
rcu_test_sync_prims();
723+
return 0;
724+
}
725+
core_initcall(rcu_exp_runtime_mode);

kernel/rcu/tree_plugin.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -670,7 +670,7 @@ void synchronize_rcu(void)
670670
lock_is_held(&rcu_lock_map) ||
671671
lock_is_held(&rcu_sched_lock_map),
672672
"Illegal synchronize_rcu() in RCU read-side critical section");
673-
if (!rcu_scheduler_active)
673+
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
674674
return;
675675
if (rcu_gp_is_expedited())
676676
synchronize_rcu_expedited();

kernel/rcu/update.c

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
121121
* Should expedited grace-period primitives always fall back to their
122122
* non-expedited counterparts? Intended for use within RCU. Note
123123
* that if the user specifies both rcu_expedited and rcu_normal, then
124-
* rcu_normal wins.
124+
* rcu_normal wins. (Except during the time period during boot from
125+
* when the first task is spawned until the rcu_exp_runtime_mode()
126+
* core_initcall() is invoked, at which point everything is expedited.)
125127
*/
126128
bool rcu_gp_is_normal(void)
127129
{
128-
return READ_ONCE(rcu_normal);
130+
return READ_ONCE(rcu_normal) &&
131+
rcu_scheduler_active != RCU_SCHEDULER_INIT;
129132
}
130133
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
131134

@@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting =
135138
/*
136139
* Should normal grace-period primitives be expedited? Intended for
137140
* use within RCU. Note that this function takes the rcu_expedited
138-
* sysfs/boot variable into account as well as the rcu_expedite_gp()
139-
* nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
140-
* returns false is a -really- bad idea.
141+
* sysfs/boot variable and rcu_scheduler_active into account as well
142+
* as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
143+
* until rcu_gp_is_expedited() returns false is a -really- bad idea.
141144
*/
142145
bool rcu_gp_is_expedited(void)
143146
{
144-
return rcu_expedited || atomic_read(&rcu_expedited_nesting);
147+
return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
148+
rcu_scheduler_active == RCU_SCHEDULER_INIT;
145149
}
146150
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
147151

@@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
257261

258262
int notrace debug_lockdep_rcu_enabled(void)
259263
{
260-
return rcu_scheduler_active && debug_locks &&
264+
return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
261265
current->lockdep_recursion == 0;
262266
}
263267
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
591595
void synchronize_rcu_tasks(void)
592596
{
593597
/* Complain if the scheduler has not started. */
594-
RCU_LOCKDEP_WARN(!rcu_scheduler_active,
598+
RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
595599
"synchronize_rcu_tasks called too soon");
596600

597601
/* Wait for the grace period. */
@@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void)
813817

814818
#endif /* #ifdef CONFIG_TASKS_RCU */
815819

820+
/*
821+
* Test each non-SRCU synchronous grace-period wait API. This is
822+
* useful just after a change in mode for these primitives, and
823+
* during early boot.
824+
*/
825+
void rcu_test_sync_prims(void)
826+
{
827+
if (!IS_ENABLED(CONFIG_PROVE_RCU))
828+
return;
829+
synchronize_rcu();
830+
synchronize_rcu_bh();
831+
synchronize_sched();
832+
synchronize_rcu_expedited();
833+
synchronize_rcu_bh_expedited();
834+
synchronize_sched_expedited();
835+
}
836+
816837
#ifdef CONFIG_PROVE_RCU
817838

818839
/*
@@ -865,6 +886,7 @@ void rcu_early_boot_tests(void)
865886
early_boot_test_call_rcu_bh();
866887
if (rcu_self_test_sched)
867888
early_boot_test_call_rcu_sched();
889+
rcu_test_sync_prims();
868890
}
869891

870892
static int rcu_verify_early_boot_tests(void)

0 commit comments

Comments
 (0)