Skip to content

Commit cefef3a

Browse files
author
Ingo Molnar
committed
Merge branch 'sched/core' into timers/nohz, to avoid conflicts in upcoming patches
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2 parents 92d21ac + 748c720 commit cefef3a

File tree

10 files changed

+418
-190
lines changed

10 files changed

+418
-190
lines changed

arch/x86/kernel/kvm.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void)
301301
if (!has_steal_clock)
302302
return;
303303

304-
memset(st, 0, sizeof(*st));
305-
306304
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
307305
pr_info("kvm-stealtime: cpu %d, msr %llx\n",
308306
cpu, (unsigned long long) slow_virt_to_phys(st));

include/linux/sched.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p);
219219
#define TASK_WAKING 256
220220
#define TASK_PARKED 512
221221
#define TASK_NOLOAD 1024
222-
#define TASK_STATE_MAX 2048
222+
#define TASK_NEW 2048
223+
#define TASK_STATE_MAX 4096
223224

224-
#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
225+
#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
225226

226227
extern char ___assert_task_state[1 - 2*!!(
227228
sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@ -2139,6 +2140,9 @@ static inline void put_task_struct(struct task_struct *t)
21392140
__put_task_struct(t);
21402141
}
21412142

2143+
struct task_struct *task_rcu_dereference(struct task_struct **ptask);
2144+
struct task_struct *try_get_task_struct(struct task_struct **ptask);
2145+
21422146
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
21432147
extern void task_cputime(struct task_struct *t,
21442148
cputime_t *utime, cputime_t *stime);

kernel/exit.c

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,82 @@ void release_task(struct task_struct *p)
210210
goto repeat;
211211
}
212212

213+
/*
214+
* Note that if this function returns a valid task_struct pointer (!NULL)
215+
* task->usage must remain >0 for the duration of the RCU critical section.
216+
*/
217+
struct task_struct *task_rcu_dereference(struct task_struct **ptask)
218+
{
219+
struct sighand_struct *sighand;
220+
struct task_struct *task;
221+
222+
/*
223+
* We need to verify that release_task() was not called and thus
224+
* delayed_put_task_struct() can't run and drop the last reference
225+
* before rcu_read_unlock(). We check task->sighand != NULL,
226+
* but we can read the already freed and reused memory.
227+
*/
228+
retry:
229+
task = rcu_dereference(*ptask);
230+
if (!task)
231+
return NULL;
232+
233+
probe_kernel_address(&task->sighand, sighand);
234+
235+
/*
236+
* Pairs with atomic_dec_and_test() in put_task_struct(). If this task
237+
* was already freed we can not miss the preceding update of this
238+
* pointer.
239+
*/
240+
smp_rmb();
241+
if (unlikely(task != READ_ONCE(*ptask)))
242+
goto retry;
243+
244+
/*
245+
* We've re-checked that "task == *ptask", now we have two different
246+
* cases:
247+
*
248+
* 1. This is actually the same task/task_struct. In this case
249+
* sighand != NULL tells us it is still alive.
250+
*
251+
* 2. This is another task which got the same memory for task_struct.
252+
* We can't know this of course, and we can not trust
253+
* sighand != NULL.
254+
*
255+
* In this case we actually return a random value, but this is
256+
* correct.
257+
*
258+
* If we return NULL - we can pretend that we actually noticed that
259+
* *ptask was updated when the previous task has exited. Or pretend
260+
* that probe_slab_address(&sighand) reads NULL.
261+
*
262+
* If we return the new task (because sighand is not NULL for any
263+
* reason) - this is fine too. This (new) task can't go away before
264+
* another gp pass.
265+
*
266+
* And note: We could even eliminate the false positive if re-read
267+
* task->sighand once again to avoid the falsely NULL. But this case
268+
* is very unlikely so we don't care.
269+
*/
270+
if (!sighand)
271+
return NULL;
272+
273+
return task;
274+
}
275+
276+
struct task_struct *try_get_task_struct(struct task_struct **ptask)
277+
{
278+
struct task_struct *task;
279+
280+
rcu_read_lock();
281+
task = task_rcu_dereference(ptask);
282+
if (task)
283+
get_task_struct(task);
284+
rcu_read_unlock();
285+
286+
return task;
287+
}
288+
213289
/*
214290
* Determine if a process group is "orphaned", according to the POSIX
215291
* definition in 2.2.2.52. Orphaned process groups are not to be affected

kernel/sched/core.c

Lines changed: 81 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
23422342

23432343
__sched_fork(clone_flags, p);
23442344
/*
2345-
* We mark the process as running here. This guarantees that
2345+
* We mark the process as NEW here. This guarantees that
23462346
* nobody will actually run it, and a signal or other external
23472347
* event cannot wake it up and insert it on the runqueue either.
23482348
*/
2349-
p->state = TASK_RUNNING;
2349+
p->state = TASK_NEW;
23502350

23512351
/*
23522352
* Make sure we do not leak PI boosting priority to the child.
@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
23832383
p->sched_class = &fair_sched_class;
23842384
}
23852385

2386-
if (p->sched_class->task_fork)
2387-
p->sched_class->task_fork(p);
2386+
init_entity_runnable_average(&p->se);
23882387

23892388
/*
23902389
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
23942393
* Silence PROVE_RCU.
23952394
*/
23962395
raw_spin_lock_irqsave(&p->pi_lock, flags);
2397-
set_task_cpu(p, cpu);
2396+
/*
2397+
* We're setting the cpu for the first time, we don't migrate,
2398+
* so use __set_task_cpu().
2399+
*/
2400+
__set_task_cpu(p, cpu);
2401+
if (p->sched_class->task_fork)
2402+
p->sched_class->task_fork(p);
23982403
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
23992404

24002405
#ifdef CONFIG_SCHED_INFO
@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p)
25262531
struct rq_flags rf;
25272532
struct rq *rq;
25282533

2529-
/* Initialize new task's runnable average */
2530-
init_entity_runnable_average(&p->se);
25312534
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2535+
p->state = TASK_RUNNING;
25322536
#ifdef CONFIG_SMP
25332537
/*
25342538
* Fork balancing, do it here and not earlier because:
25352539
* - cpus_allowed can change in the fork path
25362540
* - any previously selected cpu might disappear through hotplug
2541+
*
2542+
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
2543+
* as we're not fully set-up yet.
25372544
*/
2538-
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2545+
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
25392546
#endif
25402547
rq = __task_rq_lock(p, &rf);
25412548
post_init_entity_util_avg(&p->se);
@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
31613168
pr_cont("\n");
31623169
}
31633170
#endif
3171+
if (panic_on_warn)
3172+
panic("scheduling while atomic\n");
3173+
31643174
dump_stack();
31653175
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
31663176
}
@@ -4752,7 +4762,8 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
47524762
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
47534763
* @user_mask_ptr: user-space pointer to hold the current cpu mask
47544764
*
4755-
* Return: 0 on success. An error code otherwise.
4765+
* Return: size of CPU mask copied to user_mask_ptr on success. An
4766+
* error code otherwise.
47564767
*/
47574768
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
47584769
unsigned long __user *, user_mask_ptr)
@@ -7231,7 +7242,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
72317242
struct rq *rq = cpu_rq(cpu);
72327243

72337244
rq->calc_load_update = calc_load_update;
7234-
account_reset_rq(rq);
72357245
update_max_interval();
72367246
}
72377247

@@ -7711,6 +7721,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
77117721
INIT_LIST_HEAD(&tg->children);
77127722
list_add_rcu(&tg->siblings, &parent->children);
77137723
spin_unlock_irqrestore(&task_group_lock, flags);
7724+
7725+
online_fair_sched_group(tg);
77147726
}
77157727

77167728
/* rcu callback to free various structures associated with a task group */
@@ -7739,27 +7751,9 @@ void sched_offline_group(struct task_group *tg)
77397751
spin_unlock_irqrestore(&task_group_lock, flags);
77407752
}
77417753

7742-
/* change task's runqueue when it moves between groups.
7743-
* The caller of this function should have put the task in its new group
7744-
* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7745-
* reflect its new group.
7746-
*/
7747-
void sched_move_task(struct task_struct *tsk)
7754+
static void sched_change_group(struct task_struct *tsk, int type)
77487755
{
77497756
struct task_group *tg;
7750-
int queued, running;
7751-
struct rq_flags rf;
7752-
struct rq *rq;
7753-
7754-
rq = task_rq_lock(tsk, &rf);
7755-
7756-
running = task_current(rq, tsk);
7757-
queued = task_on_rq_queued(tsk);
7758-
7759-
if (queued)
7760-
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
7761-
if (unlikely(running))
7762-
put_prev_task(rq, tsk);
77637757

77647758
/*
77657759
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7772,11 +7766,37 @@ void sched_move_task(struct task_struct *tsk)
77727766
tsk->sched_task_group = tg;
77737767

77747768
#ifdef CONFIG_FAIR_GROUP_SCHED
7775-
if (tsk->sched_class->task_move_group)
7776-
tsk->sched_class->task_move_group(tsk);
7769+
if (tsk->sched_class->task_change_group)
7770+
tsk->sched_class->task_change_group(tsk, type);
77777771
else
77787772
#endif
77797773
set_task_rq(tsk, task_cpu(tsk));
7774+
}
7775+
7776+
/*
7777+
* Change task's runqueue when it moves between groups.
7778+
*
7779+
* The caller of this function should have put the task in its new group by
7780+
* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
7781+
* its new group.
7782+
*/
7783+
void sched_move_task(struct task_struct *tsk)
7784+
{
7785+
int queued, running;
7786+
struct rq_flags rf;
7787+
struct rq *rq;
7788+
7789+
rq = task_rq_lock(tsk, &rf);
7790+
7791+
running = task_current(rq, tsk);
7792+
queued = task_on_rq_queued(tsk);
7793+
7794+
if (queued)
7795+
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
7796+
if (unlikely(running))
7797+
put_prev_task(rq, tsk);
7798+
7799+
sched_change_group(tsk, TASK_MOVE_GROUP);
77807800

77817801
if (unlikely(running))
77827802
tsk->sched_class->set_curr_task(rq);
@@ -8204,15 +8224,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
82048224
sched_free_group(tg);
82058225
}
82068226

8227+
/*
8228+
* This is called before wake_up_new_task(), therefore we really only
8229+
* have to set its group bits, all the other stuff does not apply.
8230+
*/
82078231
static void cpu_cgroup_fork(struct task_struct *task)
82088232
{
8209-
sched_move_task(task);
8233+
struct rq_flags rf;
8234+
struct rq *rq;
8235+
8236+
rq = task_rq_lock(task, &rf);
8237+
8238+
sched_change_group(task, TASK_SET_GROUP);
8239+
8240+
task_rq_unlock(rq, task, &rf);
82108241
}
82118242

82128243
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
82138244
{
82148245
struct task_struct *task;
82158246
struct cgroup_subsys_state *css;
8247+
int ret = 0;
82168248

82178249
cgroup_taskset_for_each(task, css, tset) {
82188250
#ifdef CONFIG_RT_GROUP_SCHED
@@ -8223,8 +8255,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
82238255
if (task->sched_class != &fair_sched_class)
82248256
return -EINVAL;
82258257
#endif
8258+
/*
8259+
* Serialize against wake_up_new_task() such that if its
8260+
* running, we're sure to observe its full state.
8261+
*/
8262+
raw_spin_lock_irq(&task->pi_lock);
8263+
/*
8264+
* Avoid calling sched_move_task() before wake_up_new_task()
8265+
* has happened. This would lead to problems with PELT, due to
8266+
* move wanting to detach+attach while we're not attached yet.
8267+
*/
8268+
if (task->state == TASK_NEW)
8269+
ret = -EINVAL;
8270+
raw_spin_unlock_irq(&task->pi_lock);
8271+
8272+
if (ret)
8273+
break;
82268274
}
8227-
return 0;
8275+
return ret;
82288276
}
82298277

82308278
static void cpu_cgroup_attach(struct cgroup_taskset *tset)

0 commit comments

Comments
 (0)