Skip to content

Commit 2e91fa7

Browse files
committed
cgroup: keep zombies associated with their original cgroups
cgroup_exit() is called when a task exits and disassociates the exiting task from its cgroups and half-attach it to the root cgroup. This is unnecessary and undesirable. No controller actually needs an exiting task to be disassociated with non-root cgroups. Both cpu and perf_event controllers update the association to the root cgroup from their exit callbacks just to keep consistent with the cgroup core behavior. Also, this disassociation makes it difficult to track resources held by zombies or determine where the zombies came from. Currently, pids controller is completely broken as it uncharges on exit and zombies always escape the resource restriction. With cgroup association being reset on exit, fixing it is pretty painful. There's no reason to reset cgroup membership on exit. The zombie can be removed from its css_set so that it doesn't show up on "cgroup.procs" and thus can't be migrated or interfere with cgroup removal. It can still pin and point to the css_set so that its cgroup membership is maintained. This patch makes cgroup core keep zombies associated with their cgroups at the time of exit. * Previous patches decoupled populated_cnt tracking from css_set lifetime, so a dying task can be simply unlinked from its css_set while pinning and pointing to the css_set. This keeps css_set association from task side alive while hiding it from "cgroup.procs" and populated_cnt tracking. The css_set reference is dropped when the task_struct is freed. * ->exit() callback no longer needs the css arguments as the associated css never changes once PF_EXITING is set. Removed. * cpu and perf_events controllers no longer need ->exit() callbacks. There's no reason to explicitly switch away on exit. The final schedule out is enough. The callbacks are removed. * On traditional hierarchies, nothing changes. "/proc/PID/cgroup" still reports "/" for all zombies. On the default hierarchy, "/proc/PID/cgroup" keeps reporting the cgroup that the task belonged to at the time of exit. If the cgroup gets removed before the task is reaped, " (deleted)" is appended. v2: Build brekage due to missing dummy cgroup_free() when !CONFIG_CGROUP fixed. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
1 parent f0d9a5f commit 2e91fa7

File tree

8 files changed

+44
-56
lines changed

8 files changed

+44
-56
lines changed

Documentation/cgroups/unified-hierarchy.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,10 @@ supported and the interface files "release_agent" and
374374

375375
- The "cgroup.clone_children" file is removed.
376376

377+
- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
378+
to before exiting. If the cgroup is removed before the zombie is
379+
reaped, " (deleted)" is appeneded to the path.
380+
377381

378382
5-3. Controller File Conventions
379383

include/linux/cgroup-defs.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -435,9 +435,7 @@ struct cgroup_subsys {
435435
int (*can_fork)(struct task_struct *task, void **priv_p);
436436
void (*cancel_fork)(struct task_struct *task, void *priv);
437437
void (*fork)(struct task_struct *task, void *priv);
438-
void (*exit)(struct cgroup_subsys_state *css,
439-
struct cgroup_subsys_state *old_css,
440-
struct task_struct *task);
438+
void (*exit)(struct task_struct *task);
441439
void (*bind)(struct cgroup_subsys_state *root_css);
442440

443441
int early_init;

include/linux/cgroup.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
102102
extern void cgroup_post_fork(struct task_struct *p,
103103
void *old_ss_priv[CGROUP_CANFORK_COUNT]);
104104
void cgroup_exit(struct task_struct *p);
105+
void cgroup_free(struct task_struct *p);
105106

106107
int cgroup_init_early(void);
107108
int cgroup_init(void);
@@ -547,6 +548,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
547548
static inline void cgroup_post_fork(struct task_struct *p,
548549
void *ss_priv[CGROUP_CANFORK_COUNT]) {}
549550
static inline void cgroup_exit(struct task_struct *p) {}
551+
static inline void cgroup_free(struct task_struct *p) {}
550552

551553
static inline int cgroup_init_early(void) { return 0; }
552554
static inline int cgroup_init(void) { return 0; }

kernel/cgroup.c

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5379,14 +5379,34 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
53795379
seq_printf(m, "%sname=%s", count ? "," : "",
53805380
root->name);
53815381
seq_putc(m, ':');
5382+
53825383
cgrp = task_cgroup_from_root(tsk, root);
5383-
path = cgroup_path(cgrp, buf, PATH_MAX);
5384-
if (!path) {
5385-
retval = -ENAMETOOLONG;
5386-
goto out_unlock;
5384+
5385+
/*
5386+
* On traditional hierarchies, all zombie tasks show up as
5387+
* belonging to the root cgroup. On the default hierarchy,
5388+
* while a zombie doesn't show up in "cgroup.procs" and
5389+
* thus can't be migrated, its /proc/PID/cgroup keeps
5390+
* reporting the cgroup it belonged to before exiting. If
5391+
* the cgroup is removed before the zombie is reaped,
5392+
* " (deleted)" is appended to the cgroup path.
5393+
*/
5394+
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5395+
path = cgroup_path(cgrp, buf, PATH_MAX);
5396+
if (!path) {
5397+
retval = -ENAMETOOLONG;
5398+
goto out_unlock;
5399+
}
5400+
} else {
5401+
path = "/";
53875402
}
5403+
53885404
seq_puts(m, path);
5389-
seq_putc(m, '\n');
5405+
5406+
if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5407+
seq_puts(m, " (deleted)\n");
5408+
else
5409+
seq_putc(m, '\n');
53905410
}
53915411

53925412
retval = 0;
@@ -5593,7 +5613,6 @@ void cgroup_exit(struct task_struct *tsk)
55935613
{
55945614
struct cgroup_subsys *ss;
55955615
struct css_set *cset;
5596-
bool put_cset = false;
55975616
int i;
55985617

55995618
/*
@@ -5606,22 +5625,20 @@ void cgroup_exit(struct task_struct *tsk)
56065625
spin_lock_bh(&css_set_lock);
56075626
css_set_move_task(tsk, cset, NULL, false);
56085627
spin_unlock_bh(&css_set_lock);
5609-
put_cset = true;
5628+
} else {
5629+
get_css_set(cset);
56105630
}
56115631

5612-
/* Reassign the task to the init_css_set. */
5613-
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5614-
56155632
/* see cgroup_post_fork() for details */
5616-
for_each_subsys_which(ss, i, &have_exit_callback) {
5617-
struct cgroup_subsys_state *old_css = cset->subsys[i];
5618-
struct cgroup_subsys_state *css = task_css(tsk, i);
5633+
for_each_subsys_which(ss, i, &have_exit_callback)
5634+
ss->exit(tsk);
5635+
}
56195636

5620-
ss->exit(css, old_css, tsk);
5621-
}
5637+
void cgroup_free(struct task_struct *task)
5638+
{
5639+
struct css_set *cset = task_css_set(task);
56225640

5623-
if (put_cset)
5624-
put_css_set(cset);
5641+
put_css_set(cset);
56255642
}
56265643

56275644
static void check_for_release(struct cgroup *cgrp)

kernel/cgroup_pids.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv)
266266
css_put(old_css);
267267
}
268268

269-
static void pids_exit(struct cgroup_subsys_state *css,
270-
struct cgroup_subsys_state *old_css,
271-
struct task_struct *task)
269+
static void pids_exit(struct task_struct *task)
272270
{
273-
struct pids_cgroup *pids = css_pids(old_css);
271+
struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
274272

275273
pids_uncharge(pids, 1);
276274
}

kernel/events/core.c

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9293,25 +9293,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
92939293
task_function_call(task, __perf_cgroup_move, task);
92949294
}
92959295

9296-
static void perf_cgroup_exit(struct cgroup_subsys_state *css,
9297-
struct cgroup_subsys_state *old_css,
9298-
struct task_struct *task)
9299-
{
9300-
/*
9301-
* cgroup_exit() is called in the copy_process() failure path.
9302-
* Ignore this case since the task hasn't ran yet, this avoids
9303-
* trying to poke a half freed task state from generic code.
9304-
*/
9305-
if (!(task->flags & PF_EXITING))
9306-
return;
9307-
9308-
task_function_call(task, __perf_cgroup_move, task);
9309-
}
9310-
93119296
struct cgroup_subsys perf_event_cgrp_subsys = {
93129297
.css_alloc = perf_cgroup_css_alloc,
93139298
.css_free = perf_cgroup_css_free,
9314-
.exit = perf_cgroup_exit,
93159299
.attach = perf_cgroup_attach,
93169300
};
93179301
#endif /* CONFIG_CGROUP_PERF */

kernel/fork.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk)
251251
WARN_ON(atomic_read(&tsk->usage));
252252
WARN_ON(tsk == current);
253253

254+
cgroup_free(tsk);
254255
task_numa_free(tsk);
255256
security_task_free(tsk);
256257
exit_creds(tsk);

kernel/sched/core.c

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8163,21 +8163,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
81638163
sched_move_task(task);
81648164
}
81658165

8166-
static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
8167-
struct cgroup_subsys_state *old_css,
8168-
struct task_struct *task)
8169-
{
8170-
/*
8171-
* cgroup_exit() is called in the copy_process() failure path.
8172-
* Ignore this case since the task hasn't ran yet, this avoids
8173-
* trying to poke a half freed task state from generic code.
8174-
*/
8175-
if (!(task->flags & PF_EXITING))
8176-
return;
8177-
8178-
sched_move_task(task);
8179-
}
8180-
81818166
#ifdef CONFIG_FAIR_GROUP_SCHED
81828167
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
81838168
struct cftype *cftype, u64 shareval)
@@ -8509,7 +8494,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
85098494
.fork = cpu_cgroup_fork,
85108495
.can_attach = cpu_cgroup_can_attach,
85118496
.attach = cpu_cgroup_attach,
8512-
.exit = cpu_cgroup_exit,
85138497
.legacy_cftypes = cpu_files,
85148498
.early_init = 1,
85158499
};

0 commit comments

Comments
 (0)