Skip to content

Commit a79a908

Browse files
adityakalihtejun
authored andcommitted
cgroup: introduce cgroup namespaces
Introduce the ability to create new cgroup namespace. The newly created cgroup namespace remembers the cgroup of the process at the point of creation of the cgroup namespace (referred as cgroupns-root). The main purpose of cgroup namespace is to virtualize the contents of /proc/self/cgroup file. Processes inside a cgroup namespace are only able to see paths relative to their namespace root (unless they are moved outside of their cgroupns-root, at which point they will see a relative path from their cgroupns-root). For a correctly setup container this enables container-tools (like libcontainer, lxc, lmctfy, etc.) to create completely virtualized containers without leaking system level cgroup hierarchy to the task. This patch only implements the 'unshare' part of the cgroupns. Signed-off-by: Aditya Kali <adityakali@google.com> Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent 5e2bec7 commit a79a908

File tree

8 files changed

+250
-10
lines changed

8 files changed

+250
-10
lines changed

fs/proc/namespaces.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ static const struct proc_ns_operations *ns_entries[] = {
2828
&userns_operations,
2929
#endif
3030
&mntns_operations,
31+
#ifdef CONFIG_CGROUPS
32+
&cgroupns_operations,
33+
#endif
3134
};
3235

3336
static const char *proc_ns_get_link(struct dentry *dentry,

include/linux/cgroup.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717
#include <linux/seq_file.h>
1818
#include <linux/kernfs.h>
1919
#include <linux/jump_label.h>
20+
#include <linux/nsproxy.h>
21+
#include <linux/types.h>
22+
#include <linux/ns_common.h>
23+
#include <linux/nsproxy.h>
24+
#include <linux/user_namespace.h>
2025

2126
#include <linux/cgroup-defs.h>
2227

@@ -611,4 +616,48 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
611616

612617
#endif /* CONFIG_CGROUP_DATA */
613618

619+
struct cgroup_namespace {
620+
atomic_t count;
621+
struct ns_common ns;
622+
struct user_namespace *user_ns;
623+
struct css_set *root_cset;
624+
};
625+
626+
extern struct cgroup_namespace init_cgroup_ns;
627+
628+
#ifdef CONFIG_CGROUPS
629+
630+
void free_cgroup_ns(struct cgroup_namespace *ns);
631+
632+
struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
633+
struct user_namespace *user_ns,
634+
struct cgroup_namespace *old_ns);
635+
636+
char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
637+
struct cgroup_namespace *ns);
638+
639+
#else /* !CONFIG_CGROUPS */
640+
641+
static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
642+
static inline struct cgroup_namespace *
643+
copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
644+
struct cgroup_namespace *old_ns)
645+
{
646+
return old_ns;
647+
}
648+
649+
#endif /* !CONFIG_CGROUPS */
650+
651+
static inline void get_cgroup_ns(struct cgroup_namespace *ns)
652+
{
653+
if (ns)
654+
atomic_inc(&ns->count);
655+
}
656+
657+
static inline void put_cgroup_ns(struct cgroup_namespace *ns)
658+
{
659+
if (ns && atomic_dec_and_test(&ns->count))
660+
free_cgroup_ns(ns);
661+
}
662+
614663
#endif /* _LINUX_CGROUP_H */

include/linux/nsproxy.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ struct mnt_namespace;
88
struct uts_namespace;
99
struct ipc_namespace;
1010
struct pid_namespace;
11+
struct cgroup_namespace;
1112
struct fs_struct;
1213

1314
/*
@@ -33,6 +34,7 @@ struct nsproxy {
3334
struct mnt_namespace *mnt_ns;
3435
struct pid_namespace *pid_ns_for_children;
3536
struct net *net_ns;
37+
struct cgroup_namespace *cgroup_ns;
3638
};
3739
extern struct nsproxy init_nsproxy;
3840

include/linux/proc_ns.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
struct pid_namespace;
1010
struct nsproxy;
1111
struct path;
12+
struct task_struct;
13+
struct inode;
1214

1315
struct proc_ns_operations {
1416
const char *name;
@@ -24,6 +26,7 @@ extern const struct proc_ns_operations ipcns_operations;
2426
extern const struct proc_ns_operations pidns_operations;
2527
extern const struct proc_ns_operations userns_operations;
2628
extern const struct proc_ns_operations mntns_operations;
29+
extern const struct proc_ns_operations cgroupns_operations;
2730

2831
/*
2932
* We always define these enumerators
@@ -34,6 +37,7 @@ enum {
3437
PROC_UTS_INIT_INO = 0xEFFFFFFEU,
3538
PROC_USER_INIT_INO = 0xEFFFFFFDU,
3639
PROC_PID_INIT_INO = 0xEFFFFFFCU,
40+
PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
3741
};
3842

3943
#ifdef CONFIG_PROC_FS

kernel/cgroup.c

Lines changed: 170 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@
5959
#include <linux/delay.h>
6060
#include <linux/atomic.h>
6161
#include <linux/cpuset.h>
62+
#include <linux/proc_ns.h>
63+
#include <linux/nsproxy.h>
64+
#include <linux/proc_ns.h>
6265
#include <net/sock.h>
6366

6467
/*
@@ -212,6 +215,15 @@ static unsigned long have_fork_callback __read_mostly;
212215
static unsigned long have_exit_callback __read_mostly;
213216
static unsigned long have_free_callback __read_mostly;
214217

218+
/* cgroup namespace for init task */
219+
struct cgroup_namespace init_cgroup_ns = {
220+
.count = { .counter = 2, },
221+
.user_ns = &init_user_ns,
222+
.ns.ops = &cgroupns_operations,
223+
.ns.inum = PROC_CGROUP_INIT_INO,
224+
.root_cset = &init_css_set,
225+
};
226+
215227
/* Ditto for the can_fork callback. */
216228
static unsigned long have_canfork_callback __read_mostly;
217229

@@ -2177,6 +2189,35 @@ static struct file_system_type cgroup2_fs_type = {
21772189
.kill_sb = cgroup_kill_sb,
21782190
};
21792191

2192+
static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2193+
struct cgroup_namespace *ns)
2194+
{
2195+
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2196+
int ret;
2197+
2198+
ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2199+
if (ret < 0 || ret >= buflen)
2200+
return NULL;
2201+
return buf;
2202+
}
2203+
2204+
char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2205+
struct cgroup_namespace *ns)
2206+
{
2207+
char *ret;
2208+
2209+
mutex_lock(&cgroup_mutex);
2210+
spin_lock_bh(&css_set_lock);
2211+
2212+
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2213+
2214+
spin_unlock_bh(&css_set_lock);
2215+
mutex_unlock(&cgroup_mutex);
2216+
2217+
return ret;
2218+
}
2219+
EXPORT_SYMBOL_GPL(cgroup_path_ns);
2220+
21802221
/**
21812222
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
21822223
* @task: target task
@@ -2204,7 +2245,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
22042245

22052246
if (root) {
22062247
cgrp = task_cgroup_from_root(task, root);
2207-
path = cgroup_path(cgrp, buf, buflen);
2248+
path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
22082249
} else {
22092250
/* if no hierarchy exists, everyone is in "/" */
22102251
if (strlcpy(buf, "/", buflen) < buflen)
@@ -5297,6 +5338,8 @@ int __init cgroup_init(void)
52975338
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
52985339
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
52995340

5341+
get_user_ns(init_cgroup_ns.user_ns);
5342+
53005343
mutex_lock(&cgroup_mutex);
53015344

53025345
/* Add init_css_set to the hash table */
@@ -5438,7 +5481,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
54385481
* " (deleted)" is appended to the cgroup path.
54395482
*/
54405483
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5441-
path = cgroup_path(cgrp, buf, PATH_MAX);
5484+
path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5485+
current->nsproxy->cgroup_ns);
54425486
if (!path) {
54435487
retval = -ENAMETOOLONG;
54445488
goto out_unlock;
@@ -5720,7 +5764,9 @@ static void cgroup_release_agent(struct work_struct *work)
57205764
if (!pathbuf || !agentbuf)
57215765
goto out;
57225766

5723-
path = cgroup_path(cgrp, pathbuf, PATH_MAX);
5767+
spin_lock_bh(&css_set_lock);
5768+
path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
5769+
spin_unlock_bh(&css_set_lock);
57245770
if (!path)
57255771
goto out;
57265772

@@ -5931,6 +5977,127 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
59315977

59325978
#endif /* CONFIG_SOCK_CGROUP_DATA */
59335979

5980+
/* cgroup namespaces */
5981+
5982+
static struct cgroup_namespace *alloc_cgroup_ns(void)
5983+
{
5984+
struct cgroup_namespace *new_ns;
5985+
int ret;
5986+
5987+
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
5988+
if (!new_ns)
5989+
return ERR_PTR(-ENOMEM);
5990+
ret = ns_alloc_inum(&new_ns->ns);
5991+
if (ret) {
5992+
kfree(new_ns);
5993+
return ERR_PTR(ret);
5994+
}
5995+
atomic_set(&new_ns->count, 1);
5996+
new_ns->ns.ops = &cgroupns_operations;
5997+
return new_ns;
5998+
}
5999+
6000+
void free_cgroup_ns(struct cgroup_namespace *ns)
6001+
{
6002+
put_css_set(ns->root_cset);
6003+
put_user_ns(ns->user_ns);
6004+
ns_free_inum(&ns->ns);
6005+
kfree(ns);
6006+
}
6007+
EXPORT_SYMBOL(free_cgroup_ns);
6008+
6009+
struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
6010+
struct user_namespace *user_ns,
6011+
struct cgroup_namespace *old_ns)
6012+
{
6013+
struct cgroup_namespace *new_ns = NULL;
6014+
struct css_set *cset = NULL;
6015+
int err;
6016+
6017+
BUG_ON(!old_ns);
6018+
6019+
if (!(flags & CLONE_NEWCGROUP)) {
6020+
get_cgroup_ns(old_ns);
6021+
return old_ns;
6022+
}
6023+
6024+
/* Allow only sysadmin to create cgroup namespace. */
6025+
err = -EPERM;
6026+
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
6027+
goto err_out;
6028+
6029+
mutex_lock(&cgroup_mutex);
6030+
spin_lock_bh(&css_set_lock);
6031+
6032+
cset = task_css_set(current);
6033+
get_css_set(cset);
6034+
6035+
spin_unlock_bh(&css_set_lock);
6036+
mutex_unlock(&cgroup_mutex);
6037+
6038+
err = -ENOMEM;
6039+
new_ns = alloc_cgroup_ns();
6040+
if (!new_ns)
6041+
goto err_out;
6042+
6043+
new_ns->user_ns = get_user_ns(user_ns);
6044+
new_ns->root_cset = cset;
6045+
6046+
return new_ns;
6047+
6048+
err_out:
6049+
if (cset)
6050+
put_css_set(cset);
6051+
kfree(new_ns);
6052+
return ERR_PTR(err);
6053+
}
6054+
6055+
static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
6056+
{
6057+
return container_of(ns, struct cgroup_namespace, ns);
6058+
}
6059+
6060+
static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
6061+
{
6062+
pr_info("setns not supported for cgroup namespace");
6063+
return -EINVAL;
6064+
}
6065+
6066+
static struct ns_common *cgroupns_get(struct task_struct *task)
6067+
{
6068+
struct cgroup_namespace *ns = NULL;
6069+
struct nsproxy *nsproxy;
6070+
6071+
task_lock(task);
6072+
nsproxy = task->nsproxy;
6073+
if (nsproxy) {
6074+
ns = nsproxy->cgroup_ns;
6075+
get_cgroup_ns(ns);
6076+
}
6077+
task_unlock(task);
6078+
6079+
return ns ? &ns->ns : NULL;
6080+
}
6081+
6082+
static void cgroupns_put(struct ns_common *ns)
6083+
{
6084+
put_cgroup_ns(to_cg_ns(ns));
6085+
}
6086+
6087+
const struct proc_ns_operations cgroupns_operations = {
6088+
.name = "cgroup",
6089+
.type = CLONE_NEWCGROUP,
6090+
.get = cgroupns_get,
6091+
.put = cgroupns_put,
6092+
.install = cgroupns_install,
6093+
};
6094+
6095+
static __init int cgroup_namespaces_init(void)
6096+
{
6097+
return 0;
6098+
}
6099+
subsys_initcall(cgroup_namespaces_init);
6100+
59346101
#ifdef CONFIG_CGROUP_DEBUG
59356102
static struct cgroup_subsys_state *
59366103
debug_css_alloc(struct cgroup_subsys_state *parent_css)

kernel/cpuset.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2714,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
27142714
goto out;
27152715

27162716
retval = -ENAMETOOLONG;
2717-
rcu_read_lock();
2718-
css = task_css(tsk, cpuset_cgrp_id);
2719-
p = cgroup_path(css->cgroup, buf, PATH_MAX);
2720-
rcu_read_unlock();
2717+
css = task_get_css(tsk, cpuset_cgrp_id);
2718+
p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
2719+
current->nsproxy->cgroup_ns);
2720+
css_put(css);
27212721
if (!p)
27222722
goto out_free;
27232723
seq_puts(m, p);

kernel/fork.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1884,7 +1884,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
18841884
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
18851885
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
18861886
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
1887-
CLONE_NEWUSER|CLONE_NEWPID))
1887+
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
18881888
return -EINVAL;
18891889
/*
18901890
* Not implemented, but pretend it works if there is nothing

0 commit comments

Comments
 (0)