From 38f38657444d15e1a8574eae80ed3de9f501737a Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 23 Aug 2012 16:53:28 -0400 Subject: xattr: extract simple_xattr code from tmpfs Extract in-memory xattr APIs from tmpfs. Will be used by cgroup. $ size vmlinux.o text data bss dec hex filename 4658782 880729 5195032 10734543 a3cbcf vmlinux.o $ size vmlinux.o text data bss dec hex filename 4658957 880729 5195032 10734718 a3cc7e vmlinux.o v7: - checkpatch warnings fixed - Implement the changes requested by Hugh Dickins: - make simple_xattrs_init and simple_xattrs_free inline - get rid of locking and list reinitialization in simple_xattrs_free, they're not needed v6: - no changes v5: - no changes v4: - move simple_xattrs_free() to fs/xattr.c v3: - in kmem_xattrs_free(), reinitialize the list - use simple_xattr_* prefix - introduce simple_xattr_add() to prevent direct list usage Original-patch-by: Li Zefan Cc: Li Zefan Cc: Hillf Danton Cc: Lennart Poettering Acked-by: Hugh Dickins Signed-off-by: Li Zefan Signed-off-by: Aristeu Rozanski Signed-off-by: Tejun Heo --- include/linux/shmem_fs.h | 3 ++- include/linux/xattr.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index bef2cf00b3be..30aa0dc60d75 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -5,6 +5,7 @@ #include #include #include +#include /* inode in-kernel data */ @@ -18,7 +19,7 @@ struct shmem_inode_info { }; struct shared_policy policy; /* NUMA memory alloc policy */ struct list_head swaplist; /* chain of maybes on swap */ - struct list_head xattr_list; /* list of shmem_xattr */ + struct simple_xattrs xattrs; /* list of xattrs */ struct inode vfs_inode; }; diff --git a/include/linux/xattr.h b/include/linux/xattr.h index e5d122031542..2ace7a60316d 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -59,7 +59,9 @@ #ifdef __KERNEL__ +#include #include +#include struct inode; struct dentry; @@ -96,6 +98,52 @@ ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, const char *value, size_t size, gfp_t flags); + +struct simple_xattrs { + struct list_head head; + spinlock_t lock; +}; + +struct simple_xattr { + struct list_head list; + char *name; + size_t size; + char value[0]; +}; + +/* + * initialize the simple_xattrs structure + */ +static inline void simple_xattrs_init(struct simple_xattrs *xattrs) +{ + INIT_LIST_HEAD(&xattrs->head); + spin_lock_init(&xattrs->lock); +} + +/* + * free all the xattrs + */ +static inline void simple_xattrs_free(struct simple_xattrs *xattrs) +{ + struct simple_xattr *xattr, *node; + + list_for_each_entry_safe(xattr, node, &xattrs->head, list) { + kfree(xattr->name); + kfree(xattr); + } +} + +struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); +int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, + void *buffer, size_t size); +int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags); +int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name); +ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, + size_t size); +void simple_xattr_list_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr); + #endif /* __KERNEL__ */ #endif /* _LINUX_XATTR_H */ -- cgit v1.2.3 From 03b1cde6b22f625ae832b939bc7379ec1466aec5 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 23 Aug 2012 16:53:30 -0400 Subject: cgroup: add xattr support This is one of the items in the plumber's wish list. For use cases: >> What would the use case be for this? > > Attaching meta information to services, in an easily discoverable > way. For example, in systemd we create one cgroup for each service, and > could then store data like the main pid of the specific service as an > xattr on the cgroup itself. That way we'd have almost all service state > in the cgroupfs, which would make it possible to terminate systemd and > later restart it without losing any state information. But there's more: > for example, some very peculiar services cannot be terminated on > shutdown (i.e. fakeraid DM stuff) and it would be really nice if the > services in question could just mark that on their cgroup, by setting an > xattr. On the more desktopy side of things there are other > possibilities: for example there are plans defining what an application > is along the lines of a cgroup (i.e. an app being a collection of > processes). With xattrs one could then attach an icon or human readable > program name on the cgroup. > > The key idea is that this would allow attaching runtime meta information > to cgroups and everything they model (services, apps, vms), that doesn't > need any complex userspace infrastructure, has good access control > (i.e. because the file system enforces that anyway, and there's the > "trusted." xattr namespace), notifications (inotify), and can easily be > shared among applications. > > Lennart v7: - no changes v6: - remove user xattr namespace, only allow trusted and security v5: - check for capabilities before setting/removing xattrs v4: - no changes v3: - instead of config option, use mount option to enable xattr support Original-patch-by: Li Zefan Cc: Li Zefan Cc: Tejun Heo Cc: Hugh Dickins Cc: Hillf Danton Cc: Lennart Poettering Signed-off-by: Li Zefan Signed-off-by: Aristeu Rozanski Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 13 +++++-- kernel/cgroup.c | 100 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 103 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c90eaa803440..145901f5ef99 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -216,6 +217,9 @@ struct cgroup { /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; + + /* directory xattrs */ + struct simple_xattrs xattrs; }; /* @@ -309,6 +313,9 @@ struct cftype { /* CFTYPE_* flags */ unsigned int flags; + /* file xattrs */ + struct simple_xattrs xattrs; + int (*open)(struct inode *inode, struct file *file); ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, struct file *file, @@ -394,7 +401,7 @@ struct cftype { */ struct cftype_set { struct list_head node; /* chained at subsys->cftsets */ - const struct cftype *cfts; + struct cftype *cfts; }; struct cgroup_scanner { @@ -406,8 +413,8 @@ struct cgroup_scanner { void *data; }; -int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts); -int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts); +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_is_removed(const struct cgroup *cgrp); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 875a7130647c..508b4a97ab19 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -276,7 +276,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) /* bits in struct cgroupfs_root flags field */ enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ + ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ + ROOT_XATTR, /* supports extended attributes */ }; static int cgroup_is_releasable(const struct cgroup *cgrp) @@ -913,15 +914,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ BUG_ON(!list_empty(&cgrp->pidlists)); + simple_xattrs_free(&cgrp->xattrs); + kfree_rcu(cgrp, rcu_head); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; + struct cftype *cft = cfe->type; WARN_ONCE(!list_empty(&cfe->node) && cgrp != &cgrp->root->top_cgroup, "cfe still linked for %s\n", cfe->type->name); kfree(cfe); + simple_xattrs_free(&cft->xattrs); } iput(inode); } @@ -1140,6 +1145,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",%s", ss->name); if (test_bit(ROOT_NOPREFIX, &root->flags)) seq_puts(seq, ",noprefix"); + if (test_bit(ROOT_XATTR, &root->flags)) + seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); if (clone_children(&root->top_cgroup)) @@ -1208,6 +1215,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->clone_children = true; continue; } + if (!strcmp(token, "xattr")) { + set_bit(ROOT_XATTR, &opts->flags); + continue; + } if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) @@ -1425,6 +1436,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); + simple_xattrs_init(&cgrp->xattrs); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -1769,6 +1781,8 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + simple_xattrs_free(&cgrp->xattrs); + kill_litter_super(sb); cgroup_drop_root(root); } @@ -2575,6 +2589,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } +static struct simple_xattrs *__d_xattrs(struct dentry *dentry) +{ + if (S_ISDIR(dentry->d_inode->i_mode)) + return &__d_cgrp(dentry)->xattrs; + else + return &__d_cft(dentry)->xattrs; +} + +static inline int xattr_enabled(struct dentry *dentry) +{ + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + return test_bit(ROOT_XATTR, &root->flags); +} + +static bool is_valid_xattr(const char *name) +{ + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + return true; + return false; +} + +static int cgroup_setxattr(struct dentry *dentry, const char *name, + const void *val, size_t size, int flags) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); +} + +static int cgroup_removexattr(struct dentry *dentry, const char *name) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_remove(__d_xattrs(dentry), name); +} + +static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, + void *buf, size_t size) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_get(__d_xattrs(dentry), name, buf, size); +} + +static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + return simple_xattr_list(__d_xattrs(dentry), buf, size); +} + static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, @@ -2583,11 +2655,22 @@ static const struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; +static const struct inode_operations cgroup_file_inode_operations = { + .setxattr = cgroup_setxattr, + .getxattr = cgroup_getxattr, + .listxattr = cgroup_listxattr, + .removexattr = cgroup_removexattr, +}; + static const struct inode_operations cgroup_dir_inode_operations = { .lookup = cgroup_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, + .setxattr = cgroup_setxattr, + .getxattr = cgroup_getxattr, + .listxattr = cgroup_listxattr, + .removexattr = cgroup_removexattr, }; static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -2635,6 +2718,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cgroup_file_operations; + inode->i_op = &cgroup_file_inode_operations; } d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ @@ -2695,7 +2779,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - const struct cftype *cft) + struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2705,6 +2789,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; + simple_xattrs_init(&cft->xattrs); + /* does @cft->flags tell us to skip creation on @cgrp? */ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) return 0; @@ -2745,9 +2831,9 @@ out: } static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - const struct cftype cfts[], bool is_add) + struct cftype cfts[], bool is_add) { - const struct cftype *cft; + struct cftype *cft; int err, ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { @@ -2781,7 +2867,7 @@ static void cgroup_cfts_prepare(void) } static void cgroup_cfts_commit(struct cgroup_subsys *ss, - const struct cftype *cfts, bool is_add) + struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) { LIST_HEAD(pending); @@ -2832,7 +2918,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; @@ -2862,7 +2948,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered with @ss. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; -- cgit v1.2.3 From be45c900fdc2c66baad5a7703fb8136991d88aeb Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 13 Sep 2012 09:50:55 +0200 Subject: cgroup: Remove CGROUP_BUILTIN_SUBSYS_COUNT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CGROUP_BUILTIN_SUBSYS_COUNT is used as start index or stop index when looping over the subsys array looking either at the builtin or the module subsystems. Since all the builtin subsystems have an id which is lower then CGROUP_BUILTIN_SUBSYS_COUNT we know that any module will have an id larger than CGROUP_BUILTIN_SUBSYS_COUNT. In short the ids are sorted. We are about to change id assignment to happen only at compile time later in this series. That means we can't rely on the above trick since all ids will always be defined at compile time. Furthermore, ordering the builtin subsystems and the module subsystems is not really necessary. So we need a different way to know which subsystem is a builtin or a module one. We can use the subsys[]->module pointer for this. Any place where we need to know if a subsys is module we just check for the pointer. If it is NULL then the subsystem is a builtin one. With this we are able to drop the CGROUP_BUILTIN_SUBSYS_COUNT enum. Though we need to introduce a temporary placeholder so that we don't get a compilation error when only CONFIG_CGROUP is selected and no single controller. An empty enum definition is not valid. Later in this series we are able to remove the placeholder again. And with this change we get a fix for this: kernel/cgroup.c: In function ‘cgroup_load_subsys’: kernel/cgroup.c:4326:38: warning: array subscript is below array bounds [-Warray-bounds] when CONFIG_CGROUP=y and no built in controller was enabled. Signed-off-by: Daniel Wagner Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Cc: Gao feng Cc: Jamal Hadi Salim Cc: John Fastabend Cc: netdev@vger.kernel.org Cc: cgroups@vger.kernel.org --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 68 +++++++++++++++++++++++++++++--------------------- 2 files changed, 41 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 145901f5ef99..1916cdb071dc 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -48,7 +48,7 @@ extern const struct file_operations proc_cgroup_operations; #define SUBSYS(_x) _x ## _subsys_id, enum cgroup_subsys_id { #include - CGROUP_BUILTIN_SUBSYS_COUNT + __CGROUP_TEMPORARY_PLACEHOLDER }; #undef SUBSYS /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ced292d720b9..1b18090269ad 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -88,7 +88,7 @@ static DEFINE_MUTEX(cgroup_root_mutex); /* * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are + * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by * cgroup_mutex. */ @@ -1321,7 +1321,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * take duplicate reference counts on a subsystem that's already used, * but rebind_subsystems handles this case. */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; if (!(bit & opts->subsys_mask)) @@ -1337,7 +1337,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * raced with a module_delete call, and to the user this is * essentially a "subsystem doesn't exist" case. */ - for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { + for (i--; i >= 0; i--) { /* drop refcounts only on the ones we took */ unsigned long bit = 1UL << i; @@ -1354,7 +1354,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) static void drop_parsed_module_refcounts(unsigned long subsys_mask) { int i; - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; if (!(bit & subsys_mask)) @@ -4442,8 +4442,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * since cgroup_init_subsys will have already taken care of it. */ if (ss->module == NULL) { - /* a few sanity checks */ - BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); + /* a sanity check */ BUG_ON(subsys[ss->subsys_id] != ss); return 0; } @@ -4457,7 +4456,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) */ mutex_lock(&cgroup_mutex); /* find the first empty slot in the array */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { if (subsys[i] == NULL) break; } @@ -4560,7 +4559,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); /* deassign the subsys_id */ - BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ @@ -4623,10 +4621,13 @@ int __init cgroup_init_early(void) for (i = 0; i < CSS_SET_TABLE_SIZE; i++) INIT_HLIST_HEAD(&css_set_table[i]); - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + /* at bootup time, we don't worry about modular subsystems */ + if (!ss || ss->module) + continue; + BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); BUG_ON(!ss->create); @@ -4659,9 +4660,12 @@ int __init cgroup_init(void) if (err) return err; - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* at bootup time, we don't worry about modular subsystems */ + if (!ss || ss->module) + continue; if (!ss->early_init) cgroup_init_subsys(ss); if (ss->use_id) @@ -4856,13 +4860,16 @@ void cgroup_fork_callbacks(struct task_struct *child) { if (need_forkexit_callback) { int i; - /* - * forkexit callbacks are only supported for builtin - * subsystems, and the builtin section of the subsys array is - * immutable, so we don't need to lock the subsys array here. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* + * forkexit callbacks are only supported for + * builtin subsystems. + */ + if (!ss || ss->module) + continue; + if (ss->fork) ss->fork(child); } @@ -4967,12 +4974,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) tsk->cgroups = &init_css_set; if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* modular subsystems can't use callbacks */ + if (!ss || ss->module) + continue; + if (ss->exit) { struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; @@ -5158,13 +5166,17 @@ static int __init cgroup_disable(char *str) while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about module - * subsystems, so we don't worry about them. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + /* + * cgroup_disable, being at boot time, can't + * know about module subsystems, so we don't + * worry about them. + */ + if (!ss || ss->module) + continue; + if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" -- cgit v1.2.3 From 5fc0b02544b3b9bd3db5a8156b5f3e7350f8e797 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 Sep 2012 16:12:05 +0200 Subject: cgroup: Wrap subsystem selection macro Before we are able to define all subsystem ids at compile time we need a more fine grained control what gets defined when we include cgroup_subsys.h. For example we define the enums for the subsystems or to declare for struct cgroup_subsys (builtin subsystem) by including cgroup_subsys.h and defining SUBSYS accordingly. Currently, the decision if a subsys is used is defined inside the header by testing if CONFIG_*=y is true. By moving this test outside of cgroup_subsys.h we are able to control it on the include level. This is done by introducing IS_SUBSYS_ENABLED which then is defined according the task, e.g. is CONFIG_*=y or CONFIG_*=m. Signed-off-by: Daniel Wagner Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Cc: Gao feng Cc: Jamal Hadi Salim Cc: John Fastabend Cc: netdev@vger.kernel.org Cc: cgroups@vger.kernel.org --- include/linux/cgroup.h | 4 ++++ include/linux/cgroup_subsys.h | 24 ++++++++++++------------ kernel/cgroup.c | 1 + 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1916cdb071dc..a5ab5651441b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -46,10 +46,12 @@ extern const struct file_operations proc_cgroup_operations; /* Define the enumeration of all builtin cgroup subsystems */ #define SUBSYS(_x) _x ## _subsys_id, +#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) enum cgroup_subsys_id { #include __CGROUP_TEMPORARY_PLACEHOLDER }; +#undef IS_SUBSYS_ENABLED #undef SUBSYS /* * This define indicates the maximum number of subsystems that can be loaded @@ -528,7 +530,9 @@ struct cgroup_subsys { }; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; +#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) #include +#undef IS_SUBSYS_ENABLED #undef SUBSYS static inline struct cgroup_subsys_state *cgroup_subsys_state( diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index dfae957398c3..f204a7a9cf38 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -7,73 +7,73 @@ /* */ -#ifdef CONFIG_CPUSETS +#if IS_SUBSYS_ENABLED(CONFIG_CPUSETS) SUBSYS(cpuset) #endif /* */ -#ifdef CONFIG_CGROUP_DEBUG +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEBUG) SUBSYS(debug) #endif /* */ -#ifdef CONFIG_CGROUP_SCHED +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_SCHED) SUBSYS(cpu_cgroup) #endif /* */ -#ifdef CONFIG_CGROUP_CPUACCT +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_CPUACCT) SUBSYS(cpuacct) #endif /* */ -#ifdef CONFIG_MEMCG +#if IS_SUBSYS_ENABLED(CONFIG_MEMCG) SUBSYS(mem_cgroup) #endif /* */ -#ifdef CONFIG_CGROUP_DEVICE +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE) SUBSYS(devices) #endif /* */ -#ifdef CONFIG_CGROUP_FREEZER +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_FREEZER) SUBSYS(freezer) #endif /* */ -#ifdef CONFIG_NET_CLS_CGROUP +#if IS_SUBSYS_ENABLED(CONFIG_NET_CLS_CGROUP) SUBSYS(net_cls) #endif /* */ -#ifdef CONFIG_BLK_CGROUP +#if IS_SUBSYS_ENABLED(CONFIG_BLK_CGROUP) SUBSYS(blkio) #endif /* */ -#ifdef CONFIG_CGROUP_PERF +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF) SUBSYS(perf) #endif /* */ -#ifdef CONFIG_NETPRIO_CGROUP +#if IS_SUBSYS_ENABLED(CONFIG_NETPRIO_CGROUP) SUBSYS(net_prio) #endif /* */ -#ifdef CONFIG_CGROUP_HUGETLB +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB) SUBSYS(hugetlb) #endif diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1b18090269ad..95e9f3fdb729 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -93,6 +93,7 @@ static DEFINE_MUTEX(cgroup_root_mutex); * cgroup_mutex. */ #define SUBSYS(_x) &_x ## _subsys, +#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include }; -- cgit v1.2.3 From 8a8e04df4747661daaee77e98e102d99c9e09b98 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 Sep 2012 16:12:07 +0200 Subject: cgroup: Assign subsystem IDs during compile time WARNING: With this change it is impossible to load external built controllers anymore. In case where CONFIG_NETPRIO_CGROUP=m and CONFIG_NET_CLS_CGROUP=m is set, corresponding subsys_id should also be a constant. Up to now, net_prio_subsys_id and net_cls_subsys_id would be of the type int and the value would be assigned during runtime. By switching the macro definition IS_SUBSYS_ENABLED from IS_BUILTIN to IS_ENABLED, all *_subsys_id will have constant value. That means we need to remove all the code which assumes a value can be assigned to net_prio_subsys_id and net_cls_subsys_id. A close look is necessary on the RCU part which was introduces by following patch: commit f845172531fb7410c7fb7780b1a6e51ee6df7d52 Author: Herbert Xu Mon May 24 09:12:34 2010 Committer: David S. Miller Mon May 24 09:12:34 2010 cls_cgroup: Store classid in struct sock Tis code was added to init_cgroup_cls() /* We can't use rcu_assign_pointer because this is an int. */ smp_wmb(); net_cls_subsys_id = net_cls_subsys.subsys_id; respectively to exit_cgroup_cls() net_cls_subsys_id = -1; synchronize_rcu(); and in module version of task_cls_classid() rcu_read_lock(); id = rcu_dereference(net_cls_subsys_id); if (id >= 0) classid = container_of(task_subsys_state(p, id), struct cgroup_cls_state, css)->classid; rcu_read_unlock(); Without an explicit explaination why the RCU part is needed. (The rcu_deference was fixed by exchanging it to rcu_derefence_index_check() in a later commit, but that is a minor detail.) So here is my pondering why it was introduced and why it safe to remove it now. Note that this code was copied over to net_prio the reasoning holds for that subsystem too. The idea behind the RCU use for net_cls_subsys_id is to make sure we get a valid pointer back from task_subsys_state(). task_subsys_state() is just blindly accessing the subsys array and returning the pointer. Obviously, passing in -1 as id into task_subsys_state() returns an invalid value (out of lower bound). So this code makes sure that only after module is loaded and the subsystem registered, the id is assigned. Before unregistering the module all old readers must have left the critical section. This is done by assigning -1 to the id and issuing a synchronized_rcu(). Any new readers wont call task_subsys_state() anymore and therefore it is safe to unregister the subsystem. The new code relies on the same trick, but it looks at the subsys pointer return by task_subsys_state() (remember the id is constant and therefore we allways have a valid index into the subsys array). No precautions need to be taken during module loading module. Eventually, all CPUs will get a valid pointer back from task_subsys_state() because rebind_subsystem() which is called after the module init() function will assigned subsys[net_cls_subsys_id] the newly loaded module subsystem pointer. When the subsystem is about to be removed, rebind_subsystem() will called before the module exit() function. In this case, rebind_subsys() will assign subsys[net_cls_subsys_id] a NULL pointer and then it calls synchronize_rcu(). All old readers have left by then the critical section. Any new reader wont access the subsystem anymore. At this point we are safe to unregister the subsystem. No synchronize_rcu() call is needed. Signed-off-by: Daniel Wagner Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Cc: "David S. Miller" Cc: "Paul E. McKenney" Cc: Andrew Morton Cc: Eric Dumazet Cc: Gao feng Cc: Glauber Costa Cc: Herbert Xu Cc: Jamal Hadi Salim Cc: John Fastabend Cc: Kamezawa Hiroyuki Cc: netdev@vger.kernel.org Cc: cgroups@vger.kernel.org --- include/linux/cgroup.h | 2 +- include/net/cls_cgroup.h | 12 ++++-------- include/net/netprio_cgroup.h | 18 +++++------------- kernel/cgroup.c | 22 +++------------------- net/core/netprio_cgroup.c | 11 ----------- net/core/sock.c | 11 ----------- net/sched/cls_cgroup.c | 13 ------------- 7 files changed, 13 insertions(+), 76 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a5ab5651441b..018f819405c8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -46,7 +46,7 @@ extern const struct file_operations proc_cgroup_operations; /* Define the enumeration of all builtin cgroup subsystems */ #define SUBSYS(_x) _x ## _subsys_id, -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) +#define IS_SUBSYS_ENABLED(option) IS_ENABLED(option) enum cgroup_subsys_id { #include __CGROUP_TEMPORARY_PLACEHOLDER diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 9bd5db9e10ba..b6a6eeb3905f 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -42,22 +42,18 @@ static inline u32 task_cls_classid(struct task_struct *p) return classid; } #elif IS_MODULE(CONFIG_NET_CLS_CGROUP) - -extern int net_cls_subsys_id; - static inline u32 task_cls_classid(struct task_struct *p) { - int id; + struct cgroup_subsys_state *css; u32 classid = 0; if (in_interrupt()) return 0; rcu_read_lock(); - id = rcu_dereference_index_check(net_cls_subsys_id, - rcu_read_lock_held()); - if (id >= 0) - classid = container_of(task_subsys_state(p, id), + css = task_subsys_state(p, net_cls_subsys_id); + if (css) + classid = container_of(css, struct cgroup_cls_state, css)->classid; rcu_read_unlock(); diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index b202de882489..2760f4f4ae9b 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -30,10 +30,6 @@ struct cgroup_netprio_state { u32 prioidx; }; -#ifndef CONFIG_NETPRIO_CGROUP -extern int net_prio_subsys_id; -#endif - extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task); #if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) @@ -55,18 +51,14 @@ static inline u32 task_netprioidx(struct task_struct *p) static inline u32 task_netprioidx(struct task_struct *p) { - struct cgroup_netprio_state *state; - int subsys_id; + struct cgroup_subsys_state *css; u32 idx = 0; rcu_read_lock(); - subsys_id = rcu_dereference_index_check(net_prio_subsys_id, - rcu_read_lock_held()); - if (subsys_id >= 0) { - state = container_of(task_subsys_state(p, subsys_id), - struct cgroup_netprio_state, css); - idx = state->prioidx; - } + css = task_subsys_state(p, net_prio_subsys_id); + if (css) + idx = container_of(css, + struct cgroup_netprio_state, css)->prioidx; rcu_read_unlock(); return idx; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2bfc78f531b6..485cc1487ea2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4451,24 +4451,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* init base cftset */ cgroup_init_cftsets(ss); - /* - * need to register a subsys id before anything else - for example, - * init_cgroup_css needs it. - */ mutex_lock(&cgroup_mutex); - /* find the first empty slot in the array */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (subsys[i] == NULL) - break; - } - if (i == CGROUP_SUBSYS_COUNT) { - /* maximum number of subsystems already registered! */ - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - /* assign ourselves the subsys_id */ - ss->subsys_id = i; - subsys[i] = ss; + subsys[ss->subsys_id] = ss; /* * no ss->create seems to need anything important in the ss struct, so @@ -4477,7 +4461,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) css = ss->create(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ - subsys[i] = NULL; + subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); return PTR_ERR(css); } @@ -4493,7 +4477,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) if (ret) { dummytop->subsys[ss->subsys_id] = NULL; ss->destroy(dummytop); - subsys[i] = NULL; + subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); return ret; } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index c75e3f9d060f..6bc460c38e4f 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -326,9 +326,7 @@ struct cgroup_subsys net_prio_subsys = { .create = cgrp_create, .destroy = cgrp_destroy, .attach = net_prio_attach, -#ifdef CONFIG_NETPRIO_CGROUP .subsys_id = net_prio_subsys_id, -#endif .base_cftypes = ss_files, .module = THIS_MODULE }; @@ -366,10 +364,6 @@ static int __init init_cgroup_netprio(void) ret = cgroup_load_subsys(&net_prio_subsys); if (ret) goto out; -#ifndef CONFIG_NETPRIO_CGROUP - smp_wmb(); - net_prio_subsys_id = net_prio_subsys.subsys_id; -#endif register_netdevice_notifier(&netprio_device_notifier); @@ -386,11 +380,6 @@ static void __exit exit_cgroup_netprio(void) cgroup_unload_subsys(&net_prio_subsys); -#ifndef CONFIG_NETPRIO_CGROUP - net_prio_subsys_id = -1; - synchronize_rcu(); -#endif - rtnl_lock(); for_each_netdev(&init_net, dev) { old = rtnl_dereference(dev->priomap); diff --git a/net/core/sock.c b/net/core/sock.c index ca3eaee66056..47b4ac048e88 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -326,17 +326,6 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(__sk_backlog_rcv); -#if defined(CONFIG_CGROUPS) -#if !defined(CONFIG_NET_CLS_CGROUP) -int net_cls_subsys_id = -1; -EXPORT_SYMBOL_GPL(net_cls_subsys_id); -#endif -#if !defined(CONFIG_NETPRIO_CGROUP) -int net_prio_subsys_id = -1; -EXPORT_SYMBOL_GPL(net_prio_subsys_id); -#endif -#endif - static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { struct timeval tv; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 7743ea8d1d38..67cf90d962f4 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -77,9 +77,7 @@ struct cgroup_subsys net_cls_subsys = { .name = "net_cls", .create = cgrp_create, .destroy = cgrp_destroy, -#ifdef CONFIG_NET_CLS_CGROUP .subsys_id = net_cls_subsys_id, -#endif .base_cftypes = ss_files, .module = THIS_MODULE, }; @@ -283,12 +281,6 @@ static int __init init_cgroup_cls(void) if (ret) goto out; -#ifndef CONFIG_NET_CLS_CGROUP - /* We can't use rcu_assign_pointer because this is an int. */ - smp_wmb(); - net_cls_subsys_id = net_cls_subsys.subsys_id; -#endif - ret = register_tcf_proto_ops(&cls_cgroup_ops); if (ret) cgroup_unload_subsys(&net_cls_subsys); @@ -301,11 +293,6 @@ static void __exit exit_cgroup_cls(void) { unregister_tcf_proto_ops(&cls_cgroup_ops); -#ifndef CONFIG_NET_CLS_CGROUP - net_cls_subsys_id = -1; - synchronize_rcu(); -#endif - cgroup_unload_subsys(&net_cls_subsys); } -- cgit v1.2.3 From a6f00298b2ceaf50b4ab00e6ee3eb0206ac72fac Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 12 Sep 2012 16:12:08 +0200 Subject: cgroup: Define CGROUP_SUBSYS_COUNT according the configuration Since we know exactly how many subsystems exists at compile time we are able to define CGROUP_SUBSYS_COUNT correctly. CGROUP_SUBSYS_COUNT will be at max 12 (all controllers enabled). Depending on the architecture we safe either 32 - 12 pointers (80 bytes) or 64 - 12 pointers (416 bytes) per cgroup. With this change we can also remove the temporary placeholder to avoid compilation errors. Signed-off-by: Daniel Wagner Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman Cc: Gao feng Cc: Jamal Hadi Salim Cc: John Fastabend Cc: netdev@vger.kernel.org Cc: cgroups@vger.kernel.org --- include/linux/cgroup.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 018f819405c8..df354ae079c1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -49,16 +49,10 @@ extern const struct file_operations proc_cgroup_operations; #define IS_SUBSYS_ENABLED(option) IS_ENABLED(option) enum cgroup_subsys_id { #include - __CGROUP_TEMPORARY_PLACEHOLDER + CGROUP_SUBSYS_COUNT, }; #undef IS_SUBSYS_ENABLED #undef SUBSYS -/* - * This define indicates the maximum number of subsystems that can be loaded - * at once. We limit to this many since cgroupfs_root has subsys_bits to keep - * track of all of them. - */ -#define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long)) /* Per-subsystem/per-cgroup state maintained by the system. */ struct cgroup_subsys_state { -- cgit v1.2.3