Task Control Groups: shared cgroup subsystem group arrays
[linux-2.6/linux-2.6-openrd.git] / kernel / cgroup.c
blob883928c0e147ff6a99da00cff89c191b230f0b25
1 /*
2 * kernel/cgroup.c
4 * Generic process-grouping system.
6 * Based originally on the cpuset system, extracted by Paul Menage
7 * Copyright (C) 2006 Google, Inc
9 * Copyright notices from the original cpuset code:
10 * --------------------------------------------------
11 * Copyright (C) 2003 BULL SA.
12 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
14 * Portions derived from Patrick Mochel's sysfs code.
15 * sysfs is Copyright (c) 2001-3 Patrick Mochel
17 * 2003-10-10 Written by Simon Derr.
18 * 2003-10-22 Updates by Stephen Hemminger.
19 * 2004 May-July Rework by Paul Jackson.
20 * ---------------------------------------------------
22 * This file is subject to the terms and conditions of the GNU General Public
23 * License. See the file COPYING in the main directory of the Linux
24 * distribution for more details.
27 #include <linux/cgroup.h>
28 #include <linux/errno.h>
29 #include <linux/fs.h>
30 #include <linux/kernel.h>
31 #include <linux/list.h>
32 #include <linux/mm.h>
33 #include <linux/mutex.h>
34 #include <linux/mount.h>
35 #include <linux/pagemap.h>
36 #include <linux/proc_fs.h>
37 #include <linux/rcupdate.h>
38 #include <linux/sched.h>
39 #include <linux/backing-dev.h>
40 #include <linux/seq_file.h>
41 #include <linux/slab.h>
42 #include <linux/magic.h>
43 #include <linux/spinlock.h>
44 #include <linux/string.h>
45 #include <linux/sort.h>
46 #include <asm/atomic.h>
48 /* Generate an array of cgroup subsystem pointers */
49 #define SUBSYS(_x) &_x ## _subsys,
51 static struct cgroup_subsys *subsys[] = {
52 #include <linux/cgroup_subsys.h>
56 * A cgroupfs_root represents the root of a cgroup hierarchy,
57 * and may be associated with a superblock to form an active
58 * hierarchy
60 struct cgroupfs_root {
61 struct super_block *sb;
64 * The bitmask of subsystems intended to be attached to this
65 * hierarchy
67 unsigned long subsys_bits;
69 /* The bitmask of subsystems currently attached to this hierarchy */
70 unsigned long actual_subsys_bits;
72 /* A list running through the attached subsystems */
73 struct list_head subsys_list;
75 /* The root cgroup for this hierarchy */
76 struct cgroup top_cgroup;
78 /* Tracks how many cgroups are currently defined in hierarchy.*/
79 int number_of_cgroups;
81 /* A list running through the mounted hierarchies */
82 struct list_head root_list;
84 /* Hierarchy-specific flags */
85 unsigned long flags;
90 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
91 * subsystems that are otherwise unattached - it never has more than a
92 * single cgroup, and all tasks are part of that cgroup.
94 static struct cgroupfs_root rootnode;
96 /* The list of hierarchy roots */
98 static LIST_HEAD(roots);
99 static int root_count;
101 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
102 #define dummytop (&rootnode.top_cgroup)
104 /* This flag indicates whether tasks in the fork and exit paths should
105 * take callback_mutex and check for fork/exit handlers to call. This
106 * avoids us having to do extra work in the fork/exit path if none of the
107 * subsystems need to be called.
109 static int need_forkexit_callback;
111 /* bits in struct cgroup flags field */
112 enum {
113 CONT_REMOVED,
116 /* convenient tests for these bits */
117 inline int cgroup_is_removed(const struct cgroup *cont)
119 return test_bit(CONT_REMOVED, &cont->flags);
122 /* bits in struct cgroupfs_root flags field */
123 enum {
124 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
128 * for_each_subsys() allows you to iterate on each subsystem attached to
129 * an active hierarchy
131 #define for_each_subsys(_root, _ss) \
132 list_for_each_entry(_ss, &_root->subsys_list, sibling)
134 /* for_each_root() allows you to iterate across the active hierarchies */
135 #define for_each_root(_root) \
136 list_for_each_entry(_root, &roots, root_list)
138 /* Link structure for associating css_set objects with cgroups */
139 struct cg_cgroup_link {
141 * List running through cg_cgroup_links associated with a
142 * cgroup, anchored on cgroup->css_sets
144 struct list_head cont_link_list;
146 * List running through cg_cgroup_links pointing at a
147 * single css_set object, anchored on css_set->cg_links
149 struct list_head cg_link_list;
150 struct css_set *cg;
153 /* The default css_set - used by init and its children prior to any
154 * hierarchies being mounted. It contains a pointer to the root state
155 * for each subsystem. Also used to anchor the list of css_sets. Not
156 * reference-counted, to improve performance when child cgroups
157 * haven't been created.
160 static struct css_set init_css_set;
161 static struct cg_cgroup_link init_css_set_link;
163 /* css_set_lock protects the list of css_set objects, and the
164 * chain of tasks off each css_set. Nests outside task->alloc_lock
165 * due to cgroup_iter_start() */
166 static DEFINE_RWLOCK(css_set_lock);
167 static int css_set_count;
169 /* We don't maintain the lists running through each css_set to its
170 * task until after the first call to cgroup_iter_start(). This
171 * reduces the fork()/exit() overhead for people who have cgroups
172 * compiled into their kernel but not actually in use */
173 static int use_task_css_set_links;
175 /* When we create or destroy a css_set, the operation simply
176 * takes/releases a reference count on all the cgroups referenced
177 * by subsystems in this css_set. This can end up multiple-counting
178 * some cgroups, but that's OK - the ref-count is just a
179 * busy/not-busy indicator; ensuring that we only count each cgroup
180 * once would require taking a global lock to ensure that no
181 * subsystems moved between hierarchies while we were doing so.
183 * Possible TODO: decide at boot time based on the number of
184 * registered subsystems and the number of CPUs or NUMA nodes whether
185 * it's better for performance to ref-count every subsystem, or to
186 * take a global lock and only add one ref count to each hierarchy.
190 * unlink a css_set from the list and free it
192 static void release_css_set(struct kref *k)
194 struct css_set *cg = container_of(k, struct css_set, ref);
195 int i;
197 write_lock(&css_set_lock);
198 list_del(&cg->list);
199 css_set_count--;
200 while (!list_empty(&cg->cg_links)) {
201 struct cg_cgroup_link *link;
202 link = list_entry(cg->cg_links.next,
203 struct cg_cgroup_link, cg_link_list);
204 list_del(&link->cg_link_list);
205 list_del(&link->cont_link_list);
206 kfree(link);
208 write_unlock(&css_set_lock);
209 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
210 atomic_dec(&cg->subsys[i]->cgroup->count);
211 kfree(cg);
215 * refcounted get/put for css_set objects
217 static inline void get_css_set(struct css_set *cg)
219 kref_get(&cg->ref);
222 static inline void put_css_set(struct css_set *cg)
224 kref_put(&cg->ref, release_css_set);
228 * find_existing_css_set() is a helper for
229 * find_css_set(), and checks to see whether an existing
230 * css_set is suitable. This currently walks a linked-list for
231 * simplicity; a later patch will use a hash table for better
232 * performance
234 * oldcg: the cgroup group that we're using before the cgroup
235 * transition
237 * cont: the cgroup that we're moving into
239 * template: location in which to build the desired set of subsystem
240 * state objects for the new cgroup group
243 static struct css_set *find_existing_css_set(
244 struct css_set *oldcg,
245 struct cgroup *cont,
246 struct cgroup_subsys_state *template[])
248 int i;
249 struct cgroupfs_root *root = cont->root;
250 struct list_head *l = &init_css_set.list;
252 /* Built the set of subsystem state objects that we want to
253 * see in the new css_set */
254 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
255 if (root->subsys_bits & (1ull << i)) {
256 /* Subsystem is in this hierarchy. So we want
257 * the subsystem state from the new
258 * cgroup */
259 template[i] = cont->subsys[i];
260 } else {
261 /* Subsystem is not in this hierarchy, so we
262 * don't want to change the subsystem state */
263 template[i] = oldcg->subsys[i];
267 /* Look through existing cgroup groups to find one to reuse */
268 do {
269 struct css_set *cg =
270 list_entry(l, struct css_set, list);
272 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
273 /* All subsystems matched */
274 return cg;
276 /* Try the next cgroup group */
277 l = l->next;
278 } while (l != &init_css_set.list);
280 /* No existing cgroup group matched */
281 return NULL;
285 * allocate_cg_links() allocates "count" cg_cgroup_link structures
286 * and chains them on tmp through their cont_link_list fields. Returns 0 on
287 * success or a negative error
290 static int allocate_cg_links(int count, struct list_head *tmp)
292 struct cg_cgroup_link *link;
293 int i;
294 INIT_LIST_HEAD(tmp);
295 for (i = 0; i < count; i++) {
296 link = kmalloc(sizeof(*link), GFP_KERNEL);
297 if (!link) {
298 while (!list_empty(tmp)) {
299 link = list_entry(tmp->next,
300 struct cg_cgroup_link,
301 cont_link_list);
302 list_del(&link->cont_link_list);
303 kfree(link);
305 return -ENOMEM;
307 list_add(&link->cont_link_list, tmp);
309 return 0;
312 static void free_cg_links(struct list_head *tmp)
314 while (!list_empty(tmp)) {
315 struct cg_cgroup_link *link;
316 link = list_entry(tmp->next,
317 struct cg_cgroup_link,
318 cont_link_list);
319 list_del(&link->cont_link_list);
320 kfree(link);
325 * find_css_set() takes an existing cgroup group and a
326 * cgroup object, and returns a css_set object that's
327 * equivalent to the old group, but with the given cgroup
328 * substituted into the appropriate hierarchy. Must be called with
329 * cgroup_mutex held
332 static struct css_set *find_css_set(
333 struct css_set *oldcg, struct cgroup *cont)
335 struct css_set *res;
336 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
337 int i;
339 struct list_head tmp_cg_links;
340 struct cg_cgroup_link *link;
342 /* First see if we already have a cgroup group that matches
343 * the desired set */
344 write_lock(&css_set_lock);
345 res = find_existing_css_set(oldcg, cont, template);
346 if (res)
347 get_css_set(res);
348 write_unlock(&css_set_lock);
350 if (res)
351 return res;
353 res = kmalloc(sizeof(*res), GFP_KERNEL);
354 if (!res)
355 return NULL;
357 /* Allocate all the cg_cgroup_link objects that we'll need */
358 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
359 kfree(res);
360 return NULL;
363 kref_init(&res->ref);
364 INIT_LIST_HEAD(&res->cg_links);
365 INIT_LIST_HEAD(&res->tasks);
367 /* Copy the set of subsystem state objects generated in
368 * find_existing_css_set() */
369 memcpy(res->subsys, template, sizeof(res->subsys));
371 write_lock(&css_set_lock);
372 /* Add reference counts and links from the new css_set. */
373 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
374 struct cgroup *cont = res->subsys[i]->cgroup;
375 struct cgroup_subsys *ss = subsys[i];
376 atomic_inc(&cont->count);
378 * We want to add a link once per cgroup, so we
379 * only do it for the first subsystem in each
380 * hierarchy
382 if (ss->root->subsys_list.next == &ss->sibling) {
383 BUG_ON(list_empty(&tmp_cg_links));
384 link = list_entry(tmp_cg_links.next,
385 struct cg_cgroup_link,
386 cont_link_list);
387 list_del(&link->cont_link_list);
388 list_add(&link->cont_link_list, &cont->css_sets);
389 link->cg = res;
390 list_add(&link->cg_link_list, &res->cg_links);
393 if (list_empty(&rootnode.subsys_list)) {
394 link = list_entry(tmp_cg_links.next,
395 struct cg_cgroup_link,
396 cont_link_list);
397 list_del(&link->cont_link_list);
398 list_add(&link->cont_link_list, &dummytop->css_sets);
399 link->cg = res;
400 list_add(&link->cg_link_list, &res->cg_links);
403 BUG_ON(!list_empty(&tmp_cg_links));
405 /* Link this cgroup group into the list */
406 list_add(&res->list, &init_css_set.list);
407 css_set_count++;
408 INIT_LIST_HEAD(&res->tasks);
409 write_unlock(&css_set_lock);
411 return res;
415 * There is one global cgroup mutex. We also require taking
416 * task_lock() when dereferencing a task's cgroup subsys pointers.
417 * See "The task_lock() exception", at the end of this comment.
419 * A task must hold cgroup_mutex to modify cgroups.
421 * Any task can increment and decrement the count field without lock.
422 * So in general, code holding cgroup_mutex can't rely on the count
423 * field not changing. However, if the count goes to zero, then only
424 * attach_task() can increment it again. Because a count of zero
425 * means that no tasks are currently attached, therefore there is no
426 * way a task attached to that cgroup can fork (the other way to
427 * increment the count). So code holding cgroup_mutex can safely
428 * assume that if the count is zero, it will stay zero. Similarly, if
429 * a task holds cgroup_mutex on a cgroup with zero count, it
430 * knows that the cgroup won't be removed, as cgroup_rmdir()
431 * needs that mutex.
433 * The cgroup_common_file_write handler for operations that modify
434 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
435 * single threading all such cgroup modifications across the system.
437 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
438 * (usually) take cgroup_mutex. These are the two most performance
439 * critical pieces of code here. The exception occurs on cgroup_exit(),
440 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
441 * is taken, and if the cgroup count is zero, a usermode call made
442 * to /sbin/cgroup_release_agent with the name of the cgroup (path
443 * relative to the root of cgroup file system) as the argument.
445 * A cgroup can only be deleted if both its 'count' of using tasks
446 * is zero, and its list of 'children' cgroups is empty. Since all
447 * tasks in the system use _some_ cgroup, and since there is always at
448 * least one task in the system (init, pid == 1), therefore, top_cgroup
449 * always has either children cgroups and/or using tasks. So we don't
450 * need a special hack to ensure that top_cgroup cannot be deleted.
452 * The task_lock() exception
454 * The need for this exception arises from the action of
455 * attach_task(), which overwrites one tasks cgroup pointer with
456 * another. It does so using cgroup_mutexe, however there are
457 * several performance critical places that need to reference
458 * task->cgroup without the expense of grabbing a system global
459 * mutex. Therefore except as noted below, when dereferencing or, as
460 * in attach_task(), modifying a task'ss cgroup pointer we use
461 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
462 * the task_struct routinely used for such matters.
464 * P.S. One more locking exception. RCU is used to guard the
465 * update of a tasks cgroup pointer by attach_task()
468 static DEFINE_MUTEX(cgroup_mutex);
471 * cgroup_lock - lock out any changes to cgroup structures
475 void cgroup_lock(void)
477 mutex_lock(&cgroup_mutex);
481 * cgroup_unlock - release lock on cgroup changes
483 * Undo the lock taken in a previous cgroup_lock() call.
486 void cgroup_unlock(void)
488 mutex_unlock(&cgroup_mutex);
492 * A couple of forward declarations required, due to cyclic reference loop:
493 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
494 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
495 * -> cgroup_mkdir.
498 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
499 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
500 static int cgroup_populate_dir(struct cgroup *cont);
501 static struct inode_operations cgroup_dir_inode_operations;
502 static struct file_operations proc_cgroupstats_operations;
504 static struct backing_dev_info cgroup_backing_dev_info = {
505 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
508 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
510 struct inode *inode = new_inode(sb);
512 if (inode) {
513 inode->i_mode = mode;
514 inode->i_uid = current->fsuid;
515 inode->i_gid = current->fsgid;
516 inode->i_blocks = 0;
517 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
518 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
520 return inode;
523 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
525 /* is dentry a directory ? if so, kfree() associated cgroup */
526 if (S_ISDIR(inode->i_mode)) {
527 struct cgroup *cont = dentry->d_fsdata;
528 BUG_ON(!(cgroup_is_removed(cont)));
529 kfree(cont);
531 iput(inode);
534 static void remove_dir(struct dentry *d)
536 struct dentry *parent = dget(d->d_parent);
538 d_delete(d);
539 simple_rmdir(parent->d_inode, d);
540 dput(parent);
543 static void cgroup_clear_directory(struct dentry *dentry)
545 struct list_head *node;
547 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
548 spin_lock(&dcache_lock);
549 node = dentry->d_subdirs.next;
550 while (node != &dentry->d_subdirs) {
551 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
552 list_del_init(node);
553 if (d->d_inode) {
554 /* This should never be called on a cgroup
555 * directory with child cgroups */
556 BUG_ON(d->d_inode->i_mode & S_IFDIR);
557 d = dget_locked(d);
558 spin_unlock(&dcache_lock);
559 d_delete(d);
560 simple_unlink(dentry->d_inode, d);
561 dput(d);
562 spin_lock(&dcache_lock);
564 node = dentry->d_subdirs.next;
566 spin_unlock(&dcache_lock);
570 * NOTE : the dentry must have been dget()'ed
572 static void cgroup_d_remove_dir(struct dentry *dentry)
574 cgroup_clear_directory(dentry);
576 spin_lock(&dcache_lock);
577 list_del_init(&dentry->d_u.d_child);
578 spin_unlock(&dcache_lock);
579 remove_dir(dentry);
582 static int rebind_subsystems(struct cgroupfs_root *root,
583 unsigned long final_bits)
585 unsigned long added_bits, removed_bits;
586 struct cgroup *cont = &root->top_cgroup;
587 int i;
589 removed_bits = root->actual_subsys_bits & ~final_bits;
590 added_bits = final_bits & ~root->actual_subsys_bits;
591 /* Check that any added subsystems are currently free */
592 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
593 unsigned long long bit = 1ull << i;
594 struct cgroup_subsys *ss = subsys[i];
595 if (!(bit & added_bits))
596 continue;
597 if (ss->root != &rootnode) {
598 /* Subsystem isn't free */
599 return -EBUSY;
603 /* Currently we don't handle adding/removing subsystems when
604 * any child cgroups exist. This is theoretically supportable
605 * but involves complex error handling, so it's being left until
606 * later */
607 if (!list_empty(&cont->children))
608 return -EBUSY;
610 /* Process each subsystem */
611 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
612 struct cgroup_subsys *ss = subsys[i];
613 unsigned long bit = 1UL << i;
614 if (bit & added_bits) {
615 /* We're binding this subsystem to this hierarchy */
616 BUG_ON(cont->subsys[i]);
617 BUG_ON(!dummytop->subsys[i]);
618 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
619 cont->subsys[i] = dummytop->subsys[i];
620 cont->subsys[i]->cgroup = cont;
621 list_add(&ss->sibling, &root->subsys_list);
622 rcu_assign_pointer(ss->root, root);
623 if (ss->bind)
624 ss->bind(ss, cont);
626 } else if (bit & removed_bits) {
627 /* We're removing this subsystem */
628 BUG_ON(cont->subsys[i] != dummytop->subsys[i]);
629 BUG_ON(cont->subsys[i]->cgroup != cont);
630 if (ss->bind)
631 ss->bind(ss, dummytop);
632 dummytop->subsys[i]->cgroup = dummytop;
633 cont->subsys[i] = NULL;
634 rcu_assign_pointer(subsys[i]->root, &rootnode);
635 list_del(&ss->sibling);
636 } else if (bit & final_bits) {
637 /* Subsystem state should already exist */
638 BUG_ON(!cont->subsys[i]);
639 } else {
640 /* Subsystem state shouldn't exist */
641 BUG_ON(cont->subsys[i]);
644 root->subsys_bits = root->actual_subsys_bits = final_bits;
645 synchronize_rcu();
647 return 0;
650 static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
652 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
653 struct cgroup_subsys *ss;
655 mutex_lock(&cgroup_mutex);
656 for_each_subsys(root, ss)
657 seq_printf(seq, ",%s", ss->name);
658 if (test_bit(ROOT_NOPREFIX, &root->flags))
659 seq_puts(seq, ",noprefix");
660 mutex_unlock(&cgroup_mutex);
661 return 0;
664 struct cgroup_sb_opts {
665 unsigned long subsys_bits;
666 unsigned long flags;
669 /* Convert a hierarchy specifier into a bitmask of subsystems and
670 * flags. */
671 static int parse_cgroupfs_options(char *data,
672 struct cgroup_sb_opts *opts)
674 char *token, *o = data ?: "all";
676 opts->subsys_bits = 0;
677 opts->flags = 0;
679 while ((token = strsep(&o, ",")) != NULL) {
680 if (!*token)
681 return -EINVAL;
682 if (!strcmp(token, "all")) {
683 opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
684 } else if (!strcmp(token, "noprefix")) {
685 set_bit(ROOT_NOPREFIX, &opts->flags);
686 } else {
687 struct cgroup_subsys *ss;
688 int i;
689 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
690 ss = subsys[i];
691 if (!strcmp(token, ss->name)) {
692 set_bit(i, &opts->subsys_bits);
693 break;
696 if (i == CGROUP_SUBSYS_COUNT)
697 return -ENOENT;
701 /* We can't have an empty hierarchy */
702 if (!opts->subsys_bits)
703 return -EINVAL;
705 return 0;
708 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
710 int ret = 0;
711 struct cgroupfs_root *root = sb->s_fs_info;
712 struct cgroup *cont = &root->top_cgroup;
713 struct cgroup_sb_opts opts;
715 mutex_lock(&cont->dentry->d_inode->i_mutex);
716 mutex_lock(&cgroup_mutex);
718 /* See what subsystems are wanted */
719 ret = parse_cgroupfs_options(data, &opts);
720 if (ret)
721 goto out_unlock;
723 /* Don't allow flags to change at remount */
724 if (opts.flags != root->flags) {
725 ret = -EINVAL;
726 goto out_unlock;
729 ret = rebind_subsystems(root, opts.subsys_bits);
731 /* (re)populate subsystem files */
732 if (!ret)
733 cgroup_populate_dir(cont);
735 out_unlock:
736 mutex_unlock(&cgroup_mutex);
737 mutex_unlock(&cont->dentry->d_inode->i_mutex);
738 return ret;
741 static struct super_operations cgroup_ops = {
742 .statfs = simple_statfs,
743 .drop_inode = generic_delete_inode,
744 .show_options = cgroup_show_options,
745 .remount_fs = cgroup_remount,
748 static void init_cgroup_root(struct cgroupfs_root *root)
750 struct cgroup *cont = &root->top_cgroup;
751 INIT_LIST_HEAD(&root->subsys_list);
752 INIT_LIST_HEAD(&root->root_list);
753 root->number_of_cgroups = 1;
754 cont->root = root;
755 cont->top_cgroup = cont;
756 INIT_LIST_HEAD(&cont->sibling);
757 INIT_LIST_HEAD(&cont->children);
758 INIT_LIST_HEAD(&cont->css_sets);
761 static int cgroup_test_super(struct super_block *sb, void *data)
763 struct cgroupfs_root *new = data;
764 struct cgroupfs_root *root = sb->s_fs_info;
766 /* First check subsystems */
767 if (new->subsys_bits != root->subsys_bits)
768 return 0;
770 /* Next check flags */
771 if (new->flags != root->flags)
772 return 0;
774 return 1;
777 static int cgroup_set_super(struct super_block *sb, void *data)
779 int ret;
780 struct cgroupfs_root *root = data;
782 ret = set_anon_super(sb, NULL);
783 if (ret)
784 return ret;
786 sb->s_fs_info = root;
787 root->sb = sb;
789 sb->s_blocksize = PAGE_CACHE_SIZE;
790 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
791 sb->s_magic = CGROUP_SUPER_MAGIC;
792 sb->s_op = &cgroup_ops;
794 return 0;
797 static int cgroup_get_rootdir(struct super_block *sb)
799 struct inode *inode =
800 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
801 struct dentry *dentry;
803 if (!inode)
804 return -ENOMEM;
806 inode->i_op = &simple_dir_inode_operations;
807 inode->i_fop = &simple_dir_operations;
808 inode->i_op = &cgroup_dir_inode_operations;
809 /* directories start off with i_nlink == 2 (for "." entry) */
810 inc_nlink(inode);
811 dentry = d_alloc_root(inode);
812 if (!dentry) {
813 iput(inode);
814 return -ENOMEM;
816 sb->s_root = dentry;
817 return 0;
820 static int cgroup_get_sb(struct file_system_type *fs_type,
821 int flags, const char *unused_dev_name,
822 void *data, struct vfsmount *mnt)
824 struct cgroup_sb_opts opts;
825 int ret = 0;
826 struct super_block *sb;
827 struct cgroupfs_root *root;
828 struct list_head tmp_cg_links, *l;
829 INIT_LIST_HEAD(&tmp_cg_links);
831 /* First find the desired set of subsystems */
832 ret = parse_cgroupfs_options(data, &opts);
833 if (ret)
834 return ret;
836 root = kzalloc(sizeof(*root), GFP_KERNEL);
837 if (!root)
838 return -ENOMEM;
840 init_cgroup_root(root);
841 root->subsys_bits = opts.subsys_bits;
842 root->flags = opts.flags;
844 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
846 if (IS_ERR(sb)) {
847 kfree(root);
848 return PTR_ERR(sb);
851 if (sb->s_fs_info != root) {
852 /* Reusing an existing superblock */
853 BUG_ON(sb->s_root == NULL);
854 kfree(root);
855 root = NULL;
856 } else {
857 /* New superblock */
858 struct cgroup *cont = &root->top_cgroup;
859 struct inode *inode;
861 BUG_ON(sb->s_root != NULL);
863 ret = cgroup_get_rootdir(sb);
864 if (ret)
865 goto drop_new_super;
866 inode = sb->s_root->d_inode;
868 mutex_lock(&inode->i_mutex);
869 mutex_lock(&cgroup_mutex);
872 * We're accessing css_set_count without locking
873 * css_set_lock here, but that's OK - it can only be
874 * increased by someone holding cgroup_lock, and
875 * that's us. The worst that can happen is that we
876 * have some link structures left over
878 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
879 if (ret) {
880 mutex_unlock(&cgroup_mutex);
881 mutex_unlock(&inode->i_mutex);
882 goto drop_new_super;
885 ret = rebind_subsystems(root, root->subsys_bits);
886 if (ret == -EBUSY) {
887 mutex_unlock(&cgroup_mutex);
888 mutex_unlock(&inode->i_mutex);
889 goto drop_new_super;
892 /* EBUSY should be the only error here */
893 BUG_ON(ret);
895 list_add(&root->root_list, &roots);
896 root_count++;
898 sb->s_root->d_fsdata = &root->top_cgroup;
899 root->top_cgroup.dentry = sb->s_root;
901 /* Link the top cgroup in this hierarchy into all
902 * the css_set objects */
903 write_lock(&css_set_lock);
904 l = &init_css_set.list;
905 do {
906 struct css_set *cg;
907 struct cg_cgroup_link *link;
908 cg = list_entry(l, struct css_set, list);
909 BUG_ON(list_empty(&tmp_cg_links));
910 link = list_entry(tmp_cg_links.next,
911 struct cg_cgroup_link,
912 cont_link_list);
913 list_del(&link->cont_link_list);
914 link->cg = cg;
915 list_add(&link->cont_link_list,
916 &root->top_cgroup.css_sets);
917 list_add(&link->cg_link_list, &cg->cg_links);
918 l = l->next;
919 } while (l != &init_css_set.list);
920 write_unlock(&css_set_lock);
922 free_cg_links(&tmp_cg_links);
924 BUG_ON(!list_empty(&cont->sibling));
925 BUG_ON(!list_empty(&cont->children));
926 BUG_ON(root->number_of_cgroups != 1);
928 cgroup_populate_dir(cont);
929 mutex_unlock(&inode->i_mutex);
930 mutex_unlock(&cgroup_mutex);
933 return simple_set_mnt(mnt, sb);
935 drop_new_super:
936 up_write(&sb->s_umount);
937 deactivate_super(sb);
938 free_cg_links(&tmp_cg_links);
939 return ret;
942 static void cgroup_kill_sb(struct super_block *sb) {
943 struct cgroupfs_root *root = sb->s_fs_info;
944 struct cgroup *cont = &root->top_cgroup;
945 int ret;
947 BUG_ON(!root);
949 BUG_ON(root->number_of_cgroups != 1);
950 BUG_ON(!list_empty(&cont->children));
951 BUG_ON(!list_empty(&cont->sibling));
953 mutex_lock(&cgroup_mutex);
955 /* Rebind all subsystems back to the default hierarchy */
956 ret = rebind_subsystems(root, 0);
957 /* Shouldn't be able to fail ... */
958 BUG_ON(ret);
961 * Release all the links from css_sets to this hierarchy's
962 * root cgroup
964 write_lock(&css_set_lock);
965 while (!list_empty(&cont->css_sets)) {
966 struct cg_cgroup_link *link;
967 link = list_entry(cont->css_sets.next,
968 struct cg_cgroup_link, cont_link_list);
969 list_del(&link->cg_link_list);
970 list_del(&link->cont_link_list);
971 kfree(link);
973 write_unlock(&css_set_lock);
975 if (!list_empty(&root->root_list)) {
976 list_del(&root->root_list);
977 root_count--;
979 mutex_unlock(&cgroup_mutex);
981 kfree(root);
982 kill_litter_super(sb);
985 static struct file_system_type cgroup_fs_type = {
986 .name = "cgroup",
987 .get_sb = cgroup_get_sb,
988 .kill_sb = cgroup_kill_sb,
991 static inline struct cgroup *__d_cont(struct dentry *dentry)
993 return dentry->d_fsdata;
996 static inline struct cftype *__d_cft(struct dentry *dentry)
998 return dentry->d_fsdata;
1002 * Called with cgroup_mutex held. Writes path of cgroup into buf.
1003 * Returns 0 on success, -errno on error.
1005 int cgroup_path(const struct cgroup *cont, char *buf, int buflen)
1007 char *start;
1009 if (cont == dummytop) {
1011 * Inactive subsystems have no dentry for their root
1012 * cgroup
1014 strcpy(buf, "/");
1015 return 0;
1018 start = buf + buflen;
1020 *--start = '\0';
1021 for (;;) {
1022 int len = cont->dentry->d_name.len;
1023 if ((start -= len) < buf)
1024 return -ENAMETOOLONG;
1025 memcpy(start, cont->dentry->d_name.name, len);
1026 cont = cont->parent;
1027 if (!cont)
1028 break;
1029 if (!cont->parent)
1030 continue;
1031 if (--start < buf)
1032 return -ENAMETOOLONG;
1033 *start = '/';
1035 memmove(buf, start, buf + buflen - start);
1036 return 0;
1040 * Return the first subsystem attached to a cgroup's hierarchy, and
1041 * its subsystem id.
1044 static void get_first_subsys(const struct cgroup *cont,
1045 struct cgroup_subsys_state **css, int *subsys_id)
1047 const struct cgroupfs_root *root = cont->root;
1048 const struct cgroup_subsys *test_ss;
1049 BUG_ON(list_empty(&root->subsys_list));
1050 test_ss = list_entry(root->subsys_list.next,
1051 struct cgroup_subsys, sibling);
1052 if (css) {
1053 *css = cont->subsys[test_ss->subsys_id];
1054 BUG_ON(!*css);
1056 if (subsys_id)
1057 *subsys_id = test_ss->subsys_id;
1061 * Attach task 'tsk' to cgroup 'cont'
1063 * Call holding cgroup_mutex. May take task_lock of
1064 * the task 'pid' during call.
1066 static int attach_task(struct cgroup *cont, struct task_struct *tsk)
1068 int retval = 0;
1069 struct cgroup_subsys *ss;
1070 struct cgroup *oldcont;
1071 struct css_set *cg = tsk->cgroups;
1072 struct css_set *newcg;
1073 struct cgroupfs_root *root = cont->root;
1074 int subsys_id;
1076 get_first_subsys(cont, NULL, &subsys_id);
1078 /* Nothing to do if the task is already in that cgroup */
1079 oldcont = task_cgroup(tsk, subsys_id);
1080 if (cont == oldcont)
1081 return 0;
1083 for_each_subsys(root, ss) {
1084 if (ss->can_attach) {
1085 retval = ss->can_attach(ss, cont, tsk);
1086 if (retval) {
1087 return retval;
1093 * Locate or allocate a new css_set for this task,
1094 * based on its final set of cgroups
1096 newcg = find_css_set(cg, cont);
1097 if (!newcg) {
1098 return -ENOMEM;
1101 task_lock(tsk);
1102 if (tsk->flags & PF_EXITING) {
1103 task_unlock(tsk);
1104 put_css_set(newcg);
1105 return -ESRCH;
1107 rcu_assign_pointer(tsk->cgroups, newcg);
1108 task_unlock(tsk);
1110 /* Update the css_set linked lists if we're using them */
1111 write_lock(&css_set_lock);
1112 if (!list_empty(&tsk->cg_list)) {
1113 list_del(&tsk->cg_list);
1114 list_add(&tsk->cg_list, &newcg->tasks);
1116 write_unlock(&css_set_lock);
1118 for_each_subsys(root, ss) {
1119 if (ss->attach) {
1120 ss->attach(ss, cont, oldcont, tsk);
1124 synchronize_rcu();
1125 put_css_set(cg);
1126 return 0;
1130 * Attach task with pid 'pid' to cgroup 'cont'. Call with
1131 * cgroup_mutex, may take task_lock of task
1133 static int attach_task_by_pid(struct cgroup *cont, char *pidbuf)
1135 pid_t pid;
1136 struct task_struct *tsk;
1137 int ret;
1139 if (sscanf(pidbuf, "%d", &pid) != 1)
1140 return -EIO;
1142 if (pid) {
1143 rcu_read_lock();
1144 tsk = find_task_by_pid(pid);
1145 if (!tsk || tsk->flags & PF_EXITING) {
1146 rcu_read_unlock();
1147 return -ESRCH;
1149 get_task_struct(tsk);
1150 rcu_read_unlock();
1152 if ((current->euid) && (current->euid != tsk->uid)
1153 && (current->euid != tsk->suid)) {
1154 put_task_struct(tsk);
1155 return -EACCES;
1157 } else {
1158 tsk = current;
1159 get_task_struct(tsk);
1162 ret = attach_task(cont, tsk);
1163 put_task_struct(tsk);
1164 return ret;
1167 /* The various types of files and directories in a cgroup file system */
1169 enum cgroup_filetype {
1170 FILE_ROOT,
1171 FILE_DIR,
1172 FILE_TASKLIST,
1175 static ssize_t cgroup_write_uint(struct cgroup *cont, struct cftype *cft,
1176 struct file *file,
1177 const char __user *userbuf,
1178 size_t nbytes, loff_t *unused_ppos)
1180 char buffer[64];
1181 int retval = 0;
1182 u64 val;
1183 char *end;
1185 if (!nbytes)
1186 return -EINVAL;
1187 if (nbytes >= sizeof(buffer))
1188 return -E2BIG;
1189 if (copy_from_user(buffer, userbuf, nbytes))
1190 return -EFAULT;
1192 buffer[nbytes] = 0; /* nul-terminate */
1194 /* strip newline if necessary */
1195 if (nbytes && (buffer[nbytes-1] == '\n'))
1196 buffer[nbytes-1] = 0;
1197 val = simple_strtoull(buffer, &end, 0);
1198 if (*end)
1199 return -EINVAL;
1201 /* Pass to subsystem */
1202 retval = cft->write_uint(cont, cft, val);
1203 if (!retval)
1204 retval = nbytes;
1205 return retval;
1208 static ssize_t cgroup_common_file_write(struct cgroup *cont,
1209 struct cftype *cft,
1210 struct file *file,
1211 const char __user *userbuf,
1212 size_t nbytes, loff_t *unused_ppos)
1214 enum cgroup_filetype type = cft->private;
1215 char *buffer;
1216 int retval = 0;
1218 if (nbytes >= PATH_MAX)
1219 return -E2BIG;
1221 /* +1 for nul-terminator */
1222 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1223 if (buffer == NULL)
1224 return -ENOMEM;
1226 if (copy_from_user(buffer, userbuf, nbytes)) {
1227 retval = -EFAULT;
1228 goto out1;
1230 buffer[nbytes] = 0; /* nul-terminate */
1232 mutex_lock(&cgroup_mutex);
1234 if (cgroup_is_removed(cont)) {
1235 retval = -ENODEV;
1236 goto out2;
1239 switch (type) {
1240 case FILE_TASKLIST:
1241 retval = attach_task_by_pid(cont, buffer);
1242 break;
1243 default:
1244 retval = -EINVAL;
1245 goto out2;
1248 if (retval == 0)
1249 retval = nbytes;
1250 out2:
1251 mutex_unlock(&cgroup_mutex);
1252 out1:
1253 kfree(buffer);
1254 return retval;
1257 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1258 size_t nbytes, loff_t *ppos)
1260 struct cftype *cft = __d_cft(file->f_dentry);
1261 struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
1263 if (!cft)
1264 return -ENODEV;
1265 if (cft->write)
1266 return cft->write(cont, cft, file, buf, nbytes, ppos);
1267 if (cft->write_uint)
1268 return cgroup_write_uint(cont, cft, file, buf, nbytes, ppos);
1269 return -EINVAL;
1272 static ssize_t cgroup_read_uint(struct cgroup *cont, struct cftype *cft,
1273 struct file *file,
1274 char __user *buf, size_t nbytes,
1275 loff_t *ppos)
1277 char tmp[64];
1278 u64 val = cft->read_uint(cont, cft);
1279 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1281 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1284 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1285 size_t nbytes, loff_t *ppos)
1287 struct cftype *cft = __d_cft(file->f_dentry);
1288 struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
1290 if (!cft)
1291 return -ENODEV;
1293 if (cft->read)
1294 return cft->read(cont, cft, file, buf, nbytes, ppos);
1295 if (cft->read_uint)
1296 return cgroup_read_uint(cont, cft, file, buf, nbytes, ppos);
1297 return -EINVAL;
1300 static int cgroup_file_open(struct inode *inode, struct file *file)
1302 int err;
1303 struct cftype *cft;
1305 err = generic_file_open(inode, file);
1306 if (err)
1307 return err;
1309 cft = __d_cft(file->f_dentry);
1310 if (!cft)
1311 return -ENODEV;
1312 if (cft->open)
1313 err = cft->open(inode, file);
1314 else
1315 err = 0;
1317 return err;
1320 static int cgroup_file_release(struct inode *inode, struct file *file)
1322 struct cftype *cft = __d_cft(file->f_dentry);
1323 if (cft->release)
1324 return cft->release(inode, file);
1325 return 0;
1329 * cgroup_rename - Only allow simple rename of directories in place.
1331 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1332 struct inode *new_dir, struct dentry *new_dentry)
1334 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1335 return -ENOTDIR;
1336 if (new_dentry->d_inode)
1337 return -EEXIST;
1338 if (old_dir != new_dir)
1339 return -EIO;
1340 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1343 static struct file_operations cgroup_file_operations = {
1344 .read = cgroup_file_read,
1345 .write = cgroup_file_write,
1346 .llseek = generic_file_llseek,
1347 .open = cgroup_file_open,
1348 .release = cgroup_file_release,
1351 static struct inode_operations cgroup_dir_inode_operations = {
1352 .lookup = simple_lookup,
1353 .mkdir = cgroup_mkdir,
1354 .rmdir = cgroup_rmdir,
1355 .rename = cgroup_rename,
1358 static int cgroup_create_file(struct dentry *dentry, int mode,
1359 struct super_block *sb)
1361 static struct dentry_operations cgroup_dops = {
1362 .d_iput = cgroup_diput,
1365 struct inode *inode;
1367 if (!dentry)
1368 return -ENOENT;
1369 if (dentry->d_inode)
1370 return -EEXIST;
1372 inode = cgroup_new_inode(mode, sb);
1373 if (!inode)
1374 return -ENOMEM;
1376 if (S_ISDIR(mode)) {
1377 inode->i_op = &cgroup_dir_inode_operations;
1378 inode->i_fop = &simple_dir_operations;
1380 /* start off with i_nlink == 2 (for "." entry) */
1381 inc_nlink(inode);
1383 /* start with the directory inode held, so that we can
1384 * populate it without racing with another mkdir */
1385 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1386 } else if (S_ISREG(mode)) {
1387 inode->i_size = 0;
1388 inode->i_fop = &cgroup_file_operations;
1390 dentry->d_op = &cgroup_dops;
1391 d_instantiate(dentry, inode);
1392 dget(dentry); /* Extra count - pin the dentry in core */
1393 return 0;
1397 * cgroup_create_dir - create a directory for an object.
1398 * cont: the cgroup we create the directory for.
1399 * It must have a valid ->parent field
1400 * And we are going to fill its ->dentry field.
1401 * dentry: dentry of the new container
1402 * mode: mode to set on new directory.
1404 static int cgroup_create_dir(struct cgroup *cont, struct dentry *dentry,
1405 int mode)
1407 struct dentry *parent;
1408 int error = 0;
1410 parent = cont->parent->dentry;
1411 error = cgroup_create_file(dentry, S_IFDIR | mode, cont->root->sb);
1412 if (!error) {
1413 dentry->d_fsdata = cont;
1414 inc_nlink(parent->d_inode);
1415 cont->dentry = dentry;
1416 dget(dentry);
1418 dput(dentry);
1420 return error;
1423 int cgroup_add_file(struct cgroup *cont,
1424 struct cgroup_subsys *subsys,
1425 const struct cftype *cft)
1427 struct dentry *dir = cont->dentry;
1428 struct dentry *dentry;
1429 int error;
1431 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1432 if (subsys && !test_bit(ROOT_NOPREFIX, &cont->root->flags)) {
1433 strcpy(name, subsys->name);
1434 strcat(name, ".");
1436 strcat(name, cft->name);
1437 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1438 dentry = lookup_one_len(name, dir, strlen(name));
1439 if (!IS_ERR(dentry)) {
1440 error = cgroup_create_file(dentry, 0644 | S_IFREG,
1441 cont->root->sb);
1442 if (!error)
1443 dentry->d_fsdata = (void *)cft;
1444 dput(dentry);
1445 } else
1446 error = PTR_ERR(dentry);
1447 return error;
1450 int cgroup_add_files(struct cgroup *cont,
1451 struct cgroup_subsys *subsys,
1452 const struct cftype cft[],
1453 int count)
1455 int i, err;
1456 for (i = 0; i < count; i++) {
1457 err = cgroup_add_file(cont, subsys, &cft[i]);
1458 if (err)
1459 return err;
1461 return 0;
1464 /* Count the number of tasks in a cgroup. */
1466 int cgroup_task_count(const struct cgroup *cont)
1468 int count = 0;
1469 struct list_head *l;
1471 read_lock(&css_set_lock);
1472 l = cont->css_sets.next;
1473 while (l != &cont->css_sets) {
1474 struct cg_cgroup_link *link =
1475 list_entry(l, struct cg_cgroup_link, cont_link_list);
1476 count += atomic_read(&link->cg->ref.refcount);
1477 l = l->next;
1479 read_unlock(&css_set_lock);
1480 return count;
1484 * Advance a list_head iterator. The iterator should be positioned at
1485 * the start of a css_set
1487 static void cgroup_advance_iter(struct cgroup *cont,
1488 struct cgroup_iter *it)
1490 struct list_head *l = it->cg_link;
1491 struct cg_cgroup_link *link;
1492 struct css_set *cg;
1494 /* Advance to the next non-empty css_set */
1495 do {
1496 l = l->next;
1497 if (l == &cont->css_sets) {
1498 it->cg_link = NULL;
1499 return;
1501 link = list_entry(l, struct cg_cgroup_link, cont_link_list);
1502 cg = link->cg;
1503 } while (list_empty(&cg->tasks));
1504 it->cg_link = l;
1505 it->task = cg->tasks.next;
1508 void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it)
1511 * The first time anyone tries to iterate across a cgroup,
1512 * we need to enable the list linking each css_set to its
1513 * tasks, and fix up all existing tasks.
1515 if (!use_task_css_set_links) {
1516 struct task_struct *p, *g;
1517 write_lock(&css_set_lock);
1518 use_task_css_set_links = 1;
1519 do_each_thread(g, p) {
1520 task_lock(p);
1521 if (list_empty(&p->cg_list))
1522 list_add(&p->cg_list, &p->cgroups->tasks);
1523 task_unlock(p);
1524 } while_each_thread(g, p);
1525 write_unlock(&css_set_lock);
1527 read_lock(&css_set_lock);
1528 it->cg_link = &cont->css_sets;
1529 cgroup_advance_iter(cont, it);
1532 struct task_struct *cgroup_iter_next(struct cgroup *cont,
1533 struct cgroup_iter *it)
1535 struct task_struct *res;
1536 struct list_head *l = it->task;
1538 /* If the iterator cg is NULL, we have no tasks */
1539 if (!it->cg_link)
1540 return NULL;
1541 res = list_entry(l, struct task_struct, cg_list);
1542 /* Advance iterator to find next entry */
1543 l = l->next;
1544 if (l == &res->cgroups->tasks) {
1545 /* We reached the end of this task list - move on to
1546 * the next cg_cgroup_link */
1547 cgroup_advance_iter(cont, it);
1548 } else {
1549 it->task = l;
1551 return res;
1554 void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it)
1556 read_unlock(&css_set_lock);
1560 * Stuff for reading the 'tasks' file.
1562 * Reading this file can return large amounts of data if a cgroup has
1563 * *lots* of attached tasks. So it may need several calls to read(),
1564 * but we cannot guarantee that the information we produce is correct
1565 * unless we produce it entirely atomically.
1567 * Upon tasks file open(), a struct ctr_struct is allocated, that
1568 * will have a pointer to an array (also allocated here). The struct
1569 * ctr_struct * is stored in file->private_data. Its resources will
1570 * be freed by release() when the file is closed. The array is used
1571 * to sprintf the PIDs and then used by read().
1573 struct ctr_struct {
1574 char *buf;
1575 int bufsz;
1579 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
1580 * 'cont'. Return actual number of pids loaded. No need to
1581 * task_lock(p) when reading out p->cgroup, since we're in an RCU
1582 * read section, so the css_set can't go away, and is
1583 * immutable after creation.
1585 static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cont)
1587 int n = 0;
1588 struct cgroup_iter it;
1589 struct task_struct *tsk;
1590 cgroup_iter_start(cont, &it);
1591 while ((tsk = cgroup_iter_next(cont, &it))) {
1592 if (unlikely(n == npids))
1593 break;
1594 pidarray[n++] = pid_nr(task_pid(tsk));
1596 cgroup_iter_end(cont, &it);
1597 return n;
1600 static int cmppid(const void *a, const void *b)
1602 return *(pid_t *)a - *(pid_t *)b;
1606 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1607 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1608 * count 'cnt' of how many chars would be written if buf were large enough.
1610 static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1612 int cnt = 0;
1613 int i;
1615 for (i = 0; i < npids; i++)
1616 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1617 return cnt;
1621 * Handle an open on 'tasks' file. Prepare a buffer listing the
1622 * process id's of tasks currently attached to the cgroup being opened.
1624 * Does not require any specific cgroup mutexes, and does not take any.
1626 static int cgroup_tasks_open(struct inode *unused, struct file *file)
1628 struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
1629 struct ctr_struct *ctr;
1630 pid_t *pidarray;
1631 int npids;
1632 char c;
1634 if (!(file->f_mode & FMODE_READ))
1635 return 0;
1637 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1638 if (!ctr)
1639 goto err0;
1642 * If cgroup gets more users after we read count, we won't have
1643 * enough space - tough. This race is indistinguishable to the
1644 * caller from the case that the additional cgroup users didn't
1645 * show up until sometime later on.
1647 npids = cgroup_task_count(cont);
1648 if (npids) {
1649 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1650 if (!pidarray)
1651 goto err1;
1653 npids = pid_array_load(pidarray, npids, cont);
1654 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1656 /* Call pid_array_to_buf() twice, first just to get bufsz */
1657 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1658 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1659 if (!ctr->buf)
1660 goto err2;
1661 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1663 kfree(pidarray);
1664 } else {
1665 ctr->buf = 0;
1666 ctr->bufsz = 0;
1668 file->private_data = ctr;
1669 return 0;
1671 err2:
1672 kfree(pidarray);
1673 err1:
1674 kfree(ctr);
1675 err0:
1676 return -ENOMEM;
1679 static ssize_t cgroup_tasks_read(struct cgroup *cont,
1680 struct cftype *cft,
1681 struct file *file, char __user *buf,
1682 size_t nbytes, loff_t *ppos)
1684 struct ctr_struct *ctr = file->private_data;
1686 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1689 static int cgroup_tasks_release(struct inode *unused_inode,
1690 struct file *file)
1692 struct ctr_struct *ctr;
1694 if (file->f_mode & FMODE_READ) {
1695 ctr = file->private_data;
1696 kfree(ctr->buf);
1697 kfree(ctr);
1699 return 0;
1703 * for the common functions, 'private' gives the type of file
1705 static struct cftype cft_tasks = {
1706 .name = "tasks",
1707 .open = cgroup_tasks_open,
1708 .read = cgroup_tasks_read,
1709 .write = cgroup_common_file_write,
1710 .release = cgroup_tasks_release,
1711 .private = FILE_TASKLIST,
1714 static int cgroup_populate_dir(struct cgroup *cont)
1716 int err;
1717 struct cgroup_subsys *ss;
1719 /* First clear out any existing files */
1720 cgroup_clear_directory(cont->dentry);
1722 err = cgroup_add_file(cont, NULL, &cft_tasks);
1723 if (err < 0)
1724 return err;
1726 for_each_subsys(cont->root, ss) {
1727 if (ss->populate && (err = ss->populate(ss, cont)) < 0)
1728 return err;
1731 return 0;
1734 static void init_cgroup_css(struct cgroup_subsys_state *css,
1735 struct cgroup_subsys *ss,
1736 struct cgroup *cont)
1738 css->cgroup = cont;
1739 atomic_set(&css->refcnt, 0);
1740 css->flags = 0;
1741 if (cont == dummytop)
1742 set_bit(CSS_ROOT, &css->flags);
1743 BUG_ON(cont->subsys[ss->subsys_id]);
1744 cont->subsys[ss->subsys_id] = css;
1748 * cgroup_create - create a cgroup
1749 * parent: cgroup that will be parent of the new cgroup.
1750 * name: name of the new cgroup. Will be strcpy'ed.
1751 * mode: mode to set on new inode
1753 * Must be called with the mutex on the parent inode held
1756 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
1757 int mode)
1759 struct cgroup *cont;
1760 struct cgroupfs_root *root = parent->root;
1761 int err = 0;
1762 struct cgroup_subsys *ss;
1763 struct super_block *sb = root->sb;
1765 cont = kzalloc(sizeof(*cont), GFP_KERNEL);
1766 if (!cont)
1767 return -ENOMEM;
1769 /* Grab a reference on the superblock so the hierarchy doesn't
1770 * get deleted on unmount if there are child cgroups. This
1771 * can be done outside cgroup_mutex, since the sb can't
1772 * disappear while someone has an open control file on the
1773 * fs */
1774 atomic_inc(&sb->s_active);
1776 mutex_lock(&cgroup_mutex);
1778 cont->flags = 0;
1779 INIT_LIST_HEAD(&cont->sibling);
1780 INIT_LIST_HEAD(&cont->children);
1781 INIT_LIST_HEAD(&cont->css_sets);
1783 cont->parent = parent;
1784 cont->root = parent->root;
1785 cont->top_cgroup = parent->top_cgroup;
1787 for_each_subsys(root, ss) {
1788 struct cgroup_subsys_state *css = ss->create(ss, cont);
1789 if (IS_ERR(css)) {
1790 err = PTR_ERR(css);
1791 goto err_destroy;
1793 init_cgroup_css(css, ss, cont);
1796 list_add(&cont->sibling, &cont->parent->children);
1797 root->number_of_cgroups++;
1799 err = cgroup_create_dir(cont, dentry, mode);
1800 if (err < 0)
1801 goto err_remove;
1803 /* The cgroup directory was pre-locked for us */
1804 BUG_ON(!mutex_is_locked(&cont->dentry->d_inode->i_mutex));
1806 err = cgroup_populate_dir(cont);
1807 /* If err < 0, we have a half-filled directory - oh well ;) */
1809 mutex_unlock(&cgroup_mutex);
1810 mutex_unlock(&cont->dentry->d_inode->i_mutex);
1812 return 0;
1814 err_remove:
1816 list_del(&cont->sibling);
1817 root->number_of_cgroups--;
1819 err_destroy:
1821 for_each_subsys(root, ss) {
1822 if (cont->subsys[ss->subsys_id])
1823 ss->destroy(ss, cont);
1826 mutex_unlock(&cgroup_mutex);
1828 /* Release the reference count that we took on the superblock */
1829 deactivate_super(sb);
1831 kfree(cont);
1832 return err;
1835 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1837 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
1839 /* the vfs holds inode->i_mutex already */
1840 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
1843 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
1845 struct cgroup *cont = dentry->d_fsdata;
1846 struct dentry *d;
1847 struct cgroup *parent;
1848 struct cgroup_subsys *ss;
1849 struct super_block *sb;
1850 struct cgroupfs_root *root;
1851 int css_busy = 0;
1853 /* the vfs holds both inode->i_mutex already */
1855 mutex_lock(&cgroup_mutex);
1856 if (atomic_read(&cont->count) != 0) {
1857 mutex_unlock(&cgroup_mutex);
1858 return -EBUSY;
1860 if (!list_empty(&cont->children)) {
1861 mutex_unlock(&cgroup_mutex);
1862 return -EBUSY;
1865 parent = cont->parent;
1866 root = cont->root;
1867 sb = root->sb;
1869 /* Check the reference count on each subsystem. Since we
1870 * already established that there are no tasks in the
1871 * cgroup, if the css refcount is also 0, then there should
1872 * be no outstanding references, so the subsystem is safe to
1873 * destroy */
1874 for_each_subsys(root, ss) {
1875 struct cgroup_subsys_state *css;
1876 css = cont->subsys[ss->subsys_id];
1877 if (atomic_read(&css->refcnt)) {
1878 css_busy = 1;
1879 break;
1882 if (css_busy) {
1883 mutex_unlock(&cgroup_mutex);
1884 return -EBUSY;
1887 for_each_subsys(root, ss) {
1888 if (cont->subsys[ss->subsys_id])
1889 ss->destroy(ss, cont);
1892 set_bit(CONT_REMOVED, &cont->flags);
1893 /* delete my sibling from parent->children */
1894 list_del(&cont->sibling);
1895 spin_lock(&cont->dentry->d_lock);
1896 d = dget(cont->dentry);
1897 cont->dentry = NULL;
1898 spin_unlock(&d->d_lock);
1900 cgroup_d_remove_dir(d);
1901 dput(d);
1902 root->number_of_cgroups--;
1904 mutex_unlock(&cgroup_mutex);
1905 /* Drop the active superblock reference that we took when we
1906 * created the cgroup */
1907 deactivate_super(sb);
1908 return 0;
1911 static void cgroup_init_subsys(struct cgroup_subsys *ss)
1913 struct cgroup_subsys_state *css;
1914 struct list_head *l;
1915 printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name);
1917 /* Create the top cgroup state for this subsystem */
1918 ss->root = &rootnode;
1919 css = ss->create(ss, dummytop);
1920 /* We don't handle early failures gracefully */
1921 BUG_ON(IS_ERR(css));
1922 init_cgroup_css(css, ss, dummytop);
1924 /* Update all cgroup groups to contain a subsys
1925 * pointer to this state - since the subsystem is
1926 * newly registered, all tasks and hence all cgroup
1927 * groups are in the subsystem's top cgroup. */
1928 write_lock(&css_set_lock);
1929 l = &init_css_set.list;
1930 do {
1931 struct css_set *cg =
1932 list_entry(l, struct css_set, list);
1933 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
1934 l = l->next;
1935 } while (l != &init_css_set.list);
1936 write_unlock(&css_set_lock);
1938 /* If this subsystem requested that it be notified with fork
1939 * events, we should send it one now for every process in the
1940 * system */
1941 if (ss->fork) {
1942 struct task_struct *g, *p;
1944 read_lock(&tasklist_lock);
1945 do_each_thread(g, p) {
1946 ss->fork(ss, p);
1947 } while_each_thread(g, p);
1948 read_unlock(&tasklist_lock);
1951 need_forkexit_callback |= ss->fork || ss->exit;
1953 ss->active = 1;
1957 * cgroup_init_early - initialize cgroups at system boot, and
1958 * initialize any subsystems that request early init.
1960 int __init cgroup_init_early(void)
1962 int i;
1963 kref_init(&init_css_set.ref);
1964 kref_get(&init_css_set.ref);
1965 INIT_LIST_HEAD(&init_css_set.list);
1966 INIT_LIST_HEAD(&init_css_set.cg_links);
1967 INIT_LIST_HEAD(&init_css_set.tasks);
1968 css_set_count = 1;
1969 init_cgroup_root(&rootnode);
1970 list_add(&rootnode.root_list, &roots);
1971 root_count = 1;
1972 init_task.cgroups = &init_css_set;
1974 init_css_set_link.cg = &init_css_set;
1975 list_add(&init_css_set_link.cont_link_list,
1976 &rootnode.top_cgroup.css_sets);
1977 list_add(&init_css_set_link.cg_link_list,
1978 &init_css_set.cg_links);
1980 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1981 struct cgroup_subsys *ss = subsys[i];
1983 BUG_ON(!ss->name);
1984 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
1985 BUG_ON(!ss->create);
1986 BUG_ON(!ss->destroy);
1987 if (ss->subsys_id != i) {
1988 printk(KERN_ERR "Subsys %s id == %d\n",
1989 ss->name, ss->subsys_id);
1990 BUG();
1993 if (ss->early_init)
1994 cgroup_init_subsys(ss);
1996 return 0;
2000 * cgroup_init - register cgroup filesystem and /proc file, and
2001 * initialize any subsystems that didn't request early init.
2003 int __init cgroup_init(void)
2005 int err;
2006 int i;
2007 struct proc_dir_entry *entry;
2009 err = bdi_init(&cgroup_backing_dev_info);
2010 if (err)
2011 return err;
2013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2014 struct cgroup_subsys *ss = subsys[i];
2015 if (!ss->early_init)
2016 cgroup_init_subsys(ss);
2019 err = register_filesystem(&cgroup_fs_type);
2020 if (err < 0)
2021 goto out;
2023 entry = create_proc_entry("cgroups", 0, NULL);
2024 if (entry)
2025 entry->proc_fops = &proc_cgroupstats_operations;
2027 out:
2028 if (err)
2029 bdi_destroy(&cgroup_backing_dev_info);
2031 return err;
2035 * proc_cgroup_show()
2036 * - Print task's cgroup paths into seq_file, one line for each hierarchy
2037 * - Used for /proc/<pid>/cgroup.
2038 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2039 * doesn't really matter if tsk->cgroup changes after we read it,
2040 * and we take cgroup_mutex, keeping attach_task() from changing it
2041 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2042 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2043 * cgroup to top_cgroup.
2046 /* TODO: Use a proper seq_file iterator */
2047 static int proc_cgroup_show(struct seq_file *m, void *v)
2049 struct pid *pid;
2050 struct task_struct *tsk;
2051 char *buf;
2052 int retval;
2053 struct cgroupfs_root *root;
2055 retval = -ENOMEM;
2056 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2057 if (!buf)
2058 goto out;
2060 retval = -ESRCH;
2061 pid = m->private;
2062 tsk = get_pid_task(pid, PIDTYPE_PID);
2063 if (!tsk)
2064 goto out_free;
2066 retval = 0;
2068 mutex_lock(&cgroup_mutex);
2070 for_each_root(root) {
2071 struct cgroup_subsys *ss;
2072 struct cgroup *cont;
2073 int subsys_id;
2074 int count = 0;
2076 /* Skip this hierarchy if it has no active subsystems */
2077 if (!root->actual_subsys_bits)
2078 continue;
2079 for_each_subsys(root, ss)
2080 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
2081 seq_putc(m, ':');
2082 get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
2083 cont = task_cgroup(tsk, subsys_id);
2084 retval = cgroup_path(cont, buf, PAGE_SIZE);
2085 if (retval < 0)
2086 goto out_unlock;
2087 seq_puts(m, buf);
2088 seq_putc(m, '\n');
2091 out_unlock:
2092 mutex_unlock(&cgroup_mutex);
2093 put_task_struct(tsk);
2094 out_free:
2095 kfree(buf);
2096 out:
2097 return retval;
2100 static int cgroup_open(struct inode *inode, struct file *file)
2102 struct pid *pid = PROC_I(inode)->pid;
2103 return single_open(file, proc_cgroup_show, pid);
2106 struct file_operations proc_cgroup_operations = {
2107 .open = cgroup_open,
2108 .read = seq_read,
2109 .llseek = seq_lseek,
2110 .release = single_release,
2113 /* Display information about each subsystem and each hierarchy */
2114 static int proc_cgroupstats_show(struct seq_file *m, void *v)
2116 int i;
2117 struct cgroupfs_root *root;
2119 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n");
2120 mutex_lock(&cgroup_mutex);
2121 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2122 struct cgroup_subsys *ss = subsys[i];
2123 seq_printf(m, "%s\t%lu\t%d\n",
2124 ss->name, ss->root->subsys_bits,
2125 ss->root->number_of_cgroups);
2127 mutex_unlock(&cgroup_mutex);
2128 return 0;
2131 static int cgroupstats_open(struct inode *inode, struct file *file)
2133 return single_open(file, proc_cgroupstats_show, 0);
2136 static struct file_operations proc_cgroupstats_operations = {
2137 .open = cgroupstats_open,
2138 .read = seq_read,
2139 .llseek = seq_lseek,
2140 .release = single_release,
2144 * cgroup_fork - attach newly forked task to its parents cgroup.
2145 * @tsk: pointer to task_struct of forking parent process.
2147 * Description: A task inherits its parent's cgroup at fork().
2149 * A pointer to the shared css_set was automatically copied in
2150 * fork.c by dup_task_struct(). However, we ignore that copy, since
2151 * it was not made under the protection of RCU or cgroup_mutex, so
2152 * might no longer be a valid cgroup pointer. attach_task() might
2153 * have already changed current->cgroups, allowing the previously
2154 * referenced cgroup group to be removed and freed.
2156 * At the point that cgroup_fork() is called, 'current' is the parent
2157 * task, and the passed argument 'child' points to the child task.
2159 void cgroup_fork(struct task_struct *child)
2161 task_lock(current);
2162 child->cgroups = current->cgroups;
2163 get_css_set(child->cgroups);
2164 task_unlock(current);
2165 INIT_LIST_HEAD(&child->cg_list);
2169 * cgroup_fork_callbacks - called on a new task very soon before
2170 * adding it to the tasklist. No need to take any locks since no-one
2171 * can be operating on this task
2173 void cgroup_fork_callbacks(struct task_struct *child)
2175 if (need_forkexit_callback) {
2176 int i;
2177 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2178 struct cgroup_subsys *ss = subsys[i];
2179 if (ss->fork)
2180 ss->fork(ss, child);
2186 * cgroup_post_fork - called on a new task after adding it to the
2187 * task list. Adds the task to the list running through its css_set
2188 * if necessary. Has to be after the task is visible on the task list
2189 * in case we race with the first call to cgroup_iter_start() - to
2190 * guarantee that the new task ends up on its list. */
2191 void cgroup_post_fork(struct task_struct *child)
2193 if (use_task_css_set_links) {
2194 write_lock(&css_set_lock);
2195 if (list_empty(&child->cg_list))
2196 list_add(&child->cg_list, &child->cgroups->tasks);
2197 write_unlock(&css_set_lock);
2201 * cgroup_exit - detach cgroup from exiting task
2202 * @tsk: pointer to task_struct of exiting process
2204 * Description: Detach cgroup from @tsk and release it.
2206 * Note that cgroups marked notify_on_release force every task in
2207 * them to take the global cgroup_mutex mutex when exiting.
2208 * This could impact scaling on very large systems. Be reluctant to
2209 * use notify_on_release cgroups where very high task exit scaling
2210 * is required on large systems.
2212 * the_top_cgroup_hack:
2214 * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
2216 * We call cgroup_exit() while the task is still competent to
2217 * handle notify_on_release(), then leave the task attached to the
2218 * root cgroup in each hierarchy for the remainder of its exit.
2220 * To do this properly, we would increment the reference count on
2221 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
2222 * code we would add a second cgroup function call, to drop that
2223 * reference. This would just create an unnecessary hot spot on
2224 * the top_cgroup reference count, to no avail.
2226 * Normally, holding a reference to a cgroup without bumping its
2227 * count is unsafe. The cgroup could go away, or someone could
2228 * attach us to a different cgroup, decrementing the count on
2229 * the first cgroup that we never incremented. But in this case,
2230 * top_cgroup isn't going away, and either task has PF_EXITING set,
2231 * which wards off any attach_task() attempts, or task is a failed
2232 * fork, never visible to attach_task.
2235 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
2237 int i;
2238 struct css_set *cg;
2240 if (run_callbacks && need_forkexit_callback) {
2241 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2242 struct cgroup_subsys *ss = subsys[i];
2243 if (ss->exit)
2244 ss->exit(ss, tsk);
2249 * Unlink from the css_set task list if necessary.
2250 * Optimistically check cg_list before taking
2251 * css_set_lock
2253 if (!list_empty(&tsk->cg_list)) {
2254 write_lock(&css_set_lock);
2255 if (!list_empty(&tsk->cg_list))
2256 list_del(&tsk->cg_list);
2257 write_unlock(&css_set_lock);
2260 /* Reassign the task to the init_css_set. */
2261 task_lock(tsk);
2262 cg = tsk->cgroups;
2263 tsk->cgroups = &init_css_set;
2264 task_unlock(tsk);
2265 if (cg)
2266 put_css_set(cg);
2270 * cgroup_clone - duplicate the current cgroup in the hierarchy
2271 * that the given subsystem is attached to, and move this task into
2272 * the new child
2274 int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2276 struct dentry *dentry;
2277 int ret = 0;
2278 char nodename[MAX_CGROUP_TYPE_NAMELEN];
2279 struct cgroup *parent, *child;
2280 struct inode *inode;
2281 struct css_set *cg;
2282 struct cgroupfs_root *root;
2283 struct cgroup_subsys *ss;
2285 /* We shouldn't be called by an unregistered subsystem */
2286 BUG_ON(!subsys->active);
2288 /* First figure out what hierarchy and cgroup we're dealing
2289 * with, and pin them so we can drop cgroup_mutex */
2290 mutex_lock(&cgroup_mutex);
2291 again:
2292 root = subsys->root;
2293 if (root == &rootnode) {
2294 printk(KERN_INFO
2295 "Not cloning cgroup for unused subsystem %s\n",
2296 subsys->name);
2297 mutex_unlock(&cgroup_mutex);
2298 return 0;
2300 cg = tsk->cgroups;
2301 parent = task_cgroup(tsk, subsys->subsys_id);
2303 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
2305 /* Pin the hierarchy */
2306 atomic_inc(&parent->root->sb->s_active);
2308 /* Keep the cgroup alive */
2309 get_css_set(cg);
2310 mutex_unlock(&cgroup_mutex);
2312 /* Now do the VFS work to create a cgroup */
2313 inode = parent->dentry->d_inode;
2315 /* Hold the parent directory mutex across this operation to
2316 * stop anyone else deleting the new cgroup */
2317 mutex_lock(&inode->i_mutex);
2318 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
2319 if (IS_ERR(dentry)) {
2320 printk(KERN_INFO
2321 "Couldn't allocate dentry for %s: %ld\n", nodename,
2322 PTR_ERR(dentry));
2323 ret = PTR_ERR(dentry);
2324 goto out_release;
2327 /* Create the cgroup directory, which also creates the cgroup */
2328 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
2329 child = __d_cont(dentry);
2330 dput(dentry);
2331 if (ret) {
2332 printk(KERN_INFO
2333 "Failed to create cgroup %s: %d\n", nodename,
2334 ret);
2335 goto out_release;
2338 if (!child) {
2339 printk(KERN_INFO
2340 "Couldn't find new cgroup %s\n", nodename);
2341 ret = -ENOMEM;
2342 goto out_release;
2345 /* The cgroup now exists. Retake cgroup_mutex and check
2346 * that we're still in the same state that we thought we
2347 * were. */
2348 mutex_lock(&cgroup_mutex);
2349 if ((root != subsys->root) ||
2350 (parent != task_cgroup(tsk, subsys->subsys_id))) {
2351 /* Aargh, we raced ... */
2352 mutex_unlock(&inode->i_mutex);
2353 put_css_set(cg);
2355 deactivate_super(parent->root->sb);
2356 /* The cgroup is still accessible in the VFS, but
2357 * we're not going to try to rmdir() it at this
2358 * point. */
2359 printk(KERN_INFO
2360 "Race in cgroup_clone() - leaking cgroup %s\n",
2361 nodename);
2362 goto again;
2365 /* do any required auto-setup */
2366 for_each_subsys(root, ss) {
2367 if (ss->post_clone)
2368 ss->post_clone(ss, child);
2371 /* All seems fine. Finish by moving the task into the new cgroup */
2372 ret = attach_task(child, tsk);
2373 mutex_unlock(&cgroup_mutex);
2375 out_release:
2376 mutex_unlock(&inode->i_mutex);
2377 put_css_set(cg);
2378 deactivate_super(parent->root->sb);
2379 return ret;
2383 * See if "cont" is a descendant of the current task's cgroup in
2384 * the appropriate hierarchy
2386 * If we are sending in dummytop, then presumably we are creating
2387 * the top cgroup in the subsystem.
2389 * Called only by the ns (nsproxy) cgroup.
2391 int cgroup_is_descendant(const struct cgroup *cont)
2393 int ret;
2394 struct cgroup *target;
2395 int subsys_id;
2397 if (cont == dummytop)
2398 return 1;
2400 get_first_subsys(cont, NULL, &subsys_id);
2401 target = task_cgroup(current, subsys_id);
2402 while (cont != target && cont!= cont->top_cgroup)
2403 cont = cont->parent;
2404 ret = (cont == target);
2405 return ret;