4 * Generic process-grouping system.
6 * Based originally on the cpuset system, extracted by Paul Menage
7 * Copyright (C) 2006 Google, Inc
9 * Copyright notices from the original cpuset code:
10 * --------------------------------------------------
11 * Copyright (C) 2003 BULL SA.
12 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
14 * Portions derived from Patrick Mochel's sysfs code.
15 * sysfs is Copyright (c) 2001-3 Patrick Mochel
17 * 2003-10-10 Written by Simon Derr.
18 * 2003-10-22 Updates by Stephen Hemminger.
19 * 2004 May-July Rework by Paul Jackson.
20 * ---------------------------------------------------
22 * This file is subject to the terms and conditions of the GNU General Public
23 * License. See the file COPYING in the main directory of the Linux
24 * distribution for more details.
27 #include <linux/cgroup.h>
28 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/list.h>
33 #include <linux/mutex.h>
34 #include <linux/mount.h>
35 #include <linux/pagemap.h>
36 #include <linux/proc_fs.h>
37 #include <linux/rcupdate.h>
38 #include <linux/sched.h>
39 #include <linux/backing-dev.h>
40 #include <linux/seq_file.h>
41 #include <linux/slab.h>
42 #include <linux/magic.h>
43 #include <linux/spinlock.h>
44 #include <linux/string.h>
45 #include <linux/sort.h>
46 #include <asm/atomic.h>
48 /* Generate an array of cgroup subsystem pointers */
49 #define SUBSYS(_x) &_x ## _subsys,
51 static struct cgroup_subsys
*subsys
[] = {
52 #include <linux/cgroup_subsys.h>
56 * A cgroupfs_root represents the root of a cgroup hierarchy,
57 * and may be associated with a superblock to form an active
60 struct cgroupfs_root
{
61 struct super_block
*sb
;
64 * The bitmask of subsystems intended to be attached to this
67 unsigned long subsys_bits
;
69 /* The bitmask of subsystems currently attached to this hierarchy */
70 unsigned long actual_subsys_bits
;
72 /* A list running through the attached subsystems */
73 struct list_head subsys_list
;
75 /* The root cgroup for this hierarchy */
76 struct cgroup top_cgroup
;
78 /* Tracks how many cgroups are currently defined in hierarchy.*/
79 int number_of_cgroups
;
81 /* A list running through the mounted hierarchies */
82 struct list_head root_list
;
84 /* Hierarchy-specific flags */
90 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
91 * subsystems that are otherwise unattached - it never has more than a
92 * single cgroup, and all tasks are part of that cgroup.
94 static struct cgroupfs_root rootnode
;
96 /* The list of hierarchy roots */
98 static LIST_HEAD(roots
);
99 static int root_count
;
101 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
102 #define dummytop (&rootnode.top_cgroup)
104 /* This flag indicates whether tasks in the fork and exit paths should
105 * take callback_mutex and check for fork/exit handlers to call. This
106 * avoids us having to do extra work in the fork/exit path if none of the
107 * subsystems need to be called.
109 static int need_forkexit_callback
;
111 /* bits in struct cgroup flags field */
116 /* convenient tests for these bits */
117 inline int cgroup_is_removed(const struct cgroup
*cont
)
119 return test_bit(CONT_REMOVED
, &cont
->flags
);
122 /* bits in struct cgroupfs_root flags field */
124 ROOT_NOPREFIX
, /* mounted subsystems have no named prefix */
128 * for_each_subsys() allows you to iterate on each subsystem attached to
129 * an active hierarchy
131 #define for_each_subsys(_root, _ss) \
132 list_for_each_entry(_ss, &_root->subsys_list, sibling)
134 /* for_each_root() allows you to iterate across the active hierarchies */
135 #define for_each_root(_root) \
136 list_for_each_entry(_root, &roots, root_list)
138 /* Link structure for associating css_set objects with cgroups */
139 struct cg_cgroup_link
{
141 * List running through cg_cgroup_links associated with a
142 * cgroup, anchored on cgroup->css_sets
144 struct list_head cont_link_list
;
146 * List running through cg_cgroup_links pointing at a
147 * single css_set object, anchored on css_set->cg_links
149 struct list_head cg_link_list
;
153 /* The default css_set - used by init and its children prior to any
154 * hierarchies being mounted. It contains a pointer to the root state
155 * for each subsystem. Also used to anchor the list of css_sets. Not
156 * reference-counted, to improve performance when child cgroups
157 * haven't been created.
160 static struct css_set init_css_set
;
161 static struct cg_cgroup_link init_css_set_link
;
163 /* css_set_lock protects the list of css_set objects, and the
164 * chain of tasks off each css_set. Nests outside task->alloc_lock
165 * due to cgroup_iter_start() */
166 static DEFINE_RWLOCK(css_set_lock
);
167 static int css_set_count
;
169 /* We don't maintain the lists running through each css_set to its
170 * task until after the first call to cgroup_iter_start(). This
171 * reduces the fork()/exit() overhead for people who have cgroups
172 * compiled into their kernel but not actually in use */
173 static int use_task_css_set_links
;
175 /* When we create or destroy a css_set, the operation simply
176 * takes/releases a reference count on all the cgroups referenced
177 * by subsystems in this css_set. This can end up multiple-counting
178 * some cgroups, but that's OK - the ref-count is just a
179 * busy/not-busy indicator; ensuring that we only count each cgroup
180 * once would require taking a global lock to ensure that no
181 * subsystems moved between hierarchies while we were doing so.
183 * Possible TODO: decide at boot time based on the number of
184 * registered subsystems and the number of CPUs or NUMA nodes whether
185 * it's better for performance to ref-count every subsystem, or to
186 * take a global lock and only add one ref count to each hierarchy.
190 * unlink a css_set from the list and free it
192 static void release_css_set(struct kref
*k
)
194 struct css_set
*cg
= container_of(k
, struct css_set
, ref
);
197 write_lock(&css_set_lock
);
200 while (!list_empty(&cg
->cg_links
)) {
201 struct cg_cgroup_link
*link
;
202 link
= list_entry(cg
->cg_links
.next
,
203 struct cg_cgroup_link
, cg_link_list
);
204 list_del(&link
->cg_link_list
);
205 list_del(&link
->cont_link_list
);
208 write_unlock(&css_set_lock
);
209 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++)
210 atomic_dec(&cg
->subsys
[i
]->cgroup
->count
);
215 * refcounted get/put for css_set objects
217 static inline void get_css_set(struct css_set
*cg
)
222 static inline void put_css_set(struct css_set
*cg
)
224 kref_put(&cg
->ref
, release_css_set
);
228 * find_existing_css_set() is a helper for
229 * find_css_set(), and checks to see whether an existing
230 * css_set is suitable. This currently walks a linked-list for
231 * simplicity; a later patch will use a hash table for better
234 * oldcg: the cgroup group that we're using before the cgroup
237 * cont: the cgroup that we're moving into
239 * template: location in which to build the desired set of subsystem
240 * state objects for the new cgroup group
243 static struct css_set
*find_existing_css_set(
244 struct css_set
*oldcg
,
246 struct cgroup_subsys_state
*template[])
249 struct cgroupfs_root
*root
= cont
->root
;
250 struct list_head
*l
= &init_css_set
.list
;
252 /* Built the set of subsystem state objects that we want to
253 * see in the new css_set */
254 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
255 if (root
->subsys_bits
& (1ull << i
)) {
256 /* Subsystem is in this hierarchy. So we want
257 * the subsystem state from the new
259 template[i
] = cont
->subsys
[i
];
261 /* Subsystem is not in this hierarchy, so we
262 * don't want to change the subsystem state */
263 template[i
] = oldcg
->subsys
[i
];
267 /* Look through existing cgroup groups to find one to reuse */
270 list_entry(l
, struct css_set
, list
);
272 if (!memcmp(template, cg
->subsys
, sizeof(cg
->subsys
))) {
273 /* All subsystems matched */
276 /* Try the next cgroup group */
278 } while (l
!= &init_css_set
.list
);
280 /* No existing cgroup group matched */
285 * allocate_cg_links() allocates "count" cg_cgroup_link structures
286 * and chains them on tmp through their cont_link_list fields. Returns 0 on
287 * success or a negative error
290 static int allocate_cg_links(int count
, struct list_head
*tmp
)
292 struct cg_cgroup_link
*link
;
295 for (i
= 0; i
< count
; i
++) {
296 link
= kmalloc(sizeof(*link
), GFP_KERNEL
);
298 while (!list_empty(tmp
)) {
299 link
= list_entry(tmp
->next
,
300 struct cg_cgroup_link
,
302 list_del(&link
->cont_link_list
);
307 list_add(&link
->cont_link_list
, tmp
);
312 static void free_cg_links(struct list_head
*tmp
)
314 while (!list_empty(tmp
)) {
315 struct cg_cgroup_link
*link
;
316 link
= list_entry(tmp
->next
,
317 struct cg_cgroup_link
,
319 list_del(&link
->cont_link_list
);
325 * find_css_set() takes an existing cgroup group and a
326 * cgroup object, and returns a css_set object that's
327 * equivalent to the old group, but with the given cgroup
328 * substituted into the appropriate hierarchy. Must be called with
332 static struct css_set
*find_css_set(
333 struct css_set
*oldcg
, struct cgroup
*cont
)
336 struct cgroup_subsys_state
*template[CGROUP_SUBSYS_COUNT
];
339 struct list_head tmp_cg_links
;
340 struct cg_cgroup_link
*link
;
342 /* First see if we already have a cgroup group that matches
344 write_lock(&css_set_lock
);
345 res
= find_existing_css_set(oldcg
, cont
, template);
348 write_unlock(&css_set_lock
);
353 res
= kmalloc(sizeof(*res
), GFP_KERNEL
);
357 /* Allocate all the cg_cgroup_link objects that we'll need */
358 if (allocate_cg_links(root_count
, &tmp_cg_links
) < 0) {
363 kref_init(&res
->ref
);
364 INIT_LIST_HEAD(&res
->cg_links
);
365 INIT_LIST_HEAD(&res
->tasks
);
367 /* Copy the set of subsystem state objects generated in
368 * find_existing_css_set() */
369 memcpy(res
->subsys
, template, sizeof(res
->subsys
));
371 write_lock(&css_set_lock
);
372 /* Add reference counts and links from the new css_set. */
373 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
374 struct cgroup
*cont
= res
->subsys
[i
]->cgroup
;
375 struct cgroup_subsys
*ss
= subsys
[i
];
376 atomic_inc(&cont
->count
);
378 * We want to add a link once per cgroup, so we
379 * only do it for the first subsystem in each
382 if (ss
->root
->subsys_list
.next
== &ss
->sibling
) {
383 BUG_ON(list_empty(&tmp_cg_links
));
384 link
= list_entry(tmp_cg_links
.next
,
385 struct cg_cgroup_link
,
387 list_del(&link
->cont_link_list
);
388 list_add(&link
->cont_link_list
, &cont
->css_sets
);
390 list_add(&link
->cg_link_list
, &res
->cg_links
);
393 if (list_empty(&rootnode
.subsys_list
)) {
394 link
= list_entry(tmp_cg_links
.next
,
395 struct cg_cgroup_link
,
397 list_del(&link
->cont_link_list
);
398 list_add(&link
->cont_link_list
, &dummytop
->css_sets
);
400 list_add(&link
->cg_link_list
, &res
->cg_links
);
403 BUG_ON(!list_empty(&tmp_cg_links
));
405 /* Link this cgroup group into the list */
406 list_add(&res
->list
, &init_css_set
.list
);
408 INIT_LIST_HEAD(&res
->tasks
);
409 write_unlock(&css_set_lock
);
415 * There is one global cgroup mutex. We also require taking
416 * task_lock() when dereferencing a task's cgroup subsys pointers.
417 * See "The task_lock() exception", at the end of this comment.
419 * A task must hold cgroup_mutex to modify cgroups.
421 * Any task can increment and decrement the count field without lock.
422 * So in general, code holding cgroup_mutex can't rely on the count
423 * field not changing. However, if the count goes to zero, then only
424 * attach_task() can increment it again. Because a count of zero
425 * means that no tasks are currently attached, therefore there is no
426 * way a task attached to that cgroup can fork (the other way to
427 * increment the count). So code holding cgroup_mutex can safely
428 * assume that if the count is zero, it will stay zero. Similarly, if
429 * a task holds cgroup_mutex on a cgroup with zero count, it
430 * knows that the cgroup won't be removed, as cgroup_rmdir()
433 * The cgroup_common_file_write handler for operations that modify
434 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
435 * single threading all such cgroup modifications across the system.
437 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
438 * (usually) take cgroup_mutex. These are the two most performance
439 * critical pieces of code here. The exception occurs on cgroup_exit(),
440 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
441 * is taken, and if the cgroup count is zero, a usermode call made
442 * to /sbin/cgroup_release_agent with the name of the cgroup (path
443 * relative to the root of cgroup file system) as the argument.
445 * A cgroup can only be deleted if both its 'count' of using tasks
446 * is zero, and its list of 'children' cgroups is empty. Since all
447 * tasks in the system use _some_ cgroup, and since there is always at
448 * least one task in the system (init, pid == 1), therefore, top_cgroup
449 * always has either children cgroups and/or using tasks. So we don't
450 * need a special hack to ensure that top_cgroup cannot be deleted.
452 * The task_lock() exception
454 * The need for this exception arises from the action of
455 * attach_task(), which overwrites one tasks cgroup pointer with
456 * another. It does so using cgroup_mutexe, however there are
457 * several performance critical places that need to reference
458 * task->cgroup without the expense of grabbing a system global
459 * mutex. Therefore except as noted below, when dereferencing or, as
460 * in attach_task(), modifying a task'ss cgroup pointer we use
461 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
462 * the task_struct routinely used for such matters.
464 * P.S. One more locking exception. RCU is used to guard the
465 * update of a tasks cgroup pointer by attach_task()
468 static DEFINE_MUTEX(cgroup_mutex
);
471 * cgroup_lock - lock out any changes to cgroup structures
475 void cgroup_lock(void)
477 mutex_lock(&cgroup_mutex
);
481 * cgroup_unlock - release lock on cgroup changes
483 * Undo the lock taken in a previous cgroup_lock() call.
486 void cgroup_unlock(void)
488 mutex_unlock(&cgroup_mutex
);
492 * A couple of forward declarations required, due to cyclic reference loop:
493 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
494 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
498 static int cgroup_mkdir(struct inode
*dir
, struct dentry
*dentry
, int mode
);
499 static int cgroup_rmdir(struct inode
*unused_dir
, struct dentry
*dentry
);
500 static int cgroup_populate_dir(struct cgroup
*cont
);
501 static struct inode_operations cgroup_dir_inode_operations
;
502 static struct file_operations proc_cgroupstats_operations
;
504 static struct backing_dev_info cgroup_backing_dev_info
= {
505 .capabilities
= BDI_CAP_NO_ACCT_DIRTY
| BDI_CAP_NO_WRITEBACK
,
508 static struct inode
*cgroup_new_inode(mode_t mode
, struct super_block
*sb
)
510 struct inode
*inode
= new_inode(sb
);
513 inode
->i_mode
= mode
;
514 inode
->i_uid
= current
->fsuid
;
515 inode
->i_gid
= current
->fsgid
;
517 inode
->i_atime
= inode
->i_mtime
= inode
->i_ctime
= CURRENT_TIME
;
518 inode
->i_mapping
->backing_dev_info
= &cgroup_backing_dev_info
;
523 static void cgroup_diput(struct dentry
*dentry
, struct inode
*inode
)
525 /* is dentry a directory ? if so, kfree() associated cgroup */
526 if (S_ISDIR(inode
->i_mode
)) {
527 struct cgroup
*cont
= dentry
->d_fsdata
;
528 BUG_ON(!(cgroup_is_removed(cont
)));
534 static void remove_dir(struct dentry
*d
)
536 struct dentry
*parent
= dget(d
->d_parent
);
539 simple_rmdir(parent
->d_inode
, d
);
543 static void cgroup_clear_directory(struct dentry
*dentry
)
545 struct list_head
*node
;
547 BUG_ON(!mutex_is_locked(&dentry
->d_inode
->i_mutex
));
548 spin_lock(&dcache_lock
);
549 node
= dentry
->d_subdirs
.next
;
550 while (node
!= &dentry
->d_subdirs
) {
551 struct dentry
*d
= list_entry(node
, struct dentry
, d_u
.d_child
);
554 /* This should never be called on a cgroup
555 * directory with child cgroups */
556 BUG_ON(d
->d_inode
->i_mode
& S_IFDIR
);
558 spin_unlock(&dcache_lock
);
560 simple_unlink(dentry
->d_inode
, d
);
562 spin_lock(&dcache_lock
);
564 node
= dentry
->d_subdirs
.next
;
566 spin_unlock(&dcache_lock
);
570 * NOTE : the dentry must have been dget()'ed
572 static void cgroup_d_remove_dir(struct dentry
*dentry
)
574 cgroup_clear_directory(dentry
);
576 spin_lock(&dcache_lock
);
577 list_del_init(&dentry
->d_u
.d_child
);
578 spin_unlock(&dcache_lock
);
582 static int rebind_subsystems(struct cgroupfs_root
*root
,
583 unsigned long final_bits
)
585 unsigned long added_bits
, removed_bits
;
586 struct cgroup
*cont
= &root
->top_cgroup
;
589 removed_bits
= root
->actual_subsys_bits
& ~final_bits
;
590 added_bits
= final_bits
& ~root
->actual_subsys_bits
;
591 /* Check that any added subsystems are currently free */
592 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
593 unsigned long long bit
= 1ull << i
;
594 struct cgroup_subsys
*ss
= subsys
[i
];
595 if (!(bit
& added_bits
))
597 if (ss
->root
!= &rootnode
) {
598 /* Subsystem isn't free */
603 /* Currently we don't handle adding/removing subsystems when
604 * any child cgroups exist. This is theoretically supportable
605 * but involves complex error handling, so it's being left until
607 if (!list_empty(&cont
->children
))
610 /* Process each subsystem */
611 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
612 struct cgroup_subsys
*ss
= subsys
[i
];
613 unsigned long bit
= 1UL << i
;
614 if (bit
& added_bits
) {
615 /* We're binding this subsystem to this hierarchy */
616 BUG_ON(cont
->subsys
[i
]);
617 BUG_ON(!dummytop
->subsys
[i
]);
618 BUG_ON(dummytop
->subsys
[i
]->cgroup
!= dummytop
);
619 cont
->subsys
[i
] = dummytop
->subsys
[i
];
620 cont
->subsys
[i
]->cgroup
= cont
;
621 list_add(&ss
->sibling
, &root
->subsys_list
);
622 rcu_assign_pointer(ss
->root
, root
);
626 } else if (bit
& removed_bits
) {
627 /* We're removing this subsystem */
628 BUG_ON(cont
->subsys
[i
] != dummytop
->subsys
[i
]);
629 BUG_ON(cont
->subsys
[i
]->cgroup
!= cont
);
631 ss
->bind(ss
, dummytop
);
632 dummytop
->subsys
[i
]->cgroup
= dummytop
;
633 cont
->subsys
[i
] = NULL
;
634 rcu_assign_pointer(subsys
[i
]->root
, &rootnode
);
635 list_del(&ss
->sibling
);
636 } else if (bit
& final_bits
) {
637 /* Subsystem state should already exist */
638 BUG_ON(!cont
->subsys
[i
]);
640 /* Subsystem state shouldn't exist */
641 BUG_ON(cont
->subsys
[i
]);
644 root
->subsys_bits
= root
->actual_subsys_bits
= final_bits
;
650 static int cgroup_show_options(struct seq_file
*seq
, struct vfsmount
*vfs
)
652 struct cgroupfs_root
*root
= vfs
->mnt_sb
->s_fs_info
;
653 struct cgroup_subsys
*ss
;
655 mutex_lock(&cgroup_mutex
);
656 for_each_subsys(root
, ss
)
657 seq_printf(seq
, ",%s", ss
->name
);
658 if (test_bit(ROOT_NOPREFIX
, &root
->flags
))
659 seq_puts(seq
, ",noprefix");
660 mutex_unlock(&cgroup_mutex
);
664 struct cgroup_sb_opts
{
665 unsigned long subsys_bits
;
669 /* Convert a hierarchy specifier into a bitmask of subsystems and
671 static int parse_cgroupfs_options(char *data
,
672 struct cgroup_sb_opts
*opts
)
674 char *token
, *o
= data
?: "all";
676 opts
->subsys_bits
= 0;
679 while ((token
= strsep(&o
, ",")) != NULL
) {
682 if (!strcmp(token
, "all")) {
683 opts
->subsys_bits
= (1 << CGROUP_SUBSYS_COUNT
) - 1;
684 } else if (!strcmp(token
, "noprefix")) {
685 set_bit(ROOT_NOPREFIX
, &opts
->flags
);
687 struct cgroup_subsys
*ss
;
689 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
691 if (!strcmp(token
, ss
->name
)) {
692 set_bit(i
, &opts
->subsys_bits
);
696 if (i
== CGROUP_SUBSYS_COUNT
)
701 /* We can't have an empty hierarchy */
702 if (!opts
->subsys_bits
)
708 static int cgroup_remount(struct super_block
*sb
, int *flags
, char *data
)
711 struct cgroupfs_root
*root
= sb
->s_fs_info
;
712 struct cgroup
*cont
= &root
->top_cgroup
;
713 struct cgroup_sb_opts opts
;
715 mutex_lock(&cont
->dentry
->d_inode
->i_mutex
);
716 mutex_lock(&cgroup_mutex
);
718 /* See what subsystems are wanted */
719 ret
= parse_cgroupfs_options(data
, &opts
);
723 /* Don't allow flags to change at remount */
724 if (opts
.flags
!= root
->flags
) {
729 ret
= rebind_subsystems(root
, opts
.subsys_bits
);
731 /* (re)populate subsystem files */
733 cgroup_populate_dir(cont
);
736 mutex_unlock(&cgroup_mutex
);
737 mutex_unlock(&cont
->dentry
->d_inode
->i_mutex
);
741 static struct super_operations cgroup_ops
= {
742 .statfs
= simple_statfs
,
743 .drop_inode
= generic_delete_inode
,
744 .show_options
= cgroup_show_options
,
745 .remount_fs
= cgroup_remount
,
748 static void init_cgroup_root(struct cgroupfs_root
*root
)
750 struct cgroup
*cont
= &root
->top_cgroup
;
751 INIT_LIST_HEAD(&root
->subsys_list
);
752 INIT_LIST_HEAD(&root
->root_list
);
753 root
->number_of_cgroups
= 1;
755 cont
->top_cgroup
= cont
;
756 INIT_LIST_HEAD(&cont
->sibling
);
757 INIT_LIST_HEAD(&cont
->children
);
758 INIT_LIST_HEAD(&cont
->css_sets
);
761 static int cgroup_test_super(struct super_block
*sb
, void *data
)
763 struct cgroupfs_root
*new = data
;
764 struct cgroupfs_root
*root
= sb
->s_fs_info
;
766 /* First check subsystems */
767 if (new->subsys_bits
!= root
->subsys_bits
)
770 /* Next check flags */
771 if (new->flags
!= root
->flags
)
777 static int cgroup_set_super(struct super_block
*sb
, void *data
)
780 struct cgroupfs_root
*root
= data
;
782 ret
= set_anon_super(sb
, NULL
);
786 sb
->s_fs_info
= root
;
789 sb
->s_blocksize
= PAGE_CACHE_SIZE
;
790 sb
->s_blocksize_bits
= PAGE_CACHE_SHIFT
;
791 sb
->s_magic
= CGROUP_SUPER_MAGIC
;
792 sb
->s_op
= &cgroup_ops
;
797 static int cgroup_get_rootdir(struct super_block
*sb
)
799 struct inode
*inode
=
800 cgroup_new_inode(S_IFDIR
| S_IRUGO
| S_IXUGO
| S_IWUSR
, sb
);
801 struct dentry
*dentry
;
806 inode
->i_op
= &simple_dir_inode_operations
;
807 inode
->i_fop
= &simple_dir_operations
;
808 inode
->i_op
= &cgroup_dir_inode_operations
;
809 /* directories start off with i_nlink == 2 (for "." entry) */
811 dentry
= d_alloc_root(inode
);
820 static int cgroup_get_sb(struct file_system_type
*fs_type
,
821 int flags
, const char *unused_dev_name
,
822 void *data
, struct vfsmount
*mnt
)
824 struct cgroup_sb_opts opts
;
826 struct super_block
*sb
;
827 struct cgroupfs_root
*root
;
828 struct list_head tmp_cg_links
, *l
;
829 INIT_LIST_HEAD(&tmp_cg_links
);
831 /* First find the desired set of subsystems */
832 ret
= parse_cgroupfs_options(data
, &opts
);
836 root
= kzalloc(sizeof(*root
), GFP_KERNEL
);
840 init_cgroup_root(root
);
841 root
->subsys_bits
= opts
.subsys_bits
;
842 root
->flags
= opts
.flags
;
844 sb
= sget(fs_type
, cgroup_test_super
, cgroup_set_super
, root
);
851 if (sb
->s_fs_info
!= root
) {
852 /* Reusing an existing superblock */
853 BUG_ON(sb
->s_root
== NULL
);
858 struct cgroup
*cont
= &root
->top_cgroup
;
861 BUG_ON(sb
->s_root
!= NULL
);
863 ret
= cgroup_get_rootdir(sb
);
866 inode
= sb
->s_root
->d_inode
;
868 mutex_lock(&inode
->i_mutex
);
869 mutex_lock(&cgroup_mutex
);
872 * We're accessing css_set_count without locking
873 * css_set_lock here, but that's OK - it can only be
874 * increased by someone holding cgroup_lock, and
875 * that's us. The worst that can happen is that we
876 * have some link structures left over
878 ret
= allocate_cg_links(css_set_count
, &tmp_cg_links
);
880 mutex_unlock(&cgroup_mutex
);
881 mutex_unlock(&inode
->i_mutex
);
885 ret
= rebind_subsystems(root
, root
->subsys_bits
);
887 mutex_unlock(&cgroup_mutex
);
888 mutex_unlock(&inode
->i_mutex
);
892 /* EBUSY should be the only error here */
895 list_add(&root
->root_list
, &roots
);
898 sb
->s_root
->d_fsdata
= &root
->top_cgroup
;
899 root
->top_cgroup
.dentry
= sb
->s_root
;
901 /* Link the top cgroup in this hierarchy into all
902 * the css_set objects */
903 write_lock(&css_set_lock
);
904 l
= &init_css_set
.list
;
907 struct cg_cgroup_link
*link
;
908 cg
= list_entry(l
, struct css_set
, list
);
909 BUG_ON(list_empty(&tmp_cg_links
));
910 link
= list_entry(tmp_cg_links
.next
,
911 struct cg_cgroup_link
,
913 list_del(&link
->cont_link_list
);
915 list_add(&link
->cont_link_list
,
916 &root
->top_cgroup
.css_sets
);
917 list_add(&link
->cg_link_list
, &cg
->cg_links
);
919 } while (l
!= &init_css_set
.list
);
920 write_unlock(&css_set_lock
);
922 free_cg_links(&tmp_cg_links
);
924 BUG_ON(!list_empty(&cont
->sibling
));
925 BUG_ON(!list_empty(&cont
->children
));
926 BUG_ON(root
->number_of_cgroups
!= 1);
928 cgroup_populate_dir(cont
);
929 mutex_unlock(&inode
->i_mutex
);
930 mutex_unlock(&cgroup_mutex
);
933 return simple_set_mnt(mnt
, sb
);
936 up_write(&sb
->s_umount
);
937 deactivate_super(sb
);
938 free_cg_links(&tmp_cg_links
);
942 static void cgroup_kill_sb(struct super_block
*sb
) {
943 struct cgroupfs_root
*root
= sb
->s_fs_info
;
944 struct cgroup
*cont
= &root
->top_cgroup
;
949 BUG_ON(root
->number_of_cgroups
!= 1);
950 BUG_ON(!list_empty(&cont
->children
));
951 BUG_ON(!list_empty(&cont
->sibling
));
953 mutex_lock(&cgroup_mutex
);
955 /* Rebind all subsystems back to the default hierarchy */
956 ret
= rebind_subsystems(root
, 0);
957 /* Shouldn't be able to fail ... */
961 * Release all the links from css_sets to this hierarchy's
964 write_lock(&css_set_lock
);
965 while (!list_empty(&cont
->css_sets
)) {
966 struct cg_cgroup_link
*link
;
967 link
= list_entry(cont
->css_sets
.next
,
968 struct cg_cgroup_link
, cont_link_list
);
969 list_del(&link
->cg_link_list
);
970 list_del(&link
->cont_link_list
);
973 write_unlock(&css_set_lock
);
975 if (!list_empty(&root
->root_list
)) {
976 list_del(&root
->root_list
);
979 mutex_unlock(&cgroup_mutex
);
982 kill_litter_super(sb
);
985 static struct file_system_type cgroup_fs_type
= {
987 .get_sb
= cgroup_get_sb
,
988 .kill_sb
= cgroup_kill_sb
,
991 static inline struct cgroup
*__d_cont(struct dentry
*dentry
)
993 return dentry
->d_fsdata
;
996 static inline struct cftype
*__d_cft(struct dentry
*dentry
)
998 return dentry
->d_fsdata
;
1002 * Called with cgroup_mutex held. Writes path of cgroup into buf.
1003 * Returns 0 on success, -errno on error.
1005 int cgroup_path(const struct cgroup
*cont
, char *buf
, int buflen
)
1009 if (cont
== dummytop
) {
1011 * Inactive subsystems have no dentry for their root
1018 start
= buf
+ buflen
;
1022 int len
= cont
->dentry
->d_name
.len
;
1023 if ((start
-= len
) < buf
)
1024 return -ENAMETOOLONG
;
1025 memcpy(start
, cont
->dentry
->d_name
.name
, len
);
1026 cont
= cont
->parent
;
1032 return -ENAMETOOLONG
;
1035 memmove(buf
, start
, buf
+ buflen
- start
);
1040 * Return the first subsystem attached to a cgroup's hierarchy, and
1044 static void get_first_subsys(const struct cgroup
*cont
,
1045 struct cgroup_subsys_state
**css
, int *subsys_id
)
1047 const struct cgroupfs_root
*root
= cont
->root
;
1048 const struct cgroup_subsys
*test_ss
;
1049 BUG_ON(list_empty(&root
->subsys_list
));
1050 test_ss
= list_entry(root
->subsys_list
.next
,
1051 struct cgroup_subsys
, sibling
);
1053 *css
= cont
->subsys
[test_ss
->subsys_id
];
1057 *subsys_id
= test_ss
->subsys_id
;
1061 * Attach task 'tsk' to cgroup 'cont'
1063 * Call holding cgroup_mutex. May take task_lock of
1064 * the task 'pid' during call.
1066 static int attach_task(struct cgroup
*cont
, struct task_struct
*tsk
)
1069 struct cgroup_subsys
*ss
;
1070 struct cgroup
*oldcont
;
1071 struct css_set
*cg
= tsk
->cgroups
;
1072 struct css_set
*newcg
;
1073 struct cgroupfs_root
*root
= cont
->root
;
1076 get_first_subsys(cont
, NULL
, &subsys_id
);
1078 /* Nothing to do if the task is already in that cgroup */
1079 oldcont
= task_cgroup(tsk
, subsys_id
);
1080 if (cont
== oldcont
)
1083 for_each_subsys(root
, ss
) {
1084 if (ss
->can_attach
) {
1085 retval
= ss
->can_attach(ss
, cont
, tsk
);
1093 * Locate or allocate a new css_set for this task,
1094 * based on its final set of cgroups
1096 newcg
= find_css_set(cg
, cont
);
1102 if (tsk
->flags
& PF_EXITING
) {
1107 rcu_assign_pointer(tsk
->cgroups
, newcg
);
1110 /* Update the css_set linked lists if we're using them */
1111 write_lock(&css_set_lock
);
1112 if (!list_empty(&tsk
->cg_list
)) {
1113 list_del(&tsk
->cg_list
);
1114 list_add(&tsk
->cg_list
, &newcg
->tasks
);
1116 write_unlock(&css_set_lock
);
1118 for_each_subsys(root
, ss
) {
1120 ss
->attach(ss
, cont
, oldcont
, tsk
);
1130 * Attach task with pid 'pid' to cgroup 'cont'. Call with
1131 * cgroup_mutex, may take task_lock of task
1133 static int attach_task_by_pid(struct cgroup
*cont
, char *pidbuf
)
1136 struct task_struct
*tsk
;
1139 if (sscanf(pidbuf
, "%d", &pid
) != 1)
1144 tsk
= find_task_by_pid(pid
);
1145 if (!tsk
|| tsk
->flags
& PF_EXITING
) {
1149 get_task_struct(tsk
);
1152 if ((current
->euid
) && (current
->euid
!= tsk
->uid
)
1153 && (current
->euid
!= tsk
->suid
)) {
1154 put_task_struct(tsk
);
1159 get_task_struct(tsk
);
1162 ret
= attach_task(cont
, tsk
);
1163 put_task_struct(tsk
);
1167 /* The various types of files and directories in a cgroup file system */
1169 enum cgroup_filetype
{
1175 static ssize_t
cgroup_write_uint(struct cgroup
*cont
, struct cftype
*cft
,
1177 const char __user
*userbuf
,
1178 size_t nbytes
, loff_t
*unused_ppos
)
1187 if (nbytes
>= sizeof(buffer
))
1189 if (copy_from_user(buffer
, userbuf
, nbytes
))
1192 buffer
[nbytes
] = 0; /* nul-terminate */
1194 /* strip newline if necessary */
1195 if (nbytes
&& (buffer
[nbytes
-1] == '\n'))
1196 buffer
[nbytes
-1] = 0;
1197 val
= simple_strtoull(buffer
, &end
, 0);
1201 /* Pass to subsystem */
1202 retval
= cft
->write_uint(cont
, cft
, val
);
1208 static ssize_t
cgroup_common_file_write(struct cgroup
*cont
,
1211 const char __user
*userbuf
,
1212 size_t nbytes
, loff_t
*unused_ppos
)
1214 enum cgroup_filetype type
= cft
->private;
1218 if (nbytes
>= PATH_MAX
)
1221 /* +1 for nul-terminator */
1222 buffer
= kmalloc(nbytes
+ 1, GFP_KERNEL
);
1226 if (copy_from_user(buffer
, userbuf
, nbytes
)) {
1230 buffer
[nbytes
] = 0; /* nul-terminate */
1232 mutex_lock(&cgroup_mutex
);
1234 if (cgroup_is_removed(cont
)) {
1241 retval
= attach_task_by_pid(cont
, buffer
);
1251 mutex_unlock(&cgroup_mutex
);
1257 static ssize_t
cgroup_file_write(struct file
*file
, const char __user
*buf
,
1258 size_t nbytes
, loff_t
*ppos
)
1260 struct cftype
*cft
= __d_cft(file
->f_dentry
);
1261 struct cgroup
*cont
= __d_cont(file
->f_dentry
->d_parent
);
1266 return cft
->write(cont
, cft
, file
, buf
, nbytes
, ppos
);
1267 if (cft
->write_uint
)
1268 return cgroup_write_uint(cont
, cft
, file
, buf
, nbytes
, ppos
);
1272 static ssize_t
cgroup_read_uint(struct cgroup
*cont
, struct cftype
*cft
,
1274 char __user
*buf
, size_t nbytes
,
1278 u64 val
= cft
->read_uint(cont
, cft
);
1279 int len
= sprintf(tmp
, "%llu\n", (unsigned long long) val
);
1281 return simple_read_from_buffer(buf
, nbytes
, ppos
, tmp
, len
);
1284 static ssize_t
cgroup_file_read(struct file
*file
, char __user
*buf
,
1285 size_t nbytes
, loff_t
*ppos
)
1287 struct cftype
*cft
= __d_cft(file
->f_dentry
);
1288 struct cgroup
*cont
= __d_cont(file
->f_dentry
->d_parent
);
1294 return cft
->read(cont
, cft
, file
, buf
, nbytes
, ppos
);
1296 return cgroup_read_uint(cont
, cft
, file
, buf
, nbytes
, ppos
);
1300 static int cgroup_file_open(struct inode
*inode
, struct file
*file
)
1305 err
= generic_file_open(inode
, file
);
1309 cft
= __d_cft(file
->f_dentry
);
1313 err
= cft
->open(inode
, file
);
1320 static int cgroup_file_release(struct inode
*inode
, struct file
*file
)
1322 struct cftype
*cft
= __d_cft(file
->f_dentry
);
1324 return cft
->release(inode
, file
);
1329 * cgroup_rename - Only allow simple rename of directories in place.
1331 static int cgroup_rename(struct inode
*old_dir
, struct dentry
*old_dentry
,
1332 struct inode
*new_dir
, struct dentry
*new_dentry
)
1334 if (!S_ISDIR(old_dentry
->d_inode
->i_mode
))
1336 if (new_dentry
->d_inode
)
1338 if (old_dir
!= new_dir
)
1340 return simple_rename(old_dir
, old_dentry
, new_dir
, new_dentry
);
1343 static struct file_operations cgroup_file_operations
= {
1344 .read
= cgroup_file_read
,
1345 .write
= cgroup_file_write
,
1346 .llseek
= generic_file_llseek
,
1347 .open
= cgroup_file_open
,
1348 .release
= cgroup_file_release
,
1351 static struct inode_operations cgroup_dir_inode_operations
= {
1352 .lookup
= simple_lookup
,
1353 .mkdir
= cgroup_mkdir
,
1354 .rmdir
= cgroup_rmdir
,
1355 .rename
= cgroup_rename
,
1358 static int cgroup_create_file(struct dentry
*dentry
, int mode
,
1359 struct super_block
*sb
)
1361 static struct dentry_operations cgroup_dops
= {
1362 .d_iput
= cgroup_diput
,
1365 struct inode
*inode
;
1369 if (dentry
->d_inode
)
1372 inode
= cgroup_new_inode(mode
, sb
);
1376 if (S_ISDIR(mode
)) {
1377 inode
->i_op
= &cgroup_dir_inode_operations
;
1378 inode
->i_fop
= &simple_dir_operations
;
1380 /* start off with i_nlink == 2 (for "." entry) */
1383 /* start with the directory inode held, so that we can
1384 * populate it without racing with another mkdir */
1385 mutex_lock_nested(&inode
->i_mutex
, I_MUTEX_CHILD
);
1386 } else if (S_ISREG(mode
)) {
1388 inode
->i_fop
= &cgroup_file_operations
;
1390 dentry
->d_op
= &cgroup_dops
;
1391 d_instantiate(dentry
, inode
);
1392 dget(dentry
); /* Extra count - pin the dentry in core */
1397 * cgroup_create_dir - create a directory for an object.
1398 * cont: the cgroup we create the directory for.
1399 * It must have a valid ->parent field
1400 * And we are going to fill its ->dentry field.
1401 * dentry: dentry of the new container
1402 * mode: mode to set on new directory.
1404 static int cgroup_create_dir(struct cgroup
*cont
, struct dentry
*dentry
,
1407 struct dentry
*parent
;
1410 parent
= cont
->parent
->dentry
;
1411 error
= cgroup_create_file(dentry
, S_IFDIR
| mode
, cont
->root
->sb
);
1413 dentry
->d_fsdata
= cont
;
1414 inc_nlink(parent
->d_inode
);
1415 cont
->dentry
= dentry
;
1423 int cgroup_add_file(struct cgroup
*cont
,
1424 struct cgroup_subsys
*subsys
,
1425 const struct cftype
*cft
)
1427 struct dentry
*dir
= cont
->dentry
;
1428 struct dentry
*dentry
;
1431 char name
[MAX_CGROUP_TYPE_NAMELEN
+ MAX_CFTYPE_NAME
+ 2] = { 0 };
1432 if (subsys
&& !test_bit(ROOT_NOPREFIX
, &cont
->root
->flags
)) {
1433 strcpy(name
, subsys
->name
);
1436 strcat(name
, cft
->name
);
1437 BUG_ON(!mutex_is_locked(&dir
->d_inode
->i_mutex
));
1438 dentry
= lookup_one_len(name
, dir
, strlen(name
));
1439 if (!IS_ERR(dentry
)) {
1440 error
= cgroup_create_file(dentry
, 0644 | S_IFREG
,
1443 dentry
->d_fsdata
= (void *)cft
;
1446 error
= PTR_ERR(dentry
);
1450 int cgroup_add_files(struct cgroup
*cont
,
1451 struct cgroup_subsys
*subsys
,
1452 const struct cftype cft
[],
1456 for (i
= 0; i
< count
; i
++) {
1457 err
= cgroup_add_file(cont
, subsys
, &cft
[i
]);
1464 /* Count the number of tasks in a cgroup. */
1466 int cgroup_task_count(const struct cgroup
*cont
)
1469 struct list_head
*l
;
1471 read_lock(&css_set_lock
);
1472 l
= cont
->css_sets
.next
;
1473 while (l
!= &cont
->css_sets
) {
1474 struct cg_cgroup_link
*link
=
1475 list_entry(l
, struct cg_cgroup_link
, cont_link_list
);
1476 count
+= atomic_read(&link
->cg
->ref
.refcount
);
1479 read_unlock(&css_set_lock
);
1484 * Advance a list_head iterator. The iterator should be positioned at
1485 * the start of a css_set
1487 static void cgroup_advance_iter(struct cgroup
*cont
,
1488 struct cgroup_iter
*it
)
1490 struct list_head
*l
= it
->cg_link
;
1491 struct cg_cgroup_link
*link
;
1494 /* Advance to the next non-empty css_set */
1497 if (l
== &cont
->css_sets
) {
1501 link
= list_entry(l
, struct cg_cgroup_link
, cont_link_list
);
1503 } while (list_empty(&cg
->tasks
));
1505 it
->task
= cg
->tasks
.next
;
1508 void cgroup_iter_start(struct cgroup
*cont
, struct cgroup_iter
*it
)
1511 * The first time anyone tries to iterate across a cgroup,
1512 * we need to enable the list linking each css_set to its
1513 * tasks, and fix up all existing tasks.
1515 if (!use_task_css_set_links
) {
1516 struct task_struct
*p
, *g
;
1517 write_lock(&css_set_lock
);
1518 use_task_css_set_links
= 1;
1519 do_each_thread(g
, p
) {
1521 if (list_empty(&p
->cg_list
))
1522 list_add(&p
->cg_list
, &p
->cgroups
->tasks
);
1524 } while_each_thread(g
, p
);
1525 write_unlock(&css_set_lock
);
1527 read_lock(&css_set_lock
);
1528 it
->cg_link
= &cont
->css_sets
;
1529 cgroup_advance_iter(cont
, it
);
1532 struct task_struct
*cgroup_iter_next(struct cgroup
*cont
,
1533 struct cgroup_iter
*it
)
1535 struct task_struct
*res
;
1536 struct list_head
*l
= it
->task
;
1538 /* If the iterator cg is NULL, we have no tasks */
1541 res
= list_entry(l
, struct task_struct
, cg_list
);
1542 /* Advance iterator to find next entry */
1544 if (l
== &res
->cgroups
->tasks
) {
1545 /* We reached the end of this task list - move on to
1546 * the next cg_cgroup_link */
1547 cgroup_advance_iter(cont
, it
);
1554 void cgroup_iter_end(struct cgroup
*cont
, struct cgroup_iter
*it
)
1556 read_unlock(&css_set_lock
);
1560 * Stuff for reading the 'tasks' file.
1562 * Reading this file can return large amounts of data if a cgroup has
1563 * *lots* of attached tasks. So it may need several calls to read(),
1564 * but we cannot guarantee that the information we produce is correct
1565 * unless we produce it entirely atomically.
1567 * Upon tasks file open(), a struct ctr_struct is allocated, that
1568 * will have a pointer to an array (also allocated here). The struct
1569 * ctr_struct * is stored in file->private_data. Its resources will
1570 * be freed by release() when the file is closed. The array is used
1571 * to sprintf the PIDs and then used by read().
1579 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
1580 * 'cont'. Return actual number of pids loaded. No need to
1581 * task_lock(p) when reading out p->cgroup, since we're in an RCU
1582 * read section, so the css_set can't go away, and is
1583 * immutable after creation.
1585 static int pid_array_load(pid_t
*pidarray
, int npids
, struct cgroup
*cont
)
1588 struct cgroup_iter it
;
1589 struct task_struct
*tsk
;
1590 cgroup_iter_start(cont
, &it
);
1591 while ((tsk
= cgroup_iter_next(cont
, &it
))) {
1592 if (unlikely(n
== npids
))
1594 pidarray
[n
++] = pid_nr(task_pid(tsk
));
1596 cgroup_iter_end(cont
, &it
);
1600 static int cmppid(const void *a
, const void *b
)
1602 return *(pid_t
*)a
- *(pid_t
*)b
;
1606 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1607 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1608 * count 'cnt' of how many chars would be written if buf were large enough.
1610 static int pid_array_to_buf(char *buf
, int sz
, pid_t
*a
, int npids
)
1615 for (i
= 0; i
< npids
; i
++)
1616 cnt
+= snprintf(buf
+ cnt
, max(sz
- cnt
, 0), "%d\n", a
[i
]);
1621 * Handle an open on 'tasks' file. Prepare a buffer listing the
1622 * process id's of tasks currently attached to the cgroup being opened.
1624 * Does not require any specific cgroup mutexes, and does not take any.
1626 static int cgroup_tasks_open(struct inode
*unused
, struct file
*file
)
1628 struct cgroup
*cont
= __d_cont(file
->f_dentry
->d_parent
);
1629 struct ctr_struct
*ctr
;
1634 if (!(file
->f_mode
& FMODE_READ
))
1637 ctr
= kmalloc(sizeof(*ctr
), GFP_KERNEL
);
1642 * If cgroup gets more users after we read count, we won't have
1643 * enough space - tough. This race is indistinguishable to the
1644 * caller from the case that the additional cgroup users didn't
1645 * show up until sometime later on.
1647 npids
= cgroup_task_count(cont
);
1649 pidarray
= kmalloc(npids
* sizeof(pid_t
), GFP_KERNEL
);
1653 npids
= pid_array_load(pidarray
, npids
, cont
);
1654 sort(pidarray
, npids
, sizeof(pid_t
), cmppid
, NULL
);
1656 /* Call pid_array_to_buf() twice, first just to get bufsz */
1657 ctr
->bufsz
= pid_array_to_buf(&c
, sizeof(c
), pidarray
, npids
) + 1;
1658 ctr
->buf
= kmalloc(ctr
->bufsz
, GFP_KERNEL
);
1661 ctr
->bufsz
= pid_array_to_buf(ctr
->buf
, ctr
->bufsz
, pidarray
, npids
);
1668 file
->private_data
= ctr
;
1679 static ssize_t
cgroup_tasks_read(struct cgroup
*cont
,
1681 struct file
*file
, char __user
*buf
,
1682 size_t nbytes
, loff_t
*ppos
)
1684 struct ctr_struct
*ctr
= file
->private_data
;
1686 return simple_read_from_buffer(buf
, nbytes
, ppos
, ctr
->buf
, ctr
->bufsz
);
1689 static int cgroup_tasks_release(struct inode
*unused_inode
,
1692 struct ctr_struct
*ctr
;
1694 if (file
->f_mode
& FMODE_READ
) {
1695 ctr
= file
->private_data
;
1703 * for the common functions, 'private' gives the type of file
1705 static struct cftype cft_tasks
= {
1707 .open
= cgroup_tasks_open
,
1708 .read
= cgroup_tasks_read
,
1709 .write
= cgroup_common_file_write
,
1710 .release
= cgroup_tasks_release
,
1711 .private = FILE_TASKLIST
,
1714 static int cgroup_populate_dir(struct cgroup
*cont
)
1717 struct cgroup_subsys
*ss
;
1719 /* First clear out any existing files */
1720 cgroup_clear_directory(cont
->dentry
);
1722 err
= cgroup_add_file(cont
, NULL
, &cft_tasks
);
1726 for_each_subsys(cont
->root
, ss
) {
1727 if (ss
->populate
&& (err
= ss
->populate(ss
, cont
)) < 0)
1734 static void init_cgroup_css(struct cgroup_subsys_state
*css
,
1735 struct cgroup_subsys
*ss
,
1736 struct cgroup
*cont
)
1739 atomic_set(&css
->refcnt
, 0);
1741 if (cont
== dummytop
)
1742 set_bit(CSS_ROOT
, &css
->flags
);
1743 BUG_ON(cont
->subsys
[ss
->subsys_id
]);
1744 cont
->subsys
[ss
->subsys_id
] = css
;
1748 * cgroup_create - create a cgroup
1749 * parent: cgroup that will be parent of the new cgroup.
1750 * name: name of the new cgroup. Will be strcpy'ed.
1751 * mode: mode to set on new inode
1753 * Must be called with the mutex on the parent inode held
1756 static long cgroup_create(struct cgroup
*parent
, struct dentry
*dentry
,
1759 struct cgroup
*cont
;
1760 struct cgroupfs_root
*root
= parent
->root
;
1762 struct cgroup_subsys
*ss
;
1763 struct super_block
*sb
= root
->sb
;
1765 cont
= kzalloc(sizeof(*cont
), GFP_KERNEL
);
1769 /* Grab a reference on the superblock so the hierarchy doesn't
1770 * get deleted on unmount if there are child cgroups. This
1771 * can be done outside cgroup_mutex, since the sb can't
1772 * disappear while someone has an open control file on the
1774 atomic_inc(&sb
->s_active
);
1776 mutex_lock(&cgroup_mutex
);
1779 INIT_LIST_HEAD(&cont
->sibling
);
1780 INIT_LIST_HEAD(&cont
->children
);
1781 INIT_LIST_HEAD(&cont
->css_sets
);
1783 cont
->parent
= parent
;
1784 cont
->root
= parent
->root
;
1785 cont
->top_cgroup
= parent
->top_cgroup
;
1787 for_each_subsys(root
, ss
) {
1788 struct cgroup_subsys_state
*css
= ss
->create(ss
, cont
);
1793 init_cgroup_css(css
, ss
, cont
);
1796 list_add(&cont
->sibling
, &cont
->parent
->children
);
1797 root
->number_of_cgroups
++;
1799 err
= cgroup_create_dir(cont
, dentry
, mode
);
1803 /* The cgroup directory was pre-locked for us */
1804 BUG_ON(!mutex_is_locked(&cont
->dentry
->d_inode
->i_mutex
));
1806 err
= cgroup_populate_dir(cont
);
1807 /* If err < 0, we have a half-filled directory - oh well ;) */
1809 mutex_unlock(&cgroup_mutex
);
1810 mutex_unlock(&cont
->dentry
->d_inode
->i_mutex
);
1816 list_del(&cont
->sibling
);
1817 root
->number_of_cgroups
--;
1821 for_each_subsys(root
, ss
) {
1822 if (cont
->subsys
[ss
->subsys_id
])
1823 ss
->destroy(ss
, cont
);
1826 mutex_unlock(&cgroup_mutex
);
1828 /* Release the reference count that we took on the superblock */
1829 deactivate_super(sb
);
1835 static int cgroup_mkdir(struct inode
*dir
, struct dentry
*dentry
, int mode
)
1837 struct cgroup
*c_parent
= dentry
->d_parent
->d_fsdata
;
1839 /* the vfs holds inode->i_mutex already */
1840 return cgroup_create(c_parent
, dentry
, mode
| S_IFDIR
);
1843 static int cgroup_rmdir(struct inode
*unused_dir
, struct dentry
*dentry
)
1845 struct cgroup
*cont
= dentry
->d_fsdata
;
1847 struct cgroup
*parent
;
1848 struct cgroup_subsys
*ss
;
1849 struct super_block
*sb
;
1850 struct cgroupfs_root
*root
;
1853 /* the vfs holds both inode->i_mutex already */
1855 mutex_lock(&cgroup_mutex
);
1856 if (atomic_read(&cont
->count
) != 0) {
1857 mutex_unlock(&cgroup_mutex
);
1860 if (!list_empty(&cont
->children
)) {
1861 mutex_unlock(&cgroup_mutex
);
1865 parent
= cont
->parent
;
1869 /* Check the reference count on each subsystem. Since we
1870 * already established that there are no tasks in the
1871 * cgroup, if the css refcount is also 0, then there should
1872 * be no outstanding references, so the subsystem is safe to
1874 for_each_subsys(root
, ss
) {
1875 struct cgroup_subsys_state
*css
;
1876 css
= cont
->subsys
[ss
->subsys_id
];
1877 if (atomic_read(&css
->refcnt
)) {
1883 mutex_unlock(&cgroup_mutex
);
1887 for_each_subsys(root
, ss
) {
1888 if (cont
->subsys
[ss
->subsys_id
])
1889 ss
->destroy(ss
, cont
);
1892 set_bit(CONT_REMOVED
, &cont
->flags
);
1893 /* delete my sibling from parent->children */
1894 list_del(&cont
->sibling
);
1895 spin_lock(&cont
->dentry
->d_lock
);
1896 d
= dget(cont
->dentry
);
1897 cont
->dentry
= NULL
;
1898 spin_unlock(&d
->d_lock
);
1900 cgroup_d_remove_dir(d
);
1902 root
->number_of_cgroups
--;
1904 mutex_unlock(&cgroup_mutex
);
1905 /* Drop the active superblock reference that we took when we
1906 * created the cgroup */
1907 deactivate_super(sb
);
1911 static void cgroup_init_subsys(struct cgroup_subsys
*ss
)
1913 struct cgroup_subsys_state
*css
;
1914 struct list_head
*l
;
1915 printk(KERN_ERR
"Initializing cgroup subsys %s\n", ss
->name
);
1917 /* Create the top cgroup state for this subsystem */
1918 ss
->root
= &rootnode
;
1919 css
= ss
->create(ss
, dummytop
);
1920 /* We don't handle early failures gracefully */
1921 BUG_ON(IS_ERR(css
));
1922 init_cgroup_css(css
, ss
, dummytop
);
1924 /* Update all cgroup groups to contain a subsys
1925 * pointer to this state - since the subsystem is
1926 * newly registered, all tasks and hence all cgroup
1927 * groups are in the subsystem's top cgroup. */
1928 write_lock(&css_set_lock
);
1929 l
= &init_css_set
.list
;
1931 struct css_set
*cg
=
1932 list_entry(l
, struct css_set
, list
);
1933 cg
->subsys
[ss
->subsys_id
] = dummytop
->subsys
[ss
->subsys_id
];
1935 } while (l
!= &init_css_set
.list
);
1936 write_unlock(&css_set_lock
);
1938 /* If this subsystem requested that it be notified with fork
1939 * events, we should send it one now for every process in the
1942 struct task_struct
*g
, *p
;
1944 read_lock(&tasklist_lock
);
1945 do_each_thread(g
, p
) {
1947 } while_each_thread(g
, p
);
1948 read_unlock(&tasklist_lock
);
1951 need_forkexit_callback
|= ss
->fork
|| ss
->exit
;
1957 * cgroup_init_early - initialize cgroups at system boot, and
1958 * initialize any subsystems that request early init.
1960 int __init
cgroup_init_early(void)
1963 kref_init(&init_css_set
.ref
);
1964 kref_get(&init_css_set
.ref
);
1965 INIT_LIST_HEAD(&init_css_set
.list
);
1966 INIT_LIST_HEAD(&init_css_set
.cg_links
);
1967 INIT_LIST_HEAD(&init_css_set
.tasks
);
1969 init_cgroup_root(&rootnode
);
1970 list_add(&rootnode
.root_list
, &roots
);
1972 init_task
.cgroups
= &init_css_set
;
1974 init_css_set_link
.cg
= &init_css_set
;
1975 list_add(&init_css_set_link
.cont_link_list
,
1976 &rootnode
.top_cgroup
.css_sets
);
1977 list_add(&init_css_set_link
.cg_link_list
,
1978 &init_css_set
.cg_links
);
1980 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
1981 struct cgroup_subsys
*ss
= subsys
[i
];
1984 BUG_ON(strlen(ss
->name
) > MAX_CGROUP_TYPE_NAMELEN
);
1985 BUG_ON(!ss
->create
);
1986 BUG_ON(!ss
->destroy
);
1987 if (ss
->subsys_id
!= i
) {
1988 printk(KERN_ERR
"Subsys %s id == %d\n",
1989 ss
->name
, ss
->subsys_id
);
1994 cgroup_init_subsys(ss
);
2000 * cgroup_init - register cgroup filesystem and /proc file, and
2001 * initialize any subsystems that didn't request early init.
2003 int __init
cgroup_init(void)
2007 struct proc_dir_entry
*entry
;
2009 err
= bdi_init(&cgroup_backing_dev_info
);
2013 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
2014 struct cgroup_subsys
*ss
= subsys
[i
];
2015 if (!ss
->early_init
)
2016 cgroup_init_subsys(ss
);
2019 err
= register_filesystem(&cgroup_fs_type
);
2023 entry
= create_proc_entry("cgroups", 0, NULL
);
2025 entry
->proc_fops
= &proc_cgroupstats_operations
;
2029 bdi_destroy(&cgroup_backing_dev_info
);
2035 * proc_cgroup_show()
2036 * - Print task's cgroup paths into seq_file, one line for each hierarchy
2037 * - Used for /proc/<pid>/cgroup.
2038 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2039 * doesn't really matter if tsk->cgroup changes after we read it,
2040 * and we take cgroup_mutex, keeping attach_task() from changing it
2041 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2042 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2043 * cgroup to top_cgroup.
2046 /* TODO: Use a proper seq_file iterator */
2047 static int proc_cgroup_show(struct seq_file
*m
, void *v
)
2050 struct task_struct
*tsk
;
2053 struct cgroupfs_root
*root
;
2056 buf
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
2062 tsk
= get_pid_task(pid
, PIDTYPE_PID
);
2068 mutex_lock(&cgroup_mutex
);
2070 for_each_root(root
) {
2071 struct cgroup_subsys
*ss
;
2072 struct cgroup
*cont
;
2076 /* Skip this hierarchy if it has no active subsystems */
2077 if (!root
->actual_subsys_bits
)
2079 for_each_subsys(root
, ss
)
2080 seq_printf(m
, "%s%s", count
++ ? "," : "", ss
->name
);
2082 get_first_subsys(&root
->top_cgroup
, NULL
, &subsys_id
);
2083 cont
= task_cgroup(tsk
, subsys_id
);
2084 retval
= cgroup_path(cont
, buf
, PAGE_SIZE
);
2092 mutex_unlock(&cgroup_mutex
);
2093 put_task_struct(tsk
);
2100 static int cgroup_open(struct inode
*inode
, struct file
*file
)
2102 struct pid
*pid
= PROC_I(inode
)->pid
;
2103 return single_open(file
, proc_cgroup_show
, pid
);
2106 struct file_operations proc_cgroup_operations
= {
2107 .open
= cgroup_open
,
2109 .llseek
= seq_lseek
,
2110 .release
= single_release
,
2113 /* Display information about each subsystem and each hierarchy */
2114 static int proc_cgroupstats_show(struct seq_file
*m
, void *v
)
2117 struct cgroupfs_root
*root
;
2119 seq_puts(m
, "#subsys_name\thierarchy\tnum_cgroups\n");
2120 mutex_lock(&cgroup_mutex
);
2121 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
2122 struct cgroup_subsys
*ss
= subsys
[i
];
2123 seq_printf(m
, "%s\t%lu\t%d\n",
2124 ss
->name
, ss
->root
->subsys_bits
,
2125 ss
->root
->number_of_cgroups
);
2127 mutex_unlock(&cgroup_mutex
);
2131 static int cgroupstats_open(struct inode
*inode
, struct file
*file
)
2133 return single_open(file
, proc_cgroupstats_show
, 0);
2136 static struct file_operations proc_cgroupstats_operations
= {
2137 .open
= cgroupstats_open
,
2139 .llseek
= seq_lseek
,
2140 .release
= single_release
,
2144 * cgroup_fork - attach newly forked task to its parents cgroup.
2145 * @tsk: pointer to task_struct of forking parent process.
2147 * Description: A task inherits its parent's cgroup at fork().
2149 * A pointer to the shared css_set was automatically copied in
2150 * fork.c by dup_task_struct(). However, we ignore that copy, since
2151 * it was not made under the protection of RCU or cgroup_mutex, so
2152 * might no longer be a valid cgroup pointer. attach_task() might
2153 * have already changed current->cgroups, allowing the previously
2154 * referenced cgroup group to be removed and freed.
2156 * At the point that cgroup_fork() is called, 'current' is the parent
2157 * task, and the passed argument 'child' points to the child task.
2159 void cgroup_fork(struct task_struct
*child
)
2162 child
->cgroups
= current
->cgroups
;
2163 get_css_set(child
->cgroups
);
2164 task_unlock(current
);
2165 INIT_LIST_HEAD(&child
->cg_list
);
2169 * cgroup_fork_callbacks - called on a new task very soon before
2170 * adding it to the tasklist. No need to take any locks since no-one
2171 * can be operating on this task
2173 void cgroup_fork_callbacks(struct task_struct
*child
)
2175 if (need_forkexit_callback
) {
2177 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
2178 struct cgroup_subsys
*ss
= subsys
[i
];
2180 ss
->fork(ss
, child
);
2186 * cgroup_post_fork - called on a new task after adding it to the
2187 * task list. Adds the task to the list running through its css_set
2188 * if necessary. Has to be after the task is visible on the task list
2189 * in case we race with the first call to cgroup_iter_start() - to
2190 * guarantee that the new task ends up on its list. */
2191 void cgroup_post_fork(struct task_struct
*child
)
2193 if (use_task_css_set_links
) {
2194 write_lock(&css_set_lock
);
2195 if (list_empty(&child
->cg_list
))
2196 list_add(&child
->cg_list
, &child
->cgroups
->tasks
);
2197 write_unlock(&css_set_lock
);
2201 * cgroup_exit - detach cgroup from exiting task
2202 * @tsk: pointer to task_struct of exiting process
2204 * Description: Detach cgroup from @tsk and release it.
2206 * Note that cgroups marked notify_on_release force every task in
2207 * them to take the global cgroup_mutex mutex when exiting.
2208 * This could impact scaling on very large systems. Be reluctant to
2209 * use notify_on_release cgroups where very high task exit scaling
2210 * is required on large systems.
2212 * the_top_cgroup_hack:
2214 * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
2216 * We call cgroup_exit() while the task is still competent to
2217 * handle notify_on_release(), then leave the task attached to the
2218 * root cgroup in each hierarchy for the remainder of its exit.
2220 * To do this properly, we would increment the reference count on
2221 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
2222 * code we would add a second cgroup function call, to drop that
2223 * reference. This would just create an unnecessary hot spot on
2224 * the top_cgroup reference count, to no avail.
2226 * Normally, holding a reference to a cgroup without bumping its
2227 * count is unsafe. The cgroup could go away, or someone could
2228 * attach us to a different cgroup, decrementing the count on
2229 * the first cgroup that we never incremented. But in this case,
2230 * top_cgroup isn't going away, and either task has PF_EXITING set,
2231 * which wards off any attach_task() attempts, or task is a failed
2232 * fork, never visible to attach_task.
2235 void cgroup_exit(struct task_struct
*tsk
, int run_callbacks
)
2240 if (run_callbacks
&& need_forkexit_callback
) {
2241 for (i
= 0; i
< CGROUP_SUBSYS_COUNT
; i
++) {
2242 struct cgroup_subsys
*ss
= subsys
[i
];
2249 * Unlink from the css_set task list if necessary.
2250 * Optimistically check cg_list before taking
2253 if (!list_empty(&tsk
->cg_list
)) {
2254 write_lock(&css_set_lock
);
2255 if (!list_empty(&tsk
->cg_list
))
2256 list_del(&tsk
->cg_list
);
2257 write_unlock(&css_set_lock
);
2260 /* Reassign the task to the init_css_set. */
2263 tsk
->cgroups
= &init_css_set
;
2270 * cgroup_clone - duplicate the current cgroup in the hierarchy
2271 * that the given subsystem is attached to, and move this task into
2274 int cgroup_clone(struct task_struct
*tsk
, struct cgroup_subsys
*subsys
)
2276 struct dentry
*dentry
;
2278 char nodename
[MAX_CGROUP_TYPE_NAMELEN
];
2279 struct cgroup
*parent
, *child
;
2280 struct inode
*inode
;
2282 struct cgroupfs_root
*root
;
2283 struct cgroup_subsys
*ss
;
2285 /* We shouldn't be called by an unregistered subsystem */
2286 BUG_ON(!subsys
->active
);
2288 /* First figure out what hierarchy and cgroup we're dealing
2289 * with, and pin them so we can drop cgroup_mutex */
2290 mutex_lock(&cgroup_mutex
);
2292 root
= subsys
->root
;
2293 if (root
== &rootnode
) {
2295 "Not cloning cgroup for unused subsystem %s\n",
2297 mutex_unlock(&cgroup_mutex
);
2301 parent
= task_cgroup(tsk
, subsys
->subsys_id
);
2303 snprintf(nodename
, MAX_CGROUP_TYPE_NAMELEN
, "node_%d", tsk
->pid
);
2305 /* Pin the hierarchy */
2306 atomic_inc(&parent
->root
->sb
->s_active
);
2308 /* Keep the cgroup alive */
2310 mutex_unlock(&cgroup_mutex
);
2312 /* Now do the VFS work to create a cgroup */
2313 inode
= parent
->dentry
->d_inode
;
2315 /* Hold the parent directory mutex across this operation to
2316 * stop anyone else deleting the new cgroup */
2317 mutex_lock(&inode
->i_mutex
);
2318 dentry
= lookup_one_len(nodename
, parent
->dentry
, strlen(nodename
));
2319 if (IS_ERR(dentry
)) {
2321 "Couldn't allocate dentry for %s: %ld\n", nodename
,
2323 ret
= PTR_ERR(dentry
);
2327 /* Create the cgroup directory, which also creates the cgroup */
2328 ret
= vfs_mkdir(inode
, dentry
, S_IFDIR
| 0755);
2329 child
= __d_cont(dentry
);
2333 "Failed to create cgroup %s: %d\n", nodename
,
2340 "Couldn't find new cgroup %s\n", nodename
);
2345 /* The cgroup now exists. Retake cgroup_mutex and check
2346 * that we're still in the same state that we thought we
2348 mutex_lock(&cgroup_mutex
);
2349 if ((root
!= subsys
->root
) ||
2350 (parent
!= task_cgroup(tsk
, subsys
->subsys_id
))) {
2351 /* Aargh, we raced ... */
2352 mutex_unlock(&inode
->i_mutex
);
2355 deactivate_super(parent
->root
->sb
);
2356 /* The cgroup is still accessible in the VFS, but
2357 * we're not going to try to rmdir() it at this
2360 "Race in cgroup_clone() - leaking cgroup %s\n",
2365 /* do any required auto-setup */
2366 for_each_subsys(root
, ss
) {
2368 ss
->post_clone(ss
, child
);
2371 /* All seems fine. Finish by moving the task into the new cgroup */
2372 ret
= attach_task(child
, tsk
);
2373 mutex_unlock(&cgroup_mutex
);
2376 mutex_unlock(&inode
->i_mutex
);
2378 deactivate_super(parent
->root
->sb
);
2383 * See if "cont" is a descendant of the current task's cgroup in
2384 * the appropriate hierarchy
2386 * If we are sending in dummytop, then presumably we are creating
2387 * the top cgroup in the subsystem.
2389 * Called only by the ns (nsproxy) cgroup.
2391 int cgroup_is_descendant(const struct cgroup
*cont
)
2394 struct cgroup
*target
;
2397 if (cont
== dummytop
)
2400 get_first_subsys(cont
, NULL
, &subsys_id
);
2401 target
= task_cgroup(current
, subsys_id
);
2402 while (cont
!= target
&& cont
!= cont
->top_cgroup
)
2403 cont
= cont
->parent
;
2404 ret
= (cont
== target
);