- pre3:
[davej-history.git] / fs / super.c
blob81a3fafc2b19dd8577efef44c84d694e7780da58
1 /*
2 * linux/fs/super.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
6 * super.c contains code to handle: - mount structures
7 * - super-block tables
8 * - filesystem drivers list
9 * - mount system call
10 * - umount system call
11 * - ustat system call
13 * GK 2/5/95 - Changed to support mounting the root fs via NFS
15 * Added kerneld support: Jacques Gelinas and Bjorn Ekwall
16 * Added change_root: Werner Almesberger & Hans Lermen, Feb '96
17 * Added options to /proc/mounts:
18 * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
19 * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
20 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
23 #include <linux/config.h>
24 #include <linux/string.h>
25 #include <linux/malloc.h>
26 #include <linux/locks.h>
27 #include <linux/smp_lock.h>
28 #include <linux/devfs_fs_kernel.h>
29 #include <linux/fd.h>
30 #include <linux/init.h>
31 #include <linux/quotaops.h>
32 #include <linux/acct.h>
34 #include <asm/uaccess.h>
36 #include <linux/nfs_fs.h>
37 #include <linux/nfs_fs_sb.h>
38 #include <linux/nfs_mount.h>
40 #include <linux/kmod.h>
41 #define __NO_VERSION__
42 #include <linux/module.h>
45 * We use a semaphore to synchronize all mount/umount
46 * activity - imagine the mess if we have a race between
47 * unmounting a filesystem and re-mounting it (or something
48 * else).
50 static DECLARE_MUTEX(mount_sem);
52 extern void wait_for_keypress(void);
54 extern int root_mountflags;
56 static int do_remount_sb(struct super_block *sb, int flags, char * data);
58 /* this is initialized in init/main.c */
59 kdev_t ROOT_DEV;
61 int nr_super_blocks;
62 int max_super_blocks = NR_SUPER;
63 LIST_HEAD(super_blocks);
66 * Handling of filesystem drivers list.
67 * Rules:
68 * Inclusion to/removals from/scanning of list are protected by spinlock.
69 * During the unload module must call unregister_filesystem().
70 * We can access the fields of list element if:
71 * 1) spinlock is held or
72 * 2) we hold the reference to the module.
73 * The latter can be guaranteed by call of try_inc_mod_count(); if it
74 * returned 0 we must skip the element, otherwise we got the reference.
75 * Once the reference is obtained we can drop the spinlock.
78 static struct file_system_type *file_systems;
79 static rwlock_t file_systems_lock = RW_LOCK_UNLOCKED;
81 /* WARNING: This can be used only if we _already_ own a reference */
82 static void get_filesystem(struct file_system_type *fs)
84 if (fs->owner)
85 __MOD_INC_USE_COUNT(fs->owner);
88 static void put_filesystem(struct file_system_type *fs)
90 if (fs->owner)
91 __MOD_DEC_USE_COUNT(fs->owner);
94 static struct file_system_type **find_filesystem(const char *name)
96 struct file_system_type **p;
97 for (p=&file_systems; *p; p=&(*p)->next)
98 if (strcmp((*p)->name,name) == 0)
99 break;
100 return p;
104 * register_filesystem - register a new filesystem
105 * @fs: the file system structure
107 * Adds the file system passed to the list of file systems the kernel
108 * is aware of for mount and other syscalls. Returns 0 on success,
109 * or a negative errno code on an error.
111 * The &struct file_system_type that is passed is linked into the kernel
112 * structures and must not be freed until the file system has been
113 * unregistered.
116 int register_filesystem(struct file_system_type * fs)
118 int res = 0;
119 struct file_system_type ** p;
121 if (!fs)
122 return -EINVAL;
123 if (fs->next)
124 return -EBUSY;
125 write_lock(&file_systems_lock);
126 p = find_filesystem(fs->name);
127 if (*p)
128 res = -EBUSY;
129 else
130 *p = fs;
131 write_unlock(&file_systems_lock);
132 return res;
136 * unregister_filesystem - unregister a file system
137 * @fs: filesystem to unregister
139 * Remove a file system that was previously successfully registered
140 * with the kernel. An error is returned if the file system is not found.
141 * Zero is returned on a success.
143 * Once this function has returned the &struct file_system_type structure
144 * may be freed or reused.
147 int unregister_filesystem(struct file_system_type * fs)
149 struct file_system_type ** tmp;
151 write_lock(&file_systems_lock);
152 tmp = &file_systems;
153 while (*tmp) {
154 if (fs == *tmp) {
155 *tmp = fs->next;
156 fs->next = NULL;
157 write_unlock(&file_systems_lock);
158 return 0;
160 tmp = &(*tmp)->next;
162 write_unlock(&file_systems_lock);
163 return -EINVAL;
166 static int fs_index(const char * __name)
168 struct file_system_type * tmp;
169 char * name;
170 int err, index;
172 name = getname(__name);
173 err = PTR_ERR(name);
174 if (IS_ERR(name))
175 return err;
177 err = -EINVAL;
178 read_lock(&file_systems_lock);
179 for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
180 if (strcmp(tmp->name,name) == 0) {
181 err = index;
182 break;
185 read_unlock(&file_systems_lock);
186 putname(name);
187 return err;
190 static int fs_name(unsigned int index, char * buf)
192 struct file_system_type * tmp;
193 int len, res;
195 read_lock(&file_systems_lock);
196 for (tmp = file_systems; tmp; tmp = tmp->next, index--)
197 if (index <= 0 && try_inc_mod_count(tmp->owner))
198 break;
199 read_unlock(&file_systems_lock);
200 if (!tmp)
201 return -EINVAL;
203 /* OK, we got the reference, so we can safely block */
204 len = strlen(tmp->name) + 1;
205 res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
206 put_filesystem(tmp);
207 return res;
210 static int fs_maxindex(void)
212 struct file_system_type * tmp;
213 int index;
215 read_lock(&file_systems_lock);
216 for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
218 read_unlock(&file_systems_lock);
219 return index;
223 * Whee.. Weird sysv syscall.
225 asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
227 int retval = -EINVAL;
229 switch (option) {
230 case 1:
231 retval = fs_index((const char *) arg1);
232 break;
234 case 2:
235 retval = fs_name(arg1, (char *) arg2);
236 break;
238 case 3:
239 retval = fs_maxindex();
240 break;
242 return retval;
245 int get_filesystem_list(char * buf)
247 int len = 0;
248 struct file_system_type * tmp;
250 read_lock(&file_systems_lock);
251 tmp = file_systems;
252 while (tmp && len < PAGE_SIZE - 80) {
253 len += sprintf(buf+len, "%s\t%s\n",
254 (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
255 tmp->name);
256 tmp = tmp->next;
258 read_unlock(&file_systems_lock);
259 return len;
262 struct file_system_type *get_fs_type(const char *name)
264 struct file_system_type *fs;
266 read_lock(&file_systems_lock);
267 fs = *(find_filesystem(name));
268 if (fs && !try_inc_mod_count(fs->owner))
269 fs = NULL;
270 read_unlock(&file_systems_lock);
271 if (!fs && (request_module(name) == 0)) {
272 read_lock(&file_systems_lock);
273 fs = *(find_filesystem(name));
274 if (fs && !try_inc_mod_count(fs->owner))
275 fs = NULL;
276 read_unlock(&file_systems_lock);
278 return fs;
281 static LIST_HEAD(vfsmntlist);
284 * add_vfsmnt - add a new mount node
285 * @nd: location of mountpoint or %NULL if we want a root node
286 * @root: root of (sub)tree to be mounted
287 * @dev_name: device name to show in /proc/mounts or %NULL (for "none").
289 * This is VFS idea of mount. New node is allocated, bound to a tree
290 * we are mounting and optionally (OK, usually) registered as mounted
291 * on a given mountpoint. Returns a pointer to new node or %NULL in
292 * case of failure.
294 * Potential reason for failure (aside of trivial lack of memory) is a
295 * deleted mountpoint. Caller must hold ->i_zombie on mountpoint
296 * dentry (if any).
298 * Node is marked as MNT_VISIBLE (visible in /proc/mounts) unless both
299 * @nd and @devname are %NULL. It works since we pass non-%NULL @devname
300 * when we are mounting root and kern_mount() filesystems are deviceless.
301 * If we will get a kern_mount() filesystem with nontrivial @devname we
302 * will have to pass the visibility flag explicitly, so if we will add
303 * support for such beasts we'll have to change prototype.
306 static struct vfsmount *add_vfsmnt(struct nameidata *nd,
307 struct dentry *root,
308 const char *dev_name)
310 struct vfsmount *mnt;
311 struct super_block *sb = root->d_inode->i_sb;
312 char *name;
314 mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
315 if (!mnt)
316 goto out;
317 memset(mnt, 0, sizeof(struct vfsmount));
319 if (nd || dev_name)
320 mnt->mnt_flags = MNT_VISIBLE;
322 /* It may be NULL, but who cares? */
323 if (dev_name) {
324 name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
325 if (name) {
326 strcpy(name, dev_name);
327 mnt->mnt_devname = name;
330 mnt->mnt_owner = current->uid;
331 atomic_set(&mnt->mnt_count,1);
332 mnt->mnt_sb = sb;
334 spin_lock(&dcache_lock);
335 if (nd && !IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
336 goto fail;
337 mnt->mnt_root = dget(root);
338 mnt->mnt_mountpoint = nd ? dget(nd->dentry) : dget(root);
339 mnt->mnt_parent = nd ? mntget(nd->mnt) : mnt;
341 if (nd) {
342 list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
343 list_add(&mnt->mnt_clash, &nd->dentry->d_vfsmnt);
344 } else {
345 INIT_LIST_HEAD(&mnt->mnt_child);
346 INIT_LIST_HEAD(&mnt->mnt_clash);
348 INIT_LIST_HEAD(&mnt->mnt_mounts);
349 list_add(&mnt->mnt_instances, &sb->s_mounts);
350 list_add(&mnt->mnt_list, vfsmntlist.prev);
351 spin_unlock(&dcache_lock);
352 out:
353 return mnt;
354 fail:
355 spin_unlock(&dcache_lock);
356 if (mnt->mnt_devname)
357 kfree(mnt->mnt_devname);
358 kfree(mnt);
359 return NULL;
362 static void move_vfsmnt(struct vfsmount *mnt,
363 struct dentry *mountpoint,
364 struct vfsmount *parent,
365 const char *dev_name)
367 struct dentry *old_mountpoint;
368 struct vfsmount *old_parent;
369 char *new_devname = NULL;
371 if (dev_name) {
372 new_devname = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
373 if (new_devname)
374 strcpy(new_devname, dev_name);
377 spin_lock(&dcache_lock);
378 old_mountpoint = mnt->mnt_mountpoint;
379 old_parent = mnt->mnt_parent;
381 /* flip names */
382 if (new_devname) {
383 if (mnt->mnt_devname)
384 kfree(mnt->mnt_devname);
385 mnt->mnt_devname = new_devname;
388 /* flip the linkage */
389 mnt->mnt_mountpoint = dget(mountpoint);
390 mnt->mnt_parent = parent ? mntget(parent) : mnt;
391 list_del(&mnt->mnt_clash);
392 list_del(&mnt->mnt_child);
393 if (parent) {
394 list_add(&mnt->mnt_child, &parent->mnt_mounts);
395 list_add(&mnt->mnt_clash, &mountpoint->d_vfsmnt);
396 } else {
397 INIT_LIST_HEAD(&mnt->mnt_child);
398 INIT_LIST_HEAD(&mnt->mnt_clash);
400 spin_unlock(&dcache_lock);
402 /* put the old stuff */
403 dput(old_mountpoint);
404 if (old_parent != mnt)
405 mntput(old_parent);
409 * Called with spinlock held, releases it.
411 static void remove_vfsmnt(struct vfsmount *mnt)
413 /* First of all, remove it from all lists */
414 list_del(&mnt->mnt_instances);
415 list_del(&mnt->mnt_clash);
416 list_del(&mnt->mnt_list);
417 list_del(&mnt->mnt_child);
418 spin_unlock(&dcache_lock);
419 /* Now we can work safely */
420 if (mnt->mnt_parent != mnt)
421 mntput(mnt->mnt_parent);
423 dput(mnt->mnt_mountpoint);
424 dput(mnt->mnt_root);
425 if (mnt->mnt_devname)
426 kfree(mnt->mnt_devname);
427 kfree(mnt);
431 /* Use octal escapes, like mount does, for embedded spaces etc. */
432 static unsigned char need_escaping[] = { ' ', '\t', '\n', '\\' };
434 static int
435 mangle(const unsigned char *s, char *buf, int len) {
436 char *sp;
437 int n;
439 sp = buf;
440 while(*s && sp-buf < len-3) {
441 for (n = 0; n < sizeof(need_escaping); n++) {
442 if (*s == need_escaping[n]) {
443 *sp++ = '\\';
444 *sp++ = '0' + ((*s & 0300) >> 6);
445 *sp++ = '0' + ((*s & 070) >> 3);
446 *sp++ = '0' + (*s & 07);
447 goto next;
450 *sp++ = *s;
451 next:
452 s++;
454 return sp - buf; /* no trailing NUL */
457 static struct proc_fs_info {
458 int flag;
459 char *str;
460 } fs_info[] = {
461 { MS_NOEXEC, ",noexec" },
462 { MS_NOSUID, ",nosuid" },
463 { MS_NODEV, ",nodev" },
464 { MS_SYNCHRONOUS, ",sync" },
465 { MS_MANDLOCK, ",mand" },
466 { MS_NOATIME, ",noatime" },
467 { MS_NODIRATIME, ",nodiratime" },
468 #ifdef MS_NOSUB /* Can't find this except in mount.c */
469 { MS_NOSUB, ",nosub" },
470 #endif
471 { 0, NULL }
474 static struct proc_nfs_info {
475 int flag;
476 char *str;
477 char *nostr;
478 } nfs_info[] = {
479 { NFS_MOUNT_SOFT, ",soft", ",hard" },
480 { NFS_MOUNT_INTR, ",intr", "" },
481 { NFS_MOUNT_POSIX, ",posix", "" },
482 { NFS_MOUNT_TCP, ",tcp", ",udp" },
483 { NFS_MOUNT_NOCTO, ",nocto", "" },
484 { NFS_MOUNT_NOAC, ",noac", "" },
485 { NFS_MOUNT_NONLM, ",nolock", ",lock" },
486 { 0, NULL, NULL }
489 int get_filesystem_info( char *buf )
491 struct list_head *p;
492 struct proc_fs_info *fs_infop;
493 struct proc_nfs_info *nfs_infop;
494 struct nfs_server *nfss;
495 int len, prevlen;
496 char *path, *buffer = (char *) __get_free_page(GFP_KERNEL);
498 if (!buffer) return 0;
499 len = prevlen = 0;
501 #define FREEROOM ((int)PAGE_SIZE-200-len)
502 #define MANGLE(s) len += mangle((s), buf+len, FREEROOM);
504 for (p = vfsmntlist.next; p != &vfsmntlist; p = p->next) {
505 struct vfsmount *tmp = list_entry(p, struct vfsmount, mnt_list);
506 if (!(tmp->mnt_flags & MNT_VISIBLE))
507 continue;
508 path = d_path(tmp->mnt_root, tmp, buffer, PAGE_SIZE);
509 if (!path)
510 continue;
511 MANGLE(tmp->mnt_devname ? tmp->mnt_devname : "none");
512 buf[len++] = ' ';
513 MANGLE(path);
514 buf[len++] = ' ';
515 MANGLE(tmp->mnt_sb->s_type->name);
516 len += sprintf(buf+len, " %s",
517 tmp->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
518 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
519 if (tmp->mnt_sb->s_flags & fs_infop->flag)
520 MANGLE(fs_infop->str);
522 if (!strcmp("nfs", tmp->mnt_sb->s_type->name)) {
523 nfss = &tmp->mnt_sb->u.nfs_sb.s_server;
524 len += sprintf(buf+len, ",v%d", nfss->rpc_ops->version);
526 len += sprintf(buf+len, ",rsize=%d", nfss->rsize);
528 len += sprintf(buf+len, ",wsize=%d", nfss->wsize);
529 #if 0
530 if (nfss->timeo != 7*HZ/10) {
531 len += sprintf(buf+len, ",timeo=%d",
532 nfss->timeo*10/HZ);
534 if (nfss->retrans != 3) {
535 len += sprintf(buf+len, ",retrans=%d",
536 nfss->retrans);
538 #endif
539 if (nfss->acregmin != 3*HZ) {
540 len += sprintf(buf+len, ",acregmin=%d",
541 nfss->acregmin/HZ);
543 if (nfss->acregmax != 60*HZ) {
544 len += sprintf(buf+len, ",acregmax=%d",
545 nfss->acregmax/HZ);
547 if (nfss->acdirmin != 30*HZ) {
548 len += sprintf(buf+len, ",acdirmin=%d",
549 nfss->acdirmin/HZ);
551 if (nfss->acdirmax != 60*HZ) {
552 len += sprintf(buf+len, ",acdirmax=%d",
553 nfss->acdirmax/HZ);
555 for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
556 char *str;
557 if (nfss->flags & nfs_infop->flag)
558 str = nfs_infop->str;
559 else
560 str = nfs_infop->nostr;
561 MANGLE(str);
563 len += sprintf(buf+len, ",addr=");
564 MANGLE(nfss->hostname);
566 len += sprintf(buf + len, " 0 0\n");
567 if (FREEROOM <= 3) {
568 len = prevlen;
569 len += sprintf(buf+len, "# truncated\n");
570 break;
572 prevlen = len;
575 free_page((unsigned long) buffer);
576 return len;
577 #undef MANGLE
578 #undef FREEROOM
582 * __wait_on_super - wait on a superblock
583 * @sb: superblock to wait on
585 * Waits for a superblock to become unlocked and then returns. It does
586 * not take the lock. This is an internal function. See wait_on_super().
589 void __wait_on_super(struct super_block * sb)
591 DECLARE_WAITQUEUE(wait, current);
593 add_wait_queue(&sb->s_wait, &wait);
594 repeat:
595 set_current_state(TASK_UNINTERRUPTIBLE);
596 if (sb->s_lock) {
597 schedule();
598 goto repeat;
600 remove_wait_queue(&sb->s_wait, &wait);
601 current->state = TASK_RUNNING;
605 * Note: check the dirty flag before waiting, so we don't
606 * hold up the sync while mounting a device. (The newly
607 * mounted device won't need syncing.)
609 void sync_supers(kdev_t dev)
611 struct super_block * sb;
613 for (sb = sb_entry(super_blocks.next);
614 sb != sb_entry(&super_blocks);
615 sb = sb_entry(sb->s_list.next)) {
616 if (!sb->s_dev)
617 continue;
618 if (dev && sb->s_dev != dev)
619 continue;
620 if (!sb->s_dirt)
621 continue;
622 lock_super(sb);
623 if (sb->s_dev && sb->s_dirt && (!dev || dev == sb->s_dev))
624 if (sb->s_op && sb->s_op->write_super)
625 sb->s_op->write_super(sb);
626 unlock_super(sb);
631 * get_super - get the superblock of a device
632 * @dev: device to get the superblock for
634 * Scans the superblock list and finds the superblock of the file system
635 * mounted on the device given. %NULL is returned if no match is found.
638 struct super_block * get_super(kdev_t dev)
640 struct super_block * s;
642 if (!dev)
643 return NULL;
644 restart:
645 s = sb_entry(super_blocks.next);
646 while (s != sb_entry(&super_blocks))
647 if (s->s_dev == dev) {
648 wait_on_super(s);
649 if (s->s_dev == dev)
650 return s;
651 goto restart;
652 } else
653 s = sb_entry(s->s_list.next);
654 return NULL;
657 asmlinkage long sys_ustat(dev_t dev, struct ustat * ubuf)
659 struct super_block *s;
660 struct ustat tmp;
661 struct statfs sbuf;
662 int err = -EINVAL;
664 lock_kernel();
665 s = get_super(to_kdev_t(dev));
666 unlock_kernel();
667 if (s == NULL)
668 goto out;
669 err = vfs_statfs(s, &sbuf);
670 if (err)
671 goto out;
673 memset(&tmp,0,sizeof(struct ustat));
674 tmp.f_tfree = sbuf.f_bfree;
675 tmp.f_tinode = sbuf.f_ffree;
677 err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
678 out:
679 return err;
683 * get_empty_super - find empty superblocks
685 * Find a superblock with no device assigned. A free superblock is
686 * found and returned. If neccessary new superblocks are allocated.
687 * %NULL is returned if there are insufficient resources to complete
688 * the request.
691 struct super_block *get_empty_super(void)
693 struct super_block *s;
695 for (s = sb_entry(super_blocks.next);
696 s != sb_entry(&super_blocks);
697 s = sb_entry(s->s_list.next)) {
698 if (s->s_dev)
699 continue;
700 if (!s->s_lock)
701 return s;
702 printk("VFS: empty superblock %p locked!\n", s);
704 /* Need a new one... */
705 if (nr_super_blocks >= max_super_blocks)
706 return NULL;
707 s = kmalloc(sizeof(struct super_block), GFP_USER);
708 if (s) {
709 nr_super_blocks++;
710 memset(s, 0, sizeof(struct super_block));
711 INIT_LIST_HEAD(&s->s_dirty);
712 list_add (&s->s_list, super_blocks.prev);
713 init_waitqueue_head(&s->s_wait);
714 INIT_LIST_HEAD(&s->s_files);
715 INIT_LIST_HEAD(&s->s_mounts);
717 return s;
720 static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
721 struct file_system_type *type, int flags,
722 void *data, int silent)
724 struct super_block * s;
725 s = get_empty_super();
726 if (!s)
727 goto out;
728 s->s_dev = dev;
729 s->s_bdev = bdev;
730 s->s_flags = flags;
731 s->s_dirt = 0;
732 sema_init(&s->s_vfs_rename_sem,1);
733 sema_init(&s->s_nfsd_free_path_sem,1);
734 s->s_type = type;
735 sema_init(&s->s_dquot.dqio_sem, 1);
736 sema_init(&s->s_dquot.dqoff_sem, 1);
737 s->s_dquot.flags = 0;
738 lock_super(s);
739 if (!type->read_super(s, data, silent))
740 goto out_fail;
741 unlock_super(s);
742 /* tell bdcache that we are going to keep this one */
743 if (bdev)
744 atomic_inc(&bdev->bd_count);
745 out:
746 return s;
748 out_fail:
749 s->s_dev = 0;
750 s->s_bdev = 0;
751 s->s_type = NULL;
752 unlock_super(s);
753 return NULL;
757 * Unnamed block devices are dummy devices used by virtual
758 * filesystems which don't use real block-devices. -- jrs
761 static unsigned int unnamed_dev_in_use[256/(8*sizeof(unsigned int))];
763 kdev_t get_unnamed_dev(void)
765 int i;
767 for (i = 1; i < 256; i++) {
768 if (!test_and_set_bit(i,unnamed_dev_in_use))
769 return MKDEV(UNNAMED_MAJOR, i);
771 return 0;
774 void put_unnamed_dev(kdev_t dev)
776 if (!dev || MAJOR(dev) != UNNAMED_MAJOR)
777 return;
778 if (test_and_clear_bit(MINOR(dev), unnamed_dev_in_use))
779 return;
780 printk("VFS: put_unnamed_dev: freeing unused device %s\n",
781 kdevname(dev));
784 static struct super_block *get_sb_bdev(struct file_system_type *fs_type,
785 char *dev_name, int flags, void * data)
787 struct inode *inode;
788 struct block_device *bdev;
789 struct block_device_operations *bdops;
790 struct super_block * sb;
791 struct nameidata nd;
792 kdev_t dev;
793 int error = 0;
794 /* What device it is? */
795 if (!dev_name || !*dev_name)
796 return ERR_PTR(-EINVAL);
797 if (path_init(dev_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
798 error = path_walk(dev_name, &nd);
799 if (error)
800 return ERR_PTR(error);
801 inode = nd.dentry->d_inode;
802 error = -ENOTBLK;
803 if (!S_ISBLK(inode->i_mode))
804 goto out;
805 error = -EACCES;
806 if (IS_NODEV(inode))
807 goto out;
808 bdev = inode->i_bdev;
809 bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
810 if (bdops) bdev->bd_op = bdops;
811 /* Done with lookups, semaphore down */
812 down(&mount_sem);
813 dev = to_kdev_t(bdev->bd_dev);
814 sb = get_super(dev);
815 if (sb) {
816 if (fs_type == sb->s_type &&
817 ((flags ^ sb->s_flags) & MS_RDONLY) == 0) {
818 path_release(&nd);
819 return sb;
821 } else {
822 mode_t mode = FMODE_READ; /* we always need it ;-) */
823 if (!(flags & MS_RDONLY))
824 mode |= FMODE_WRITE;
825 error = blkdev_get(bdev, mode, 0, BDEV_FS);
826 if (error)
827 goto out;
828 check_disk_change(dev);
829 error = -EACCES;
830 if (!(flags & MS_RDONLY) && is_read_only(dev))
831 goto out1;
832 error = -EINVAL;
833 sb = read_super(dev, bdev, fs_type, flags, data, 0);
834 if (sb) {
835 get_filesystem(fs_type);
836 path_release(&nd);
837 return sb;
839 out1:
840 blkdev_put(bdev, BDEV_FS);
842 out:
843 path_release(&nd);
844 up(&mount_sem);
845 return ERR_PTR(error);
848 static struct super_block *get_sb_nodev(struct file_system_type *fs_type,
849 int flags, void * data)
851 kdev_t dev;
852 int error = -EMFILE;
853 down(&mount_sem);
854 dev = get_unnamed_dev();
855 if (dev) {
856 struct super_block * sb;
857 error = -EINVAL;
858 sb = read_super(dev, NULL, fs_type, flags, data, 0);
859 if (sb) {
860 get_filesystem(fs_type);
861 return sb;
863 put_unnamed_dev(dev);
865 up(&mount_sem);
866 return ERR_PTR(error);
869 static struct super_block *get_sb_single(struct file_system_type *fs_type,
870 int flags, void *data)
872 struct super_block * sb;
874 * Get the superblock of kernel-wide instance, but
875 * keep the reference to fs_type.
877 down(&mount_sem);
878 sb = fs_type->kern_mnt->mnt_sb;
879 if (!sb)
880 BUG();
881 get_filesystem(fs_type);
882 do_remount_sb(sb, flags, data);
883 return sb;
886 static struct block_device *kill_super(struct super_block *sb, int umount_root)
888 struct block_device *bdev;
889 kdev_t dev;
890 struct dentry *root = sb->s_root;
891 sb->s_root = NULL;
892 /* Need to clean after the sucker */
893 if (sb->s_type->fs_flags & FS_LITTER)
894 d_genocide(root);
895 if (sb->s_type->fs_flags & (FS_SINGLE|FS_LITTER))
896 shrink_dcache_parent(root);
897 dput(root);
898 lock_super(sb);
899 if (sb->s_op) {
900 if (sb->s_op->write_super && sb->s_dirt)
901 sb->s_op->write_super(sb);
902 if (sb->s_op->put_super)
903 sb->s_op->put_super(sb);
906 /* Forget any remaining inodes */
907 if (invalidate_inodes(sb)) {
908 printk("VFS: Busy inodes after unmount. "
909 "Self-destruct in 5 seconds. Have a nice day...\n");
912 dev = sb->s_dev;
913 sb->s_dev = 0; /* Free the superblock */
914 bdev = sb->s_bdev;
915 sb->s_bdev = NULL;
916 put_filesystem(sb->s_type);
917 sb->s_type = NULL;
918 unlock_super(sb);
919 if (umount_root) {
920 /* special: the old device driver is going to be
921 a ramdisk and the point of this call is to free its
922 protected memory (even if dirty). */
923 destroy_buffers(dev);
925 if (bdev) {
926 blkdev_put(bdev, BDEV_FS);
927 bdput(bdev);
928 } else
929 put_unnamed_dev(dev);
930 return bdev;
934 * Alters the mount flags of a mounted file system. Only the mount point
935 * is used as a reference - file system type and the device are ignored.
938 static int do_remount_sb(struct super_block *sb, int flags, char *data)
940 int retval;
942 if (!(flags & MS_RDONLY) && sb->s_dev && is_read_only(sb->s_dev))
943 return -EACCES;
944 /*flags |= MS_RDONLY;*/
945 /* If we are remounting RDONLY, make sure there are no rw files open */
946 if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
947 if (!fs_may_remount_ro(sb))
948 return -EBUSY;
949 if (sb->s_op && sb->s_op->remount_fs) {
950 lock_super(sb);
951 retval = sb->s_op->remount_fs(sb, &flags, data);
952 unlock_super(sb);
953 if (retval)
954 return retval;
956 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
959 * We can't invalidate inodes as we can loose data when remounting
960 * (someone might manage to alter data while we are waiting in lock_super()
961 * or in foo_remount_fs()))
964 return 0;
967 struct vfsmount *kern_mount(struct file_system_type *type)
969 kdev_t dev = get_unnamed_dev();
970 struct super_block *sb;
971 struct vfsmount *mnt;
972 if (!dev)
973 return ERR_PTR(-EMFILE);
974 sb = read_super(dev, NULL, type, 0, NULL, 0);
975 if (!sb) {
976 put_unnamed_dev(dev);
977 return ERR_PTR(-EINVAL);
979 mnt = add_vfsmnt(NULL, sb->s_root, NULL);
980 if (!mnt) {
981 kill_super(sb, 0);
982 return ERR_PTR(-ENOMEM);
984 type->kern_mnt = mnt;
985 return mnt;
988 /* Call only after unregister_filesystem() - it's a final cleanup */
990 void kern_umount(struct vfsmount *mnt)
992 struct super_block *sb = mnt->mnt_sb;
993 spin_lock(&dcache_lock);
994 remove_vfsmnt(mnt);
995 kill_super(sb, 0);
999 * Doesn't take quota and stuff into account. IOW, in some cases it will
1000 * give false negatives. The main reason why it's here is that we need
1001 * a non-destructive way to look for easily umountable filesystems.
1003 int may_umount(struct vfsmount *mnt)
1005 if (atomic_read(&mnt->mnt_count) > 2)
1006 return -EBUSY;
1007 return 0;
1010 static int do_umount(struct vfsmount *mnt, int umount_root, int flags)
1012 struct super_block * sb = mnt->mnt_sb;
1015 * No sense to grab the lock for this test, but test itself looks
1016 * somewhat bogus. Suggestions for better replacement?
1017 * Ho-hum... In principle, we might treat that as umount + switch
1018 * to rootfs. GC would eventually take care of the old vfsmount.
1019 * The problem being: we have to implement rootfs and GC for that ;-)
1020 * Actually it makes sense, especially if rootfs would contain a
1021 * /reboot - static binary that would close all descriptors and
1022 * call reboot(9). Then init(8) could umount root and exec /reboot.
1024 if (mnt == current->fs->rootmnt && !umount_root) {
1025 int retval = 0;
1027 * Special case for "unmounting" root ...
1028 * we just try to remount it readonly.
1030 mntput(mnt);
1031 if (!(sb->s_flags & MS_RDONLY))
1032 retval = do_remount_sb(sb, MS_RDONLY, 0);
1033 return retval;
1036 spin_lock(&dcache_lock);
1037 if (atomic_read(&mnt->mnt_count) > 2) {
1038 spin_unlock(&dcache_lock);
1039 mntput(mnt);
1040 return -EBUSY;
1043 if (mnt->mnt_instances.next != mnt->mnt_instances.prev) {
1044 if (sb->s_type->fs_flags & FS_SINGLE)
1045 put_filesystem(sb->s_type);
1046 /* We hold two references, so mntput() is safe */
1047 mntput(mnt);
1048 remove_vfsmnt(mnt);
1049 return 0;
1051 spin_unlock(&dcache_lock);
1054 * Before checking whether the filesystem is still busy,
1055 * make sure the kernel doesn't hold any quota files open
1056 * on the device. If the umount fails, too bad -- there
1057 * are no quotas running any more. Just turn them on again.
1059 DQUOT_OFF(sb);
1060 acct_auto_close(sb->s_dev);
1063 * If we may have to abort operations to get out of this
1064 * mount, and they will themselves hold resources we must
1065 * allow the fs to do things. In the Unix tradition of
1066 * 'Gee thats tricky lets do it in userspace' the umount_begin
1067 * might fail to complete on the first run through as other tasks
1068 * must return, and the like. Thats for the mount program to worry
1069 * about for the moment.
1072 if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
1073 sb->s_op->umount_begin(sb);
1076 * Shrink dcache, then fsync. This guarantees that if the
1077 * filesystem is quiescent at this point, then (a) only the
1078 * root entry should be in use and (b) that root entry is
1079 * clean.
1081 shrink_dcache_sb(sb);
1082 fsync_dev(sb->s_dev);
1084 if (sb->s_root->d_inode->i_state) {
1085 mntput(mnt);
1086 return -EBUSY;
1089 /* Something might grab it again - redo checks */
1091 spin_lock(&dcache_lock);
1092 if (atomic_read(&mnt->mnt_count) > 2) {
1093 spin_unlock(&dcache_lock);
1094 mntput(mnt);
1095 return -EBUSY;
1098 /* OK, that's the point of no return */
1099 mntput(mnt);
1100 remove_vfsmnt(mnt);
1102 kill_super(sb, umount_root);
1103 return 0;
1107 * Now umount can handle mount points as well as block devices.
1108 * This is important for filesystems which use unnamed block devices.
1110 * We now support a flag for forced unmount like the other 'big iron'
1111 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1114 asmlinkage long sys_umount(char * name, int flags)
1116 struct nameidata nd;
1117 char *kname;
1118 int retval;
1120 lock_kernel();
1121 kname = getname(name);
1122 retval = PTR_ERR(kname);
1123 if (IS_ERR(kname))
1124 goto out;
1125 retval = 0;
1126 if (path_init(kname, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd))
1127 retval = path_walk(kname, &nd);
1128 putname(kname);
1129 if (retval)
1130 goto out;
1131 retval = -EINVAL;
1132 if (nd.dentry != nd.mnt->mnt_root)
1133 goto dput_and_out;
1135 retval = -EPERM;
1136 if (!capable(CAP_SYS_ADMIN) && current->uid!=nd.mnt->mnt_owner)
1137 goto dput_and_out;
1139 dput(nd.dentry);
1140 /* puts nd.mnt */
1141 down(&mount_sem);
1142 retval = do_umount(nd.mnt, 0, flags);
1143 up(&mount_sem);
1144 goto out;
1145 dput_and_out:
1146 path_release(&nd);
1147 out:
1148 unlock_kernel();
1149 return retval;
1153 * The 2.0 compatible umount. No flags.
1156 asmlinkage long sys_oldumount(char * name)
1158 return sys_umount(name,0);
1161 static int mount_is_safe(struct nameidata *nd)
1163 if (capable(CAP_SYS_ADMIN))
1164 return 0;
1165 return -EPERM;
1166 #ifdef notyet
1167 if (S_ISLNK(nd->dentry->d_inode->i_mode))
1168 return -EPERM;
1169 if (nd->dentry->d_inode->i_mode & S_ISVTX) {
1170 if (current->uid != nd->dentry->d_inode->i_uid)
1171 return -EPERM;
1173 if (permission(nd->dentry->d_inode, MAY_WRITE))
1174 return -EPERM;
1175 return 0;
1176 #endif
1180 * do loopback mount.
1182 static int do_loopback(char *old_name, char *new_name)
1184 struct nameidata old_nd, new_nd;
1185 int err = 0;
1186 if (!old_name || !*old_name)
1187 return -EINVAL;
1188 if (path_init(old_name, LOOKUP_POSITIVE, &old_nd))
1189 err = path_walk(old_name, &old_nd);
1190 if (err)
1191 goto out;
1192 if (path_init(new_name, LOOKUP_POSITIVE, &new_nd))
1193 err = path_walk(new_name, &new_nd);
1194 if (err)
1195 goto out1;
1196 err = mount_is_safe(&new_nd);
1197 if (err)
1198 goto out2;
1199 err = -EINVAL;
1200 if (S_ISDIR(new_nd.dentry->d_inode->i_mode) !=
1201 S_ISDIR(old_nd.dentry->d_inode->i_mode))
1202 goto out2;
1204 err = -ENOMEM;
1205 if (old_nd.mnt->mnt_sb->s_type->fs_flags & FS_SINGLE)
1206 get_filesystem(old_nd.mnt->mnt_sb->s_type);
1208 down(&mount_sem);
1209 /* there we go */
1210 down(&new_nd.dentry->d_inode->i_zombie);
1211 if (IS_DEADDIR(new_nd.dentry->d_inode))
1212 err = -ENOENT;
1213 else if (add_vfsmnt(&new_nd, old_nd.dentry, old_nd.mnt->mnt_devname))
1214 err = 0;
1215 up(&new_nd.dentry->d_inode->i_zombie);
1216 up(&mount_sem);
1217 if (err && old_nd.mnt->mnt_sb->s_type->fs_flags & FS_SINGLE)
1218 put_filesystem(old_nd.mnt->mnt_sb->s_type);
1219 out2:
1220 path_release(&new_nd);
1221 out1:
1222 path_release(&old_nd);
1223 out:
1224 return err;
1228 * change filesystem flags. dir should be a physical root of filesystem.
1229 * If you've mounted a non-root directory somewhere and want to do remount
1230 * on it - tough luck.
1233 static int do_remount(const char *dir,int flags,char *data)
1235 struct nameidata nd;
1236 int retval = 0;
1238 if (!capable(CAP_SYS_ADMIN))
1239 return -EPERM;
1241 if (path_init(dir, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
1242 retval = path_walk(dir, &nd);
1243 if (!retval) {
1244 struct super_block * sb = nd.dentry->d_inode->i_sb;
1245 retval = -ENODEV;
1246 if (sb) {
1247 retval = -EINVAL;
1248 if (nd.dentry == sb->s_root) {
1250 * Shrink the dcache and sync the device.
1252 shrink_dcache_sb(sb);
1253 fsync_dev(sb->s_dev);
1254 if (flags & MS_RDONLY)
1255 acct_auto_close(sb->s_dev);
1256 retval = do_remount_sb(sb, flags, data);
1259 path_release(&nd);
1261 return retval;
1264 static int copy_mount_options (const void *data, unsigned long *where)
1266 int i;
1267 unsigned long page;
1268 unsigned long size;
1270 *where = 0;
1271 if (!data)
1272 return 0;
1274 if (!(page = __get_free_page(GFP_KERNEL)))
1275 return -ENOMEM;
1277 /* We only care that *some* data at the address the user
1278 * gave us is valid. Just in case, we'll zero
1279 * the remainder of the page.
1281 /* copy_from_user cannot cross TASK_SIZE ! */
1282 size = TASK_SIZE - (unsigned long)data;
1283 if (size > PAGE_SIZE)
1284 size = PAGE_SIZE;
1286 i = size - copy_from_user((void *)page, data, size);
1287 if (!i) {
1288 free_page(page);
1289 return -EFAULT;
1291 if (i != PAGE_SIZE)
1292 memset((char *)page + i, 0, PAGE_SIZE - i);
1293 *where = page;
1294 return 0;
1298 * Flags is a 16-bit value that allows up to 16 non-fs dependent flags to
1299 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
1301 * data is a (void *) that can point to any structure up to
1302 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
1303 * information (or be NULL).
1305 * NOTE! As pre-0.97 versions of mount() didn't use this setup, the
1306 * flags have to have a special 16-bit magic number in the high word:
1307 * 0xC0ED. If this magic word isn't present, the flags and data info
1308 * aren't used, as the syscall assumes we are talking to an older
1309 * version that didn't understand them.
1311 long do_mount(char * dev_name, char * dir_name, char *type_page,
1312 unsigned long new_flags, void *data_page)
1314 struct file_system_type * fstype;
1315 struct nameidata nd;
1316 struct vfsmount *mnt = NULL;
1317 struct super_block *sb;
1318 int retval = 0;
1319 unsigned long flags = 0;
1321 /* Basic sanity checks */
1323 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
1324 return -EINVAL;
1325 if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
1326 return -EINVAL;
1328 /* OK, looks good, now let's see what do they want */
1330 /* just change the flags? - capabilities are checked in do_remount() */
1331 if ((new_flags & (MS_MGC_MSK|MS_REMOUNT)) == (MS_MGC_VAL|MS_REMOUNT))
1332 return do_remount(dir_name, new_flags&~(MS_MGC_MSK|MS_REMOUNT),
1333 (char *) data_page);
1335 if ((new_flags & MS_MGC_MSK) == MS_MGC_VAL)
1336 flags = new_flags & ~MS_MGC_MSK;
1338 /* For the rest we need the type */
1340 if (!type_page || !memchr(type_page, 0, PAGE_SIZE))
1341 return -EINVAL;
1343 /* loopback mount? This is special - requires fewer capabilities */
1344 if (strcmp(type_page, "bind")==0)
1345 return do_loopback(dev_name, dir_name);
1347 /* for the rest we _really_ need capabilities... */
1348 if (!capable(CAP_SYS_ADMIN))
1349 return -EPERM;
1351 /* ... filesystem driver... */
1352 fstype = get_fs_type(type_page);
1353 if (!fstype)
1354 return -ENODEV;
1356 /* ... and mountpoint. Do the lookup first to force automounting. */
1357 if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &nd))
1358 retval = path_walk(dir_name, &nd);
1359 if (retval)
1360 goto fs_out;
1362 /* get superblock, locks mount_sem on success */
1363 if (fstype->fs_flags & FS_NOMOUNT)
1364 sb = ERR_PTR(-EINVAL);
1365 else if (fstype->fs_flags & FS_REQUIRES_DEV)
1366 sb = get_sb_bdev(fstype, dev_name,flags, data_page);
1367 else if (fstype->fs_flags & FS_SINGLE)
1368 sb = get_sb_single(fstype, flags, data_page);
1369 else
1370 sb = get_sb_nodev(fstype, flags, data_page);
1372 retval = PTR_ERR(sb);
1373 if (IS_ERR(sb))
1374 goto dput_out;
1376 /* Something was mounted here while we slept */
1377 while(d_mountpoint(nd.dentry) && follow_down(&nd.mnt, &nd.dentry))
1379 retval = -ENOENT;
1380 if (!nd.dentry->d_inode)
1381 goto fail;
1382 down(&nd.dentry->d_inode->i_zombie);
1383 if (!IS_DEADDIR(nd.dentry->d_inode)) {
1384 retval = -ENOMEM;
1385 mnt = add_vfsmnt(&nd, sb->s_root, dev_name);
1387 up(&nd.dentry->d_inode->i_zombie);
1388 if (!mnt)
1389 goto fail;
1390 retval = 0;
1391 unlock_out:
1392 up(&mount_sem);
1393 dput_out:
1394 path_release(&nd);
1395 fs_out:
1396 put_filesystem(fstype);
1397 return retval;
1399 fail:
1400 if (list_empty(&sb->s_mounts))
1401 kill_super(sb, 0);
1402 goto unlock_out;
1405 asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type,
1406 unsigned long new_flags, void * data)
1408 int retval;
1409 unsigned long data_page;
1410 unsigned long type_page;
1411 unsigned long dev_page;
1412 char *dir_page;
1414 retval = copy_mount_options (type, &type_page);
1415 if (retval < 0)
1416 return retval;
1418 dir_page = getname(dir_name);
1419 retval = PTR_ERR(dir_page);
1420 if (IS_ERR(dir_page))
1421 goto out1;
1423 retval = copy_mount_options (dev_name, &dev_page);
1424 if (retval < 0)
1425 goto out2;
1426 retval = copy_mount_options (data, &data_page);
1427 if (retval >= 0) {
1428 lock_kernel();
1429 retval = do_mount((char*)dev_page,dir_page,(char*)type_page,
1430 new_flags, (void*)data_page);
1431 unlock_kernel();
1432 free_page(data_page);
1434 free_page(dev_page);
1435 out2:
1436 putname(dir_page);
1437 out1:
1438 free_page(type_page);
1439 return retval;
1442 void __init mount_root(void)
1444 struct file_system_type * fs_type;
1445 struct super_block * sb;
1446 struct vfsmount *vfsmnt;
1447 struct block_device *bdev = NULL;
1448 mode_t mode;
1449 int retval;
1450 void *handle;
1451 char path[64];
1452 int path_start = -1;
1454 #ifdef CONFIG_ROOT_NFS
1455 void *data;
1456 if (MAJOR(ROOT_DEV) != UNNAMED_MAJOR)
1457 goto skip_nfs;
1458 fs_type = get_fs_type("nfs");
1459 if (!fs_type)
1460 goto no_nfs;
1461 ROOT_DEV = get_unnamed_dev();
1462 if (!ROOT_DEV)
1464 * Your /linuxrc sucks worse than MSExchange - that's the
1465 * only way you could run out of anon devices at that point.
1467 goto no_anon;
1468 data = nfs_root_data();
1469 if (!data)
1470 goto no_server;
1471 sb = read_super(ROOT_DEV, NULL, fs_type, root_mountflags, data, 1);
1472 if (sb)
1474 * We _can_ fail there, but if that will happen we have no
1475 * chance anyway (no memory for vfsmnt and we _will_ need it,
1476 * no matter which fs we try to mount).
1478 goto mount_it;
1479 no_server:
1480 put_unnamed_dev(ROOT_DEV);
1481 no_anon:
1482 put_filesystem(fs_type);
1483 no_nfs:
1484 printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
1485 ROOT_DEV = MKDEV(FLOPPY_MAJOR, 0);
1486 skip_nfs:
1487 #endif
1489 #ifdef CONFIG_BLK_DEV_FD
1490 if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
1491 #ifdef CONFIG_BLK_DEV_RAM
1492 extern int rd_doload;
1493 extern void rd_load_secondary(void);
1494 #endif
1495 floppy_eject();
1496 #ifndef CONFIG_BLK_DEV_RAM
1497 printk(KERN_NOTICE "(Warning, this kernel has no ramdisk support)\n");
1498 #else
1499 /* rd_doload is 2 for a dual initrd/ramload setup */
1500 if(rd_doload==2)
1501 rd_load_secondary();
1502 else
1503 #endif
1505 printk(KERN_NOTICE "VFS: Insert root floppy and press ENTER\n");
1506 wait_for_keypress();
1509 #endif
1511 devfs_make_root (root_device_name);
1512 handle = devfs_find_handle (NULL, ROOT_DEVICE_NAME,
1513 MAJOR (ROOT_DEV), MINOR (ROOT_DEV),
1514 DEVFS_SPECIAL_BLK, 1);
1515 if (handle) /* Sigh: bd*() functions only paper over the cracks */
1517 unsigned major, minor;
1519 devfs_get_maj_min (handle, &major, &minor);
1520 ROOT_DEV = MKDEV (major, minor);
1524 * Probably pure paranoia, but I'm less than happy about delving into
1525 * devfs crap and checking it right now. Later.
1527 if (!ROOT_DEV)
1528 panic("I have no root and I want to scream");
1530 bdev = bdget(kdev_t_to_nr(ROOT_DEV));
1531 if (!bdev)
1532 panic(__FUNCTION__ ": unable to allocate root device");
1533 bdev->bd_op = devfs_get_ops (handle);
1534 path_start = devfs_generate_path (handle, path + 5, sizeof (path) - 5);
1535 mode = FMODE_READ;
1536 if (!(root_mountflags & MS_RDONLY))
1537 mode |= FMODE_WRITE;
1538 retval = blkdev_get(bdev, mode, 0, BDEV_FS);
1539 if (retval == -EROFS) {
1540 root_mountflags |= MS_RDONLY;
1541 retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS);
1543 if (retval) {
1545 * Allow the user to distinguish between failed open
1546 * and bad superblock on root device.
1548 printk ("VFS: Cannot open root device \"%s\" or %s\n",
1549 root_device_name, kdevname (ROOT_DEV));
1550 printk ("Please append a correct \"root=\" boot option\n");
1551 panic("VFS: Unable to mount root fs on %s",
1552 kdevname(ROOT_DEV));
1555 check_disk_change(ROOT_DEV);
1556 sb = get_super(ROOT_DEV);
1557 if (sb) {
1558 fs_type = sb->s_type;
1559 goto mount_it;
1562 read_lock(&file_systems_lock);
1563 for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) {
1564 if (!(fs_type->fs_flags & FS_REQUIRES_DEV))
1565 continue;
1566 if (!try_inc_mod_count(fs_type->owner))
1567 continue;
1568 read_unlock(&file_systems_lock);
1569 sb = read_super(ROOT_DEV,bdev,fs_type,root_mountflags,NULL,1);
1570 if (sb)
1571 goto mount_it;
1572 read_lock(&file_systems_lock);
1573 put_filesystem(fs_type);
1575 read_unlock(&file_systems_lock);
1576 panic("VFS: Unable to mount root fs on %s", kdevname(ROOT_DEV));
1578 mount_it:
1579 printk ("VFS: Mounted root (%s filesystem)%s.\n",
1580 fs_type->name,
1581 (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1582 if (path_start >= 0) {
1583 devfs_mk_symlink (NULL, "root", DEVFS_FL_DEFAULT,
1584 path + 5 + path_start, NULL, NULL);
1585 memcpy (path + path_start, "/dev/", 5);
1586 vfsmnt = add_vfsmnt(NULL, sb->s_root, path + path_start);
1588 else
1589 vfsmnt = add_vfsmnt(NULL, sb->s_root, "/dev/root");
1590 /* FIXME: if something will try to umount us right now... */
1591 if (vfsmnt) {
1592 set_fs_root(current->fs, vfsmnt, sb->s_root);
1593 set_fs_pwd(current->fs, vfsmnt, sb->s_root);
1594 if (bdev)
1595 bdput(bdev); /* sb holds a reference */
1596 return;
1598 panic("VFS: add_vfsmnt failed for root fs");
1602 static void chroot_fs_refs(struct dentry *old_root,
1603 struct vfsmount *old_rootmnt,
1604 struct dentry *new_root,
1605 struct vfsmount *new_rootmnt)
1607 struct task_struct *p;
1608 struct fs_struct *fs;
1610 read_lock(&tasklist_lock);
1611 for_each_task(p) {
1612 task_lock(p);
1613 fs = p->fs;
1614 if (fs) {
1615 atomic_inc(&fs->count);
1616 task_unlock(p);
1617 if (fs->root==old_root && fs->rootmnt==old_rootmnt)
1618 set_fs_root(fs, new_rootmnt, new_root);
1619 if (fs->pwd==old_root && fs->pwdmnt==old_rootmnt)
1620 set_fs_pwd(fs, new_rootmnt, new_root);
1621 put_fs_struct(fs);
1622 } else
1623 task_unlock(p);
1625 read_unlock(&tasklist_lock);
1629 * Moves the current root to put_root, and sets root/cwd of all processes
1630 * which had them on the old root to new_root.
1632 * Note:
1633 * - we don't move root/cwd if they are not at the root (reason: if something
1634 * cared enough to change them, it's probably wrong to force them elsewhere)
1635 * - it's okay to pick a root that isn't the root of a file system, e.g.
1636 * /nfs/my_root where /nfs is the mount point. Better avoid creating
1637 * unreachable mount points this way, though.
1640 asmlinkage long sys_pivot_root(const char *new_root, const char *put_old)
1642 struct dentry *root;
1643 struct vfsmount *root_mnt;
1644 struct vfsmount *tmp;
1645 struct nameidata new_nd, old_nd;
1646 char *name;
1647 int error;
1649 if (!capable(CAP_SYS_ADMIN))
1650 return -EPERM;
1652 lock_kernel();
1654 name = getname(new_root);
1655 error = PTR_ERR(name);
1656 if (IS_ERR(name))
1657 goto out0;
1658 error = 0;
1659 if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd))
1660 error = path_walk(name, &new_nd);
1661 putname(name);
1662 if (error)
1663 goto out0;
1665 name = getname(put_old);
1666 error = PTR_ERR(name);
1667 if (IS_ERR(name))
1668 goto out0;
1669 error = 0;
1670 if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd))
1671 error = path_walk(name, &old_nd);
1672 putname(name);
1673 if (error)
1674 goto out1;
1676 read_lock(&current->fs->lock);
1677 root_mnt = mntget(current->fs->rootmnt);
1678 root = dget(current->fs->root);
1679 read_unlock(&current->fs->lock);
1680 down(&mount_sem);
1681 down(&old_nd.dentry->d_inode->i_zombie);
1682 error = -ENOENT;
1683 if (IS_DEADDIR(new_nd.dentry->d_inode))
1684 goto out2;
1685 if (d_unhashed(new_nd.dentry) && !IS_ROOT(new_nd.dentry))
1686 goto out2;
1687 if (d_unhashed(old_nd.dentry) && !IS_ROOT(old_nd.dentry))
1688 goto out2;
1689 error = -EBUSY;
1690 if (new_nd.mnt == root_mnt || old_nd.mnt == root_mnt)
1691 goto out2; /* loop */
1692 error = -EINVAL;
1693 tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
1694 spin_lock(&dcache_lock);
1695 if (tmp != new_nd.mnt) {
1696 for (;;) {
1697 if (tmp->mnt_parent == tmp)
1698 goto out3;
1699 if (tmp->mnt_parent == new_nd.mnt)
1700 break;
1701 tmp = tmp->mnt_parent;
1703 if (!is_subdir(tmp->mnt_mountpoint, new_nd.dentry))
1704 goto out3;
1705 } else if (!is_subdir(old_nd.dentry, new_nd.dentry))
1706 goto out3;
1707 spin_unlock(&dcache_lock);
1709 move_vfsmnt(new_nd.mnt, new_nd.dentry, NULL, NULL);
1710 move_vfsmnt(root_mnt, old_nd.dentry, old_nd.mnt, NULL);
1711 chroot_fs_refs(root,root_mnt,new_nd.dentry,new_nd.mnt);
1712 error = 0;
1713 out2:
1714 up(&old_nd.dentry->d_inode->i_zombie);
1715 up(&mount_sem);
1716 dput(root);
1717 mntput(root_mnt);
1718 path_release(&old_nd);
1719 out1:
1720 path_release(&new_nd);
1721 out0:
1722 unlock_kernel();
1723 return error;
1724 out3:
1725 spin_unlock(&dcache_lock);
1726 goto out2;
1730 #ifdef CONFIG_BLK_DEV_INITRD
1732 int __init change_root(kdev_t new_root_dev,const char *put_old)
1734 struct vfsmount *old_rootmnt;
1735 struct nameidata devfs_nd, nd;
1736 int error = 0;
1738 read_lock(&current->fs->lock);
1739 old_rootmnt = mntget(current->fs->rootmnt);
1740 read_unlock(&current->fs->lock);
1741 /* First unmount devfs if mounted */
1742 if (path_init("/dev", LOOKUP_FOLLOW|LOOKUP_POSITIVE, &devfs_nd))
1743 error = path_walk("/dev", &devfs_nd);
1744 if (!error) {
1745 if (devfs_nd.mnt->mnt_sb->s_magic == DEVFS_SUPER_MAGIC &&
1746 devfs_nd.dentry == devfs_nd.mnt->mnt_root) {
1747 dput(devfs_nd.dentry);
1748 down(&mount_sem);
1749 /* puts devfs_nd.mnt */
1750 do_umount(devfs_nd.mnt, 0, 0);
1751 up(&mount_sem);
1752 } else
1753 path_release(&devfs_nd);
1755 ROOT_DEV = new_root_dev;
1756 mount_root();
1757 #if 1
1758 shrink_dcache();
1759 printk("change_root: old root has d_count=%d\n",
1760 atomic_read(&old_rootmnt->mnt_root->d_count));
1761 #endif
1762 mount_devfs_fs ();
1764 * Get the new mount directory
1766 error = 0;
1767 if (path_init(put_old, LOOKUP_FOLLOW|LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &nd))
1768 error = path_walk(put_old, &nd);
1769 if (error) {
1770 int blivet;
1772 printk(KERN_NOTICE "Trying to unmount old root ... ");
1773 blivet = do_umount(old_rootmnt, 1, 0);
1774 if (!blivet) {
1775 printk("okay\n");
1776 return 0;
1778 printk(KERN_ERR "error %d\n", blivet);
1779 return error;
1781 /* FIXME: we should hold i_zombie on nd.dentry */
1782 move_vfsmnt(old_rootmnt, nd.dentry, nd.mnt, "/dev/root.old");
1783 mntput(old_rootmnt);
1784 path_release(&nd);
1785 return 0;
1788 #endif