Fix semaphores in modules.
[linux-2.6/linux-mips.git] / fs / super.c
blob30248780771ab0d785cd629e11748e503aa3fac2
1 /*
2 * linux/fs/super.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
6 * super.c contains code to handle: - mount structures
7 * - super-block tables.
8 * - mount system call
9 * - umount system call
11 * Added options to /proc/mounts
12 * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
14 * GK 2/5/95 - Changed to support mounting the root fs via NFS
16 * Added kerneld support: Jacques Gelinas and Bjorn Ekwall
17 * Added change_root: Werner Almesberger & Hans Lermen, Feb '96
18 * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
21 #include <linux/config.h>
22 #include <linux/string.h>
23 #include <linux/malloc.h>
24 #include <linux/locks.h>
25 #include <linux/smp_lock.h>
26 #include <linux/devfs_fs_kernel.h>
27 #include <linux/fd.h>
28 #include <linux/init.h>
29 #include <linux/quotaops.h>
30 #include <linux/acct.h>
32 #include <asm/uaccess.h>
34 #include <linux/nfs_fs.h>
35 #include <linux/nfs_fs_sb.h>
36 #include <linux/nfs_mount.h>
38 #include <linux/kmod.h>
39 #define __NO_VERSION__
40 #include <linux/module.h>
43 * We use a semaphore to synchronize all mount/umount
44 * activity - imagine the mess if we have a race between
45 * unmounting a filesystem and re-mounting it (or something
46 * else).
48 static DECLARE_MUTEX(mount_sem);
50 extern void wait_for_keypress(void);
52 extern int root_mountflags;
54 static int do_remount_sb(struct super_block *sb, int flags, char * data);
56 /* this is initialized in init/main.c */
57 kdev_t ROOT_DEV;
59 int nr_super_blocks = 0;
60 int max_super_blocks = NR_SUPER;
61 LIST_HEAD(super_blocks);
64 * Handling of filesystem drivers list.
65 * Rules:
66 * Inclusion to/removals from/scanning of list are protected by spinlock.
67 * During the unload module must call unregister_filesystem().
68 * We can access the fields of list element if:
69 * 1) spinlock is held or
70 * 2) we hold the reference to the module.
71 * The latter can be guaranteed by call of try_inc_mod_count(); if it
72 * returned 0 we must skip the element, otherwise we got the reference.
73 * Once the reference is obtained we can drop the spinlock.
76 static struct file_system_type *file_systems = NULL;
77 static spinlock_t file_systems_lock = SPIN_LOCK_UNLOCKED;
79 static void put_filesystem(struct file_system_type *fs)
81 if (fs->owner)
82 __MOD_DEC_USE_COUNT(fs->owner);
85 static struct file_system_type **find_filesystem(const char *name)
87 struct file_system_type **p;
88 for (p=&file_systems; *p; p=&(*p)->next)
89 if (strcmp((*p)->name,name) == 0)
90 break;
91 return p;
94 int register_filesystem(struct file_system_type * fs)
96 int res = 0;
97 struct file_system_type ** p;
99 if (!fs)
100 return -EINVAL;
101 if (fs->next)
102 return -EBUSY;
103 spin_lock(&file_systems_lock);
104 p = find_filesystem(fs->name);
105 if (*p)
106 res = -EBUSY;
107 else
108 *p = fs;
109 spin_unlock(&file_systems_lock);
110 return res;
113 int unregister_filesystem(struct file_system_type * fs)
115 struct file_system_type ** tmp;
117 spin_lock(&file_systems_lock);
118 tmp = &file_systems;
119 while (*tmp) {
120 if (fs == *tmp) {
121 *tmp = fs->next;
122 fs->next = NULL;
123 spin_unlock(&file_systems_lock);
124 return 0;
126 tmp = &(*tmp)->next;
128 spin_unlock(&file_systems_lock);
129 return -EINVAL;
132 static int fs_index(const char * __name)
134 struct file_system_type * tmp;
135 char * name;
136 int err, index;
138 name = getname(__name);
139 err = PTR_ERR(name);
140 if (IS_ERR(name))
141 return err;
143 err = -EINVAL;
144 spin_lock(&file_systems_lock);
145 for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
146 if (strcmp(tmp->name,name) == 0) {
147 err = index;
148 break;
151 spin_unlock(&file_systems_lock);
152 putname(name);
153 return err;
156 static int fs_name(unsigned int index, char * buf)
158 struct file_system_type * tmp;
159 int len, res;
161 spin_lock(&file_systems_lock);
162 for (tmp = file_systems; tmp; tmp = tmp->next, index--)
163 if (index <= 0 && try_inc_mod_count(tmp->owner))
164 break;
165 spin_unlock(&file_systems_lock);
166 if (!tmp)
167 return -EINVAL;
169 /* OK, we got the reference, so we can safely block */
170 len = strlen(tmp->name) + 1;
171 res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
172 put_filesystem(tmp);
173 return res;
176 static int fs_maxindex(void)
178 struct file_system_type * tmp;
179 int index;
181 spin_lock(&file_systems_lock);
182 for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
184 spin_unlock(&file_systems_lock);
185 return index;
189 * Whee.. Weird sysv syscall.
191 asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
193 int retval = -EINVAL;
195 lock_kernel();
196 switch (option) {
197 case 1:
198 retval = fs_index((const char *) arg1);
199 break;
201 case 2:
202 retval = fs_name(arg1, (char *) arg2);
203 break;
205 case 3:
206 retval = fs_maxindex();
207 break;
209 unlock_kernel();
210 return retval;
213 int get_filesystem_list(char * buf)
215 int len = 0;
216 struct file_system_type * tmp;
218 spin_lock(&file_systems_lock);
219 tmp = file_systems;
220 while (tmp && len < PAGE_SIZE - 80) {
221 len += sprintf(buf+len, "%s\t%s\n",
222 (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
223 tmp->name);
224 tmp = tmp->next;
226 spin_unlock(&file_systems_lock);
227 return len;
230 static struct file_system_type *get_fs_type(const char *name)
232 struct file_system_type *fs;
234 spin_lock(&file_systems_lock);
235 fs = *(find_filesystem(name));
236 if (fs && !try_inc_mod_count(fs->owner))
237 fs = NULL;
238 spin_unlock(&file_systems_lock);
239 if (!fs && (request_module(name) == 0)) {
240 spin_lock(&file_systems_lock);
241 fs = *(find_filesystem(name));
242 if (fs && !try_inc_mod_count(fs->owner))
243 fs = NULL;
244 spin_unlock(&file_systems_lock);
246 return fs;
249 struct vfsmount *vfsmntlist = NULL;
250 static struct vfsmount *vfsmnttail = NULL, *mru_vfsmnt = NULL;
252 static struct vfsmount *add_vfsmnt(struct super_block *sb,
253 const char *dev_name, const char *dir_name)
255 struct vfsmount *lptr;
256 char *tmp, *name;
258 lptr = (struct vfsmount *)kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
259 if (!lptr)
260 goto out;
261 memset(lptr, 0, sizeof(struct vfsmount));
263 lptr->mnt_sb = sb;
264 lptr->mnt_dev = sb->s_dev;
266 /* N.B. Is it really OK to have a vfsmount without names? */
267 if (dev_name && !IS_ERR(tmp = getname(dev_name))) {
268 name = (char *) kmalloc(strlen(tmp)+1, GFP_KERNEL);
269 if (name) {
270 strcpy(name, tmp);
271 lptr->mnt_devname = name;
273 putname(tmp);
275 if (dir_name && !IS_ERR(tmp = getname(dir_name))) {
276 name = (char *) kmalloc(strlen(tmp)+1, GFP_KERNEL);
277 if (name) {
278 strcpy(name, tmp);
279 lptr->mnt_dirname = name;
281 putname(tmp);
284 if (vfsmntlist == (struct vfsmount *)NULL) {
285 vfsmntlist = vfsmnttail = lptr;
286 } else {
287 vfsmnttail->mnt_next = lptr;
288 vfsmnttail = lptr;
290 out:
291 return lptr;
294 void remove_vfsmnt(kdev_t dev)
296 struct vfsmount *lptr, *tofree;
298 if (vfsmntlist == NULL)
299 return;
300 lptr = vfsmntlist;
301 if (lptr->mnt_dev == dev) {
302 tofree = lptr;
303 vfsmntlist = lptr->mnt_next;
304 if (vfsmnttail->mnt_dev == dev)
305 vfsmnttail = vfsmntlist;
306 } else {
307 while (lptr->mnt_next != NULL) {
308 if (lptr->mnt_next->mnt_dev == dev)
309 break;
310 lptr = lptr->mnt_next;
312 tofree = lptr->mnt_next;
313 if (tofree == NULL)
314 return;
315 lptr->mnt_next = lptr->mnt_next->mnt_next;
316 if (vfsmnttail->mnt_dev == dev)
317 vfsmnttail = lptr;
319 if (tofree == mru_vfsmnt)
320 mru_vfsmnt = NULL;
321 kfree(tofree->mnt_devname);
322 kfree(tofree->mnt_dirname);
323 kfree_s(tofree, sizeof(struct vfsmount));
326 static struct proc_fs_info {
327 int flag;
328 char *str;
329 } fs_info[] = {
330 { MS_NOEXEC, ",noexec" },
331 { MS_NOSUID, ",nosuid" },
332 { MS_NODEV, ",nodev" },
333 { MS_SYNCHRONOUS, ",sync" },
334 { MS_MANDLOCK, ",mand" },
335 { MS_NOATIME, ",noatime" },
336 { MS_NODIRATIME, ",nodiratime" },
337 #ifdef MS_NOSUB /* Can't find this except in mount.c */
338 { MS_NOSUB, ",nosub" },
339 #endif
340 { 0, NULL }
343 static struct proc_nfs_info {
344 int flag;
345 char *str;
346 } nfs_info[] = {
347 { NFS_MOUNT_SOFT, ",soft" },
348 { NFS_MOUNT_INTR, ",intr" },
349 { NFS_MOUNT_POSIX, ",posix" },
350 { NFS_MOUNT_NOCTO, ",nocto" },
351 { NFS_MOUNT_NOAC, ",noac" },
352 { 0, NULL }
355 int get_filesystem_info( char *buf )
357 struct vfsmount *tmp;
358 struct proc_fs_info *fs_infop;
359 struct proc_nfs_info *nfs_infop;
360 struct nfs_server *nfss;
361 int len = 0;
362 char *path,*buffer = (char *) __get_free_page(GFP_KERNEL);
364 if (!buffer) return 0;
365 for (tmp = vfsmntlist; tmp && len < PAGE_SIZE - 160;
366 tmp = tmp->mnt_next) {
367 path = d_path(tmp->mnt_sb->s_root, buffer, PAGE_SIZE);
368 if (!path)
369 continue;
370 len += sprintf( buf + len, "%s %s %s %s",
371 tmp->mnt_devname, path,
372 tmp->mnt_sb->s_type->name,
373 tmp->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw" );
374 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
375 if (tmp->mnt_sb->s_flags & fs_infop->flag) {
376 strcpy(buf + len, fs_infop->str);
377 len += strlen(fs_infop->str);
380 if (!strcmp("nfs", tmp->mnt_sb->s_type->name)) {
381 nfss = &tmp->mnt_sb->u.nfs_sb.s_server;
382 if (nfss->rsize != NFS_DEF_FILE_IO_BUFFER_SIZE) {
383 len += sprintf(buf+len, ",rsize=%d",
384 nfss->rsize);
386 if (nfss->wsize != NFS_DEF_FILE_IO_BUFFER_SIZE) {
387 len += sprintf(buf+len, ",wsize=%d",
388 nfss->wsize);
390 #if 0
391 if (nfss->timeo != 7*HZ/10) {
392 len += sprintf(buf+len, ",timeo=%d",
393 nfss->timeo*10/HZ);
395 if (nfss->retrans != 3) {
396 len += sprintf(buf+len, ",retrans=%d",
397 nfss->retrans);
399 #endif
400 if (nfss->acregmin != 3*HZ) {
401 len += sprintf(buf+len, ",acregmin=%d",
402 nfss->acregmin/HZ);
404 if (nfss->acregmax != 60*HZ) {
405 len += sprintf(buf+len, ",acregmax=%d",
406 nfss->acregmax/HZ);
408 if (nfss->acdirmin != 30*HZ) {
409 len += sprintf(buf+len, ",acdirmin=%d",
410 nfss->acdirmin/HZ);
412 if (nfss->acdirmax != 60*HZ) {
413 len += sprintf(buf+len, ",acdirmax=%d",
414 nfss->acdirmax/HZ);
416 for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
417 if (nfss->flags & nfs_infop->flag) {
418 strcpy(buf + len, nfs_infop->str);
419 len += strlen(nfs_infop->str);
422 len += sprintf(buf+len, ",addr=%s",
423 nfss->hostname);
425 len += sprintf( buf + len, " 0 0\n" );
428 free_page((unsigned long) buffer);
429 return len;
432 void __wait_on_super(struct super_block * sb)
434 DECLARE_WAITQUEUE(wait, current);
436 add_wait_queue(&sb->s_wait, &wait);
437 repeat:
438 set_current_state(TASK_UNINTERRUPTIBLE);
439 if (sb->s_lock) {
440 schedule();
441 goto repeat;
443 remove_wait_queue(&sb->s_wait, &wait);
444 current->state = TASK_RUNNING;
448 * Note: check the dirty flag before waiting, so we don't
449 * hold up the sync while mounting a device. (The newly
450 * mounted device won't need syncing.)
452 void sync_supers(kdev_t dev)
454 struct super_block * sb;
456 for (sb = sb_entry(super_blocks.next);
457 sb != sb_entry(&super_blocks);
458 sb = sb_entry(sb->s_list.next)) {
459 if (!sb->s_dev)
460 continue;
461 if (dev && sb->s_dev != dev)
462 continue;
463 if (!sb->s_dirt)
464 continue;
465 /* N.B. Should lock the superblock while writing */
466 wait_on_super(sb);
467 if (!sb->s_dev || !sb->s_dirt)
468 continue;
469 if (dev && (dev != sb->s_dev))
470 continue;
471 if (sb->s_op && sb->s_op->write_super)
472 sb->s_op->write_super(sb);
476 struct super_block * get_super(kdev_t dev)
478 struct super_block * s;
480 if (!dev)
481 return NULL;
482 restart:
483 s = sb_entry(super_blocks.next);
484 while (s != sb_entry(&super_blocks))
485 if (s->s_dev == dev) {
486 wait_on_super(s);
487 if (s->s_dev == dev)
488 return s;
489 goto restart;
490 } else
491 s = sb_entry(s->s_list.next);
492 return NULL;
495 asmlinkage long sys_ustat(dev_t dev, struct ustat * ubuf)
497 struct super_block *s;
498 struct ustat tmp;
499 struct statfs sbuf;
500 int err = -EINVAL;
502 lock_kernel();
503 s = get_super(to_kdev_t(dev));
504 if (s == NULL)
505 goto out;
506 err = vfs_statfs(s, &sbuf);
507 if (err)
508 goto out;
510 memset(&tmp,0,sizeof(struct ustat));
511 tmp.f_tfree = sbuf.f_bfree;
512 tmp.f_tinode = sbuf.f_ffree;
514 err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
515 out:
516 unlock_kernel();
517 return err;
521 * Find a super_block with no device assigned.
523 struct super_block *get_empty_super(void)
525 struct super_block *s;
527 for (s = sb_entry(super_blocks.next);
528 s != sb_entry(&super_blocks);
529 s = sb_entry(s->s_list.next)) {
530 if (s->s_dev)
531 continue;
532 if (!s->s_lock)
533 return s;
534 printk("VFS: empty superblock %p locked!\n", s);
536 /* Need a new one... */
537 if (nr_super_blocks >= max_super_blocks)
538 return NULL;
539 s = kmalloc(sizeof(struct super_block), GFP_USER);
540 if (s) {
541 nr_super_blocks++;
542 memset(s, 0, sizeof(struct super_block));
543 INIT_LIST_HEAD(&s->s_dirty);
544 list_add (&s->s_list, super_blocks.prev);
545 init_waitqueue_head(&s->s_wait);
546 INIT_LIST_HEAD(&s->s_files);
548 return s;
551 static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
552 struct file_system_type *type, int flags,
553 void *data, int silent)
555 struct super_block * s;
556 s = get_empty_super();
557 if (!s)
558 goto out;
559 s->s_dev = dev;
560 s->s_bdev = bdev;
561 s->s_flags = flags;
562 s->s_dirt = 0;
563 sema_init(&s->s_vfs_rename_sem,1);
564 sema_init(&s->s_nfsd_free_path_sem,1);
565 s->s_type = type;
566 sema_init(&s->s_dquot.dqio_sem, 1);
567 sema_init(&s->s_dquot.dqoff_sem, 1);
568 s->s_dquot.flags = 0;
569 lock_super(s);
570 if (!type->read_super(s, data, silent))
571 goto out_fail;
572 unlock_super(s);
573 /* tell bdcache that we are going to keep this one */
574 if (bdev)
575 atomic_inc(&bdev->bd_count);
576 out:
577 return s;
579 out_fail:
580 s->s_dev = 0;
581 s->s_bdev = 0;
582 s->s_type = NULL;
583 unlock_super(s);
584 return NULL;
588 * Unnamed block devices are dummy devices used by virtual
589 * filesystems which don't use real block-devices. -- jrs
592 static unsigned int unnamed_dev_in_use[256/(8*sizeof(unsigned int))] = { 0, };
594 kdev_t get_unnamed_dev(void)
596 int i;
598 for (i = 1; i < 256; i++) {
599 if (!test_and_set_bit(i,unnamed_dev_in_use))
600 return MKDEV(UNNAMED_MAJOR, i);
602 return 0;
605 void put_unnamed_dev(kdev_t dev)
607 if (!dev || MAJOR(dev) != UNNAMED_MAJOR)
608 return;
609 if (test_and_clear_bit(MINOR(dev), unnamed_dev_in_use))
610 return;
611 printk("VFS: put_unnamed_dev: freeing unused device %s\n",
612 kdevname(dev));
615 static int d_umount(struct super_block * sb)
617 struct dentry * root = sb->s_root;
618 struct dentry * covered = root->d_covers;
620 if (root->d_count != 1)
621 return -EBUSY;
623 if (root->d_inode->i_state)
624 return -EBUSY;
626 sb->s_root = NULL;
628 if (covered != root) {
629 root->d_covers = root;
630 covered->d_mounts = covered;
631 dput(covered);
633 dput(root);
634 return 0;
637 static void d_mount(struct dentry *covered, struct dentry *dentry)
639 if (covered->d_mounts != covered) {
640 printk("VFS: mount - already mounted\n");
641 return;
643 covered->d_mounts = dentry;
644 dentry->d_covers = covered;
647 static struct block_device *do_umount(kdev_t dev, int unmount_root, int flags)
649 struct super_block * sb;
650 struct block_device *bdev;
651 int retval;
653 retval = -ENOENT;
654 sb = get_super(dev);
655 if (!sb || !sb->s_root)
656 goto out;
659 * Before checking whether the filesystem is still busy,
660 * make sure the kernel doesn't hold any quota files open
661 * on the device. If the umount fails, too bad -- there
662 * are no quotas running any more. Just turn them on again.
664 DQUOT_OFF(sb);
665 acct_auto_close(dev);
668 * If we may have to abort operations to get out of this
669 * mount, and they will themselves hold resources we must
670 * allow the fs to do things. In the Unix tradition of
671 * 'Gee thats tricky lets do it in userspace' the umount_begin
672 * might fail to complete on the first run through as other tasks
673 * must return, and the like. Thats for the mount program to worry
674 * about for the moment.
677 if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
678 sb->s_op->umount_begin(sb);
681 * Shrink dcache, then fsync. This guarantees that if the
682 * filesystem is quiescent at this point, then (a) only the
683 * root entry should be in use and (b) that root entry is
684 * clean.
686 shrink_dcache_sb(sb);
687 fsync_dev(dev);
689 if (sb == current->fs->root->d_sb && !unmount_root) {
691 * Special case for "unmounting" root ...
692 * we just try to remount it readonly.
694 retval = 0;
695 if (!(sb->s_flags & MS_RDONLY))
696 retval = do_remount_sb(sb, MS_RDONLY, 0);
697 return ERR_PTR(retval);
700 retval = d_umount(sb);
701 if (retval)
702 goto out;
704 if (sb->s_op) {
705 if (sb->s_op->write_super && sb->s_dirt)
706 sb->s_op->write_super(sb);
709 lock_super(sb);
710 if (sb->s_op) {
711 if (sb->s_op->put_super)
712 sb->s_op->put_super(sb);
715 /* Forget any remaining inodes */
716 if (invalidate_inodes(sb)) {
717 printk("VFS: Busy inodes after unmount. "
718 "Self-destruct in 5 seconds. Have a nice day...\n");
721 sb->s_dev = 0; /* Free the superblock */
722 bdev = sb->s_bdev;
723 sb->s_bdev = NULL;
724 put_filesystem(sb->s_type);
725 sb->s_type = NULL;
726 unlock_super(sb);
728 remove_vfsmnt(dev);
730 return bdev;
732 out:
733 return ERR_PTR(retval);
736 static int umount_dev(kdev_t dev, int flags)
738 int retval;
739 struct block_device *bdev;
741 retval = -ENXIO;
742 if (MAJOR(dev) >= MAX_BLKDEV)
743 goto out;
745 fsync_dev(dev);
747 down(&mount_sem);
749 bdev = do_umount(dev, 0, flags);
750 if (IS_ERR(bdev))
751 retval = PTR_ERR(bdev);
752 else {
753 retval = 0;
754 if (bdev) {
755 blkdev_put(bdev, BDEV_FS);
756 bdput(bdev);
757 } else {
758 put_unnamed_dev(dev);
761 up(&mount_sem);
762 out:
763 return retval;
767 * Now umount can handle mount points as well as block devices.
768 * This is important for filesystems which use unnamed block devices.
770 * We now support a flag for forced unmount like the other 'big iron'
771 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
774 asmlinkage long sys_umount(char * name, int flags)
776 struct dentry * dentry;
777 int retval;
779 if (!capable(CAP_SYS_ADMIN))
780 return -EPERM;
782 lock_kernel();
783 dentry = namei(name);
784 retval = PTR_ERR(dentry);
785 if (!IS_ERR(dentry)) {
786 struct inode * inode = dentry->d_inode;
787 kdev_t dev = inode->i_rdev;
789 retval = 0;
790 if (S_ISBLK(inode->i_mode)) {
791 if (IS_NODEV(inode))
792 retval = -EACCES;
793 } else {
794 struct super_block *sb = inode->i_sb;
795 retval = -EINVAL;
796 if (sb && inode == sb->s_root->d_inode) {
797 dev = sb->s_dev;
798 retval = 0;
801 dput(dentry);
803 if (!retval)
804 retval = umount_dev(dev, flags);
806 unlock_kernel();
807 return retval;
811 * The 2.0 compatible umount. No flags.
814 asmlinkage long sys_oldumount(char * name)
816 return sys_umount(name,0);
820 * Check whether we can mount the specified device.
822 int fs_may_mount(kdev_t dev)
824 struct super_block * sb = get_super(dev);
825 int busy;
827 busy = sb && sb->s_root &&
828 (sb->s_root->d_count != 1 || sb->s_root->d_covers != sb->s_root);
829 return !busy;
833 * do_mount() does the actual mounting after sys_mount has done the ugly
834 * parameter parsing. When enough time has gone by, and everything uses the
835 * new mount() parameters, sys_mount() can then be cleaned up.
837 * We cannot mount a filesystem if it has active, used, or dirty inodes.
838 * We also have to flush all inode-data for this device, as the new mount
839 * might need new info.
841 * [21-Mar-97] T.Schoebel-Theuer: Now this can be overridden when
842 * supplying a leading "!" before the dir_name, allowing "stacks" of
843 * mounted filesystems. The stacking will only influence any pathname lookups
844 * _after_ the mount, but open file descriptors or working directories that
845 * are now covered remain valid. For example, when you overmount /home, any
846 * process with old cwd /home/joe will continue to use the old versions,
847 * as long as relative paths are used, but absolute paths like /home/joe/xxx
848 * will go to the new "top of stack" version. In general, crossing a
849 * mount point will always go to the top of stack element.
850 * Anyone using this new feature must know what he/she is doing.
853 int do_mount(struct block_device *bdev, const char *dev_name,
854 const char *dir_name, const char * type, int flags, void * data)
856 kdev_t dev;
857 struct dentry * dir_d;
858 struct super_block * sb;
859 struct vfsmount *vfsmnt;
860 struct file_system_type *fs_type;
861 int error;
863 if (bdev) {
864 mode_t mode = FMODE_READ; /* we always need it ;-) */
865 if (!(flags & MS_RDONLY))
866 mode |= FMODE_WRITE;
867 dev = to_kdev_t(bdev->bd_dev);
868 error = blkdev_get(bdev, mode, 0, BDEV_FS);
869 if (error)
870 return error;
871 } else {
872 dev = get_unnamed_dev();
873 if (!dev)
874 return -EMFILE; /* huh? */
877 error = -EACCES;
878 if (!(flags & MS_RDONLY) && dev && is_read_only(dev))
879 goto out;
882 * Do the lookup first to force automounting.
884 dir_d = namei(dir_name);
885 error = PTR_ERR(dir_d);
886 if (IS_ERR(dir_d))
887 goto out;
889 down(&mount_sem);
890 error = -ENOTDIR;
891 if (!S_ISDIR(dir_d->d_inode->i_mode))
892 goto dput_and_out;
894 error = -EBUSY;
895 if (dir_d->d_covers != dir_d)
896 goto dput_and_out;
898 error = -EINVAL;
899 if (!dev)
900 goto dput_and_out;
901 check_disk_change(dev);
902 sb = get_super(dev);
903 if (sb) {
904 /* Already mounted */
905 error = -EBUSY;
906 goto dput_and_out;
909 fs_type = get_fs_type(type);
910 if (!fs_type) {
911 printk("VFS: on device %s: get_fs_type(%s) failed\n",
912 kdevname(dev), type);
913 goto dput_and_out;
916 sb = read_super(dev, bdev, fs_type, flags, data, 0);
917 if (!sb)
918 goto fsput_and_out;
921 * We may have slept while reading the super block,
922 * so we check afterwards whether it's safe to mount.
924 error = -EBUSY;
925 if (!fs_may_mount(dev))
926 goto bdput_and_out;
928 error = -ENOMEM;
929 vfsmnt = add_vfsmnt(sb, dev_name, dir_name);
930 if (vfsmnt) {
931 d_mount(dget(dir_d), sb->s_root);
932 dput(dir_d);
933 up(&mount_sem);
934 return 0;
937 bdput_and_out:
938 /* FIXME: ->put_super() is needed here */
939 sb->s_bdev = NULL;
940 sb->s_dev = 0;
941 sb->s_type = NULL;
942 if (bdev)
943 bdput(bdev);
944 fsput_and_out:
945 put_filesystem(fs_type);
946 dput_and_out:
947 dput(dir_d);
948 up(&mount_sem);
949 out:
950 if (bdev)
951 blkdev_put(bdev, BDEV_FS);
952 else
953 put_unnamed_dev(dev);
954 return error;
959 * Alters the mount flags of a mounted file system. Only the mount point
960 * is used as a reference - file system type and the device are ignored.
963 static int do_remount_sb(struct super_block *sb, int flags, char *data)
965 int retval;
967 if (!(flags & MS_RDONLY) && sb->s_dev && is_read_only(sb->s_dev))
968 return -EACCES;
969 /*flags |= MS_RDONLY;*/
970 /* If we are remounting RDONLY, make sure there are no rw files open */
971 if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
972 if (!fs_may_remount_ro(sb))
973 return -EBUSY;
974 if (sb->s_op && sb->s_op->remount_fs) {
975 lock_super(sb);
976 retval = sb->s_op->remount_fs(sb, &flags, data);
977 unlock_super(sb);
978 if (retval)
979 return retval;
981 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
984 * Invalidate the inodes, as some mount options may be changed.
985 * N.B. If we are changing media, we should check the return
986 * from invalidate_inodes ... can't allow _any_ open files.
988 invalidate_inodes(sb);
990 return 0;
993 static int do_remount(const char *dir,int flags,char *data)
995 struct dentry *dentry;
996 int retval;
998 dentry = namei(dir);
999 retval = PTR_ERR(dentry);
1000 if (!IS_ERR(dentry)) {
1001 struct super_block * sb = dentry->d_inode->i_sb;
1003 retval = -ENODEV;
1004 if (sb) {
1005 retval = -EINVAL;
1006 if (dentry == sb->s_root) {
1008 * Shrink the dcache and sync the device.
1010 shrink_dcache_sb(sb);
1011 fsync_dev(sb->s_dev);
1012 if (flags & MS_RDONLY)
1013 acct_auto_close(sb->s_dev);
1014 retval = do_remount_sb(sb, flags, data);
1017 dput(dentry);
1019 return retval;
1022 static int copy_mount_options (const void * data, unsigned long *where)
1024 int i;
1025 unsigned long page;
1026 struct vm_area_struct * vma;
1028 *where = 0;
1029 if (!data)
1030 return 0;
1032 vma = find_vma(current->mm, (unsigned long) data);
1033 if (!vma || (unsigned long) data < vma->vm_start)
1034 return -EFAULT;
1035 if (!(vma->vm_flags & VM_READ))
1036 return -EFAULT;
1037 i = vma->vm_end - (unsigned long) data;
1038 if (PAGE_SIZE <= (unsigned long) i)
1039 i = PAGE_SIZE-1;
1040 if (!(page = __get_free_page(GFP_KERNEL))) {
1041 return -ENOMEM;
1043 if (copy_from_user((void *) page,data,i)) {
1044 free_page(page);
1045 return -EFAULT;
1047 *where = page;
1048 return 0;
1052 * Flags is a 16-bit value that allows up to 16 non-fs dependent flags to
1053 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
1055 * data is a (void *) that can point to any structure up to
1056 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
1057 * information (or be NULL).
1059 * NOTE! As old versions of mount() didn't use this setup, the flags
1060 * have to have a special 16-bit magic number in the high word:
1061 * 0xC0ED. If this magic word isn't present, the flags and data info
1062 * aren't used, as the syscall assumes we are talking to an older
1063 * version that didn't understand them.
1065 long do_sys_mount(char * dev_name, char * dir_name, unsigned long type_page,
1066 unsigned long new_flags, unsigned long data_page)
1068 struct file_system_type * fstype;
1069 struct dentry * dentry = NULL;
1070 struct inode * inode = NULL;
1071 struct block_device *bdev = NULL;
1072 int retval;
1073 unsigned long flags = 0;
1075 if (!capable(CAP_SYS_ADMIN))
1076 return -EPERM;
1078 if ((new_flags &
1079 (MS_MGC_MSK | MS_REMOUNT)) == (MS_MGC_VAL | MS_REMOUNT)) {
1080 retval = do_remount(dir_name,
1081 new_flags & ~MS_MGC_MSK & ~MS_REMOUNT,
1082 (char *) data_page);
1083 goto out;
1086 fstype = get_fs_type((char *) type_page);
1087 retval = -ENODEV;
1088 if (!fstype)
1089 goto out;
1091 if (fstype->fs_flags & FS_REQUIRES_DEV) {
1092 struct block_device_operations *bdops;
1094 dentry = namei(dev_name);
1095 retval = PTR_ERR(dentry);
1096 if (IS_ERR(dentry))
1097 goto fs_out;
1099 inode = dentry->d_inode;
1100 retval = -ENOTBLK;
1101 if (!S_ISBLK(inode->i_mode))
1102 goto dput_and_out;
1104 retval = -EACCES;
1105 if (IS_NODEV(inode))
1106 goto dput_and_out;
1108 bdev = inode->i_bdev;
1109 bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
1110 if (bdops) bdev->bd_op = bdops;
1113 if ((new_flags & MS_MGC_MSK) == MS_MGC_VAL)
1114 flags = new_flags & ~MS_MGC_MSK;
1116 retval = do_mount(bdev, dev_name, dir_name, fstype->name, flags,
1117 (void *) data_page);
1119 dput_and_out:
1120 dput(dentry);
1121 fs_out:
1122 put_filesystem(fstype);
1123 out:
1124 return retval;
1127 asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type,
1128 unsigned long new_flags, void * data)
1130 int retval;
1131 unsigned long data_page = 0;
1132 unsigned long type_page = 0;
1134 lock_kernel();
1135 retval = copy_mount_options (type, &type_page);
1136 if (retval < 0)
1137 goto out;
1139 /* copy_mount_options allows a NULL user pointer,
1140 * and just returns zero in that case. But if we
1141 * allow the type to be NULL we will crash.
1142 * Previously we did not check this case.
1144 if (type_page == 0) {
1145 retval = -EINVAL;
1146 goto out;
1149 retval = copy_mount_options (data, &data_page);
1150 if (retval >= 0) {
1151 retval = do_sys_mount(dev_name, dir_name, type_page,
1152 new_flags, data_page);
1153 free_page(data_page);
1155 free_page(type_page);
1156 out:
1157 unlock_kernel();
1158 return retval;
1161 void __init mount_root(void)
1163 struct file_system_type * fs_type;
1164 struct super_block * sb;
1165 struct vfsmount *vfsmnt;
1166 struct block_device *bdev = NULL;
1167 mode_t mode;
1168 int retval;
1169 void *handle;
1170 char path[64];
1171 int path_start = -1;
1173 #ifdef CONFIG_ROOT_NFS
1174 if (MAJOR(ROOT_DEV) == UNNAMED_MAJOR) {
1175 ROOT_DEV = 0;
1176 if ((fs_type = get_fs_type("nfs"))) {
1177 sb = get_empty_super(); /* "can't fail" */
1178 sb->s_dev = get_unnamed_dev();
1179 sb->s_bdev = NULL;
1180 sb->s_flags = root_mountflags;
1181 sema_init(&sb->s_vfs_rename_sem,1);
1182 sema_init(&sb->s_nfsd_free_path_sem,1);
1183 vfsmnt = add_vfsmnt(sb, "/dev/root", "/");
1184 if (vfsmnt) {
1185 if (nfs_root_mount(sb) >= 0) {
1186 sb->s_dirt = 0;
1187 sb->s_type = fs_type;
1188 current->fs->root = dget(sb->s_root);
1189 current->fs->pwd = dget(sb->s_root);
1190 ROOT_DEV = sb->s_dev;
1191 printk (KERN_NOTICE "VFS: Mounted root (NFS filesystem)%s.\n", (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1192 return;
1194 remove_vfsmnt(sb->s_dev);
1196 put_unnamed_dev(sb->s_dev);
1197 sb->s_dev = 0;
1198 put_filesystem(fs_type);
1200 if (!ROOT_DEV) {
1201 printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
1202 ROOT_DEV = MKDEV(FLOPPY_MAJOR, 0);
1205 #endif
1207 #ifdef CONFIG_BLK_DEV_FD
1208 if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
1209 #ifdef CONFIG_BLK_DEV_RAM
1210 extern int rd_doload;
1211 extern void rd_load_secondary(void);
1212 #endif
1213 floppy_eject();
1214 #ifndef CONFIG_BLK_DEV_RAM
1215 printk(KERN_NOTICE "(Warning, this kernel has no ramdisk support)\n");
1216 #else
1217 /* rd_doload is 2 for a dual initrd/ramload setup */
1218 if(rd_doload==2)
1219 rd_load_secondary();
1220 else
1221 #endif
1223 printk(KERN_NOTICE "VFS: Insert root floppy and press ENTER\n");
1224 wait_for_keypress();
1227 #endif
1229 devfs_make_root (root_device_name);
1230 handle = devfs_find_handle (NULL, ROOT_DEVICE_NAME, 0,
1231 MAJOR (ROOT_DEV), MINOR (ROOT_DEV),
1232 DEVFS_SPECIAL_BLK, 1);
1233 if (handle) /* Sigh: bd*() functions only paper over the cracks */
1235 unsigned major, minor;
1237 devfs_get_maj_min (handle, &major, &minor);
1238 ROOT_DEV = MKDEV (major, minor);
1242 * Probably pure paranoia, but I'm less than happy about delving into
1243 * devfs crap and checking it right now. Later.
1245 if (!ROOT_DEV)
1246 panic("I have no root and I want to scream");
1248 bdev = bdget(kdev_t_to_nr(ROOT_DEV));
1249 if (!bdev)
1250 panic(__FUNCTION__ ": unable to allocate root device");
1251 bdev->bd_op = devfs_get_ops (handle);
1252 path_start = devfs_generate_path (handle, path + 5, sizeof (path) - 5);
1253 mode = FMODE_READ;
1254 if (!(root_mountflags & MS_RDONLY))
1255 mode |= FMODE_WRITE;
1256 retval = blkdev_get(bdev, mode, 0, BDEV_FS);
1257 if (retval == -EROFS) {
1258 root_mountflags |= MS_RDONLY;
1259 retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS);
1261 if (retval) {
1263 * Allow the user to distinguish between failed open
1264 * and bad superblock on root device.
1266 printk ("VFS: Cannot open root device \"%s\" or %s\n",
1267 root_device_name, kdevname (ROOT_DEV));
1268 printk ("Please append a correct \"root=\" boot option\n");
1269 panic("VFS: Unable to mount root fs on %s",
1270 kdevname(ROOT_DEV));
1273 check_disk_change(ROOT_DEV);
1275 spin_lock(&file_systems_lock);
1276 for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) {
1277 if (!(fs_type->fs_flags & FS_REQUIRES_DEV))
1278 continue;
1279 if (!try_inc_mod_count(fs_type->owner))
1280 continue;
1281 spin_unlock(&file_systems_lock);
1282 sb = get_super(ROOT_DEV);
1283 if (sb) {
1284 /* Shouldn't we fail here? Oh, well... */
1285 sb->s_bdev = bdev;
1286 goto mount_it;
1288 sb = read_super(ROOT_DEV,bdev,fs_type,root_mountflags,NULL,1);
1289 if (sb)
1290 goto mount_it;
1291 spin_lock(&file_systems_lock);
1292 put_filesystem(fs_type);
1294 spin_unlock(&file_systems_lock);
1295 panic("VFS: Unable to mount root fs on %s",
1296 kdevname(ROOT_DEV));
1298 mount_it:
1299 sb->s_flags = root_mountflags;
1300 current->fs->root = dget(sb->s_root);
1301 current->fs->pwd = dget(sb->s_root);
1302 printk ("VFS: Mounted root (%s filesystem)%s.\n",
1303 fs_type->name,
1304 (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1305 if (path_start >= 0) {
1306 devfs_mk_symlink (NULL,
1307 "root", 0, DEVFS_FL_DEFAULT,
1308 path + 5 + path_start, 0,
1309 NULL, NULL);
1310 memcpy (path + path_start, "/dev/", 5);
1311 vfsmnt = add_vfsmnt (sb, path + path_start,
1312 "/");
1314 else vfsmnt = add_vfsmnt (sb, "/dev/root", "/");
1315 if (vfsmnt) {
1316 bdput(bdev); /* sb holds a reference */
1317 return;
1319 panic("VFS: add_vfsmnt failed for root fs");
1323 static void chroot_fs_refs(struct dentry *old_root,
1324 struct dentry *new_root)
1326 struct task_struct *p;
1328 read_lock(&tasklist_lock);
1329 for_each_task(p) {
1330 if (!p->fs) continue;
1331 if (p->fs->root == old_root) {
1332 dput(old_root);
1333 p->fs->root = dget(new_root);
1334 printk(KERN_DEBUG "chroot_fs_refs: changed root of "
1335 "process %d\n",p->pid);
1337 if (p->fs->pwd == old_root) {
1338 dput(old_root);
1339 p->fs->pwd = dget(new_root);
1340 printk(KERN_DEBUG "chroot_fs_refs: changed cwd of "
1341 "process %d\n",p->pid);
1344 read_unlock(&tasklist_lock);
1349 * Moves the current root to put_root, and sets root/cwd of all processes
1350 * which had them on the old root to new_root.
1352 * Note:
1353 * - we don't move root/cwd if they are not at the root (reason: if something
1354 * cared enough to change them, it's probably wrong to force them elsewhere)
1355 * - it's okay to pick a root that isn't the root of a file system, e.g.
1356 * /nfs/my_root where /nfs is the mount point. Better avoid creating
1357 * unreachable mount points this way, though.
1360 asmlinkage long sys_pivot_root(const char *new_root, const char *put_old)
1362 struct dentry *root = current->fs->root;
1363 struct dentry *d_new_root, *d_put_old, *covered;
1364 struct dentry *root_dev_root, *new_root_dev_root;
1365 struct dentry *walk, *next;
1366 int error;
1368 if (!capable(CAP_SYS_ADMIN))
1369 return -EPERM;
1371 lock_kernel();
1372 d_new_root = namei(new_root);
1373 if (IS_ERR(d_new_root)) {
1374 error = PTR_ERR(d_new_root);
1375 goto out0;
1377 d_put_old = namei(put_old);
1378 if (IS_ERR(d_put_old)) {
1379 error = PTR_ERR(d_put_old);
1380 goto out1;
1382 down(&mount_sem);
1383 if (!d_new_root->d_inode || !d_put_old->d_inode) {
1384 error = -ENOENT;
1385 goto out2;
1387 if (!S_ISDIR(d_new_root->d_inode->i_mode) ||
1388 !S_ISDIR(d_put_old->d_inode->i_mode)) {
1389 error = -ENOTDIR;
1390 goto out2;
1392 error = -EBUSY;
1393 if (d_new_root->d_sb == root->d_sb || d_put_old->d_sb == root->d_sb)
1394 goto out2; /* loop */
1395 if (d_put_old != d_put_old->d_covers)
1396 goto out2; /* mount point is busy */
1397 error = -EINVAL;
1398 walk = d_put_old; /* make sure we can reach put_old from new_root */
1399 for (;;) {
1400 next = walk->d_covers->d_parent;
1401 if (next == walk)
1402 goto out2;
1403 if (next == d_new_root)
1404 break;
1405 walk = next;
1408 new_root_dev_root = d_new_root->d_sb->s_root;
1409 covered = new_root_dev_root->d_covers;
1410 new_root_dev_root->d_covers = new_root_dev_root;
1411 dput(covered);
1412 covered->d_mounts = covered;
1414 root_dev_root = root->d_sb->s_root;
1415 root_dev_root->d_covers = dget(d_put_old);
1416 d_put_old->d_mounts = root_dev_root;
1417 chroot_fs_refs(root,d_new_root);
1418 error = 0;
1419 out2:
1420 up(&mount_sem);
1421 dput(d_put_old);
1422 out1:
1423 dput(d_new_root);
1424 out0:
1425 unlock_kernel();
1426 return error;
1430 #ifdef CONFIG_BLK_DEV_INITRD
1432 int __init change_root(kdev_t new_root_dev,const char *put_old)
1434 kdev_t old_root_dev;
1435 struct vfsmount *vfsmnt;
1436 struct dentry *old_root,*old_pwd,*dir_d = NULL;
1437 int error;
1439 old_root = current->fs->root;
1440 old_pwd = current->fs->pwd;
1441 old_root_dev = ROOT_DEV;
1442 if (!fs_may_mount(new_root_dev)) {
1443 printk(KERN_CRIT "New root is busy. Staying in initrd.\n");
1444 return -EBUSY;
1446 /* First unmount devfs if mounted */
1447 dir_d = lookup_dentry ("/dev", NULL, 1);
1448 if (!IS_ERR(dir_d)) {
1449 struct super_block *sb = dir_d->d_inode->i_sb;
1451 if (sb && (dir_d->d_inode == sb->s_root->d_inode) &&
1452 (sb->s_magic == DEVFS_SUPER_MAGIC)) {
1453 dput (dir_d);
1454 do_umount (sb->s_dev, 0, 0);
1456 else dput (dir_d);
1458 ROOT_DEV = new_root_dev;
1459 mount_root();
1460 dput(old_root);
1461 dput(old_pwd);
1462 #if 1
1463 shrink_dcache();
1464 printk("change_root: old root has d_count=%d\n", old_root->d_count);
1465 #endif
1466 mount_devfs_fs ();
1468 * Get the new mount directory
1470 dir_d = lookup_dentry(put_old, NULL, 1);
1471 if (IS_ERR(dir_d)) {
1472 error = PTR_ERR(dir_d);
1473 } else if (!dir_d->d_inode) {
1474 dput(dir_d);
1475 error = -ENOENT;
1476 } else {
1477 error = 0;
1479 if (!error && dir_d->d_covers != dir_d) {
1480 dput(dir_d);
1481 error = -EBUSY;
1483 if (!error && !S_ISDIR(dir_d->d_inode->i_mode)) {
1484 dput(dir_d);
1485 error = -ENOTDIR;
1487 if (error) {
1488 struct block_device *bdev;
1490 printk(KERN_NOTICE "Trying to unmount old root ... ");
1491 bdev = do_umount(old_root_dev,1, 0);
1492 if (!IS_ERR(bdev)) {
1493 printk("okay\n");
1494 /* special: the old device driver is going to be
1495 a ramdisk and the point of this call is to free its
1496 protected memory (even if dirty). */
1497 destroy_buffers(old_root_dev);
1498 if (bdev) {
1499 blkdev_put(bdev, BDEV_FS);
1500 bdput(bdev);
1502 return 0;
1504 printk(KERN_ERR "error %ld\n",PTR_ERR(bdev));
1505 return error;
1507 remove_vfsmnt(old_root_dev);
1508 vfsmnt = add_vfsmnt(old_root->d_sb, "/dev/root.old", put_old);
1509 if (vfsmnt) {
1510 d_mount(dir_d,old_root);
1511 return 0;
1513 printk(KERN_CRIT "Trouble: add_vfsmnt failed\n");
1514 return -ENOMEM;
1517 #endif