fs/super.c

   1 /*
   2  *  linux/fs/super.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  *
   6  *  super.c contains code to handle: - mount structures
   7  *                                   - super-block tables.
   8  *                                   - mount system call
   9  *                                   - umount system call
  10  *
  11  *  Added options to /proc/mounts
  12  *  Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
  13  *
  14  * GK 2/5/95  -  Changed to support mounting the root fs via NFS
  15  *
  16  *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
  17  *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
  18  *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
  19  */
  20
  21 #include <linux/config.h>
  22 #include <linux/string.h>
  23 #include <linux/malloc.h>
  24 #include <linux/locks.h>
  25 #include <linux/smp_lock.h>
  26 #include <linux/devfs_fs_kernel.h>
  27 #include <linux/fd.h>
  28 #include <linux/init.h>
  29 #include <linux/quotaops.h>
  30 #include <linux/acct.h>
  31
  32 #include <asm/uaccess.h>
  33
  34 #include <linux/nfs_fs.h>
  35 #include <linux/nfs_fs_sb.h>
  36 #include <linux/nfs_mount.h>
  37
  38 #include <linux/kmod.h>
  39 #define __NO_VERSION__
  40 #include <linux/module.h>
  41
  42 /*
  43  * We use a semaphore to synchronize all mount/umount
  44  * activity - imagine the mess if we have a race between
  45  * unmounting a filesystem and re-mounting it (or something
  46  * else).
  47  */
  48 static DECLARE_MUTEX(mount_sem);
  49
  50 extern void wait_for_keypress(void);
  51
  52 extern int root_mountflags;
  53
  54 static int do_remount_sb(struct super_block *sb, int flags, char * data);
  55
  56 /* this is initialized in init/main.c */
  57 kdev_t ROOT_DEV;
  58
  59 int nr_super_blocks = 0;
  60 int max_super_blocks = NR_SUPER;
  61 LIST_HEAD(super_blocks);
  62
  63 /*
  64  * Handling of filesystem drivers list.
  65  * Rules:
  66  *      Inclusion to/removals from/scanning of list are protected by spinlock.
  67  *      During the unload module must call unregister_filesystem().
  68  *      We can access the fields of list element if:
  69  *              1) spinlock is held or
  70  *              2) we hold the reference to the module.
  71  *      The latter can be guaranteed by call of try_inc_mod_count(); if it
  72  *      returned 0 we must skip the element, otherwise we got the reference.
  73  *      Once the reference is obtained we can drop the spinlock.
  74  */
  75
  76 static struct file_system_type *file_systems = NULL;
  77 static spinlock_t file_systems_lock = SPIN_LOCK_UNLOCKED;
  78
  79 static void put_filesystem(struct file_system_type *fs)
  80 {
  81         if (fs->owner)
  82                 __MOD_DEC_USE_COUNT(fs->owner);
  83 }
  84
  85 static struct file_system_type **find_filesystem(const char *name)
  86 {
  87         struct file_system_type **p;
  88         for (p=&file_systems; *p; p=&(*p)->next)
  89                 if (strcmp((*p)->name,name) == 0)
  90                         break;
  91         return p;
  92 }
  93
  94 int register_filesystem(struct file_system_type * fs)
  95 {
  96         int res = 0;
  97         struct file_system_type ** p;
  98
  99         if (!fs)
 100                 return -EINVAL;
 101         if (fs->next)
 102                 return -EBUSY;
 103         spin_lock(&file_systems_lock);
 104         p = find_filesystem(fs->name);
 105         if (*p)
 106                 res = -EBUSY;
 107         else
 108                 *p = fs;
 109         spin_unlock(&file_systems_lock);
 110         return res;
 111 }
 112
 113 int unregister_filesystem(struct file_system_type * fs)
 114 {
 115         struct file_system_type ** tmp;
 116
 117         spin_lock(&file_systems_lock);
 118         tmp = &file_systems;
 119         while (*tmp) {
 120                 if (fs == *tmp) {
 121                         *tmp = fs->next;
 122                         fs->next = NULL;
 123                         spin_unlock(&file_systems_lock);
 124                         return 0;
 125                 }
 126                 tmp = &(*tmp)->next;
 127         }
 128         spin_unlock(&file_systems_lock);
 129         return -EINVAL;
 130 }
 131
 132 static int fs_index(const char * __name)
 133 {
 134         struct file_system_type * tmp;
 135         char * name;
 136         int err, index;
 137
 138         name = getname(__name);
 139         err = PTR_ERR(name);
 140         if (IS_ERR(name))
 141                 return err;
 142
 143         err = -EINVAL;
 144         spin_lock(&file_systems_lock);
 145         for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
 146                 if (strcmp(tmp->name,name) == 0) {
 147                         err = index;
 148                         break;
 149                 }
 150         }
 151         spin_unlock(&file_systems_lock);
 152         putname(name);
 153         return err;
 154 }
 155
 156 static int fs_name(unsigned int index, char * buf)
 157 {
 158         struct file_system_type * tmp;
 159         int len, res;
 160
 161         spin_lock(&file_systems_lock);
 162         for (tmp = file_systems; tmp; tmp = tmp->next, index--)
 163                 if (index <= 0 && try_inc_mod_count(tmp->owner))
 164                                 break;
 165         spin_unlock(&file_systems_lock);
 166         if (!tmp)
 167                 return -EINVAL;
 168
 169         /* OK, we got the reference, so we can safely block */
 170         len = strlen(tmp->name) + 1;
 171         res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
 172         put_filesystem(tmp);
 173         return res;
 174 }
 175
 176 static int fs_maxindex(void)
 177 {
 178         struct file_system_type * tmp;
 179         int index;
 180
 181         spin_lock(&file_systems_lock);
 182         for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
 183                 ;
 184         spin_unlock(&file_systems_lock);
 185         return index;
 186 }
 187
 188 /*
 189  * Whee.. Weird sysv syscall.
 190  */
 191 asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
 192 {
 193         int retval = -EINVAL;
 194
 195         lock_kernel();
 196         switch (option) {
 197                 case 1:
 198                         retval = fs_index((const char *) arg1);
 199                         break;
 200
 201                 case 2:
 202                         retval = fs_name(arg1, (char *) arg2);
 203                         break;
 204
 205                 case 3:
 206                         retval = fs_maxindex();
 207                         break;
 208         }
 209         unlock_kernel();
 210         return retval;
 211 }
 212
 213 int get_filesystem_list(char * buf)
 214 {
 215         int len = 0;
 216         struct file_system_type * tmp;
 217
 218         spin_lock(&file_systems_lock);
 219         tmp = file_systems;
 220         while (tmp && len < PAGE_SIZE - 80) {
 221                 len += sprintf(buf+len, "%s\t%s\n",
 222                         (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
 223                         tmp->name);
 224                 tmp = tmp->next;
 225         }
 226         spin_unlock(&file_systems_lock);
 227         return len;
 228 }
 229
 230 static struct file_system_type *get_fs_type(const char *name)
 231 {
 232         struct file_system_type *fs;
 233
 234         spin_lock(&file_systems_lock);
 235         fs = *(find_filesystem(name));
 236         if (fs && !try_inc_mod_count(fs->owner))
 237                 fs = NULL;
 238         spin_unlock(&file_systems_lock);
 239         if (!fs && (request_module(name) == 0)) {
 240                 spin_lock(&file_systems_lock);
 241                 fs = *(find_filesystem(name));
 242                 if (fs && !try_inc_mod_count(fs->owner))
 243                         fs = NULL;
 244                 spin_unlock(&file_systems_lock);
 245         }
 246         return fs;
 247 }
 248
 249 struct vfsmount *vfsmntlist = NULL;
 250 static struct vfsmount *vfsmnttail = NULL, *mru_vfsmnt = NULL;
 251
 252 static struct vfsmount *add_vfsmnt(struct super_block *sb,
 253                         const char *dev_name, const char *dir_name)
 254 {
 255         struct vfsmount *lptr;
 256         char *tmp, *name;
 257
 258         lptr = (struct vfsmount *)kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
 259         if (!lptr)
 260                 goto out;
 261         memset(lptr, 0, sizeof(struct vfsmount));
 262
 263         lptr->mnt_sb = sb;
 264         lptr->mnt_dev = sb->s_dev;
 265
 266         /* N.B. Is it really OK to have a vfsmount without names? */
 267         if (dev_name && !IS_ERR(tmp = getname(dev_name))) {
 268                 name = (char *) kmalloc(strlen(tmp)+1, GFP_KERNEL);
 269                 if (name) {
 270                         strcpy(name, tmp);
 271                         lptr->mnt_devname = name;
 272                 }
 273                 putname(tmp);
 274         }
 275         if (dir_name && !IS_ERR(tmp = getname(dir_name))) {
 276                 name = (char *) kmalloc(strlen(tmp)+1, GFP_KERNEL);
 277                 if (name) {
 278                         strcpy(name, tmp);
 279                         lptr->mnt_dirname = name;
 280                 }
 281                 putname(tmp);
 282         }
 283
 284         if (vfsmntlist == (struct vfsmount *)NULL) {
 285                 vfsmntlist = vfsmnttail = lptr;
 286         } else {
 287                 vfsmnttail->mnt_next = lptr;
 288                 vfsmnttail = lptr;
 289         }
 290 out:
 291         return lptr;
 292 }
 293
 294 void remove_vfsmnt(kdev_t dev)
 295 {
 296         struct vfsmount *lptr, *tofree;
 297
 298         if (vfsmntlist == NULL)
 299                 return;
 300         lptr = vfsmntlist;
 301         if (lptr->mnt_dev == dev) {
 302                 tofree = lptr;
 303                 vfsmntlist = lptr->mnt_next;
 304                 if (vfsmnttail->mnt_dev == dev)
 305                         vfsmnttail = vfsmntlist;
 306         } else {
 307                 while (lptr->mnt_next != NULL) {
 308                         if (lptr->mnt_next->mnt_dev == dev)
 309                                 break;
 310                         lptr = lptr->mnt_next;
 311                 }
 312                 tofree = lptr->mnt_next;
 313                 if (tofree == NULL)
 314                         return;
 315                 lptr->mnt_next = lptr->mnt_next->mnt_next;
 316                 if (vfsmnttail->mnt_dev == dev)
 317                         vfsmnttail = lptr;
 318         }
 319         if (tofree == mru_vfsmnt)
 320                 mru_vfsmnt = NULL;
 321         kfree(tofree->mnt_devname);
 322         kfree(tofree->mnt_dirname);
 323         kfree_s(tofree, sizeof(struct vfsmount));
 324 }
 325
 326 static struct proc_fs_info {
 327         int flag;
 328         char *str;
 329 } fs_info[] = {
 330         { MS_NOEXEC, ",noexec" },
 331         { MS_NOSUID, ",nosuid" },
 332         { MS_NODEV, ",nodev" },
 333         { MS_SYNCHRONOUS, ",sync" },
 334         { MS_MANDLOCK, ",mand" },
 335         { MS_NOATIME, ",noatime" },
 336         { MS_NODIRATIME, ",nodiratime" },
 337 #ifdef MS_NOSUB                 /* Can't find this except in mount.c */
 338         { MS_NOSUB, ",nosub" },
 339 #endif
 340         { 0, NULL }
 341 };
 342
 343 static struct proc_nfs_info {
 344         int flag;
 345         char *str;
 346 } nfs_info[] = {
 347         { NFS_MOUNT_SOFT, ",soft" },
 348         { NFS_MOUNT_INTR, ",intr" },
 349         { NFS_MOUNT_POSIX, ",posix" },
 350         { NFS_MOUNT_NOCTO, ",nocto" },
 351         { NFS_MOUNT_NOAC, ",noac" },
 352         { 0, NULL }
 353 };
 354
 355 int get_filesystem_info( char *buf )
 356 {
 357         struct vfsmount *tmp;
 358         struct proc_fs_info *fs_infop;
 359         struct proc_nfs_info *nfs_infop;
 360         struct nfs_server *nfss;
 361         int len = 0;
 362         char *path,*buffer = (char *) __get_free_page(GFP_KERNEL);
 363
 364         if (!buffer) return 0;
 365         for (tmp = vfsmntlist; tmp && len < PAGE_SIZE - 160;
 366             tmp = tmp->mnt_next) {
 367                 path = d_path(tmp->mnt_sb->s_root, buffer, PAGE_SIZE);
 368                 if (!path)
 369                         continue;
 370                 len += sprintf( buf + len, "%s %s %s %s",
 371                         tmp->mnt_devname, path,
 372                         tmp->mnt_sb->s_type->name,
 373                         tmp->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw" );
 374                 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
 375                   if (tmp->mnt_sb->s_flags & fs_infop->flag) {
 376                     strcpy(buf + len, fs_infop->str);
 377                     len += strlen(fs_infop->str);
 378                   }
 379                 }
 380                 if (!strcmp("nfs", tmp->mnt_sb->s_type->name)) {
 381                         nfss = &tmp->mnt_sb->u.nfs_sb.s_server;
 382                         if (nfss->rsize != NFS_DEF_FILE_IO_BUFFER_SIZE) {
 383                                 len += sprintf(buf+len, ",rsize=%d",
 384                                                nfss->rsize);
 385                         }
 386                         if (nfss->wsize != NFS_DEF_FILE_IO_BUFFER_SIZE) {
 387                                 len += sprintf(buf+len, ",wsize=%d",
 388                                                nfss->wsize);
 389                         }
 390 #if 0
 391                         if (nfss->timeo != 7*HZ/10) {
 392                                 len += sprintf(buf+len, ",timeo=%d",
 393                                                nfss->timeo*10/HZ);
 394                         }
 395                         if (nfss->retrans != 3) {
 396                                 len += sprintf(buf+len, ",retrans=%d",
 397                                                nfss->retrans);
 398                         }
 399 #endif
 400                         if (nfss->acregmin != 3*HZ) {
 401                                 len += sprintf(buf+len, ",acregmin=%d",
 402                                                nfss->acregmin/HZ);
 403                         }
 404                         if (nfss->acregmax != 60*HZ) {
 405                                 len += sprintf(buf+len, ",acregmax=%d",
 406                                                nfss->acregmax/HZ);
 407                         }
 408                         if (nfss->acdirmin != 30*HZ) {
 409                                 len += sprintf(buf+len, ",acdirmin=%d",
 410                                                nfss->acdirmin/HZ);
 411                         }
 412                         if (nfss->acdirmax != 60*HZ) {
 413                                 len += sprintf(buf+len, ",acdirmax=%d",
 414                                                nfss->acdirmax/HZ);
 415                         }
 416                         for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
 417                                 if (nfss->flags & nfs_infop->flag) {
 418                                         strcpy(buf + len, nfs_infop->str);
 419                                         len += strlen(nfs_infop->str);
 420                                 }
 421                         }
 422                         len += sprintf(buf+len, ",addr=%s",
 423                                        nfss->hostname);
 424                 }
 425                 len += sprintf( buf + len, " 0 0\n" );
 426         }
 427
 428         free_page((unsigned long) buffer);
 429         return len;
 430 }
 431
 432 void __wait_on_super(struct super_block * sb)
 433 {
 434         DECLARE_WAITQUEUE(wait, current);
 435
 436         add_wait_queue(&sb->s_wait, &wait);
 437 repeat:
 438         set_current_state(TASK_UNINTERRUPTIBLE);
 439         if (sb->s_lock) {
 440                 schedule();
 441                 goto repeat;
 442         }
 443         remove_wait_queue(&sb->s_wait, &wait);
 444         current->state = TASK_RUNNING;
 445 }
 446
 447 /*
 448  * Note: check the dirty flag before waiting, so we don't
 449  * hold up the sync while mounting a device. (The newly
 450  * mounted device won't need syncing.)
 451  */
 452 void sync_supers(kdev_t dev)
 453 {
 454         struct super_block * sb;
 455
 456         for (sb = sb_entry(super_blocks.next);
 457              sb != sb_entry(&super_blocks);
 458              sb = sb_entry(sb->s_list.next)) {
 459                 if (!sb->s_dev)
 460                         continue;
 461                 if (dev && sb->s_dev != dev)
 462                         continue;
 463                 if (!sb->s_dirt)
 464                         continue;
 465                 /* N.B. Should lock the superblock while writing */
 466                 wait_on_super(sb);
 467                 if (!sb->s_dev || !sb->s_dirt)
 468                         continue;
 469                 if (dev && (dev != sb->s_dev))
 470                         continue;
 471                 if (sb->s_op && sb->s_op->write_super)
 472                         sb->s_op->write_super(sb);
 473         }
 474 }
 475
 476 struct super_block * get_super(kdev_t dev)
 477 {
 478         struct super_block * s;
 479
 480         if (!dev)
 481                 return NULL;
 482 restart:
 483         s = sb_entry(super_blocks.next);
 484         while (s != sb_entry(&super_blocks))
 485                 if (s->s_dev == dev) {
 486                         wait_on_super(s);
 487                         if (s->s_dev == dev)
 488                                 return s;
 489                         goto restart;
 490                 } else
 491                         s = sb_entry(s->s_list.next);
 492         return NULL;
 493 }
 494
 495 asmlinkage long sys_ustat(dev_t dev, struct ustat * ubuf)
 496 {
 497         struct super_block *s;
 498         struct ustat tmp;
 499         struct statfs sbuf;
 500         int err = -EINVAL;
 501
 502         lock_kernel();
 503         s = get_super(to_kdev_t(dev));
 504         if (s == NULL)
 505                 goto out;
 506         err = vfs_statfs(s, &sbuf);
 507         if (err)
 508                 goto out;
 509
 510         memset(&tmp,0,sizeof(struct ustat));
 511         tmp.f_tfree = sbuf.f_bfree;
 512         tmp.f_tinode = sbuf.f_ffree;
 513
 514         err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
 515 out:
 516         unlock_kernel();
 517         return err;
 518 }
 519
 520 /*
 521  * Find a super_block with no device assigned.
 522  */
 523 struct super_block *get_empty_super(void)
 524 {
 525         struct super_block *s;
 526
 527         for (s  = sb_entry(super_blocks.next);
 528              s != sb_entry(&super_blocks);
 529              s  = sb_entry(s->s_list.next)) {
 530                 if (s->s_dev)
 531                         continue;
 532                 if (!s->s_lock)
 533                         return s;
 534                 printk("VFS: empty superblock %p locked!\n", s);
 535         }
 536         /* Need a new one... */
 537         if (nr_super_blocks >= max_super_blocks)
 538                 return NULL;
 539         s = kmalloc(sizeof(struct super_block),  GFP_USER);
 540         if (s) {
 541                 nr_super_blocks++;
 542                 memset(s, 0, sizeof(struct super_block));
 543                 INIT_LIST_HEAD(&s->s_dirty);
 544                 list_add (&s->s_list, super_blocks.prev);
 545                 init_waitqueue_head(&s->s_wait);
 546                 INIT_LIST_HEAD(&s->s_files);
 547         }
 548         return s;
 549 }
 550
 551 static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
 552                                        struct file_system_type *type, int flags,
 553                                        void *data, int silent)
 554 {
 555         struct super_block * s;
 556         s = get_empty_super();
 557         if (!s)
 558                 goto out;
 559         s->s_dev = dev;
 560         s->s_bdev = bdev;
 561         s->s_flags = flags;
 562         s->s_dirt = 0;
 563         sema_init(&s->s_vfs_rename_sem,1);
 564         sema_init(&s->s_nfsd_free_path_sem,1);
 565         s->s_type = type;
 566         sema_init(&s->s_dquot.dqio_sem, 1);
 567         sema_init(&s->s_dquot.dqoff_sem, 1);
 568         s->s_dquot.flags = 0;
 569         lock_super(s);
 570         if (!type->read_super(s, data, silent))
 571                 goto out_fail;
 572         unlock_super(s);
 573         /* tell bdcache that we are going to keep this one */
 574         if (bdev)
 575                 atomic_inc(&bdev->bd_count);
 576 out:
 577         return s;
 578
 579 out_fail:
 580         s->s_dev = 0;
 581         s->s_bdev = 0;
 582         s->s_type = NULL;
 583         unlock_super(s);
 584         return NULL;
 585 }
 586
 587 /*
 588  * Unnamed block devices are dummy devices used by virtual
 589  * filesystems which don't use real block-devices.  -- jrs
 590  */
 591
 592 static unsigned int unnamed_dev_in_use[256/(8*sizeof(unsigned int))] = { 0, };
 593
 594 kdev_t get_unnamed_dev(void)
 595 {
 596         int i;
 597
 598         for (i = 1; i < 256; i++) {
 599                 if (!test_and_set_bit(i,unnamed_dev_in_use))
 600                         return MKDEV(UNNAMED_MAJOR, i);
 601         }
 602         return 0;
 603 }
 604
 605 void put_unnamed_dev(kdev_t dev)
 606 {
 607         if (!dev || MAJOR(dev) != UNNAMED_MAJOR)
 608                 return;
 609         if (test_and_clear_bit(MINOR(dev), unnamed_dev_in_use))
 610                 return;
 611         printk("VFS: put_unnamed_dev: freeing unused device %s\n",
 612                         kdevname(dev));
 613 }
 614
 615 static int d_umount(struct super_block * sb)
 616 {
 617         struct dentry * root = sb->s_root;
 618         struct dentry * covered = root->d_covers;
 619
 620         if (root->d_count != 1)
 621                 return -EBUSY;
 622
 623         if (root->d_inode->i_state)
 624                 return -EBUSY;
 625
 626         sb->s_root = NULL;
 627
 628         if (covered != root) {
 629                 root->d_covers = root;
 630                 covered->d_mounts = covered;
 631                 dput(covered);
 632         }
 633         dput(root);
 634         return 0;
 635 }
 636
 637 static void d_mount(struct dentry *covered, struct dentry *dentry)
 638 {
 639         if (covered->d_mounts != covered) {
 640                 printk("VFS: mount - already mounted\n");
 641                 return;
 642         }
 643         covered->d_mounts = dentry;
 644         dentry->d_covers = covered;
 645 }
 646
 647 static struct block_device *do_umount(kdev_t dev, int unmount_root, int flags)
 648 {
 649         struct super_block * sb;
 650         struct block_device *bdev;
 651         int retval;
 652
 653         retval = -ENOENT;
 654         sb = get_super(dev);
 655         if (!sb || !sb->s_root)
 656                 goto out;
 657
 658         /*
 659          * Before checking whether the filesystem is still busy,
 660          * make sure the kernel doesn't hold any quota files open
 661          * on the device. If the umount fails, too bad -- there
 662          * are no quotas running any more. Just turn them on again.
 663          */
 664         DQUOT_OFF(sb);
 665         acct_auto_close(dev);
 666
 667         /*
 668          * If we may have to abort operations to get out of this
 669          * mount, and they will themselves hold resources we must
 670          * allow the fs to do things. In the Unix tradition of
 671          * 'Gee thats tricky lets do it in userspace' the umount_begin
 672          * might fail to complete on the first run through as other tasks
 673          * must return, and the like. Thats for the mount program to worry
 674          * about for the moment.
 675          */
 676
 677         if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
 678                 sb->s_op->umount_begin(sb);
 679
 680         /*
 681          * Shrink dcache, then fsync. This guarantees that if the
 682          * filesystem is quiescent at this point, then (a) only the
 683          * root entry should be in use and (b) that root entry is
 684          * clean.
 685          */
 686         shrink_dcache_sb(sb);
 687         fsync_dev(dev);
 688
 689         if (sb == current->fs->root->d_sb && !unmount_root) {
 690                 /*
 691                  * Special case for "unmounting" root ...
 692                  * we just try to remount it readonly.
 693                  */
 694                 retval = 0;
 695                 if (!(sb->s_flags & MS_RDONLY))
 696                         retval = do_remount_sb(sb, MS_RDONLY, 0);
 697                 return ERR_PTR(retval);
 698         }
 699
 700         retval = d_umount(sb);
 701         if (retval)
 702                 goto out;
 703
 704         if (sb->s_op) {
 705                 if (sb->s_op->write_super && sb->s_dirt)
 706                         sb->s_op->write_super(sb);
 707         }
 708
 709         lock_super(sb);
 710         if (sb->s_op) {
 711                 if (sb->s_op->put_super)
 712                         sb->s_op->put_super(sb);
 713         }
 714
 715         /* Forget any remaining inodes */
 716         if (invalidate_inodes(sb)) {
 717                 printk("VFS: Busy inodes after unmount. "
 718                         "Self-destruct in 5 seconds.  Have a nice day...\n");
 719         }
 720
 721         sb->s_dev = 0;          /* Free the superblock */
 722         bdev = sb->s_bdev;
 723         sb->s_bdev = NULL;
 724         put_filesystem(sb->s_type);
 725         sb->s_type = NULL;
 726         unlock_super(sb);
 727
 728         remove_vfsmnt(dev);
 729
 730         return bdev;
 731
 732 out:
 733         return ERR_PTR(retval);
 734 }
 735
 736 static int umount_dev(kdev_t dev, int flags)
 737 {
 738         int retval;
 739         struct block_device *bdev;
 740
 741         retval = -ENXIO;
 742         if (MAJOR(dev) >= MAX_BLKDEV)
 743                 goto out;
 744
 745         fsync_dev(dev);
 746
 747         down(&mount_sem);
 748
 749         bdev = do_umount(dev, 0, flags);
 750         if (IS_ERR(bdev))
 751                 retval = PTR_ERR(bdev);
 752         else {
 753                 retval = 0;
 754                 if (bdev) {
 755                         blkdev_put(bdev, BDEV_FS);
 756                         bdput(bdev);
 757                 } else {
 758                         put_unnamed_dev(dev);
 759                 }
 760         }
 761         up(&mount_sem);
 762 out:
 763         return retval;
 764 }
 765
 766 /*
 767  * Now umount can handle mount points as well as block devices.
 768  * This is important for filesystems which use unnamed block devices.
 769  *
 770  * We now support a flag for forced unmount like the other 'big iron'
 771  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
 772  */
 773
 774 asmlinkage long sys_umount(char * name, int flags)
 775 {
 776         struct dentry * dentry;
 777         int retval;
 778
 779         if (!capable(CAP_SYS_ADMIN))
 780                 return -EPERM;
 781
 782         lock_kernel();
 783         dentry = namei(name);
 784         retval = PTR_ERR(dentry);
 785         if (!IS_ERR(dentry)) {
 786                 struct inode * inode = dentry->d_inode;
 787                 kdev_t dev = inode->i_rdev;
 788
 789                 retval = 0;
 790                 if (S_ISBLK(inode->i_mode)) {
 791                         if (IS_NODEV(inode))
 792                                 retval = -EACCES;
 793                 } else {
 794                         struct super_block *sb = inode->i_sb;
 795                         retval = -EINVAL;
 796                         if (sb && inode == sb->s_root->d_inode) {
 797                                 dev = sb->s_dev;
 798                                 retval = 0;
 799                         }
 800                 }
 801                 dput(dentry);
 802
 803                 if (!retval)
 804                         retval = umount_dev(dev, flags);
 805         }
 806         unlock_kernel();
 807         return retval;
 808 }
 809
 810 /*
 811  *      The 2.0 compatible umount. No flags.
 812  */
 813
 814 asmlinkage long sys_oldumount(char * name)
 815 {
 816         return sys_umount(name,0);
 817 }
 818
 819 /*
 820  * Check whether we can mount the specified device.
 821  */
 822 int fs_may_mount(kdev_t dev)
 823 {
 824         struct super_block * sb = get_super(dev);
 825         int busy;
 826
 827         busy = sb && sb->s_root &&
 828                (sb->s_root->d_count != 1 || sb->s_root->d_covers != sb->s_root);
 829         return !busy;
 830 }
 831
 832 /*
 833  * do_mount() does the actual mounting after sys_mount has done the ugly
 834  * parameter parsing. When enough time has gone by, and everything uses the
 835  * new mount() parameters, sys_mount() can then be cleaned up.
 836  *
 837  * We cannot mount a filesystem if it has active, used, or dirty inodes.
 838  * We also have to flush all inode-data for this device, as the new mount
 839  * might need new info.
 840  *
 841  * [21-Mar-97] T.Schoebel-Theuer: Now this can be overridden when
 842  * supplying a leading "!" before the dir_name, allowing "stacks" of
 843  * mounted filesystems. The stacking will only influence any pathname lookups
 844  * _after_ the mount, but open file descriptors or working directories that
 845  * are now covered remain valid. For example, when you overmount /home, any
 846  * process with old cwd /home/joe will continue to use the old versions,
 847  * as long as relative paths are used, but absolute paths like /home/joe/xxx
 848  * will go to the new "top of stack" version. In general, crossing a
 849  * mount point will always go to the top of stack element.
 850  * Anyone using this new feature must know what he/she is doing.
 851  */
 852
 853 int do_mount(struct block_device *bdev, const char *dev_name,
 854              const char *dir_name, const char * type, int flags, void * data)
 855 {
 856         kdev_t dev;
 857         struct dentry * dir_d;
 858         struct super_block * sb;
 859         struct vfsmount *vfsmnt;
 860         struct file_system_type *fs_type;
 861         int error;
 862
 863         if (bdev) {
 864                 mode_t mode = FMODE_READ; /* we always need it ;-) */
 865                 if (!(flags & MS_RDONLY))
 866                         mode |= FMODE_WRITE;
 867                 dev = to_kdev_t(bdev->bd_dev);
 868                 error = blkdev_get(bdev, mode, 0, BDEV_FS);
 869                 if (error)
 870                         return error;
 871         } else {
 872                 dev = get_unnamed_dev();
 873                 if (!dev)
 874                         return -EMFILE; /* huh? */
 875         }
 876
 877         error = -EACCES;
 878         if (!(flags & MS_RDONLY) && dev && is_read_only(dev))
 879                 goto out;
 880
 881         /*
 882          * Do the lookup first to force automounting.
 883          */
 884         dir_d = namei(dir_name);
 885         error = PTR_ERR(dir_d);
 886         if (IS_ERR(dir_d))
 887                 goto out;
 888
 889         down(&mount_sem);
 890         error = -ENOTDIR;
 891         if (!S_ISDIR(dir_d->d_inode->i_mode))
 892                 goto dput_and_out;
 893
 894         error = -EBUSY;
 895         if (dir_d->d_covers != dir_d)
 896                 goto dput_and_out;
 897
 898         error = -EINVAL;
 899         if (!dev)
 900                 goto dput_and_out;
 901         check_disk_change(dev);
 902         sb = get_super(dev);
 903         if (sb) {
 904                 /* Already mounted */
 905                 error = -EBUSY;
 906                 goto dput_and_out;
 907         }
 908
 909         fs_type = get_fs_type(type);
 910         if (!fs_type) {
 911                 printk("VFS: on device %s: get_fs_type(%s) failed\n",
 912                        kdevname(dev), type);
 913                 goto dput_and_out;
 914         }
 915
 916         sb = read_super(dev, bdev, fs_type, flags, data, 0);
 917         if (!sb)
 918                 goto fsput_and_out;
 919
 920         /*
 921          * We may have slept while reading the super block,
 922          * so we check afterwards whether it's safe to mount.
 923          */
 924         error = -EBUSY;
 925         if (!fs_may_mount(dev))
 926                 goto bdput_and_out;
 927
 928         error = -ENOMEM;
 929         vfsmnt = add_vfsmnt(sb, dev_name, dir_name);
 930         if (vfsmnt) {
 931                 d_mount(dget(dir_d), sb->s_root);
 932                 dput(dir_d);
 933                 up(&mount_sem);
 934                 return 0;
 935         }
 936
 937 bdput_and_out:
 938         /* FIXME: ->put_super() is needed here */
 939         sb->s_bdev = NULL;
 940         sb->s_dev = 0;
 941         sb->s_type = NULL;
 942         if (bdev)
 943                 bdput(bdev);
 944 fsput_and_out:
 945         put_filesystem(fs_type);
 946 dput_and_out:
 947         dput(dir_d);
 948         up(&mount_sem);
 949 out:
 950         if (bdev)
 951                 blkdev_put(bdev, BDEV_FS);
 952         else
 953                 put_unnamed_dev(dev);
 954         return error;
 955 }
 956
 957
 958 /*
 959  * Alters the mount flags of a mounted file system. Only the mount point
 960  * is used as a reference - file system type and the device are ignored.
 961  */
 962
 963 static int do_remount_sb(struct super_block *sb, int flags, char *data)
 964 {
 965         int retval;
 966
 967         if (!(flags & MS_RDONLY) && sb->s_dev && is_read_only(sb->s_dev))
 968                 return -EACCES;
 969                 /*flags |= MS_RDONLY;*/
 970         /* If we are remounting RDONLY, make sure there are no rw files open */
 971         if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
 972                 if (!fs_may_remount_ro(sb))
 973                         return -EBUSY;
 974         if (sb->s_op && sb->s_op->remount_fs) {
 975                 lock_super(sb);
 976                 retval = sb->s_op->remount_fs(sb, &flags, data);
 977                 unlock_super(sb);
 978                 if (retval)
 979                         return retval;
 980         }
 981         sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 982
 983         /*
 984          * Invalidate the inodes, as some mount options may be changed.
 985          * N.B. If we are changing media, we should check the return
 986          * from invalidate_inodes ... can't allow _any_ open files.
 987          */
 988         invalidate_inodes(sb);
 989
 990         return 0;
 991 }
 992
 993 static int do_remount(const char *dir,int flags,char *data)
 994 {
 995         struct dentry *dentry;
 996         int retval;
 997
 998         dentry = namei(dir);
 999         retval = PTR_ERR(dentry);
1000         if (!IS_ERR(dentry)) {
1001                 struct super_block * sb = dentry->d_inode->i_sb;
1002
1003                 retval = -ENODEV;
1004                 if (sb) {
1005                         retval = -EINVAL;
1006                         if (dentry == sb->s_root) {
1007                                 /*
1008                                  * Shrink the dcache and sync the device.
1009                                  */
1010                                 shrink_dcache_sb(sb);
1011                                 fsync_dev(sb->s_dev);
1012                                 if (flags & MS_RDONLY)
1013                                         acct_auto_close(sb->s_dev);
1014                                 retval = do_remount_sb(sb, flags, data);
1015                         }
1016                 }
1017                 dput(dentry);
1018         }
1019         return retval;
1020 }
1021
1022 static int copy_mount_options (const void * data, unsigned long *where)
1023 {
1024         int i;
1025         unsigned long page;
1026         struct vm_area_struct * vma;
1027
1028         *where = 0;
1029         if (!data)
1030                 return 0;
1031
1032         vma = find_vma(current->mm, (unsigned long) data);
1033         if (!vma || (unsigned long) data < vma->vm_start)
1034                 return -EFAULT;
1035         if (!(vma->vm_flags & VM_READ))
1036                 return -EFAULT;
1037         i = vma->vm_end - (unsigned long) data;
1038         if (PAGE_SIZE <= (unsigned long) i)
1039                 i = PAGE_SIZE-1;
1040         if (!(page = __get_free_page(GFP_KERNEL))) {
1041                 return -ENOMEM;
1042         }
1043         if (copy_from_user((void *) page,data,i)) {
1044                 free_page(page);
1045                 return -EFAULT;
1046         }
1047         *where = page;
1048         return 0;
1049 }
1050
1051 /*
1052  * Flags is a 16-bit value that allows up to 16 non-fs dependent flags to
1053  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
1054  *
1055  * data is a (void *) that can point to any structure up to
1056  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
1057  * information (or be NULL).
1058  *
1059  * NOTE! As old versions of mount() didn't use this setup, the flags
1060  * have to have a special 16-bit magic number in the high word:
1061  * 0xC0ED. If this magic word isn't present, the flags and data info
1062  * aren't used, as the syscall assumes we are talking to an older
1063  * version that didn't understand them.
1064  */
1065 long do_sys_mount(char * dev_name, char * dir_name, unsigned long type_page,
1066                   unsigned long new_flags, unsigned long data_page)
1067 {
1068         struct file_system_type * fstype;
1069         struct dentry * dentry = NULL;
1070         struct inode * inode = NULL;
1071         struct block_device *bdev = NULL;
1072         int retval;
1073         unsigned long flags = 0;
1074
1075         if (!capable(CAP_SYS_ADMIN))
1076                 return -EPERM;
1077
1078         if ((new_flags &
1079              (MS_MGC_MSK | MS_REMOUNT)) == (MS_MGC_VAL | MS_REMOUNT)) {
1080                 retval = do_remount(dir_name,
1081                                     new_flags & ~MS_MGC_MSK & ~MS_REMOUNT,
1082                                     (char *) data_page);
1083                 goto out;
1084         }
1085
1086         fstype = get_fs_type((char *) type_page);
1087         retval = -ENODEV;
1088         if (!fstype)
1089                 goto out;
1090
1091         if (fstype->fs_flags & FS_REQUIRES_DEV) {
1092                 struct block_device_operations *bdops;
1093
1094                 dentry = namei(dev_name);
1095                 retval = PTR_ERR(dentry);
1096                 if (IS_ERR(dentry))
1097                         goto fs_out;
1098
1099                 inode = dentry->d_inode;
1100                 retval = -ENOTBLK;
1101                 if (!S_ISBLK(inode->i_mode))
1102                         goto dput_and_out;
1103
1104                 retval = -EACCES;
1105                 if (IS_NODEV(inode))
1106                         goto dput_and_out;
1107
1108                 bdev = inode->i_bdev;
1109                 bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
1110                 if (bdops) bdev->bd_op = bdops;
1111         }
1112
1113         if ((new_flags & MS_MGC_MSK) == MS_MGC_VAL)
1114                 flags = new_flags & ~MS_MGC_MSK;
1115
1116         retval = do_mount(bdev, dev_name, dir_name, fstype->name, flags,
1117                                 (void *) data_page);
1118
1119 dput_and_out:
1120         dput(dentry);
1121 fs_out:
1122         put_filesystem(fstype);
1123 out:
1124         return retval;
1125 }
1126
1127 asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type,
1128                           unsigned long new_flags, void * data)
1129 {
1130         int retval;
1131         unsigned long data_page = 0;
1132         unsigned long type_page = 0;
1133
1134         lock_kernel();
1135         retval = copy_mount_options (type, &type_page);
1136         if (retval < 0)
1137                 goto out;
1138
1139         /* copy_mount_options allows a NULL user pointer,
1140          * and just returns zero in that case.  But if we
1141          * allow the type to be NULL we will crash.
1142          * Previously we did not check this case.
1143          */
1144         if (type_page == 0) {
1145                 retval = -EINVAL;
1146                 goto out;
1147         }
1148
1149         retval = copy_mount_options (data, &data_page);
1150         if (retval >= 0) {
1151                 retval = do_sys_mount(dev_name, dir_name, type_page,
1152                                       new_flags, data_page);
1153                 free_page(data_page);
1154         }
1155         free_page(type_page);
1156 out:
1157         unlock_kernel();
1158         return retval;
1159 }
1160
1161 void __init mount_root(void)
1162 {
1163         struct file_system_type * fs_type;
1164         struct super_block * sb;
1165         struct vfsmount *vfsmnt;
1166         struct block_device *bdev = NULL;
1167         mode_t mode;
1168         int retval;
1169         void *handle;
1170         char path[64];
1171         int path_start = -1;
1172
1173 #ifdef CONFIG_ROOT_NFS
1174         if (MAJOR(ROOT_DEV) == UNNAMED_MAJOR) {
1175                 ROOT_DEV = 0;
1176                 if ((fs_type = get_fs_type("nfs"))) {
1177                         sb = get_empty_super(); /* "can't fail" */
1178                         sb->s_dev = get_unnamed_dev();
1179                         sb->s_bdev = NULL;
1180                         sb->s_flags = root_mountflags;
1181                         sema_init(&sb->s_vfs_rename_sem,1);
1182                         sema_init(&sb->s_nfsd_free_path_sem,1);
1183                         vfsmnt = add_vfsmnt(sb, "/dev/root", "/");
1184                         if (vfsmnt) {
1185                                 if (nfs_root_mount(sb) >= 0) {
1186                                         sb->s_dirt = 0;
1187                                         sb->s_type = fs_type;
1188                                         current->fs->root = dget(sb->s_root);
1189                                         current->fs->pwd = dget(sb->s_root);
1190                                         ROOT_DEV = sb->s_dev;
1191                                         printk (KERN_NOTICE "VFS: Mounted root (NFS filesystem)%s.\n", (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1192                                         return;
1193                                 }
1194                                 remove_vfsmnt(sb->s_dev);
1195                         }
1196                         put_unnamed_dev(sb->s_dev);
1197                         sb->s_dev = 0;
1198                         put_filesystem(fs_type);
1199                 }
1200                 if (!ROOT_DEV) {
1201                         printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
1202                         ROOT_DEV = MKDEV(FLOPPY_MAJOR, 0);
1203                 }
1204         }
1205 #endif
1206
1207 #ifdef CONFIG_BLK_DEV_FD
1208         if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
1209 #ifdef CONFIG_BLK_DEV_RAM
1210                 extern int rd_doload;
1211                 extern void rd_load_secondary(void);
1212 #endif
1213                 floppy_eject();
1214 #ifndef CONFIG_BLK_DEV_RAM
1215                 printk(KERN_NOTICE "(Warning, this kernel has no ramdisk support)\n");
1216 #else
1217                 /* rd_doload is 2 for a dual initrd/ramload setup */
1218                 if(rd_doload==2)
1219                         rd_load_secondary();
1220                 else
1221 #endif
1222                 {
1223                         printk(KERN_NOTICE "VFS: Insert root floppy and press ENTER\n");
1224                         wait_for_keypress();
1225                 }
1226         }
1227 #endif
1228
1229         devfs_make_root (root_device_name);
1230         handle = devfs_find_handle (NULL, ROOT_DEVICE_NAME, 0,
1231                                     MAJOR (ROOT_DEV), MINOR (ROOT_DEV),
1232                                     DEVFS_SPECIAL_BLK, 1);
1233         if (handle)  /*  Sigh: bd*() functions only paper over the cracks  */
1234         {
1235             unsigned major, minor;
1236
1237             devfs_get_maj_min (handle, &major, &minor);
1238             ROOT_DEV = MKDEV (major, minor);
1239         }
1240
1241         /*
1242          * Probably pure paranoia, but I'm less than happy about delving into
1243          * devfs crap and checking it right now. Later.
1244          */
1245         if (!ROOT_DEV)
1246                 panic("I have no root and I want to scream");
1247
1248         bdev = bdget(kdev_t_to_nr(ROOT_DEV));
1249         if (!bdev)
1250                 panic(__FUNCTION__ ": unable to allocate root device");
1251         bdev->bd_op = devfs_get_ops (handle);
1252         path_start = devfs_generate_path (handle, path + 5, sizeof (path) - 5);
1253         mode = FMODE_READ;
1254         if (!(root_mountflags & MS_RDONLY))
1255                 mode |= FMODE_WRITE;
1256         retval = blkdev_get(bdev, mode, 0, BDEV_FS);
1257         if (retval == -EROFS) {
1258                 root_mountflags |= MS_RDONLY;
1259                 retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS);
1260         }
1261         if (retval) {
1262                 /*
1263                  * Allow the user to distinguish between failed open
1264                  * and bad superblock on root device.
1265                  */
1266                 printk ("VFS: Cannot open root device \"%s\" or %s\n",
1267                         root_device_name, kdevname (ROOT_DEV));
1268                 printk ("Please append a correct \"root=\" boot option\n");
1269                 panic("VFS: Unable to mount root fs on %s",
1270                         kdevname(ROOT_DEV));
1271         }
1272
1273         check_disk_change(ROOT_DEV);
1274
1275         spin_lock(&file_systems_lock);
1276         for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) {
1277                 if (!(fs_type->fs_flags & FS_REQUIRES_DEV))
1278                         continue;
1279                 if (!try_inc_mod_count(fs_type->owner))
1280                         continue;
1281                 spin_unlock(&file_systems_lock);
1282                 sb = get_super(ROOT_DEV);
1283                 if (sb) {
1284                         /* Shouldn't we fail here? Oh, well... */
1285                         sb->s_bdev = bdev;
1286                         goto mount_it;
1287                 }
1288                 sb = read_super(ROOT_DEV,bdev,fs_type,root_mountflags,NULL,1);
1289                 if (sb)
1290                         goto mount_it;
1291                 spin_lock(&file_systems_lock);
1292                 put_filesystem(fs_type);
1293         }
1294         spin_unlock(&file_systems_lock);
1295         panic("VFS: Unable to mount root fs on %s",
1296                 kdevname(ROOT_DEV));
1297
1298 mount_it:
1299         sb->s_flags = root_mountflags;
1300         current->fs->root = dget(sb->s_root);
1301         current->fs->pwd = dget(sb->s_root);
1302         printk ("VFS: Mounted root (%s filesystem)%s.\n",
1303                 fs_type->name,
1304                 (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1305         if (path_start >= 0) {
1306                 devfs_mk_symlink (NULL,
1307                                   "root", 0, DEVFS_FL_DEFAULT,
1308                                   path + 5 + path_start, 0,
1309                                   NULL, NULL);
1310                 memcpy (path + path_start, "/dev/", 5);
1311                 vfsmnt = add_vfsmnt (sb, path + path_start,
1312                                      "/");
1313         }
1314         else vfsmnt = add_vfsmnt (sb, "/dev/root", "/");
1315         if (vfsmnt) {
1316                 bdput(bdev); /* sb holds a reference */
1317                 return;
1318         }
1319         panic("VFS: add_vfsmnt failed for root fs");
1320 }
1321
1322
1323 static void chroot_fs_refs(struct dentry *old_root,
1324     struct dentry *new_root)
1325 {
1326         struct task_struct *p;
1327
1328         read_lock(&tasklist_lock);
1329         for_each_task(p) {
1330                 if (!p->fs) continue;
1331                 if (p->fs->root == old_root) {
1332                         dput(old_root);
1333                         p->fs->root = dget(new_root);
1334                         printk(KERN_DEBUG "chroot_fs_refs: changed root of "
1335                             "process %d\n",p->pid);
1336                 }
1337                 if (p->fs->pwd == old_root) {
1338                         dput(old_root);
1339                         p->fs->pwd = dget(new_root);
1340                         printk(KERN_DEBUG "chroot_fs_refs: changed cwd of "
1341                             "process %d\n",p->pid);
1342                 }
1343         }
1344         read_unlock(&tasklist_lock);
1345 }
1346
1347
1348 /*
1349  * Moves the current root to put_root, and sets root/cwd of all processes
1350  * which had them on the old root to new_root.
1351  *
1352  * Note:
1353  *  - we don't move root/cwd if they are not at the root (reason: if something
1354  *    cared enough to change them, it's probably wrong to force them elsewhere)
1355  *  - it's okay to pick a root that isn't the root of a file system, e.g.
1356  *    /nfs/my_root where /nfs is the mount point. Better avoid creating
1357  *    unreachable mount points this way, though.
1358  */
1359
1360 asmlinkage long sys_pivot_root(const char *new_root, const char *put_old)
1361 {
1362         struct dentry *root = current->fs->root;
1363         struct dentry *d_new_root, *d_put_old, *covered;
1364         struct dentry *root_dev_root, *new_root_dev_root;
1365         struct dentry *walk, *next;
1366         int error;
1367
1368         if (!capable(CAP_SYS_ADMIN))
1369                 return -EPERM;
1370
1371         lock_kernel();
1372         d_new_root = namei(new_root);
1373         if (IS_ERR(d_new_root)) {
1374                 error = PTR_ERR(d_new_root);
1375                 goto out0;
1376         }
1377         d_put_old = namei(put_old);
1378         if (IS_ERR(d_put_old)) {
1379                 error = PTR_ERR(d_put_old);
1380                 goto out1;
1381         }
1382         down(&mount_sem);
1383         if (!d_new_root->d_inode || !d_put_old->d_inode) {
1384                 error = -ENOENT;
1385                 goto out2;
1386         }
1387         if (!S_ISDIR(d_new_root->d_inode->i_mode) ||
1388             !S_ISDIR(d_put_old->d_inode->i_mode)) {
1389                 error = -ENOTDIR;
1390                 goto out2;
1391         }
1392         error = -EBUSY;
1393         if (d_new_root->d_sb == root->d_sb || d_put_old->d_sb == root->d_sb)
1394                 goto out2; /* loop */
1395         if (d_put_old != d_put_old->d_covers)
1396                 goto out2; /* mount point is busy */
1397         error = -EINVAL;
1398         walk = d_put_old; /* make sure we can reach put_old from new_root */
1399         for (;;) {
1400                 next = walk->d_covers->d_parent;
1401                 if (next == walk)
1402                         goto out2;
1403                 if (next == d_new_root)
1404                         break;
1405                 walk = next;
1406         }
1407
1408         new_root_dev_root = d_new_root->d_sb->s_root;
1409         covered = new_root_dev_root->d_covers;
1410         new_root_dev_root->d_covers = new_root_dev_root;
1411         dput(covered);
1412         covered->d_mounts = covered;
1413
1414         root_dev_root = root->d_sb->s_root;
1415         root_dev_root->d_covers = dget(d_put_old);
1416         d_put_old->d_mounts = root_dev_root;
1417         chroot_fs_refs(root,d_new_root);
1418         error = 0;
1419 out2:
1420         up(&mount_sem);
1421         dput(d_put_old);
1422 out1:
1423         dput(d_new_root);
1424 out0:
1425         unlock_kernel();
1426         return error;
1427 }
1428
1429
1430 #ifdef CONFIG_BLK_DEV_INITRD
1431
1432 int __init change_root(kdev_t new_root_dev,const char *put_old)
1433 {
1434         kdev_t old_root_dev;
1435         struct vfsmount *vfsmnt;
1436         struct dentry *old_root,*old_pwd,*dir_d = NULL;
1437         int error;
1438
1439         old_root = current->fs->root;
1440         old_pwd = current->fs->pwd;
1441         old_root_dev = ROOT_DEV;
1442         if (!fs_may_mount(new_root_dev)) {
1443                 printk(KERN_CRIT "New root is busy. Staying in initrd.\n");
1444                 return -EBUSY;
1445         }
1446         /*  First unmount devfs if mounted  */
1447         dir_d = lookup_dentry ("/dev", NULL, 1);
1448         if (!IS_ERR(dir_d)) {
1449                 struct super_block *sb = dir_d->d_inode->i_sb;
1450
1451                 if (sb && (dir_d->d_inode == sb->s_root->d_inode) &&
1452                     (sb->s_magic == DEVFS_SUPER_MAGIC)) {
1453                         dput (dir_d);
1454                         do_umount (sb->s_dev, 0, 0);
1455                 }
1456                 else dput (dir_d);
1457         }
1458         ROOT_DEV = new_root_dev;
1459         mount_root();
1460         dput(old_root);
1461         dput(old_pwd);
1462 #if 1
1463         shrink_dcache();
1464         printk("change_root: old root has d_count=%d\n", old_root->d_count);
1465 #endif
1466         mount_devfs_fs ();
1467         /*
1468          * Get the new mount directory
1469          */
1470         dir_d = lookup_dentry(put_old, NULL, 1);
1471         if (IS_ERR(dir_d)) {
1472                 error = PTR_ERR(dir_d);
1473         } else if (!dir_d->d_inode) {
1474                 dput(dir_d);
1475                 error = -ENOENT;
1476         } else {
1477                 error = 0;
1478         }
1479         if (!error && dir_d->d_covers != dir_d) {
1480                 dput(dir_d);
1481                 error = -EBUSY;
1482         }
1483         if (!error && !S_ISDIR(dir_d->d_inode->i_mode)) {
1484                 dput(dir_d);
1485                 error = -ENOTDIR;
1486         }
1487         if (error) {
1488                 struct block_device *bdev;
1489
1490                 printk(KERN_NOTICE "Trying to unmount old root ... ");
1491                 bdev = do_umount(old_root_dev,1, 0);
1492                 if (!IS_ERR(bdev)) {
1493                         printk("okay\n");
1494                         /* special: the old device driver is going to be
1495                            a ramdisk and the point of this call is to free its
1496                            protected memory (even if dirty). */
1497                         destroy_buffers(old_root_dev);
1498                         if (bdev) {
1499                                 blkdev_put(bdev, BDEV_FS);
1500                                 bdput(bdev);
1501                         }
1502                         return 0;
1503                 }
1504                 printk(KERN_ERR "error %ld\n",PTR_ERR(bdev));
1505                 return error;
1506         }
1507         remove_vfsmnt(old_root_dev);
1508         vfsmnt = add_vfsmnt(old_root->d_sb, "/dev/root.old", put_old);
1509         if (vfsmnt) {
1510                 d_mount(dir_d,old_root);
1511                 return 0;
1512         }
1513         printk(KERN_CRIT "Trouble: add_vfsmnt failed\n");
1514         return -ENOMEM;
1515 }
1516
1517 #endif