fs/super.c

   1 /*
   2  *  linux/fs/super.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  *
   6  *  super.c contains code to handle: - mount structures
   7  *                                   - super-block tables.
   8  *                                   - mount system call
   9  *                                   - umount system call
  10  *
  11  *  Added options to /proc/mounts
  12  *  Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
  13  *
  14  * GK 2/5/95  -  Changed to support mounting the root fs via NFS
  15  *
  16  *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
  17  *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
  18  *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
  19  */
  20
  21 #include <linux/config.h>
  22 #include <linux/string.h>
  23 #include <linux/malloc.h>
  24 #include <linux/locks.h>
  25 #include <linux/smp_lock.h>
  26 #include <linux/devfs_fs_kernel.h>
  27 #include <linux/fd.h>
  28 #include <linux/init.h>
  29 #include <linux/quotaops.h>
  30 #include <linux/acct.h>
  31
  32 #include <asm/uaccess.h>
  33
  34 #include <linux/nfs_fs.h>
  35 #include <linux/nfs_fs_sb.h>
  36 #include <linux/nfs_mount.h>
  37
  38 #include <linux/kmod.h>
  39 #define __NO_VERSION__
  40 #include <linux/module.h>
  41
  42 /*
  43  * We use a semaphore to synchronize all mount/umount
  44  * activity - imagine the mess if we have a race between
  45  * unmounting a filesystem and re-mounting it (or something
  46  * else).
  47  */
  48 static DECLARE_MUTEX(mount_sem);
  49
  50 extern void wait_for_keypress(void);
  51
  52 extern int root_mountflags;
  53
  54 static int do_remount_sb(struct super_block *sb, int flags, char * data);
  55
  56 /* this is initialized in init/main.c */
  57 kdev_t ROOT_DEV;
  58
  59 int nr_super_blocks = 0;
  60 int max_super_blocks = NR_SUPER;
  61 LIST_HEAD(super_blocks);
  62
  63 /*
  64  * Handling of filesystem drivers list.
  65  * Rules:
  66  *      Inclusion to/removals from/scanning of list are protected by spinlock.
  67  *      During the unload module must call unregister_filesystem().
  68  *      We can access the fields of list element if:
  69  *              1) spinlock is held or
  70  *              2) we hold the reference to the module.
  71  *      The latter can be guaranteed by call of try_inc_mod_count(); if it
  72  *      returned 0 we must skip the element, otherwise we got the reference.
  73  *      Once the reference is obtained we can drop the spinlock.
  74  */
  75
  76 static struct file_system_type *file_systems = NULL;
  77 static spinlock_t file_systems_lock = SPIN_LOCK_UNLOCKED;
  78
  79 static void put_filesystem(struct file_system_type *fs)
  80 {
  81         if (fs->owner)
  82                 __MOD_DEC_USE_COUNT(fs->owner);
  83 }
  84
  85 static struct file_system_type **find_filesystem(const char *name)
  86 {
  87         struct file_system_type **p;
  88         for (p=&file_systems; *p; p=&(*p)->next)
  89                 if (strcmp((*p)->name,name) == 0)
  90                         break;
  91         return p;
  92 }
  93
  94 int register_filesystem(struct file_system_type * fs)
  95 {
  96         int res = 0;
  97         struct file_system_type ** p;
  98
  99         if (!fs)
 100                 return -EINVAL;
 101         if (fs->next)
 102                 return -EBUSY;
 103         spin_lock(&file_systems_lock);
 104         p = find_filesystem(fs->name);
 105         if (*p)
 106                 res = -EBUSY;
 107         else
 108                 *p = fs;
 109         spin_unlock(&file_systems_lock);
 110         return res;
 111 }
 112
 113 int unregister_filesystem(struct file_system_type * fs)
 114 {
 115         struct file_system_type ** tmp;
 116
 117         spin_lock(&file_systems_lock);
 118         tmp = &file_systems;
 119         while (*tmp) {
 120                 if (fs == *tmp) {
 121                         *tmp = fs->next;
 122                         fs->next = NULL;
 123                         spin_unlock(&file_systems_lock);
 124                         return 0;
 125                 }
 126                 tmp = &(*tmp)->next;
 127         }
 128         spin_unlock(&file_systems_lock);
 129         return -EINVAL;
 130 }
 131
 132 static int fs_index(const char * __name)
 133 {
 134         struct file_system_type * tmp;
 135         char * name;
 136         int err, index;
 137
 138         name = getname(__name);
 139         err = PTR_ERR(name);
 140         if (IS_ERR(name))
 141                 return err;
 142
 143         err = -EINVAL;
 144         spin_lock(&file_systems_lock);
 145         for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
 146                 if (strcmp(tmp->name,name) == 0) {
 147                         err = index;
 148                         break;
 149                 }
 150                 index++;
 151         }
 152         spin_unlock(&file_systems_lock);
 153         putname(name);
 154         return err;
 155 }
 156
 157 static int fs_name(unsigned int index, char * buf)
 158 {
 159         struct file_system_type * tmp;
 160         int len, res;
 161
 162         spin_lock(&file_systems_lock);
 163         for (tmp = file_systems; tmp; tmp = tmp->next, index--)
 164                 if (index <= 0 && try_inc_mod_count(tmp->owner))
 165                                 break;
 166         spin_unlock(&file_systems_lock);
 167         if (!tmp)
 168                 return -EINVAL;
 169
 170         /* OK, we got the reference, so we can safely block */
 171         len = strlen(tmp->name) + 1;
 172         res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
 173         put_filesystem(tmp);
 174         return res;
 175 }
 176
 177 static int fs_maxindex(void)
 178 {
 179         struct file_system_type * tmp;
 180         int index;
 181
 182         spin_lock(&file_systems_lock);
 183         for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
 184                 ;
 185         spin_unlock(&file_systems_lock);
 186         return index;
 187 }
 188
 189 /*
 190  * Whee.. Weird sysv syscall.
 191  */
 192 asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
 193 {
 194         int retval = -EINVAL;
 195
 196         lock_kernel();
 197         switch (option) {
 198                 case 1:
 199                         retval = fs_index((const char *) arg1);
 200                         break;
 201
 202                 case 2:
 203                         retval = fs_name(arg1, (char *) arg2);
 204                         break;
 205
 206                 case 3:
 207                         retval = fs_maxindex();
 208                         break;
 209         }
 210         unlock_kernel();
 211         return retval;
 212 }
 213
 214 int get_filesystem_list(char * buf)
 215 {
 216         int len = 0;
 217         struct file_system_type * tmp;
 218
 219         spin_lock(&file_systems_lock);
 220         tmp = file_systems;
 221         while (tmp && len < PAGE_SIZE - 80) {
 222                 len += sprintf(buf+len, "%s\t%s\n",
 223                         (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
 224                         tmp->name);
 225                 tmp = tmp->next;
 226         }
 227         spin_unlock(&file_systems_lock);
 228         return len;
 229 }
 230
 231 static struct file_system_type *get_fs_type(const char *name)
 232 {
 233         struct file_system_type *fs;
 234
 235         spin_lock(&file_systems_lock);
 236         fs = *(find_filesystem(name));
 237         if (fs && !try_inc_mod_count(fs->owner))
 238                 fs = NULL;
 239         spin_unlock(&file_systems_lock);
 240         if (!fs && (request_module(name) == 0)) {
 241                 spin_lock(&file_systems_lock);
 242                 fs = *(find_filesystem(name));
 243                 if (fs && !try_inc_mod_count(fs->owner))
 244                         fs = NULL;
 245                 spin_unlock(&file_systems_lock);
 246         }
 247         return fs;
 248 }
 249
 250
 251 struct vfsmount *vfsmntlist = NULL;
 252 static struct vfsmount *vfsmnttail = NULL, *mru_vfsmnt = NULL;
 253
 254 /*
 255  * This part handles the management of the list of mounted filesystems.
 256  */
 257 struct vfsmount *lookup_vfsmnt(kdev_t dev)
 258 {
 259         struct vfsmount *lptr;
 260
 261         if (vfsmntlist == NULL)
 262                 return NULL;
 263
 264         if (mru_vfsmnt != NULL && mru_vfsmnt->mnt_dev == dev)
 265                 return (mru_vfsmnt);
 266
 267         for (lptr = vfsmntlist; lptr != NULL; lptr = lptr->mnt_next)
 268                 if (lptr->mnt_dev == dev) {
 269                         mru_vfsmnt = lptr;
 270                         return (lptr);
 271                 }
 272
 273         return NULL;
 274 }
 275
 276 static struct vfsmount *add_vfsmnt(struct super_block *sb,
 277                         const char *dev_name, const char *dir_name)
 278 {
 279         struct vfsmount *lptr;
 280         char *tmp, *name;
 281
 282         lptr = (struct vfsmount *)kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
 283         if (!lptr)
 284                 goto out;
 285         memset(lptr, 0, sizeof(struct vfsmount));
 286
 287         lptr->mnt_sb = sb;
 288         lptr->mnt_dev = sb->s_dev;
 289         lptr->mnt_flags = sb->s_flags;
 290
 291         sema_init(&lptr->mnt_dquot.dqio_sem, 1);
 292         sema_init(&lptr->mnt_dquot.dqoff_sem, 1);
 293         lptr->mnt_dquot.flags = 0;
 294
 295         /* N.B. Is it really OK to have a vfsmount without names? */
 296         if (dev_name && !IS_ERR(tmp = getname(dev_name))) {
 297                 name = (char *) kmalloc(strlen(tmp)+1, GFP_KERNEL);
 298                 if (name) {
 299                         strcpy(name, tmp);
 300                         lptr->mnt_devname = name;
 301                 }
 302                 putname(tmp);
 303         }
 304         if (dir_name && !IS_ERR(tmp = getname(dir_name))) {
 305                 name = (char *) kmalloc(strlen(tmp)+1, GFP_KERNEL);
 306                 if (name) {
 307                         strcpy(name, tmp);
 308                         lptr->mnt_dirname = name;
 309                 }
 310                 putname(tmp);
 311         }
 312
 313         if (vfsmntlist == (struct vfsmount *)NULL) {
 314                 vfsmntlist = vfsmnttail = lptr;
 315         } else {
 316                 vfsmnttail->mnt_next = lptr;
 317                 vfsmnttail = lptr;
 318         }
 319 out:
 320         return lptr;
 321 }
 322
 323 void remove_vfsmnt(kdev_t dev)
 324 {
 325         struct vfsmount *lptr, *tofree;
 326
 327         if (vfsmntlist == NULL)
 328                 return;
 329         lptr = vfsmntlist;
 330         if (lptr->mnt_dev == dev) {
 331                 tofree = lptr;
 332                 vfsmntlist = lptr->mnt_next;
 333                 if (vfsmnttail->mnt_dev == dev)
 334                         vfsmnttail = vfsmntlist;
 335         } else {
 336                 while (lptr->mnt_next != NULL) {
 337                         if (lptr->mnt_next->mnt_dev == dev)
 338                                 break;
 339                         lptr = lptr->mnt_next;
 340                 }
 341                 tofree = lptr->mnt_next;
 342                 if (tofree == NULL)
 343                         return;
 344                 lptr->mnt_next = lptr->mnt_next->mnt_next;
 345                 if (vfsmnttail->mnt_dev == dev)
 346                         vfsmnttail = lptr;
 347         }
 348         if (tofree == mru_vfsmnt)
 349                 mru_vfsmnt = NULL;
 350         kfree(tofree->mnt_devname);
 351         kfree(tofree->mnt_dirname);
 352         kfree_s(tofree, sizeof(struct vfsmount));
 353 }
 354
 355 static struct proc_fs_info {
 356         int flag;
 357         char *str;
 358 } fs_info[] = {
 359         { MS_NOEXEC, ",noexec" },
 360         { MS_NOSUID, ",nosuid" },
 361         { MS_NODEV, ",nodev" },
 362         { MS_SYNCHRONOUS, ",sync" },
 363         { MS_MANDLOCK, ",mand" },
 364         { MS_NOATIME, ",noatime" },
 365         { MS_NODIRATIME, ",nodiratime" },
 366 #ifdef MS_NOSUB                 /* Can't find this except in mount.c */
 367         { MS_NOSUB, ",nosub" },
 368 #endif
 369         { 0, NULL }
 370 };
 371
 372 static struct proc_nfs_info {
 373         int flag;
 374         char *str;
 375 } nfs_info[] = {
 376         { NFS_MOUNT_SOFT, ",soft" },
 377         { NFS_MOUNT_INTR, ",intr" },
 378         { NFS_MOUNT_POSIX, ",posix" },
 379         { NFS_MOUNT_NOCTO, ",nocto" },
 380         { NFS_MOUNT_NOAC, ",noac" },
 381         { 0, NULL }
 382 };
 383
 384 int get_filesystem_info( char *buf )
 385 {
 386         struct vfsmount *tmp;
 387         struct proc_fs_info *fs_infop;
 388         struct proc_nfs_info *nfs_infop;
 389         struct nfs_server *nfss;
 390         int len = 0;
 391         char *path,*buffer = (char *) __get_free_page(GFP_KERNEL);
 392
 393         if (!buffer) return 0;
 394         for (tmp = vfsmntlist; tmp && len < PAGE_SIZE - 160;
 395             tmp = tmp->mnt_next) {
 396                 path = d_path(tmp->mnt_sb->s_root, buffer, PAGE_SIZE);
 397                 if (!path)
 398                         continue;
 399                 len += sprintf( buf + len, "%s %s %s %s",
 400                         tmp->mnt_devname, path,
 401                         tmp->mnt_sb->s_type->name,
 402                         tmp->mnt_flags & MS_RDONLY ? "ro" : "rw" );
 403                 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
 404                   if (tmp->mnt_flags & fs_infop->flag) {
 405                     strcpy(buf + len, fs_infop->str);
 406                     len += strlen(fs_infop->str);
 407                   }
 408                 }
 409                 if (!strcmp("nfs", tmp->mnt_sb->s_type->name)) {
 410                         nfss = &tmp->mnt_sb->u.nfs_sb.s_server;
 411                         if (nfss->rsize != NFS_DEF_FILE_IO_BUFFER_SIZE) {
 412                                 len += sprintf(buf+len, ",rsize=%d",
 413                                                nfss->rsize);
 414                         }
 415                         if (nfss->wsize != NFS_DEF_FILE_IO_BUFFER_SIZE) {
 416                                 len += sprintf(buf+len, ",wsize=%d",
 417                                                nfss->wsize);
 418                         }
 419 #if 0
 420                         if (nfss->timeo != 7*HZ/10) {
 421                                 len += sprintf(buf+len, ",timeo=%d",
 422                                                nfss->timeo*10/HZ);
 423                         }
 424                         if (nfss->retrans != 3) {
 425                                 len += sprintf(buf+len, ",retrans=%d",
 426                                                nfss->retrans);
 427                         }
 428 #endif
 429                         if (nfss->acregmin != 3*HZ) {
 430                                 len += sprintf(buf+len, ",acregmin=%d",
 431                                                nfss->acregmin/HZ);
 432                         }
 433                         if (nfss->acregmax != 60*HZ) {
 434                                 len += sprintf(buf+len, ",acregmax=%d",
 435                                                nfss->acregmax/HZ);
 436                         }
 437                         if (nfss->acdirmin != 30*HZ) {
 438                                 len += sprintf(buf+len, ",acdirmin=%d",
 439                                                nfss->acdirmin/HZ);
 440                         }
 441                         if (nfss->acdirmax != 60*HZ) {
 442                                 len += sprintf(buf+len, ",acdirmax=%d",
 443                                                nfss->acdirmax/HZ);
 444                         }
 445                         for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
 446                                 if (nfss->flags & nfs_infop->flag) {
 447                                         strcpy(buf + len, nfs_infop->str);
 448                                         len += strlen(nfs_infop->str);
 449                                 }
 450                         }
 451                         len += sprintf(buf+len, ",addr=%s",
 452                                        nfss->hostname);
 453                 }
 454                 len += sprintf( buf + len, " 0 0\n" );
 455         }
 456
 457         free_page((unsigned long) buffer);
 458         return len;
 459 }
 460
 461 void __wait_on_super(struct super_block * sb)
 462 {
 463         DECLARE_WAITQUEUE(wait, current);
 464
 465         add_wait_queue(&sb->s_wait, &wait);
 466 repeat:
 467         set_current_state(TASK_UNINTERRUPTIBLE);
 468         if (sb->s_lock) {
 469                 schedule();
 470                 goto repeat;
 471         }
 472         remove_wait_queue(&sb->s_wait, &wait);
 473         current->state = TASK_RUNNING;
 474 }
 475
 476 /*
 477  * Note: check the dirty flag before waiting, so we don't
 478  * hold up the sync while mounting a device. (The newly
 479  * mounted device won't need syncing.)
 480  */
 481 void sync_supers(kdev_t dev)
 482 {
 483         struct super_block * sb;
 484
 485         for (sb = sb_entry(super_blocks.next);
 486              sb != sb_entry(&super_blocks);
 487              sb = sb_entry(sb->s_list.next)) {
 488                 if (!sb->s_dev)
 489                         continue;
 490                 if (dev && sb->s_dev != dev)
 491                         continue;
 492                 if (!sb->s_dirt)
 493                         continue;
 494                 /* N.B. Should lock the superblock while writing */
 495                 wait_on_super(sb);
 496                 if (!sb->s_dev || !sb->s_dirt)
 497                         continue;
 498                 if (dev && (dev != sb->s_dev))
 499                         continue;
 500                 if (sb->s_op && sb->s_op->write_super)
 501                         sb->s_op->write_super(sb);
 502         }
 503 }
 504
 505 struct super_block * get_super(kdev_t dev)
 506 {
 507         struct super_block * s;
 508
 509         if (!dev)
 510                 return NULL;
 511 restart:
 512         s = sb_entry(super_blocks.next);
 513         while (s != sb_entry(&super_blocks))
 514                 if (s->s_dev == dev) {
 515                         wait_on_super(s);
 516                         if (s->s_dev == dev)
 517                                 return s;
 518                         goto restart;
 519                 } else
 520                         s = sb_entry(s->s_list.next);
 521         return NULL;
 522 }
 523
 524 asmlinkage long sys_ustat(dev_t dev, struct ustat * ubuf)
 525 {
 526         struct super_block *s;
 527         struct ustat tmp;
 528         struct statfs sbuf;
 529         int err = -EINVAL;
 530
 531         lock_kernel();
 532         s = get_super(to_kdev_t(dev));
 533         if (s == NULL)
 534                 goto out;
 535         err = vfs_statfs(s, &sbuf);
 536         if (err)
 537                 goto out;
 538
 539         memset(&tmp,0,sizeof(struct ustat));
 540         tmp.f_tfree = sbuf.f_bfree;
 541         tmp.f_tinode = sbuf.f_ffree;
 542
 543         err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
 544 out:
 545         unlock_kernel();
 546         return err;
 547 }
 548
 549 /*
 550  * Find a super_block with no device assigned.
 551  */
 552 struct super_block *get_empty_super(void)
 553 {
 554         struct super_block *s;
 555
 556         for (s  = sb_entry(super_blocks.next);
 557              s != sb_entry(&super_blocks);
 558              s  = sb_entry(s->s_list.next)) {
 559                 if (s->s_dev)
 560                         continue;
 561                 if (!s->s_lock)
 562                         return s;
 563                 printk("VFS: empty superblock %p locked!\n", s);
 564         }
 565         /* Need a new one... */
 566         if (nr_super_blocks >= max_super_blocks)
 567                 return NULL;
 568         s = kmalloc(sizeof(struct super_block),  GFP_USER);
 569         if (s) {
 570                 nr_super_blocks++;
 571                 memset(s, 0, sizeof(struct super_block));
 572                 INIT_LIST_HEAD(&s->s_dirty);
 573                 list_add (&s->s_list, super_blocks.prev);
 574                 init_waitqueue_head(&s->s_wait);
 575                 INIT_LIST_HEAD(&s->s_files);
 576         }
 577         return s;
 578 }
 579
 580 static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
 581                                        struct file_system_type *type, int flags,
 582                                        void *data, int silent)
 583 {
 584         struct super_block * s;
 585         s = get_empty_super();
 586         if (!s)
 587                 goto out;
 588         s->s_dev = dev;
 589         s->s_bdev = bdev;
 590         s->s_flags = flags;
 591         s->s_dirt = 0;
 592         sema_init(&s->s_vfs_rename_sem,1);
 593         sema_init(&s->s_nfsd_free_path_sem,1);
 594         s->s_type = type;
 595         lock_super(s);
 596         if (!type->read_super(s, data, silent))
 597                 goto out_fail;
 598         unlock_super(s);
 599         /* tell bdcache that we are going to keep this one */
 600         if (bdev)
 601                 atomic_inc(&bdev->bd_count);
 602 out:
 603         return s;
 604
 605 out_fail:
 606         s->s_dev = 0;
 607         s->s_bdev = 0;
 608         s->s_type = NULL;
 609         put_filesystem(type);
 610         unlock_super(s);
 611         return NULL;
 612 }
 613
 614 /*
 615  * Unnamed block devices are dummy devices used by virtual
 616  * filesystems which don't use real block-devices.  -- jrs
 617  */
 618
 619 static unsigned int unnamed_dev_in_use[256/(8*sizeof(unsigned int))] = { 0, };
 620
 621 kdev_t get_unnamed_dev(void)
 622 {
 623         int i;
 624
 625         for (i = 1; i < 256; i++) {
 626                 if (!test_and_set_bit(i,unnamed_dev_in_use))
 627                         return MKDEV(UNNAMED_MAJOR, i);
 628         }
 629         return 0;
 630 }
 631
 632 void put_unnamed_dev(kdev_t dev)
 633 {
 634         if (!dev || MAJOR(dev) != UNNAMED_MAJOR)
 635                 return;
 636         if (test_and_clear_bit(MINOR(dev), unnamed_dev_in_use))
 637                 return;
 638         printk("VFS: put_unnamed_dev: freeing unused device %s\n",
 639                         kdevname(dev));
 640 }
 641
 642 static int d_umount(struct super_block * sb)
 643 {
 644         struct dentry * root = sb->s_root;
 645         struct dentry * covered = root->d_covers;
 646
 647         if (root->d_count != 1)
 648                 return -EBUSY;
 649
 650         if (root->d_inode->i_state)
 651                 return -EBUSY;
 652
 653         sb->s_root = NULL;
 654
 655         if (covered != root) {
 656                 root->d_covers = root;
 657                 covered->d_mounts = covered;
 658                 dput(covered);
 659         }
 660         dput(root);
 661         return 0;
 662 }
 663
 664 static void d_mount(struct dentry *covered, struct dentry *dentry)
 665 {
 666         if (covered->d_mounts != covered) {
 667                 printk("VFS: mount - already mounted\n");
 668                 return;
 669         }
 670         covered->d_mounts = dentry;
 671         dentry->d_covers = covered;
 672 }
 673
 674 static struct block_device *do_umount(kdev_t dev, int unmount_root, int flags)
 675 {
 676         struct super_block * sb;
 677         struct block_device *bdev;
 678         int retval;
 679
 680         retval = -ENOENT;
 681         sb = get_super(dev);
 682         if (!sb || !sb->s_root)
 683                 goto out;
 684
 685         /*
 686          * Before checking whether the filesystem is still busy,
 687          * make sure the kernel doesn't hold any quota files open
 688          * on the device. If the umount fails, too bad -- there
 689          * are no quotas running any more. Just turn them on again.
 690          */
 691         DQUOT_OFF(dev);
 692         acct_auto_close(dev);
 693
 694         /*
 695          * If we may have to abort operations to get out of this
 696          * mount, and they will themselves hold resources we must
 697          * allow the fs to do things. In the Unix tradition of
 698          * 'Gee thats tricky lets do it in userspace' the umount_begin
 699          * might fail to complete on the first run through as other tasks
 700          * must return, and the like. Thats for the mount program to worry
 701          * about for the moment.
 702          */
 703
 704         if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
 705                 sb->s_op->umount_begin(sb);
 706
 707         /*
 708          * Shrink dcache, then fsync. This guarantees that if the
 709          * filesystem is quiescent at this point, then (a) only the
 710          * root entry should be in use and (b) that root entry is
 711          * clean.
 712          */
 713         shrink_dcache_sb(sb);
 714         fsync_dev(dev);
 715
 716         if (sb == current->fs->root->d_sb && !unmount_root) {
 717                 /*
 718                  * Special case for "unmounting" root ...
 719                  * we just try to remount it readonly.
 720                  */
 721                 retval = 0;
 722                 if (!(sb->s_flags & MS_RDONLY))
 723                         retval = do_remount_sb(sb, MS_RDONLY, 0);
 724                 return ERR_PTR(retval);
 725         }
 726
 727         retval = d_umount(sb);
 728         if (retval)
 729                 goto out;
 730
 731         if (sb->s_op) {
 732                 if (sb->s_op->write_super && sb->s_dirt)
 733                         sb->s_op->write_super(sb);
 734         }
 735
 736         lock_super(sb);
 737         if (sb->s_op) {
 738                 if (sb->s_op->put_super)
 739                         sb->s_op->put_super(sb);
 740         }
 741
 742         /* Forget any remaining inodes */
 743         if (invalidate_inodes(sb)) {
 744                 printk("VFS: Busy inodes after unmount. "
 745                         "Self-destruct in 5 seconds.  Have a nice day...\n");
 746         }
 747
 748         sb->s_dev = 0;          /* Free the superblock */
 749         bdev = sb->s_bdev;
 750         sb->s_bdev = NULL;
 751         put_filesystem(sb->s_type);
 752         sb->s_type = NULL;
 753         unlock_super(sb);
 754
 755         remove_vfsmnt(dev);
 756
 757         return bdev;
 758
 759 out:
 760         return ERR_PTR(retval);
 761 }
 762
 763 static int umount_dev(kdev_t dev, int flags)
 764 {
 765         int retval;
 766         struct block_device *bdev;
 767
 768         retval = -ENXIO;
 769         if (MAJOR(dev) >= MAX_BLKDEV)
 770                 goto out;
 771
 772         fsync_dev(dev);
 773
 774         down(&mount_sem);
 775
 776         bdev = do_umount(dev, 0, flags);
 777         if (IS_ERR(bdev))
 778                 retval = PTR_ERR(bdev);
 779         else {
 780                 retval = 0;
 781                 if (bdev) {
 782                         blkdev_put(bdev, BDEV_FS);
 783                         bdput(bdev);
 784                 } else {
 785                         put_unnamed_dev(dev);
 786                 }
 787         }
 788         up(&mount_sem);
 789 out:
 790         return retval;
 791 }
 792
 793 /*
 794  * Now umount can handle mount points as well as block devices.
 795  * This is important for filesystems which use unnamed block devices.
 796  *
 797  * We now support a flag for forced unmount like the other 'big iron'
 798  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
 799  */
 800
 801 asmlinkage long sys_umount(char * name, int flags)
 802 {
 803         struct dentry * dentry;
 804         int retval;
 805
 806         if (!capable(CAP_SYS_ADMIN))
 807                 return -EPERM;
 808
 809         lock_kernel();
 810         dentry = namei(name);
 811         retval = PTR_ERR(dentry);
 812         if (!IS_ERR(dentry)) {
 813                 struct inode * inode = dentry->d_inode;
 814                 kdev_t dev = inode->i_rdev;
 815
 816                 retval = 0;
 817                 if (S_ISBLK(inode->i_mode)) {
 818                         if (IS_NODEV(inode))
 819                                 retval = -EACCES;
 820                 } else {
 821                         struct super_block *sb = inode->i_sb;
 822                         retval = -EINVAL;
 823                         if (sb && inode == sb->s_root->d_inode) {
 824                                 dev = sb->s_dev;
 825                                 retval = 0;
 826                         }
 827                 }
 828                 dput(dentry);
 829
 830                 if (!retval)
 831                         retval = umount_dev(dev, flags);
 832         }
 833         unlock_kernel();
 834         return retval;
 835 }
 836
 837 /*
 838  *      The 2.0 compatible umount. No flags.
 839  */
 840
 841 asmlinkage long sys_oldumount(char * name)
 842 {
 843         return sys_umount(name,0);
 844 }
 845
 846 /*
 847  * Check whether we can mount the specified device.
 848  */
 849 int fs_may_mount(kdev_t dev)
 850 {
 851         struct super_block * sb = get_super(dev);
 852         int busy;
 853
 854         busy = sb && sb->s_root &&
 855                (sb->s_root->d_count != 1 || sb->s_root->d_covers != sb->s_root);
 856         return !busy;
 857 }
 858
 859 /*
 860  * do_mount() does the actual mounting after sys_mount has done the ugly
 861  * parameter parsing. When enough time has gone by, and everything uses the
 862  * new mount() parameters, sys_mount() can then be cleaned up.
 863  *
 864  * We cannot mount a filesystem if it has active, used, or dirty inodes.
 865  * We also have to flush all inode-data for this device, as the new mount
 866  * might need new info.
 867  *
 868  * [21-Mar-97] T.Schoebel-Theuer: Now this can be overridden when
 869  * supplying a leading "!" before the dir_name, allowing "stacks" of
 870  * mounted filesystems. The stacking will only influence any pathname lookups
 871  * _after_ the mount, but open file descriptors or working directories that
 872  * are now covered remain valid. For example, when you overmount /home, any
 873  * process with old cwd /home/joe will continue to use the old versions,
 874  * as long as relative paths are used, but absolute paths like /home/joe/xxx
 875  * will go to the new "top of stack" version. In general, crossing a
 876  * mount point will always go to the top of stack element.
 877  * Anyone using this new feature must know what he/she is doing.
 878  */
 879
 880 int do_mount(struct block_device *bdev, const char *dev_name,
 881              const char *dir_name, const char * type, int flags, void * data)
 882 {
 883         kdev_t dev;
 884         struct dentry * dir_d;
 885         struct super_block * sb;
 886         struct vfsmount *vfsmnt;
 887         struct file_system_type *fs_type;
 888         int error;
 889
 890         if (bdev) {
 891                 mode_t mode = FMODE_READ; /* we always need it ;-) */
 892                 if (!(flags & MS_RDONLY))
 893                         mode |= FMODE_WRITE;
 894                 dev = to_kdev_t(bdev->bd_dev);
 895                 error = blkdev_get(bdev, mode, 0, BDEV_FS);
 896                 if (error)
 897                         return error;
 898         } else {
 899                 dev = get_unnamed_dev();
 900                 if (!dev)
 901                         return -EMFILE; /* huh? */
 902         }
 903
 904         error = -EACCES;
 905         if (!(flags & MS_RDONLY) && dev && is_read_only(dev))
 906                 goto out;
 907
 908         /*
 909          * Do the lookup first to force automounting.
 910          */
 911         dir_d = namei(dir_name);
 912         error = PTR_ERR(dir_d);
 913         if (IS_ERR(dir_d))
 914                 goto out;
 915
 916         down(&mount_sem);
 917         error = -ENOTDIR;
 918         if (!S_ISDIR(dir_d->d_inode->i_mode))
 919                 goto dput_and_out;
 920
 921         error = -EBUSY;
 922         if (dir_d->d_covers != dir_d)
 923                 goto dput_and_out;
 924
 925         error = -EINVAL;
 926         if (!dev)
 927                 goto dput_and_out;
 928         check_disk_change(dev);
 929         sb = get_super(dev);
 930         if (sb) {
 931                 /* Already mounted */
 932                 error = -EBUSY;
 933                 goto dput_and_out;
 934         }
 935
 936         fs_type = get_fs_type(type);
 937         if (!fs_type) {
 938                 printk("VFS: on device %s: get_fs_type(%s) failed\n",
 939                        kdevname(dev), type);
 940                 goto dput_and_out;
 941         }
 942
 943         sb = read_super(dev, bdev, fs_type, flags, data, 0);
 944         if (!sb)
 945                 goto fsput_and_out;
 946
 947         /*
 948          * We may have slept while reading the super block,
 949          * so we check afterwards whether it's safe to mount.
 950          */
 951         error = -EBUSY;
 952         if (!fs_may_mount(dev))
 953                 goto bdput_and_out;
 954
 955         error = -ENOMEM;
 956         vfsmnt = add_vfsmnt(sb, dev_name, dir_name);
 957         if (vfsmnt) {
 958                 d_mount(dget(dir_d), sb->s_root);
 959                 dput(dir_d);
 960                 up(&mount_sem);
 961                 return 0;
 962         }
 963
 964 bdput_and_out:
 965         /* FIXME: ->put_super() is needed here */
 966         sb->s_bdev = NULL;
 967         sb->s_dev = 0;
 968         sb->s_type = NULL;
 969         if (bdev)
 970                 bdput(bdev);
 971 fsput_and_out:
 972         put_filesystem(fs_type);
 973 dput_and_out:
 974         dput(dir_d);
 975         up(&mount_sem);
 976 out:
 977         if (bdev)
 978                 blkdev_put(bdev, BDEV_FS);
 979         else
 980                 put_unnamed_dev(dev);
 981         return error;
 982 }
 983
 984
 985 /*
 986  * Alters the mount flags of a mounted file system. Only the mount point
 987  * is used as a reference - file system type and the device are ignored.
 988  */
 989
 990 static int do_remount_sb(struct super_block *sb, int flags, char *data)
 991 {
 992         int retval;
 993         struct vfsmount *vfsmnt;
 994
 995         if (!(flags & MS_RDONLY) && sb->s_dev && is_read_only(sb->s_dev))
 996                 return -EACCES;
 997                 /*flags |= MS_RDONLY;*/
 998         /* If we are remounting RDONLY, make sure there are no rw files open */
 999         if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
1000                 if (!fs_may_remount_ro(sb))
1001                         return -EBUSY;
1002         if (sb->s_op && sb->s_op->remount_fs) {
1003                 lock_super(sb);
1004                 retval = sb->s_op->remount_fs(sb, &flags, data);
1005                 unlock_super(sb);
1006                 if (retval)
1007                         return retval;
1008         }
1009         sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
1010         vfsmnt = lookup_vfsmnt(sb->s_dev);
1011         if (vfsmnt)
1012                 vfsmnt->mnt_flags = sb->s_flags;
1013
1014         /*
1015          * Invalidate the inodes, as some mount options may be changed.
1016          * N.B. If we are changing media, we should check the return
1017          * from invalidate_inodes ... can't allow _any_ open files.
1018          */
1019         invalidate_inodes(sb);
1020
1021         return 0;
1022 }
1023
1024 static int do_remount(const char *dir,int flags,char *data)
1025 {
1026         struct dentry *dentry;
1027         int retval;
1028
1029         dentry = namei(dir);
1030         retval = PTR_ERR(dentry);
1031         if (!IS_ERR(dentry)) {
1032                 struct super_block * sb = dentry->d_inode->i_sb;
1033
1034                 retval = -ENODEV;
1035                 if (sb) {
1036                         retval = -EINVAL;
1037                         if (dentry == sb->s_root) {
1038                                 /*
1039                                  * Shrink the dcache and sync the device.
1040                                  */
1041                                 shrink_dcache_sb(sb);
1042                                 fsync_dev(sb->s_dev);
1043                                 if (flags & MS_RDONLY)
1044                                         acct_auto_close(sb->s_dev);
1045                                 retval = do_remount_sb(sb, flags, data);
1046                         }
1047                 }
1048                 dput(dentry);
1049         }
1050         return retval;
1051 }
1052
1053 static int copy_mount_options (const void * data, unsigned long *where)
1054 {
1055         int i;
1056         unsigned long page;
1057         struct vm_area_struct * vma;
1058
1059         *where = 0;
1060         if (!data)
1061                 return 0;
1062
1063         vma = find_vma(current->mm, (unsigned long) data);
1064         if (!vma || (unsigned long) data < vma->vm_start)
1065                 return -EFAULT;
1066         if (!(vma->vm_flags & VM_READ))
1067                 return -EFAULT;
1068         i = vma->vm_end - (unsigned long) data;
1069         if (PAGE_SIZE <= (unsigned long) i)
1070                 i = PAGE_SIZE-1;
1071         if (!(page = __get_free_page(GFP_KERNEL))) {
1072                 return -ENOMEM;
1073         }
1074         if (copy_from_user((void *) page,data,i)) {
1075                 free_page(page);
1076                 return -EFAULT;
1077         }
1078         *where = page;
1079         return 0;
1080 }
1081
1082 /*
1083  * Flags is a 16-bit value that allows up to 16 non-fs dependent flags to
1084  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
1085  *
1086  * data is a (void *) that can point to any structure up to
1087  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
1088  * information (or be NULL).
1089  *
1090  * NOTE! As old versions of mount() didn't use this setup, the flags
1091  * have to have a special 16-bit magic number in the high word:
1092  * 0xC0ED. If this magic word isn't present, the flags and data info
1093  * aren't used, as the syscall assumes we are talking to an older
1094  * version that didn't understand them.
1095  */
1096 long do_sys_mount(char * dev_name, char * dir_name, unsigned long type_page,
1097                   unsigned long new_flags, unsigned long data_page)
1098 {
1099         struct file_system_type * fstype;
1100         struct dentry * dentry = NULL;
1101         struct inode * inode = NULL;
1102         struct block_device *bdev = NULL;
1103         int retval;
1104         unsigned long flags = 0;
1105
1106         if (!capable(CAP_SYS_ADMIN))
1107                 return -EPERM;
1108
1109         if ((new_flags &
1110              (MS_MGC_MSK | MS_REMOUNT)) == (MS_MGC_VAL | MS_REMOUNT)) {
1111                 retval = do_remount(dir_name,
1112                                     new_flags & ~MS_MGC_MSK & ~MS_REMOUNT,
1113                                     (char *) data_page);
1114                 goto out;
1115         }
1116
1117         fstype = get_fs_type((char *) type_page);
1118         retval = -ENODEV;
1119         if (!fstype)
1120                 goto out;
1121
1122         if (fstype->fs_flags & FS_REQUIRES_DEV) {
1123                 struct block_device_operations *bdops;
1124
1125                 dentry = namei(dev_name);
1126                 retval = PTR_ERR(dentry);
1127                 if (IS_ERR(dentry))
1128                         goto fs_out;
1129
1130                 inode = dentry->d_inode;
1131                 retval = -ENOTBLK;
1132                 if (!S_ISBLK(inode->i_mode))
1133                         goto dput_and_out;
1134
1135                 retval = -EACCES;
1136                 if (IS_NODEV(inode))
1137                         goto dput_and_out;
1138
1139                 bdev = inode->i_bdev;
1140                 bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
1141                 if (bdops) bdev->bd_op = bdops;
1142         }
1143
1144         if ((new_flags & MS_MGC_MSK) == MS_MGC_VAL)
1145                 flags = new_flags & ~MS_MGC_MSK;
1146
1147         retval = do_mount(bdev, dev_name, dir_name, fstype->name, flags,
1148                                 (void *) data_page);
1149
1150 dput_and_out:
1151         dput(dentry);
1152 fs_out:
1153         put_filesystem(fstype);
1154 out:
1155         return retval;
1156 }
1157
1158 asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type,
1159                           unsigned long new_flags, void * data)
1160 {
1161         int retval;
1162         unsigned long data_page = 0;
1163         unsigned long type_page = 0;
1164
1165         lock_kernel();
1166         retval = copy_mount_options (type, &type_page);
1167         if (retval < 0)
1168                 goto out;
1169
1170         /* copy_mount_options allows a NULL user pointer,
1171          * and just returns zero in that case.  But if we
1172          * allow the type to be NULL we will crash.
1173          * Previously we did not check this case.
1174          */
1175         if (type_page == 0) {
1176                 retval = -EINVAL;
1177                 goto out;
1178         }
1179
1180         retval = copy_mount_options (data, &data_page);
1181         if (retval >= 0) {
1182                 retval = do_sys_mount(dev_name, dir_name, type_page,
1183                                       new_flags, data_page);
1184                 free_page(data_page);
1185         }
1186         free_page(type_page);
1187 out:
1188         unlock_kernel();
1189         return retval;
1190 }
1191
1192 void __init mount_root(void)
1193 {
1194         struct file_system_type * fs_type;
1195         struct super_block * sb;
1196         struct vfsmount *vfsmnt;
1197         struct block_device *bdev = NULL;
1198         mode_t mode;
1199         int retval;
1200         void *handle;
1201         char path[64];
1202         int path_start = -1;
1203
1204 #ifdef CONFIG_ROOT_NFS
1205         if (MAJOR(ROOT_DEV) == UNNAMED_MAJOR) {
1206                 ROOT_DEV = 0;
1207                 if ((fs_type = get_fs_type("nfs"))) {
1208                         sb = get_empty_super(); /* "can't fail" */
1209                         sb->s_dev = get_unnamed_dev();
1210                         sb->s_bdev = NULL;
1211                         sb->s_flags = root_mountflags;
1212                         sema_init(&sb->s_vfs_rename_sem,1);
1213                         sema_init(&sb->s_nfsd_free_path_sem,1);
1214                         vfsmnt = add_vfsmnt(sb, "/dev/root", "/");
1215                         if (vfsmnt) {
1216                                 if (nfs_root_mount(sb) >= 0) {
1217                                         sb->s_dirt = 0;
1218                                         sb->s_type = fs_type;
1219                                         current->fs->root = dget(sb->s_root);
1220                                         current->fs->pwd = dget(sb->s_root);
1221                                         ROOT_DEV = sb->s_dev;
1222                                         printk (KERN_NOTICE "VFS: Mounted root (NFS filesystem)%s.\n", (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1223                                         return;
1224                                 }
1225                                 remove_vfsmnt(sb->s_dev);
1226                         }
1227                         put_unnamed_dev(sb->s_dev);
1228                         sb->s_dev = 0;
1229                         put_filesystem(fs_type);
1230                 }
1231                 if (!ROOT_DEV) {
1232                         printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
1233                         ROOT_DEV = MKDEV(FLOPPY_MAJOR, 0);
1234                 }
1235         }
1236 #endif
1237
1238 #ifdef CONFIG_BLK_DEV_FD
1239         if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
1240 #ifdef CONFIG_BLK_DEV_RAM
1241                 extern int rd_doload;
1242                 extern void rd_load_secondary(void);
1243 #endif
1244                 floppy_eject();
1245 #ifndef CONFIG_BLK_DEV_RAM
1246                 printk(KERN_NOTICE "(Warning, this kernel has no ramdisk support)\n");
1247 #else
1248                 /* rd_doload is 2 for a dual initrd/ramload setup */
1249                 if(rd_doload==2)
1250                         rd_load_secondary();
1251                 else
1252 #endif
1253                 {
1254                         printk(KERN_NOTICE "VFS: Insert root floppy and press ENTER\n");
1255                         wait_for_keypress();
1256                 }
1257         }
1258 #endif
1259
1260         devfs_make_root (root_device_name);
1261         handle = devfs_find_handle (NULL, ROOT_DEVICE_NAME, 0,
1262                                     MAJOR (ROOT_DEV), MINOR (ROOT_DEV),
1263                                     DEVFS_SPECIAL_BLK, 1);
1264         if (handle)  /*  Sigh: bd*() functions only paper over the cracks  */
1265         {
1266             unsigned major, minor;
1267
1268             devfs_get_maj_min (handle, &major, &minor);
1269             ROOT_DEV = MKDEV (major, minor);
1270         }
1271
1272         /*
1273          * Probably pure paranoia, but I'm less than happy about delving into
1274          * devfs crap and checking it right now. Later.
1275          */
1276         if (!ROOT_DEV)
1277                 panic("I have no root and I want to sream");
1278
1279         bdev = bdget(kdev_t_to_nr(ROOT_DEV));
1280         if (!bdev)
1281                 panic(__FUNCTION__ ": unable to allocate root device");
1282         bdev->bd_op = devfs_get_ops (handle);
1283         path_start = devfs_generate_path (handle, path + 5, sizeof (path) - 5);
1284         mode = FMODE_READ;
1285         if (!(root_mountflags & MS_RDONLY))
1286                 mode |= FMODE_WRITE;
1287         retval = blkdev_get(bdev, mode, 0, BDEV_FS);
1288         if (retval == -EROFS) {
1289                 root_mountflags |= MS_RDONLY;
1290                 retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS);
1291         }
1292         if (retval) {
1293                 /*
1294                  * Allow the user to distinguish between failed open
1295                  * and bad superblock on root device.
1296                  */
1297                 printk ("VFS: Cannot open root device \"%s\" or %s\n",
1298                         root_device_name, kdevname (ROOT_DEV));
1299                 printk ("Please append a correct \"root=\" boot option\n");
1300                 panic("VFS: Unable to mount root fs on %s",
1301                         kdevname(ROOT_DEV));
1302         }
1303
1304         check_disk_change(ROOT_DEV);
1305
1306         spin_lock(&file_systems_lock);
1307         for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) {
1308                 if (!(fs_type->fs_flags & FS_REQUIRES_DEV))
1309                         continue;
1310                 if (!try_inc_mod_count(fs_type->owner))
1311                         continue;
1312                 spin_unlock(&file_systems_lock);
1313                 sb = get_super(ROOT_DEV);
1314                 if (sb) {
1315                         /* Shouldn't we fail here? Oh, well... */
1316                         sb->s_bdev = bdev;
1317                         goto mount_it;
1318                 }
1319                 sb = read_super(ROOT_DEV,bdev,fs_type,root_mountflags,NULL,1);
1320                 if (sb)
1321                         goto mount_it;
1322                 spin_lock(&file_systems_lock);
1323                 put_filesystem(fs_type);
1324         }
1325         spin_unlock(&file_systems_lock);
1326         panic("VFS: Unable to mount root fs on %s",
1327                 kdevname(ROOT_DEV));
1328
1329 mount_it:
1330         sb->s_flags = root_mountflags;
1331         current->fs->root = dget(sb->s_root);
1332         current->fs->pwd = dget(sb->s_root);
1333         printk ("VFS: Mounted root (%s filesystem)%s.\n",
1334                 fs_type->name,
1335                 (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1336         if (path_start >= 0) {
1337                 devfs_mk_symlink (NULL,
1338                                   "root", 0, DEVFS_FL_DEFAULT,
1339                                   path + 5 + path_start, 0,
1340                                   NULL, NULL);
1341                 memcpy (path + path_start, "/dev/", 5);
1342                 vfsmnt = add_vfsmnt (sb, path + path_start,
1343                                      "/");
1344         }
1345         else vfsmnt = add_vfsmnt (sb, "/dev/root", "/");
1346         if (vfsmnt) {
1347                 bdput(bdev); /* sb holds a reference */
1348                 return;
1349         }
1350         panic("VFS: add_vfsmnt failed for root fs");
1351 }
1352
1353
1354 static void chroot_fs_refs(struct dentry *old_root,
1355     struct dentry *new_root)
1356 {
1357         struct task_struct *p;
1358
1359         read_lock(&tasklist_lock);
1360         for_each_task(p) {
1361                 if (!p->fs) continue;
1362                 if (p->fs->root == old_root) {
1363                         dput(old_root);
1364                         p->fs->root = dget(new_root);
1365                         printk(KERN_DEBUG "chroot_fs_refs: changed root of "
1366                             "process %d\n",p->pid);
1367                 }
1368                 if (p->fs->pwd == old_root) {
1369                         dput(old_root);
1370                         p->fs->pwd = dget(new_root);
1371                         printk(KERN_DEBUG "chroot_fs_refs: changed cwd of "
1372                             "process %d\n",p->pid);
1373                 }
1374         }
1375         read_unlock(&tasklist_lock);
1376 }
1377
1378
1379 /*
1380  * Moves the current root to put_root, and sets root/cwd of all processes
1381  * which had them on the old root to new_root.
1382  *
1383  * Note:
1384  *  - we don't move root/cwd if they are not at the root (reason: if something
1385  *    cared enough to change them, it's probably wrong to force them elsewhere)
1386  *  - it's okay to pick a root that isn't the root of a file system, e.g.
1387  *    /nfs/my_root where /nfs is the mount point. Better avoid creating
1388  *    unreachable mount points this way, though.
1389  */
1390
1391 asmlinkage long sys_pivot_root(const char *new_root, const char *put_old)
1392 {
1393         struct dentry *root = current->fs->root;
1394         struct dentry *d_new_root, *d_put_old, *covered;
1395         struct dentry *root_dev_root, *new_root_dev_root;
1396         struct dentry *walk, *next;
1397         int error;
1398
1399         if (!capable(CAP_SYS_ADMIN))
1400                 return -EPERM;
1401
1402         lock_kernel();
1403         d_new_root = namei(new_root);
1404         if (IS_ERR(d_new_root)) {
1405                 error = PTR_ERR(d_new_root);
1406                 goto out0;
1407         }
1408         d_put_old = namei(put_old);
1409         if (IS_ERR(d_put_old)) {
1410                 error = PTR_ERR(d_put_old);
1411                 goto out1;
1412         }
1413         down(&mount_sem);
1414         if (!d_new_root->d_inode || !d_put_old->d_inode) {
1415                 error = -ENOENT;
1416                 goto out2;
1417         }
1418         if (!S_ISDIR(d_new_root->d_inode->i_mode) ||
1419             !S_ISDIR(d_put_old->d_inode->i_mode)) {
1420                 error = -ENOTDIR;
1421                 goto out2;
1422         }
1423         error = -EBUSY;
1424         if (d_new_root->d_sb == root->d_sb || d_put_old->d_sb == root->d_sb)
1425                 goto out2; /* loop */
1426         if (d_put_old != d_put_old->d_covers)
1427                 goto out2; /* mount point is busy */
1428         error = -EINVAL;
1429         walk = d_put_old; /* make sure we can reach put_old from new_root */
1430         for (;;) {
1431                 next = walk->d_covers->d_parent;
1432                 if (next == walk)
1433                         goto out2;
1434                 if (next == d_new_root)
1435                         break;
1436                 walk = next;
1437         }
1438
1439         new_root_dev_root = d_new_root->d_sb->s_root;
1440         covered = new_root_dev_root->d_covers;
1441         new_root_dev_root->d_covers = new_root_dev_root;
1442         dput(covered);
1443         covered->d_mounts = covered;
1444
1445         root_dev_root = root->d_sb->s_root;
1446         root_dev_root->d_covers = dget(d_put_old);
1447         d_put_old->d_mounts = root_dev_root;
1448         chroot_fs_refs(root,d_new_root);
1449         error = 0;
1450 out2:
1451         up(&mount_sem);
1452         dput(d_put_old);
1453 out1:
1454         dput(d_new_root);
1455 out0:
1456         unlock_kernel();
1457         return error;
1458 }
1459
1460
1461 #ifdef CONFIG_BLK_DEV_INITRD
1462
1463 int __init change_root(kdev_t new_root_dev,const char *put_old)
1464 {
1465         kdev_t old_root_dev;
1466         struct vfsmount *vfsmnt;
1467         struct dentry *old_root,*old_pwd,*dir_d = NULL;
1468         int error;
1469
1470         old_root = current->fs->root;
1471         old_pwd = current->fs->pwd;
1472         old_root_dev = ROOT_DEV;
1473         if (!fs_may_mount(new_root_dev)) {
1474                 printk(KERN_CRIT "New root is busy. Staying in initrd.\n");
1475                 return -EBUSY;
1476         }
1477         /*  First unmount devfs if mounted  */
1478         dir_d = lookup_dentry ("/dev", NULL, 1);
1479         if (!IS_ERR(dir_d)) {
1480                 struct super_block *sb = dir_d->d_inode->i_sb;
1481
1482                 if (sb && (dir_d->d_inode == sb->s_root->d_inode) &&
1483                     (sb->s_magic == DEVFS_SUPER_MAGIC)) {
1484                         dput (dir_d);
1485                         do_umount (sb->s_dev, 0, 0);
1486                 }
1487                 else dput (dir_d);
1488         }
1489         ROOT_DEV = new_root_dev;
1490         mount_root();
1491         dput(old_root);
1492         dput(old_pwd);
1493 #if 1
1494         shrink_dcache();
1495         printk("change_root: old root has d_count=%d\n", old_root->d_count);
1496 #endif
1497         mount_devfs_fs ();
1498         /*
1499          * Get the new mount directory
1500          */
1501         dir_d = lookup_dentry(put_old, NULL, 1);
1502         if (IS_ERR(dir_d)) {
1503                 error = PTR_ERR(dir_d);
1504         } else if (!dir_d->d_inode) {
1505                 dput(dir_d);
1506                 error = -ENOENT;
1507         } else {
1508                 error = 0;
1509         }
1510         if (!error && dir_d->d_covers != dir_d) {
1511                 dput(dir_d);
1512                 error = -EBUSY;
1513         }
1514         if (!error && !S_ISDIR(dir_d->d_inode->i_mode)) {
1515                 dput(dir_d);
1516                 error = -ENOTDIR;
1517         }
1518         if (error) {
1519                 struct block_device *bdev;
1520
1521                 printk(KERN_NOTICE "Trying to unmount old root ... ");
1522                 bdev = do_umount(old_root_dev,1, 0);
1523                 if (!IS_ERR(bdev)) {
1524                         printk("okay\n");
1525                         /* special: the old device driver is going to be
1526                            a ramdisk and the point of this call is to free its
1527                            protected memory (even if dirty). */
1528                         destroy_buffers(old_root_dev);
1529                         if (bdev) {
1530                                 blkdev_put(bdev, BDEV_FS);
1531                                 bdput(bdev);
1532                         }
1533                         return 0;
1534                 }
1535                 printk(KERN_ERR "error %ld\n",PTR_ERR(bdev));
1536                 return error;
1537         }
1538         remove_vfsmnt(old_root_dev);
1539         vfsmnt = add_vfsmnt(old_root->d_sb, "/dev/root.old", put_old);
1540         if (vfsmnt) {
1541                 d_mount(dir_d,old_root);
1542                 return 0;
1543         }
1544         printk(KERN_CRIT "Trouble: add_vfsmnt failed\n");
1545         return -ENOMEM;
1546 }
1547
1548 #endif