fs/super.c

   1 /*
   2  *  linux/fs/super.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  *
   6  *  super.c contains code to handle: - mount structures
   7  *                                   - super-block tables
   8  *                                   - filesystem drivers list
   9  *                                   - mount system call
  10  *                                   - umount system call
  11  *                                   - ustat system call
  12  *
  13  *  Added options to /proc/mounts
  14  *  Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
  15  *
  16  * GK 2/5/95  -  Changed to support mounting the root fs via NFS
  17  *
  18  *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
  19  *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
  20  *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
  21  */
  22
  23 #include <linux/config.h>
  24 #include <linux/string.h>
  25 #include <linux/malloc.h>
  26 #include <linux/locks.h>
  27 #include <linux/smp_lock.h>
  28 #include <linux/devfs_fs_kernel.h>
  29 #include <linux/fd.h>
  30 #include <linux/init.h>
  31 #include <linux/quotaops.h>
  32 #include <linux/acct.h>
  33
  34 #include <asm/uaccess.h>
  35
  36 #include <linux/nfs_fs.h>
  37 #include <linux/nfs_fs_sb.h>
  38 #include <linux/nfs_mount.h>
  39
  40 #include <linux/kmod.h>
  41 #define __NO_VERSION__
  42 #include <linux/module.h>
  43
  44 /*
  45  * We use a semaphore to synchronize all mount/umount
  46  * activity - imagine the mess if we have a race between
  47  * unmounting a filesystem and re-mounting it (or something
  48  * else).
  49  */
  50 static DECLARE_MUTEX(mount_sem);
  51
  52 extern void wait_for_keypress(void);
  53
  54 extern int root_mountflags;
  55
  56 static int do_remount_sb(struct super_block *sb, int flags, char * data);
  57
  58 /* this is initialized in init/main.c */
  59 kdev_t ROOT_DEV;
  60
  61 int nr_super_blocks = 0;
  62 int max_super_blocks = NR_SUPER;
  63 LIST_HEAD(super_blocks);
  64
  65 /*
  66  * Handling of filesystem drivers list.
  67  * Rules:
  68  *      Inclusion to/removals from/scanning of list are protected by spinlock.
  69  *      During the unload module must call unregister_filesystem().
  70  *      We can access the fields of list element if:
  71  *              1) spinlock is held or
  72  *              2) we hold the reference to the module.
  73  *      The latter can be guaranteed by call of try_inc_mod_count(); if it
  74  *      returned 0 we must skip the element, otherwise we got the reference.
  75  *      Once the reference is obtained we can drop the spinlock.
  76  */
  77
  78 static struct file_system_type *file_systems = NULL;
  79 static rwlock_t file_systems_lock = RW_LOCK_UNLOCKED;
  80
  81 /* WARNING: This can be used only if we _already_ own a reference */
  82 static void get_filesystem(struct file_system_type *fs)
  83 {
  84         if (fs->owner)
  85                 __MOD_INC_USE_COUNT(fs->owner);
  86 }
  87
  88 static void put_filesystem(struct file_system_type *fs)
  89 {
  90         if (fs->owner)
  91                 __MOD_DEC_USE_COUNT(fs->owner);
  92 }
  93
  94 static struct file_system_type **find_filesystem(const char *name)
  95 {
  96         struct file_system_type **p;
  97         for (p=&file_systems; *p; p=&(*p)->next)
  98                 if (strcmp((*p)->name,name) == 0)
  99                         break;
 100         return p;
 101 }
 102
 103 /**
 104  *      register_filesystem - register a new filesystem
 105  *      @fs: the file system structure
 106  *
 107  *      Adds the file system passed to the list of file systems the kernel
 108  *      is aware of for mount and other syscalls. Returns 0 on success,
 109  *      or a negative errno code on an error.
 110  *
 111  *      The &struct file_system_type that is passed is linked into the kernel
 112  *      structures and must not be freed until the file system has been
 113  *      unregistered.
 114  */
 115
 116 int register_filesystem(struct file_system_type * fs)
 117 {
 118         int res = 0;
 119         struct file_system_type ** p;
 120
 121         if (!fs)
 122                 return -EINVAL;
 123         if (fs->next)
 124                 return -EBUSY;
 125         write_lock(&file_systems_lock);
 126         p = find_filesystem(fs->name);
 127         if (*p)
 128                 res = -EBUSY;
 129         else
 130                 *p = fs;
 131         write_unlock(&file_systems_lock);
 132         return res;
 133 }
 134
 135 /**
 136  *      unregister_filesystem - unregister a file system
 137  *      @fs: filesystem to unregister
 138  *
 139  *      Remove a file system that was previously successfully registered
 140  *      with the kernel. An error is returned if the file system is not found.
 141  *      Zero is returned on a success.
 142  *
 143  *      Once this function has returned the &struct file_system_type structure
 144  *      may be freed or reused.
 145  */
 146
 147 int unregister_filesystem(struct file_system_type * fs)
 148 {
 149         struct file_system_type ** tmp;
 150
 151         write_lock(&file_systems_lock);
 152         tmp = &file_systems;
 153         while (*tmp) {
 154                 if (fs == *tmp) {
 155                         *tmp = fs->next;
 156                         fs->next = NULL;
 157                         write_unlock(&file_systems_lock);
 158                         return 0;
 159                 }
 160                 tmp = &(*tmp)->next;
 161         }
 162         write_unlock(&file_systems_lock);
 163         return -EINVAL;
 164 }
 165
 166 static int fs_index(const char * __name)
 167 {
 168         struct file_system_type * tmp;
 169         char * name;
 170         int err, index;
 171
 172         name = getname(__name);
 173         err = PTR_ERR(name);
 174         if (IS_ERR(name))
 175                 return err;
 176
 177         err = -EINVAL;
 178         read_lock(&file_systems_lock);
 179         for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
 180                 if (strcmp(tmp->name,name) == 0) {
 181                         err = index;
 182                         break;
 183                 }
 184         }
 185         read_unlock(&file_systems_lock);
 186         putname(name);
 187         return err;
 188 }
 189
 190 static int fs_name(unsigned int index, char * buf)
 191 {
 192         struct file_system_type * tmp;
 193         int len, res;
 194
 195         read_lock(&file_systems_lock);
 196         for (tmp = file_systems; tmp; tmp = tmp->next, index--)
 197                 if (index <= 0 && try_inc_mod_count(tmp->owner))
 198                                 break;
 199         read_unlock(&file_systems_lock);
 200         if (!tmp)
 201                 return -EINVAL;
 202
 203         /* OK, we got the reference, so we can safely block */
 204         len = strlen(tmp->name) + 1;
 205         res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
 206         put_filesystem(tmp);
 207         return res;
 208 }
 209
 210 static int fs_maxindex(void)
 211 {
 212         struct file_system_type * tmp;
 213         int index;
 214
 215         read_lock(&file_systems_lock);
 216         for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
 217                 ;
 218         read_unlock(&file_systems_lock);
 219         return index;
 220 }
 221
 222 /*
 223  * Whee.. Weird sysv syscall.
 224  */
 225 asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
 226 {
 227         int retval = -EINVAL;
 228
 229         switch (option) {
 230                 case 1:
 231                         retval = fs_index((const char *) arg1);
 232                         break;
 233
 234                 case 2:
 235                         retval = fs_name(arg1, (char *) arg2);
 236                         break;
 237
 238                 case 3:
 239                         retval = fs_maxindex();
 240                         break;
 241         }
 242         return retval;
 243 }
 244
 245 int get_filesystem_list(char * buf)
 246 {
 247         int len = 0;
 248         struct file_system_type * tmp;
 249
 250         read_lock(&file_systems_lock);
 251         tmp = file_systems;
 252         while (tmp && len < PAGE_SIZE - 80) {
 253                 len += sprintf(buf+len, "%s\t%s\n",
 254                         (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
 255                         tmp->name);
 256                 tmp = tmp->next;
 257         }
 258         read_unlock(&file_systems_lock);
 259         return len;
 260 }
 261
 262 static struct file_system_type *get_fs_type(const char *name)
 263 {
 264         struct file_system_type *fs;
 265
 266         read_lock(&file_systems_lock);
 267         fs = *(find_filesystem(name));
 268         if (fs && !try_inc_mod_count(fs->owner))
 269                 fs = NULL;
 270         read_unlock(&file_systems_lock);
 271         if (!fs && (request_module(name) == 0)) {
 272                 read_lock(&file_systems_lock);
 273                 fs = *(find_filesystem(name));
 274                 if (fs && !try_inc_mod_count(fs->owner))
 275                         fs = NULL;
 276                 read_unlock(&file_systems_lock);
 277         }
 278         return fs;
 279 }
 280
 281 static LIST_HEAD(vfsmntlist);
 282
 283 static struct vfsmount *add_vfsmnt(struct super_block *sb,
 284                                 struct dentry *mountpoint,
 285                                 struct dentry *root,
 286                                 struct vfsmount *parent,
 287                                 const char *dev_name,
 288                                 const char *dir_name)
 289 {
 290         struct vfsmount *mnt;
 291         char *name;
 292
 293         mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
 294         if (!mnt)
 295                 goto out;
 296         memset(mnt, 0, sizeof(struct vfsmount));
 297
 298         atomic_set(&mnt->mnt_count,1);
 299         mnt->mnt_sb = sb;
 300         mnt->mnt_dev = sb->s_dev;
 301         mnt->mnt_mountpoint = dget(mountpoint);
 302         mnt->mnt_root = dget(root);
 303         mnt->mnt_parent = parent ? mntget(parent) : mnt;
 304
 305         /* N.B. Is it really OK to have a vfsmount without names? */
 306         if (dev_name) {
 307                 name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
 308                 if (name) {
 309                         strcpy(name, dev_name);
 310                         mnt->mnt_devname = name;
 311                 }
 312         }
 313         name = kmalloc(strlen(dir_name)+1, GFP_KERNEL);
 314         if (name) {
 315                 strcpy(name, dir_name);
 316                 mnt->mnt_dirname = name;
 317         }
 318
 319         list_add(&mnt->mnt_instances, &sb->s_mounts);
 320         list_add(&mnt->mnt_clash, &mountpoint->d_vfsmnt);
 321         list_add(&mnt->mnt_list, vfsmntlist.prev);
 322         mountpoint->d_mounts = root;
 323         root->d_covers = mountpoint;
 324 out:
 325         return mnt;
 326 }
 327
 328 static void move_vfsmnt(struct vfsmount *mnt,
 329                         struct dentry *mountpoint,
 330                         struct vfsmount *parent,
 331                         const char *dev_name,
 332                         const char *dir_name)
 333 {
 334         struct dentry *old_mountpoint = mnt->mnt_mountpoint;
 335         struct vfsmount *old_parent = mnt->mnt_parent;
 336         char *new_devname = NULL, *new_dirname = NULL;
 337
 338         if (dev_name) {
 339                 new_devname = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
 340                 if (new_devname)
 341                         strcpy(new_devname, dev_name);
 342         }
 343         if (dir_name) {
 344                 new_dirname = kmalloc(strlen(dir_name)+1, GFP_KERNEL);
 345                 if (new_dirname)
 346                         strcpy(new_dirname, dir_name);
 347         }
 348
 349         /* flip names */
 350         if (new_dirname) {
 351                 kfree(mnt->mnt_dirname);
 352                 mnt->mnt_dirname = new_dirname;
 353         }
 354         if (new_devname) {
 355                 kfree(mnt->mnt_devname);
 356                 mnt->mnt_devname = new_devname;
 357         }
 358
 359         /* flip the linkage */
 360         mnt->mnt_mountpoint = dget(mountpoint);
 361         mnt->mnt_parent = parent ? mntget(parent) : mnt;
 362         list_del(&mnt->mnt_clash);
 363         list_add(&mnt->mnt_clash, &mountpoint->d_vfsmnt);
 364
 365         /* put the old stuff */
 366         old_mountpoint->d_mounts = old_mountpoint;
 367         mountpoint->d_mounts = mnt->mnt_sb->s_root;
 368         mnt->mnt_sb->s_root->d_covers = mountpoint;
 369         dput(old_mountpoint);
 370         if (old_parent != mnt)
 371                 mntput(old_parent);
 372 }
 373
 374 static void remove_vfsmnt(struct vfsmount *mnt)
 375 {
 376         struct dentry * root = mnt->mnt_sb->s_root;
 377         struct dentry * covered = mnt->mnt_mountpoint;
 378         /* First of all, remove it from all lists */
 379         list_del(&mnt->mnt_instances);
 380         list_del(&mnt->mnt_clash);
 381         list_del(&mnt->mnt_list);
 382         /* Now we can work safely */
 383         if (mnt->mnt_parent != mnt)
 384                 mntput(mnt->mnt_parent);
 385
 386         root->d_covers = root;
 387         covered->d_mounts = covered;
 388
 389         dput(mnt->mnt_mountpoint);
 390         dput(mnt->mnt_root);
 391         kfree(mnt->mnt_devname);
 392         kfree(mnt->mnt_dirname);
 393         kfree(mnt);
 394 }
 395
 396 static struct proc_fs_info {
 397         int flag;
 398         char *str;
 399 } fs_info[] = {
 400         { MS_NOEXEC, ",noexec" },
 401         { MS_NOSUID, ",nosuid" },
 402         { MS_NODEV, ",nodev" },
 403         { MS_SYNCHRONOUS, ",sync" },
 404         { MS_MANDLOCK, ",mand" },
 405         { MS_NOATIME, ",noatime" },
 406         { MS_NODIRATIME, ",nodiratime" },
 407 #ifdef MS_NOSUB                 /* Can't find this except in mount.c */
 408         { MS_NOSUB, ",nosub" },
 409 #endif
 410         { 0, NULL }
 411 };
 412
 413 static struct proc_nfs_info {
 414         int flag;
 415         char *str;
 416         char *nostr;
 417 } nfs_info[] = {
 418         { NFS_MOUNT_SOFT, ",soft", ",hard" },
 419         { NFS_MOUNT_INTR, ",intr", "" },
 420         { NFS_MOUNT_POSIX, ",posix", "" },
 421         { NFS_MOUNT_TCP, ",tcp", ",udp" },
 422         { NFS_MOUNT_NOCTO, ",nocto", "" },
 423         { NFS_MOUNT_NOAC, ",noac", "" },
 424         { NFS_MOUNT_NONLM, ",nolock", ",lock" },
 425         { 0, NULL, NULL }
 426 };
 427
 428 int get_filesystem_info( char *buf )
 429 {
 430         struct list_head *p;
 431         struct proc_fs_info *fs_infop;
 432         struct proc_nfs_info *nfs_infop;
 433         struct nfs_server *nfss;
 434         int len = 0;
 435         char *path,*buffer = (char *) __get_free_page(GFP_KERNEL);
 436
 437         if (!buffer) return 0;
 438         for (p = vfsmntlist.next; p!=&vfsmntlist && len < PAGE_SIZE - 160;
 439             p = p->next) {
 440                 struct vfsmount *tmp = list_entry(p, struct vfsmount, mnt_list);
 441                 path = d_path(tmp->mnt_root, tmp, buffer, PAGE_SIZE);
 442                 if (!path)
 443                         continue;
 444                 len += sprintf( buf + len, "%s %s %s %s",
 445                         tmp->mnt_devname, path,
 446                         tmp->mnt_sb->s_type->name,
 447                         tmp->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw" );
 448                 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
 449                   if (tmp->mnt_sb->s_flags & fs_infop->flag) {
 450                     strcpy(buf + len, fs_infop->str);
 451                     len += strlen(fs_infop->str);
 452                   }
 453                 }
 454                 if (!strcmp("nfs", tmp->mnt_sb->s_type->name)) {
 455                         nfss = &tmp->mnt_sb->u.nfs_sb.s_server;
 456                         len += sprintf(buf+len, ",v%d", nfss->rpc_ops->version);
 457
 458                         len += sprintf(buf+len, ",rsize=%d", nfss->rsize);
 459
 460                         len += sprintf(buf+len, ",wsize=%d", nfss->wsize);
 461 #if 0
 462                         if (nfss->timeo != 7*HZ/10) {
 463                                 len += sprintf(buf+len, ",timeo=%d",
 464                                                nfss->timeo*10/HZ);
 465                         }
 466                         if (nfss->retrans != 3) {
 467                                 len += sprintf(buf+len, ",retrans=%d",
 468                                                nfss->retrans);
 469                         }
 470 #endif
 471                         if (nfss->acregmin != 3*HZ) {
 472                                 len += sprintf(buf+len, ",acregmin=%d",
 473                                                nfss->acregmin/HZ);
 474                         }
 475                         if (nfss->acregmax != 60*HZ) {
 476                                 len += sprintf(buf+len, ",acregmax=%d",
 477                                                nfss->acregmax/HZ);
 478                         }
 479                         if (nfss->acdirmin != 30*HZ) {
 480                                 len += sprintf(buf+len, ",acdirmin=%d",
 481                                                nfss->acdirmin/HZ);
 482                         }
 483                         if (nfss->acdirmax != 60*HZ) {
 484                                 len += sprintf(buf+len, ",acdirmax=%d",
 485                                                nfss->acdirmax/HZ);
 486                         }
 487                         for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
 488                                 char *str;
 489                                 if (nfss->flags & nfs_infop->flag)
 490                                         str = nfs_infop->str;
 491                                 else
 492                                         str = nfs_infop->nostr;
 493                                 strcpy(buf + len, str);
 494                                 len += strlen(str);
 495                         }
 496                         len += sprintf(buf+len, ",addr=%s",
 497                                        nfss->hostname);
 498                 }
 499                 len += sprintf( buf + len, " 0 0\n" );
 500         }
 501
 502         free_page((unsigned long) buffer);
 503         return len;
 504 }
 505
 506 /**
 507  *      __wait_on_super - wait on a superblock
 508  *      @sb: superblock to wait on
 509  *
 510  *      Waits for a superblock to become unlocked and then returns. It does
 511  *      not take the lock. This is an internal function. See wait_on_super().
 512  */
 513
 514 void __wait_on_super(struct super_block * sb)
 515 {
 516         DECLARE_WAITQUEUE(wait, current);
 517
 518         add_wait_queue(&sb->s_wait, &wait);
 519 repeat:
 520         set_current_state(TASK_UNINTERRUPTIBLE);
 521         if (sb->s_lock) {
 522                 schedule();
 523                 goto repeat;
 524         }
 525         remove_wait_queue(&sb->s_wait, &wait);
 526         current->state = TASK_RUNNING;
 527 }
 528
 529 /*
 530  * Note: check the dirty flag before waiting, so we don't
 531  * hold up the sync while mounting a device. (The newly
 532  * mounted device won't need syncing.)
 533  */
 534 void sync_supers(kdev_t dev)
 535 {
 536         struct super_block * sb;
 537
 538         for (sb = sb_entry(super_blocks.next);
 539              sb != sb_entry(&super_blocks);
 540              sb = sb_entry(sb->s_list.next)) {
 541                 if (!sb->s_dev)
 542                         continue;
 543                 if (dev && sb->s_dev != dev)
 544                         continue;
 545                 if (!sb->s_dirt)
 546                         continue;
 547                 lock_super(sb);
 548                 if (sb->s_dev && sb->s_dirt && (!dev || dev == sb->s_dev))
 549                         if (sb->s_op && sb->s_op->write_super)
 550                                 sb->s_op->write_super(sb);
 551                 unlock_super(sb);
 552         }
 553 }
 554
 555 /**
 556  *      get_super       -       get the superblock of a device
 557  *      @dev: device to get the superblock for
 558  *
 559  *      Scans the superblock list and finds the superblock of the file system
 560  *      mounted on the device given. %NULL is returned if no match is found.
 561  */
 562
 563 struct super_block * get_super(kdev_t dev)
 564 {
 565         struct super_block * s;
 566
 567         if (!dev)
 568                 return NULL;
 569 restart:
 570         s = sb_entry(super_blocks.next);
 571         while (s != sb_entry(&super_blocks))
 572                 if (s->s_dev == dev) {
 573                         wait_on_super(s);
 574                         if (s->s_dev == dev)
 575                                 return s;
 576                         goto restart;
 577                 } else
 578                         s = sb_entry(s->s_list.next);
 579         return NULL;
 580 }
 581
 582 asmlinkage long sys_ustat(dev_t dev, struct ustat * ubuf)
 583 {
 584         struct super_block *s;
 585         struct ustat tmp;
 586         struct statfs sbuf;
 587         int err = -EINVAL;
 588
 589         lock_kernel();
 590         s = get_super(to_kdev_t(dev));
 591         if (s == NULL)
 592                 goto out;
 593         err = vfs_statfs(s, &sbuf);
 594         if (err)
 595                 goto out;
 596
 597         memset(&tmp,0,sizeof(struct ustat));
 598         tmp.f_tfree = sbuf.f_bfree;
 599         tmp.f_tinode = sbuf.f_ffree;
 600
 601         err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
 602 out:
 603         unlock_kernel();
 604         return err;
 605 }
 606
 607 /**
 608  *      get_empty_super -       find empty superblocks
 609  *
 610  *      Find a superblock with no device assigned. A free superblock is
 611  *      found and returned. If neccessary new superblocks are allocated.
 612  *      %NULL is returned if there are insufficient resources to complete
 613  *      the request.
 614  */
 615
 616 struct super_block *get_empty_super(void)
 617 {
 618         struct super_block *s;
 619
 620         for (s  = sb_entry(super_blocks.next);
 621              s != sb_entry(&super_blocks);
 622              s  = sb_entry(s->s_list.next)) {
 623                 if (s->s_dev)
 624                         continue;
 625                 if (!s->s_lock)
 626                         return s;
 627                 printk("VFS: empty superblock %p locked!\n", s);
 628         }
 629         /* Need a new one... */
 630         if (nr_super_blocks >= max_super_blocks)
 631                 return NULL;
 632         s = kmalloc(sizeof(struct super_block),  GFP_USER);
 633         if (s) {
 634                 nr_super_blocks++;
 635                 memset(s, 0, sizeof(struct super_block));
 636                 INIT_LIST_HEAD(&s->s_dirty);
 637                 list_add (&s->s_list, super_blocks.prev);
 638                 init_waitqueue_head(&s->s_wait);
 639                 INIT_LIST_HEAD(&s->s_files);
 640                 INIT_LIST_HEAD(&s->s_mounts);
 641         }
 642         return s;
 643 }
 644
 645 static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
 646                                        struct file_system_type *type, int flags,
 647                                        void *data, int silent)
 648 {
 649         struct super_block * s;
 650         s = get_empty_super();
 651         if (!s)
 652                 goto out;
 653         s->s_dev = dev;
 654         s->s_bdev = bdev;
 655         s->s_flags = flags;
 656         s->s_dirt = 0;
 657         sema_init(&s->s_vfs_rename_sem,1);
 658         sema_init(&s->s_nfsd_free_path_sem,1);
 659         s->s_type = type;
 660         sema_init(&s->s_dquot.dqio_sem, 1);
 661         sema_init(&s->s_dquot.dqoff_sem, 1);
 662         s->s_dquot.flags = 0;
 663         lock_super(s);
 664         if (!type->read_super(s, data, silent))
 665                 goto out_fail;
 666         unlock_super(s);
 667         /* tell bdcache that we are going to keep this one */
 668         if (bdev)
 669                 atomic_inc(&bdev->bd_count);
 670 out:
 671         return s;
 672
 673 out_fail:
 674         s->s_dev = 0;
 675         s->s_bdev = 0;
 676         s->s_type = NULL;
 677         unlock_super(s);
 678         return NULL;
 679 }
 680
 681 /*
 682  * Unnamed block devices are dummy devices used by virtual
 683  * filesystems which don't use real block-devices.  -- jrs
 684  */
 685
 686 static unsigned int unnamed_dev_in_use[256/(8*sizeof(unsigned int))] = { 0, };
 687
 688 kdev_t get_unnamed_dev(void)
 689 {
 690         int i;
 691
 692         for (i = 1; i < 256; i++) {
 693                 if (!test_and_set_bit(i,unnamed_dev_in_use))
 694                         return MKDEV(UNNAMED_MAJOR, i);
 695         }
 696         return 0;
 697 }
 698
 699 void put_unnamed_dev(kdev_t dev)
 700 {
 701         if (!dev || MAJOR(dev) != UNNAMED_MAJOR)
 702                 return;
 703         if (test_and_clear_bit(MINOR(dev), unnamed_dev_in_use))
 704                 return;
 705         printk("VFS: put_unnamed_dev: freeing unused device %s\n",
 706                         kdevname(dev));
 707 }
 708
 709 static struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 710         char *dev_name, int flags, void * data)
 711 {
 712         struct dentry *dentry;
 713         struct inode *inode;
 714         struct block_device *bdev;
 715         struct block_device_operations *bdops;
 716         struct super_block * sb;
 717         kdev_t dev;
 718         int error;
 719         /* What device it is? */
 720         if (!dev_name || !*dev_name)
 721                 return ERR_PTR(-EINVAL);
 722         dentry = lookup_dentry(dev_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE);
 723         if (IS_ERR(dentry))
 724                 return (struct super_block *)dentry;
 725         inode = dentry->d_inode;
 726         error = -ENOTBLK;
 727         if (!S_ISBLK(inode->i_mode))
 728                 goto out;
 729         error = -EACCES;
 730         if (IS_NODEV(inode))
 731                 goto out;
 732         bdev = inode->i_bdev;
 733         bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
 734         if (bdops) bdev->bd_op = bdops;
 735         /* Done with lookups, semaphore down */
 736         down(&mount_sem);
 737         dev = to_kdev_t(bdev->bd_dev);
 738         check_disk_change(dev);
 739         error = -EACCES;
 740         if (!(flags & MS_RDONLY) && is_read_only(dev))
 741                 goto out;
 742         sb = get_super(dev);
 743         if (sb) {
 744                 error = -EBUSY;
 745                 goto out;
 746                 /* MOUNT_REWRITE: the following should be used
 747                 if (fs_type == sb->s_type) {
 748                         dput(dentry);
 749                         return sb;
 750                 }
 751                 */
 752         } else {
 753                 mode_t mode = FMODE_READ; /* we always need it ;-) */
 754                 if (!(flags & MS_RDONLY))
 755                         mode |= FMODE_WRITE;
 756                 error = blkdev_get(bdev, mode, 0, BDEV_FS);
 757                 if (error)
 758                         goto out;
 759                 error = -EINVAL;
 760                 sb = read_super(dev, bdev, fs_type, flags, data, 0);
 761                 if (sb) {
 762                         get_filesystem(fs_type);
 763                         dput(dentry);
 764                         return sb;
 765                 }
 766                 blkdev_put(bdev, BDEV_FS);
 767         }
 768 out:
 769         dput(dentry);
 770         up(&mount_sem);
 771         return ERR_PTR(error);
 772 }
 773
 774 static struct super_block *get_sb_nodev(struct file_system_type *fs_type,
 775         int flags, void * data)
 776 {
 777         kdev_t dev;
 778         int error = -EMFILE;
 779         down(&mount_sem);
 780         dev = get_unnamed_dev();
 781         if (dev) {
 782                 struct super_block * sb;
 783                 error = -EINVAL;
 784                 sb = read_super(dev, NULL, fs_type, flags, data, 0);
 785                 if (sb) {
 786                         get_filesystem(fs_type);
 787                         return sb;
 788                 }
 789                 put_unnamed_dev(dev);
 790         }
 791         up(&mount_sem);
 792         return ERR_PTR(error);
 793 }
 794
 795 static struct block_device *kill_super(struct super_block *sb, int umount_root)
 796 {
 797         struct block_device *bdev;
 798         kdev_t dev;
 799         dput(sb->s_root);
 800         sb->s_root = NULL;
 801         lock_super(sb);
 802         if (sb->s_op) {
 803                 if (sb->s_op->write_super && sb->s_dirt)
 804                         sb->s_op->write_super(sb);
 805                 if (sb->s_op->put_super)
 806                         sb->s_op->put_super(sb);
 807         }
 808
 809         /* Forget any remaining inodes */
 810         if (invalidate_inodes(sb)) {
 811                 printk("VFS: Busy inodes after unmount. "
 812                         "Self-destruct in 5 seconds.  Have a nice day...\n");
 813         }
 814
 815         dev = sb->s_dev;
 816         sb->s_dev = 0;          /* Free the superblock */
 817         bdev = sb->s_bdev;
 818         sb->s_bdev = NULL;
 819         put_filesystem(sb->s_type);
 820         sb->s_type = NULL;
 821         unlock_super(sb);
 822         if (umount_root) {
 823                 /* special: the old device driver is going to be
 824                    a ramdisk and the point of this call is to free its
 825                    protected memory (even if dirty). */
 826                 destroy_buffers(dev);
 827         }
 828         if (bdev) {
 829                 blkdev_put(bdev, BDEV_FS);
 830                 bdput(bdev);
 831         } else
 832                 put_unnamed_dev(dev);
 833         return bdev;
 834 }
 835
 836 /*
 837  * Alters the mount flags of a mounted file system. Only the mount point
 838  * is used as a reference - file system type and the device are ignored.
 839  */
 840
 841 static int do_remount_sb(struct super_block *sb, int flags, char *data)
 842 {
 843         int retval;
 844
 845         if (!(flags & MS_RDONLY) && sb->s_dev && is_read_only(sb->s_dev))
 846                 return -EACCES;
 847                 /*flags |= MS_RDONLY;*/
 848         /* If we are remounting RDONLY, make sure there are no rw files open */
 849         if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
 850                 if (!fs_may_remount_ro(sb))
 851                         return -EBUSY;
 852         if (sb->s_op && sb->s_op->remount_fs) {
 853                 lock_super(sb);
 854                 retval = sb->s_op->remount_fs(sb, &flags, data);
 855                 unlock_super(sb);
 856                 if (retval)
 857                         return retval;
 858         }
 859         sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 860
 861         /*
 862          * We can't invalidate inodes as we can loose data when remounting
 863          * (someone might manage to alter data while we are waiting in lock_super()
 864          * or in foo_remount_fs()))
 865          */
 866
 867         return 0;
 868 }
 869
 870 /*
 871  * Doesn't take quota and stuff into account. IOW, in some cases it will
 872  * give false negatives. The main reason why it's here is that we need
 873  * a non-destructive way to look for easily umountable filesystems.
 874  */
 875 int may_umount(struct vfsmount *mnt)
 876 {
 877         struct super_block * sb = mnt->mnt_sb;
 878         struct dentry * root;
 879         int count;
 880
 881         if (atomic_read(&mnt->mnt_count) > 2)
 882                 return -EBUSY;
 883
 884         if (mnt->mnt_instances.next != mnt->mnt_instances.prev)
 885                 return 0;
 886
 887         /*
 888          * OK, at that point we have only one instance. We should have
 889          * one active reference from ->s_root, one active reference
 890          * from ->mnt_root (which may be different) and possibly one
 891          * active reference from ->mnt_mountpoint (if mnt->mnt_parent == mnt).
 892          * Anything above that means that tree is busy.
 893          */
 894
 895         root = sb->s_root;
 896
 897         count = d_active_refs(root);
 898         if (mnt->mnt_parent == mnt)
 899                 count--;
 900         if (count != 2)
 901                 return -EBUSY;
 902
 903         return 0;
 904 }
 905
 906 static int do_umount(struct vfsmount *mnt, int umount_root, int flags)
 907 {
 908         struct super_block * sb = mnt->mnt_sb;
 909         int count;
 910
 911         if (mnt == current->fs->rootmnt && !umount_root) {
 912                 int retval = 0;
 913                 /*
 914                  * Special case for "unmounting" root ...
 915                  * we just try to remount it readonly.
 916                  */
 917                 mntput(mnt);
 918                 if (!(sb->s_flags & MS_RDONLY))
 919                         retval = do_remount_sb(sb, MS_RDONLY, 0);
 920                 return retval;
 921         }
 922
 923         if (atomic_read(&mnt->mnt_count) > 2) {
 924                 mntput(mnt);
 925                 return -EBUSY;
 926         }
 927
 928         if (mnt->mnt_instances.next != mnt->mnt_instances.prev) {
 929                 mntput(mnt);
 930                 remove_vfsmnt(mnt);
 931                 return 0;
 932         }
 933
 934         /*
 935          * Before checking whether the filesystem is still busy,
 936          * make sure the kernel doesn't hold any quota files open
 937          * on the device. If the umount fails, too bad -- there
 938          * are no quotas running any more. Just turn them on again.
 939          */
 940         DQUOT_OFF(sb);
 941         acct_auto_close(sb->s_dev);
 942
 943         /*
 944          * If we may have to abort operations to get out of this
 945          * mount, and they will themselves hold resources we must
 946          * allow the fs to do things. In the Unix tradition of
 947          * 'Gee thats tricky lets do it in userspace' the umount_begin
 948          * might fail to complete on the first run through as other tasks
 949          * must return, and the like. Thats for the mount program to worry
 950          * about for the moment.
 951          */
 952
 953         if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
 954                 sb->s_op->umount_begin(sb);
 955
 956         /*
 957          * Shrink dcache, then fsync. This guarantees that if the
 958          * filesystem is quiescent at this point, then (a) only the
 959          * root entry should be in use and (b) that root entry is
 960          * clean.
 961          */
 962         shrink_dcache_sb(sb);
 963         fsync_dev(sb->s_dev);
 964
 965         /* Something might grab it again - redo checks */
 966
 967         if (atomic_read(&mnt->mnt_count) > 2) {
 968                 mntput(mnt);
 969                 return -EBUSY;
 970         }
 971
 972         /*
 973          * OK, at that point we have only one instance. We should have
 974          * one active reference from ->s_root, one active reference
 975          * from ->mnt_root (which may be different) and possibly one
 976          * active reference from ->mnt_mountpoint (if mnt->mnt_parent == mnt).
 977          * Anything above that means that tree is busy.
 978          */
 979
 980         count = d_active_refs(sb->s_root);
 981         if (mnt->mnt_parent == mnt)
 982                 count--;
 983         if (count != 2)
 984                 return -EBUSY;
 985
 986         if (sb->s_root->d_inode->i_state)
 987                 return -EBUSY;
 988
 989         /* OK, that's the point of no return */
 990         mntput(mnt);
 991         remove_vfsmnt(mnt);
 992
 993         kill_super(sb, umount_root);
 994         return 0;
 995 }
 996
 997 /*
 998  * Now umount can handle mount points as well as block devices.
 999  * This is important for filesystems which use unnamed block devices.
1000  *
1001  * We now support a flag for forced unmount like the other 'big iron'
1002  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1003  */
1004
1005 asmlinkage long sys_umount(char * name, int flags)
1006 {
1007         struct nameidata nd;
1008         char *kname;
1009         int retval;
1010         struct super_block *sb;
1011
1012         if (!capable(CAP_SYS_ADMIN))
1013                 return -EPERM;
1014
1015         lock_kernel();
1016         kname = getname(name);
1017         retval = PTR_ERR(kname);
1018         if (IS_ERR(kname))
1019                 goto out;
1020         retval = 0;
1021         if (walk_init(kname, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd))
1022                 retval = walk_name(kname, &nd);
1023         putname(kname);
1024         if (retval)
1025                 goto out;
1026         sb = nd.dentry->d_inode->i_sb;
1027         retval = -EINVAL;
1028         if (nd.dentry!=nd.mnt->mnt_root)
1029                 goto dput_and_out;
1030         dput(nd.dentry);
1031         /* puts nd.mnt */
1032         down(&mount_sem);
1033         retval = do_umount(nd.mnt, 0, flags);
1034         up(&mount_sem);
1035         goto out;
1036 dput_and_out:
1037         dput(nd.dentry);
1038         mntput(nd.mnt);
1039 out:
1040         unlock_kernel();
1041         return retval;
1042 }
1043
1044 /*
1045  *      The 2.0 compatible umount. No flags.
1046  */
1047
1048 asmlinkage long sys_oldumount(char * name)
1049 {
1050         return sys_umount(name,0);
1051 }
1052
1053 /*
1054  * change filesystem flags. dir should be a physical root of filesystem.
1055  * If you've mounted a non-root directory somewhere and want to do remount
1056  * on it - tough luck.
1057  */
1058
1059 static int do_remount(const char *dir,int flags,char *data)
1060 {
1061         struct dentry *dentry;
1062         int retval;
1063
1064         if (!capable(CAP_SYS_ADMIN))
1065                 return -EPERM;
1066
1067         dentry = lookup_dentry(dir, LOOKUP_FOLLOW|LOOKUP_POSITIVE);
1068         retval = PTR_ERR(dentry);
1069         if (!IS_ERR(dentry)) {
1070                 struct super_block * sb = dentry->d_inode->i_sb;
1071                 retval = -ENODEV;
1072                 if (sb) {
1073                         retval = -EINVAL;
1074                         if (dentry == sb->s_root) {
1075                                 /*
1076                                  * Shrink the dcache and sync the device.
1077                                  */
1078                                 shrink_dcache_sb(sb);
1079                                 fsync_dev(sb->s_dev);
1080                                 if (flags & MS_RDONLY)
1081                                         acct_auto_close(sb->s_dev);
1082                                 retval = do_remount_sb(sb, flags, data);
1083                         }
1084                 }
1085                 dput(dentry);
1086         }
1087         return retval;
1088 }
1089
1090 static int copy_mount_options (const void * data, unsigned long *where)
1091 {
1092         int i;
1093         unsigned long page;
1094         struct vm_area_struct * vma;
1095
1096         *where = 0;
1097         if (!data)
1098                 return 0;
1099
1100         vma = find_vma(current->mm, (unsigned long) data);
1101         if (!vma || (unsigned long) data < vma->vm_start)
1102                 return -EFAULT;
1103         if (!(vma->vm_flags & VM_READ))
1104                 return -EFAULT;
1105         i = vma->vm_end - (unsigned long) data;
1106         if (PAGE_SIZE <= (unsigned long) i)
1107                 i = PAGE_SIZE-1;
1108         if (!(page = __get_free_page(GFP_KERNEL))) {
1109                 return -ENOMEM;
1110         }
1111         if (copy_from_user((void *) page,data,i)) {
1112                 free_page(page);
1113                 return -EFAULT;
1114         }
1115         *where = page;
1116         return 0;
1117 }
1118
1119 /*
1120  * Flags is a 16-bit value that allows up to 16 non-fs dependent flags to
1121  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
1122  *
1123  * data is a (void *) that can point to any structure up to
1124  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
1125  * information (or be NULL).
1126  *
1127  * NOTE! As old versions of mount() didn't use this setup, the flags
1128  * have to have a special 16-bit magic number in the high word:
1129  * 0xC0ED. If this magic word isn't present, the flags and data info
1130  * aren't used, as the syscall assumes we are talking to an older
1131  * version that didn't understand them.
1132  */
1133 long do_sys_mount(char * dev_name, char * dir_name, char *type_page,
1134                   unsigned long new_flags, void *data_page)
1135 {
1136         struct file_system_type * fstype;
1137         struct nameidata nd;
1138         struct vfsmount *mnt;
1139         struct super_block *sb;
1140         int retval = 0;
1141         unsigned long flags = 0;
1142
1143         /* Basic sanity checks */
1144
1145         if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
1146                 return -EINVAL;
1147         if (!type_page || !memchr(type_page, 0, PAGE_SIZE))
1148                 return -EINVAL;
1149         if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
1150                 return -EINVAL;
1151
1152         /* OK, looks good, now let's see what do they want */
1153
1154         /* just change the flags? - capabilities are checked in do_remount() */
1155         if ((new_flags & (MS_MGC_MSK|MS_REMOUNT)) == (MS_MGC_VAL|MS_REMOUNT))
1156                 return do_remount(dir_name, new_flags&~(MS_MGC_MSK|MS_REMOUNT),
1157                                     (char *) data_page);
1158
1159         if ((new_flags & MS_MGC_MSK) == MS_MGC_VAL)
1160                 flags = new_flags & ~MS_MGC_MSK;
1161
1162         /* loopback mount? This is special - requires fewer capabilities */
1163         /* MOUNT_REWRITE: ... and is yet to be merged */
1164
1165         /* for the rest we _really_ need capabilities... */
1166         if (!capable(CAP_SYS_ADMIN))
1167                 return -EPERM;
1168
1169         /* ... filesystem driver... */
1170         fstype = get_fs_type(type_page);
1171         if (!fstype)
1172                 return -ENODEV;
1173
1174         /* ... and mountpoint. Do the lookup first to force automounting. */
1175         if (walk_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &nd))
1176                 retval = walk_name(dir_name, &nd);
1177         if (retval)
1178                 goto fs_out;
1179
1180         /* get superblock, locks mount_sem on success */
1181         if (fstype->fs_flags & FS_REQUIRES_DEV)
1182                 sb = get_sb_bdev(fstype, dev_name,flags, data_page);
1183         else
1184                 sb = get_sb_nodev(fstype, flags, data_page);
1185
1186         retval = PTR_ERR(sb);
1187         if (IS_ERR(sb))
1188                 goto dput_out;
1189
1190         retval = -ENOENT;
1191         if (d_unhashed(nd.dentry))
1192                 goto fail;
1193
1194         /* Something was mounted here while we slept */
1195         while(d_mountpoint(nd.dentry) && follow_down(&nd.mnt, &nd.dentry))
1196                 ;
1197
1198         retval = -ENOMEM;
1199         mnt = add_vfsmnt(sb, nd.dentry, sb->s_root, nd.mnt, dev_name, dir_name);
1200         if (!mnt)
1201                 goto fail;
1202         retval = 0;
1203 unlock_out:
1204         up(&mount_sem);
1205 dput_out:
1206         dput(nd.dentry);
1207         mntput(nd.mnt);
1208 fs_out:
1209         put_filesystem(fstype);
1210         return retval;
1211
1212 fail:
1213         if (list_empty(&sb->s_mounts))
1214                 kill_super(sb, 0);
1215         goto unlock_out;
1216 }
1217
1218 asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type,
1219                           unsigned long new_flags, void * data)
1220 {
1221         int retval;
1222         unsigned long data_page = 0;
1223         unsigned long type_page = 0;
1224         unsigned long dev_page = 0;
1225         char *dir_page;
1226
1227         lock_kernel();
1228         retval = copy_mount_options (type, &type_page);
1229         if (retval < 0)
1230                 goto out;
1231
1232         /* copy_mount_options allows a NULL user pointer,
1233          * and just returns zero in that case.  But if we
1234          * allow the type to be NULL we will crash.
1235          * Previously we did not check this case.
1236          */
1237         if (type_page == 0) {
1238                 retval = -EINVAL;
1239                 goto out;
1240         }
1241
1242         dir_page = getname(dir_name);
1243         retval = PTR_ERR(dir_page);
1244         if (IS_ERR(dir_page))
1245                 goto out1;
1246
1247         retval = copy_mount_options (dev_name, &dev_page);
1248         if (retval < 0)
1249                 goto out2;
1250         retval = copy_mount_options (data, &data_page);
1251         if (retval >= 0) {
1252                 retval = do_sys_mount((char*)dev_page,dir_page,(char*)type_page,
1253                                       new_flags, (void*)data_page);
1254                 free_page(data_page);
1255         }
1256         free_page(dev_page);
1257 out2:
1258         putname(dir_page);
1259 out1:
1260         free_page(type_page);
1261 out:
1262         unlock_kernel();
1263         return retval;
1264 }
1265
1266 void __init mount_root(void)
1267 {
1268         struct file_system_type * fs_type;
1269         struct super_block * sb;
1270         struct vfsmount *vfsmnt;
1271         struct block_device *bdev = NULL;
1272         mode_t mode;
1273         int retval;
1274         void *handle;
1275         char path[64];
1276         int path_start = -1;
1277
1278 #ifdef CONFIG_ROOT_NFS
1279         void *data;
1280         if (MAJOR(ROOT_DEV) != UNNAMED_MAJOR)
1281                 goto skip_nfs;
1282         fs_type = get_fs_type("nfs");
1283         if (!fs_type)
1284                 goto no_nfs;
1285         ROOT_DEV = get_unnamed_dev();
1286         if (!ROOT_DEV)
1287                 /*
1288                  * Your /linuxrc sucks worse than MSExchange - that's the
1289                  * only way you could run out of anon devices at that point.
1290                  */
1291                 goto no_anon;
1292         data = nfs_root_data();
1293         if (!data)
1294                 goto no_server;
1295         sb = read_super(ROOT_DEV, NULL, fs_type, root_mountflags, data, 1);
1296         if (sb)
1297                 /*
1298                  * We _can_ fail there, but if that will happen we have no
1299                  * chance anyway (no memory for vfsmnt and we _will_ need it,
1300                  * no matter which fs we try to mount).
1301                  */
1302                 goto mount_it;
1303 no_server:
1304         put_unnamed_dev(ROOT_DEV);
1305 no_anon:
1306         put_filesystem(fs_type);
1307 no_nfs:
1308         printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
1309         ROOT_DEV = MKDEV(FLOPPY_MAJOR, 0);
1310 skip_nfs:
1311 #endif
1312
1313 #ifdef CONFIG_BLK_DEV_FD
1314         if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
1315 #ifdef CONFIG_BLK_DEV_RAM
1316                 extern int rd_doload;
1317                 extern void rd_load_secondary(void);
1318 #endif
1319                 floppy_eject();
1320 #ifndef CONFIG_BLK_DEV_RAM
1321                 printk(KERN_NOTICE "(Warning, this kernel has no ramdisk support)\n");
1322 #else
1323                 /* rd_doload is 2 for a dual initrd/ramload setup */
1324                 if(rd_doload==2)
1325                         rd_load_secondary();
1326                 else
1327 #endif
1328                 {
1329                         printk(KERN_NOTICE "VFS: Insert root floppy and press ENTER\n");
1330                         wait_for_keypress();
1331                 }
1332         }
1333 #endif
1334
1335         devfs_make_root (root_device_name);
1336         handle = devfs_find_handle (NULL, ROOT_DEVICE_NAME, 0,
1337                                     MAJOR (ROOT_DEV), MINOR (ROOT_DEV),
1338                                     DEVFS_SPECIAL_BLK, 1);
1339         if (handle)  /*  Sigh: bd*() functions only paper over the cracks  */
1340         {
1341             unsigned major, minor;
1342
1343             devfs_get_maj_min (handle, &major, &minor);
1344             ROOT_DEV = MKDEV (major, minor);
1345         }
1346
1347         /*
1348          * Probably pure paranoia, but I'm less than happy about delving into
1349          * devfs crap and checking it right now. Later.
1350          */
1351         if (!ROOT_DEV)
1352                 panic("I have no root and I want to scream");
1353
1354         bdev = bdget(kdev_t_to_nr(ROOT_DEV));
1355         if (!bdev)
1356                 panic(__FUNCTION__ ": unable to allocate root device");
1357         bdev->bd_op = devfs_get_ops (handle);
1358         path_start = devfs_generate_path (handle, path + 5, sizeof (path) - 5);
1359         mode = FMODE_READ;
1360         if (!(root_mountflags & MS_RDONLY))
1361                 mode |= FMODE_WRITE;
1362         retval = blkdev_get(bdev, mode, 0, BDEV_FS);
1363         if (retval == -EROFS) {
1364                 root_mountflags |= MS_RDONLY;
1365                 retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS);
1366         }
1367         if (retval) {
1368                 /*
1369                  * Allow the user to distinguish between failed open
1370                  * and bad superblock on root device.
1371                  */
1372                 printk ("VFS: Cannot open root device \"%s\" or %s\n",
1373                         root_device_name, kdevname (ROOT_DEV));
1374                 printk ("Please append a correct \"root=\" boot option\n");
1375                 panic("VFS: Unable to mount root fs on %s",
1376                         kdevname(ROOT_DEV));
1377         }
1378
1379         check_disk_change(ROOT_DEV);
1380         sb = get_super(ROOT_DEV);
1381         if (sb) {
1382                 fs_type = sb->s_type;
1383                 goto mount_it;
1384         }
1385
1386         read_lock(&file_systems_lock);
1387         for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) {
1388                 if (!(fs_type->fs_flags & FS_REQUIRES_DEV))
1389                         continue;
1390                 if (!try_inc_mod_count(fs_type->owner))
1391                         continue;
1392                 read_unlock(&file_systems_lock);
1393                 sb = read_super(ROOT_DEV,bdev,fs_type,root_mountflags,NULL,1);
1394                 if (sb)
1395                         goto mount_it;
1396                 read_lock(&file_systems_lock);
1397                 put_filesystem(fs_type);
1398         }
1399         read_unlock(&file_systems_lock);
1400         panic("VFS: Unable to mount root fs on %s", kdevname(ROOT_DEV));
1401
1402 mount_it:
1403         printk ("VFS: Mounted root (%s filesystem)%s.\n",
1404                 fs_type->name,
1405                 (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1406         if (path_start >= 0) {
1407                 devfs_mk_symlink (NULL,
1408                                   "root", 0, DEVFS_FL_DEFAULT,
1409                                   path + 5 + path_start, 0,
1410                                   NULL, NULL);
1411                 memcpy (path + path_start, "/dev/", 5);
1412                 vfsmnt = add_vfsmnt (sb, sb->s_root, sb->s_root, NULL,
1413                                         path + path_start, "/");
1414         }
1415         else
1416                 vfsmnt = add_vfsmnt (sb, sb->s_root, sb->s_root, NULL,
1417                                         "/dev/root", "/");
1418         if (vfsmnt) {
1419                 set_fs_root(current->fs, vfsmnt, sb->s_root);
1420                 set_fs_pwd(current->fs, vfsmnt, sb->s_root);
1421                 if (bdev)
1422                         bdput(bdev); /* sb holds a reference */
1423                 return;
1424         }
1425         panic("VFS: add_vfsmnt failed for root fs");
1426 }
1427
1428
1429 static void chroot_fs_refs(struct dentry *old_root,
1430                            struct vfsmount *old_rootmnt,
1431                            struct dentry *new_root,
1432                            struct vfsmount *new_rootmnt)
1433 {
1434         struct task_struct *p;
1435
1436         /* We can't afford dput() blocking under the tasklist_lock */
1437         mntget(old_rootmnt);
1438         dget(old_root);
1439
1440         read_lock(&tasklist_lock);
1441         for_each_task(p) {
1442                 if (!p->fs) continue;
1443                 if (p->fs->root == old_root && p->fs->rootmnt == old_rootmnt)
1444                         set_fs_root(p->fs, new_rootmnt, new_root);
1445                 if (p->fs->pwd == old_root && p->fs->pwdmnt == old_rootmnt)
1446                         set_fs_pwd(p->fs, new_rootmnt, new_root);
1447         }
1448         read_unlock(&tasklist_lock);
1449
1450         dput(old_root);
1451         mntput(old_rootmnt);
1452 }
1453
1454 /*
1455  * Moves the current root to put_root, and sets root/cwd of all processes
1456  * which had them on the old root to new_root.
1457  *
1458  * Note:
1459  *  - we don't move root/cwd if they are not at the root (reason: if something
1460  *    cared enough to change them, it's probably wrong to force them elsewhere)
1461  *  - it's okay to pick a root that isn't the root of a file system, e.g.
1462  *    /nfs/my_root where /nfs is the mount point. Better avoid creating
1463  *    unreachable mount points this way, though.
1464  */
1465
1466 asmlinkage long sys_pivot_root(const char *new_root, const char *put_old)
1467 {
1468         struct dentry *root = current->fs->root;
1469         struct vfsmount *root_mnt = current->fs->rootmnt;
1470         struct vfsmount *tmp;
1471         struct nameidata new_nd, old_nd;
1472         char *name;
1473         int error;
1474
1475         if (!capable(CAP_SYS_ADMIN))
1476                 return -EPERM;
1477
1478         lock_kernel();
1479
1480         name = getname(new_root);
1481         error = PTR_ERR(name);
1482         if (IS_ERR(name))
1483                 goto out0;
1484         error = 0;
1485         if (walk_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd))
1486                 error = walk_name(name, &new_nd);
1487         putname(name);
1488         if (error)
1489                 goto out0;
1490
1491         name = getname(put_old);
1492         error = PTR_ERR(name);
1493         if (IS_ERR(name))
1494                 goto out0;
1495         error = 0;
1496         if (walk_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd))
1497                 error = walk_name(name, &old_nd);
1498         putname(name);
1499         if (error)
1500                 goto out1;
1501
1502         down(&mount_sem);
1503         error = -ENOENT;
1504         if (d_unhashed(new_nd.dentry) || d_unhashed(old_nd.dentry))
1505                 goto out2;
1506         error = -EBUSY;
1507         if (new_nd.mnt == root_mnt || old_nd.mnt == root_mnt)
1508                 goto out2; /* loop */
1509         error = -EINVAL;
1510         tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
1511         if (tmp != new_nd.mnt) {
1512                 for (;;) {
1513                         if (tmp->mnt_parent == tmp)
1514                                 goto out2;
1515                         if (tmp->mnt_parent == new_nd.mnt)
1516                                 break;
1517                         tmp = tmp->mnt_parent;
1518                 }
1519                 if (!is_subdir(tmp->mnt_root, new_nd.dentry))
1520                         goto out2;
1521         } else if (!is_subdir(old_nd.dentry, new_nd.dentry))
1522                 goto out2;
1523
1524         error = -ENOMEM;
1525         name = __getname();
1526         if (!name)
1527                 goto out2;
1528
1529         move_vfsmnt(new_nd.mnt, new_nd.dentry, NULL, NULL, "/");
1530         move_vfsmnt(root_mnt, old_nd.dentry, old_nd.mnt, NULL,
1531                         __d_path(old_nd.dentry, old_nd.mnt, new_nd.dentry,
1532                                 new_nd.mnt, name, PAGE_SIZE));
1533         putname(name);
1534         chroot_fs_refs(root,root_mnt,new_nd.dentry,new_nd.mnt);
1535         error = 0;
1536 out2:
1537         up(&mount_sem);
1538         dput(old_nd.dentry);
1539         mntput(old_nd.mnt);
1540 out1:
1541         dput(new_nd.dentry);
1542         mntput(new_nd.mnt);
1543 out0:
1544         unlock_kernel();
1545         return error;
1546 }
1547
1548
1549 #ifdef CONFIG_BLK_DEV_INITRD
1550
1551 int __init change_root(kdev_t new_root_dev,const char *put_old)
1552 {
1553         kdev_t old_root_dev = ROOT_DEV;
1554         struct vfsmount *old_rootmnt = mntget(current->fs->rootmnt);
1555         struct nameidata devfs_nd, nd;
1556         int error = 0;
1557
1558         /*  First unmount devfs if mounted  */
1559         if (walk_init("/dev", LOOKUP_FOLLOW|LOOKUP_POSITIVE, &devfs_nd))
1560                 error = walk_name("/dev", &devfs_nd);
1561         if (!error) {
1562                 struct super_block *sb = devfs_nd.dentry->d_inode->i_sb;
1563
1564                 if (devfs_nd.mnt->mnt_sb->s_magic == DEVFS_SUPER_MAGIC &&
1565                     devfs_nd.dentry == devfs_nd.mnt->mnt_root) {
1566                         dput(devfs_nd.dentry);
1567                         down(&mount_sem);
1568                         /* puts devfs_nd.mnt */
1569                         do_umount(devfs_nd.mnt, 0, 0);
1570                         up(&mount_sem);
1571                 } else {
1572                         dput(devfs_nd.dentry);
1573                         mntput(devfs_nd.mnt);
1574                 }
1575         }
1576         ROOT_DEV = new_root_dev;
1577         mount_root();
1578 #if 1
1579         shrink_dcache();
1580         printk("change_root: old root has d_count=%d\n",
1581                old_rootmnt->mnt_root->d_count);
1582 #endif
1583         mount_devfs_fs ();
1584         /*
1585          * Get the new mount directory
1586          */
1587         error = 0;
1588         if (walk_init(put_old, LOOKUP_FOLLOW|LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &nd))
1589                 error = walk_name(put_old, &nd);
1590         if (error) {
1591                 int blivet;
1592
1593                 printk(KERN_NOTICE "Trying to unmount old root ... ");
1594                 blivet = do_umount(old_rootmnt, 1, 0);
1595                 if (!blivet) {
1596                         printk("okay\n");
1597                         return 0;
1598                 }
1599                 printk(KERN_ERR "error %ld\n",blivet);
1600                 return error;
1601         }
1602         move_vfsmnt(old_rootmnt, nd.dentry, nd.mnt, "/dev/root.old", put_old);
1603         mntput(old_rootmnt);
1604         dput(nd.dentry);
1605         mntput(nd.mnt);
1606         return 0;
1607 }
1608
1609 #endif