fs/super.c

   1 /*
   2  *  linux/fs/super.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  *
   6  *  super.c contains code to handle: - mount structures
   7  *                                   - super-block tables
   8  *                                   - filesystem drivers list
   9  *                                   - mount system call
  10  *                                   - umount system call
  11  *                                   - ustat system call
  12  *
  13  *  Added options to /proc/mounts
  14  *  Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
  15  *
  16  * GK 2/5/95  -  Changed to support mounting the root fs via NFS
  17  *
  18  *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
  19  *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
  20  *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
  21  *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
  22  */
  23
  24 #include <linux/config.h>
  25 #include <linux/string.h>
  26 #include <linux/malloc.h>
  27 #include <linux/locks.h>
  28 #include <linux/smp_lock.h>
  29 #include <linux/devfs_fs_kernel.h>
  30 #include <linux/fd.h>
  31 #include <linux/init.h>
  32 #include <linux/quotaops.h>
  33 #include <linux/acct.h>
  34
  35 #include <asm/uaccess.h>
  36
  37 #include <linux/nfs_fs.h>
  38 #include <linux/nfs_fs_sb.h>
  39 #include <linux/nfs_mount.h>
  40
  41 #include <linux/kmod.h>
  42 #define __NO_VERSION__
  43 #include <linux/module.h>
  44
  45 /*
  46  * We use a semaphore to synchronize all mount/umount
  47  * activity - imagine the mess if we have a race between
  48  * unmounting a filesystem and re-mounting it (or something
  49  * else).
  50  */
  51 static DECLARE_MUTEX(mount_sem);
  52
  53 extern void wait_for_keypress(void);
  54
  55 extern int root_mountflags;
  56
  57 static int do_remount_sb(struct super_block *sb, int flags, char * data);
  58
  59 /* this is initialized in init/main.c */
  60 kdev_t ROOT_DEV;
  61
  62 int nr_super_blocks;
  63 int max_super_blocks = NR_SUPER;
  64 LIST_HEAD(super_blocks);
  65
  66 /*
  67  * Handling of filesystem drivers list.
  68  * Rules:
  69  *      Inclusion to/removals from/scanning of list are protected by spinlock.
  70  *      During the unload module must call unregister_filesystem().
  71  *      We can access the fields of list element if:
  72  *              1) spinlock is held or
  73  *              2) we hold the reference to the module.
  74  *      The latter can be guaranteed by call of try_inc_mod_count(); if it
  75  *      returned 0 we must skip the element, otherwise we got the reference.
  76  *      Once the reference is obtained we can drop the spinlock.
  77  */
  78
  79 static struct file_system_type *file_systems = NULL;
  80 static rwlock_t file_systems_lock = RW_LOCK_UNLOCKED;
  81
  82 /* WARNING: This can be used only if we _already_ own a reference */
  83 static void get_filesystem(struct file_system_type *fs)
  84 {
  85         if (fs->owner)
  86                 __MOD_INC_USE_COUNT(fs->owner);
  87 }
  88
  89 static void put_filesystem(struct file_system_type *fs)
  90 {
  91         if (fs->owner)
  92                 __MOD_DEC_USE_COUNT(fs->owner);
  93 }
  94
  95 static struct file_system_type **find_filesystem(const char *name)
  96 {
  97         struct file_system_type **p;
  98         for (p=&file_systems; *p; p=&(*p)->next)
  99                 if (strcmp((*p)->name,name) == 0)
 100                         break;
 101         return p;
 102 }
 103
 104 /**
 105  *      register_filesystem - register a new filesystem
 106  *      @fs: the file system structure
 107  *
 108  *      Adds the file system passed to the list of file systems the kernel
 109  *      is aware of for mount and other syscalls. Returns 0 on success,
 110  *      or a negative errno code on an error.
 111  *
 112  *      The &struct file_system_type that is passed is linked into the kernel
 113  *      structures and must not be freed until the file system has been
 114  *      unregistered.
 115  */
 116
 117 int register_filesystem(struct file_system_type * fs)
 118 {
 119         int res = 0;
 120         struct file_system_type ** p;
 121
 122         if (!fs)
 123                 return -EINVAL;
 124         if (fs->next)
 125                 return -EBUSY;
 126         write_lock(&file_systems_lock);
 127         p = find_filesystem(fs->name);
 128         if (*p)
 129                 res = -EBUSY;
 130         else
 131                 *p = fs;
 132         write_unlock(&file_systems_lock);
 133         return res;
 134 }
 135
 136 /**
 137  *      unregister_filesystem - unregister a file system
 138  *      @fs: filesystem to unregister
 139  *
 140  *      Remove a file system that was previously successfully registered
 141  *      with the kernel. An error is returned if the file system is not found.
 142  *      Zero is returned on a success.
 143  *
 144  *      Once this function has returned the &struct file_system_type structure
 145  *      may be freed or reused.
 146  */
 147
 148 int unregister_filesystem(struct file_system_type * fs)
 149 {
 150         struct file_system_type ** tmp;
 151
 152         write_lock(&file_systems_lock);
 153         tmp = &file_systems;
 154         while (*tmp) {
 155                 if (fs == *tmp) {
 156                         *tmp = fs->next;
 157                         fs->next = NULL;
 158                         write_unlock(&file_systems_lock);
 159                         return 0;
 160                 }
 161                 tmp = &(*tmp)->next;
 162         }
 163         write_unlock(&file_systems_lock);
 164         return -EINVAL;
 165 }
 166
 167 static int fs_index(const char * __name)
 168 {
 169         struct file_system_type * tmp;
 170         char * name;
 171         int err, index;
 172
 173         name = getname(__name);
 174         err = PTR_ERR(name);
 175         if (IS_ERR(name))
 176                 return err;
 177
 178         err = -EINVAL;
 179         read_lock(&file_systems_lock);
 180         for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
 181                 if (strcmp(tmp->name,name) == 0) {
 182                         err = index;
 183                         break;
 184                 }
 185         }
 186         read_unlock(&file_systems_lock);
 187         putname(name);
 188         return err;
 189 }
 190
 191 static int fs_name(unsigned int index, char * buf)
 192 {
 193         struct file_system_type * tmp;
 194         int len, res;
 195
 196         read_lock(&file_systems_lock);
 197         for (tmp = file_systems; tmp; tmp = tmp->next, index--)
 198                 if (index <= 0 && try_inc_mod_count(tmp->owner))
 199                                 break;
 200         read_unlock(&file_systems_lock);
 201         if (!tmp)
 202                 return -EINVAL;
 203
 204         /* OK, we got the reference, so we can safely block */
 205         len = strlen(tmp->name) + 1;
 206         res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
 207         put_filesystem(tmp);
 208         return res;
 209 }
 210
 211 static int fs_maxindex(void)
 212 {
 213         struct file_system_type * tmp;
 214         int index;
 215
 216         read_lock(&file_systems_lock);
 217         for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
 218                 ;
 219         read_unlock(&file_systems_lock);
 220         return index;
 221 }
 222
 223 /*
 224  * Whee.. Weird sysv syscall.
 225  */
 226 asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
 227 {
 228         int retval = -EINVAL;
 229
 230         switch (option) {
 231                 case 1:
 232                         retval = fs_index((const char *) arg1);
 233                         break;
 234
 235                 case 2:
 236                         retval = fs_name(arg1, (char *) arg2);
 237                         break;
 238
 239                 case 3:
 240                         retval = fs_maxindex();
 241                         break;
 242         }
 243         return retval;
 244 }
 245
 246 int get_filesystem_list(char * buf)
 247 {
 248         int len = 0;
 249         struct file_system_type * tmp;
 250
 251         read_lock(&file_systems_lock);
 252         tmp = file_systems;
 253         while (tmp && len < PAGE_SIZE - 80) {
 254                 len += sprintf(buf+len, "%s\t%s\n",
 255                         (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
 256                         tmp->name);
 257                 tmp = tmp->next;
 258         }
 259         read_unlock(&file_systems_lock);
 260         return len;
 261 }
 262
 263 static struct file_system_type *get_fs_type(const char *name)
 264 {
 265         struct file_system_type *fs;
 266
 267         read_lock(&file_systems_lock);
 268         fs = *(find_filesystem(name));
 269         if (fs && !try_inc_mod_count(fs->owner))
 270                 fs = NULL;
 271         read_unlock(&file_systems_lock);
 272         if (!fs && (request_module(name) == 0)) {
 273                 read_lock(&file_systems_lock);
 274                 fs = *(find_filesystem(name));
 275                 if (fs && !try_inc_mod_count(fs->owner))
 276                         fs = NULL;
 277                 read_unlock(&file_systems_lock);
 278         }
 279         return fs;
 280 }
 281
 282 static LIST_HEAD(vfsmntlist);
 283
 284 static struct vfsmount *add_vfsmnt(struct super_block *sb,
 285                                 struct dentry *mountpoint,
 286                                 struct dentry *root,
 287                                 struct vfsmount *parent,
 288                                 const char *dev_name,
 289                                 const char *dir_name)
 290 {
 291         struct vfsmount *mnt;
 292         char *name;
 293
 294         mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
 295         if (!mnt)
 296                 goto out;
 297         memset(mnt, 0, sizeof(struct vfsmount));
 298
 299         atomic_set(&mnt->mnt_count,1);
 300         mnt->mnt_sb = sb;
 301         mnt->mnt_mountpoint = dget(mountpoint);
 302         mnt->mnt_root = dget(root);
 303         mnt->mnt_parent = parent ? mntget(parent) : mnt;
 304
 305         /* N.B. Is it really OK to have a vfsmount without names? */
 306         if (dev_name) {
 307                 name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
 308                 if (name) {
 309                         strcpy(name, dev_name);
 310                         mnt->mnt_devname = name;
 311                 }
 312         }
 313         name = kmalloc(strlen(dir_name)+1, GFP_KERNEL);
 314         if (name) {
 315                 strcpy(name, dir_name);
 316                 mnt->mnt_dirname = name;
 317         }
 318
 319         if (parent)
 320                 list_add(&mnt->mnt_child, &parent->mnt_mounts);
 321         else
 322                 INIT_LIST_HEAD(&mnt->mnt_child);
 323         INIT_LIST_HEAD(&mnt->mnt_mounts);
 324         list_add(&mnt->mnt_instances, &sb->s_mounts);
 325         list_add(&mnt->mnt_clash, &mountpoint->d_vfsmnt);
 326         list_add(&mnt->mnt_list, vfsmntlist.prev);
 327 out:
 328         return mnt;
 329 }
 330
 331 static void move_vfsmnt(struct vfsmount *mnt,
 332                         struct dentry *mountpoint,
 333                         struct vfsmount *parent,
 334                         const char *dev_name,
 335                         const char *dir_name)
 336 {
 337         struct dentry *old_mountpoint = mnt->mnt_mountpoint;
 338         struct vfsmount *old_parent = mnt->mnt_parent;
 339         char *new_devname = NULL, *new_dirname = NULL;
 340
 341         if (dev_name) {
 342                 new_devname = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
 343                 if (new_devname)
 344                         strcpy(new_devname, dev_name);
 345         }
 346         if (dir_name) {
 347                 new_dirname = kmalloc(strlen(dir_name)+1, GFP_KERNEL);
 348                 if (new_dirname)
 349                         strcpy(new_dirname, dir_name);
 350         }
 351
 352         /* flip names */
 353         if (new_dirname) {
 354                 kfree(mnt->mnt_dirname);
 355                 mnt->mnt_dirname = new_dirname;
 356         }
 357         if (new_devname) {
 358                 kfree(mnt->mnt_devname);
 359                 mnt->mnt_devname = new_devname;
 360         }
 361
 362         /* flip the linkage */
 363         mnt->mnt_mountpoint = dget(mountpoint);
 364         mnt->mnt_parent = parent ? mntget(parent) : mnt;
 365         list_del(&mnt->mnt_clash);
 366         list_del(&mnt->mnt_child);
 367         list_add(&mnt->mnt_clash, &mountpoint->d_vfsmnt);
 368         if (parent)
 369                 list_add(&mnt->mnt_child, &parent->mnt_mounts);
 370         else
 371                 INIT_LIST_HEAD(&mnt->mnt_child);
 372
 373         /* put the old stuff */
 374         dput(old_mountpoint);
 375         if (old_parent != mnt)
 376                 mntput(old_parent);
 377 }
 378
 379 static void remove_vfsmnt(struct vfsmount *mnt)
 380 {
 381         /* First of all, remove it from all lists */
 382         list_del(&mnt->mnt_instances);
 383         list_del(&mnt->mnt_clash);
 384         list_del(&mnt->mnt_list);
 385         list_del(&mnt->mnt_child);
 386         /* Now we can work safely */
 387         if (mnt->mnt_parent != mnt)
 388                 mntput(mnt->mnt_parent);
 389
 390         dput(mnt->mnt_mountpoint);
 391         dput(mnt->mnt_root);
 392         kfree(mnt->mnt_devname);
 393         kfree(mnt->mnt_dirname);
 394         kfree(mnt);
 395 }
 396
 397 static struct proc_fs_info {
 398         int flag;
 399         char *str;
 400 } fs_info[] = {
 401         { MS_NOEXEC, ",noexec" },
 402         { MS_NOSUID, ",nosuid" },
 403         { MS_NODEV, ",nodev" },
 404         { MS_SYNCHRONOUS, ",sync" },
 405         { MS_MANDLOCK, ",mand" },
 406         { MS_NOATIME, ",noatime" },
 407         { MS_NODIRATIME, ",nodiratime" },
 408 #ifdef MS_NOSUB                 /* Can't find this except in mount.c */
 409         { MS_NOSUB, ",nosub" },
 410 #endif
 411         { 0, NULL }
 412 };
 413
 414 static struct proc_nfs_info {
 415         int flag;
 416         char *str;
 417         char *nostr;
 418 } nfs_info[] = {
 419         { NFS_MOUNT_SOFT, ",soft", ",hard" },
 420         { NFS_MOUNT_INTR, ",intr", "" },
 421         { NFS_MOUNT_POSIX, ",posix", "" },
 422         { NFS_MOUNT_TCP, ",tcp", ",udp" },
 423         { NFS_MOUNT_NOCTO, ",nocto", "" },
 424         { NFS_MOUNT_NOAC, ",noac", "" },
 425         { NFS_MOUNT_NONLM, ",nolock", ",lock" },
 426         { 0, NULL, NULL }
 427 };
 428
 429 int get_filesystem_info( char *buf )
 430 {
 431         struct list_head *p;
 432         struct proc_fs_info *fs_infop;
 433         struct proc_nfs_info *nfs_infop;
 434         struct nfs_server *nfss;
 435         int len = 0;
 436         char *path,*buffer = (char *) __get_free_page(GFP_KERNEL);
 437
 438         if (!buffer) return 0;
 439         for (p = vfsmntlist.next; p!=&vfsmntlist && len < PAGE_SIZE - 160;
 440             p = p->next) {
 441                 struct vfsmount *tmp = list_entry(p, struct vfsmount, mnt_list);
 442                 path = d_path(tmp->mnt_root, tmp, buffer, PAGE_SIZE);
 443                 if (!path)
 444                         continue;
 445                 len += sprintf( buf + len, "%s %s %s %s",
 446                         tmp->mnt_devname, path,
 447                         tmp->mnt_sb->s_type->name,
 448                         tmp->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw" );
 449                 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
 450                   if (tmp->mnt_sb->s_flags & fs_infop->flag) {
 451                     strcpy(buf + len, fs_infop->str);
 452                     len += strlen(fs_infop->str);
 453                   }
 454                 }
 455                 if (!strcmp("nfs", tmp->mnt_sb->s_type->name)) {
 456                         nfss = &tmp->mnt_sb->u.nfs_sb.s_server;
 457                         len += sprintf(buf+len, ",v%d", nfss->rpc_ops->version);
 458
 459                         len += sprintf(buf+len, ",rsize=%d", nfss->rsize);
 460
 461                         len += sprintf(buf+len, ",wsize=%d", nfss->wsize);
 462 #if 0
 463                         if (nfss->timeo != 7*HZ/10) {
 464                                 len += sprintf(buf+len, ",timeo=%d",
 465                                                nfss->timeo*10/HZ);
 466                         }
 467                         if (nfss->retrans != 3) {
 468                                 len += sprintf(buf+len, ",retrans=%d",
 469                                                nfss->retrans);
 470                         }
 471 #endif
 472                         if (nfss->acregmin != 3*HZ) {
 473                                 len += sprintf(buf+len, ",acregmin=%d",
 474                                                nfss->acregmin/HZ);
 475                         }
 476                         if (nfss->acregmax != 60*HZ) {
 477                                 len += sprintf(buf+len, ",acregmax=%d",
 478                                                nfss->acregmax/HZ);
 479                         }
 480                         if (nfss->acdirmin != 30*HZ) {
 481                                 len += sprintf(buf+len, ",acdirmin=%d",
 482                                                nfss->acdirmin/HZ);
 483                         }
 484                         if (nfss->acdirmax != 60*HZ) {
 485                                 len += sprintf(buf+len, ",acdirmax=%d",
 486                                                nfss->acdirmax/HZ);
 487                         }
 488                         for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
 489                                 char *str;
 490                                 if (nfss->flags & nfs_infop->flag)
 491                                         str = nfs_infop->str;
 492                                 else
 493                                         str = nfs_infop->nostr;
 494                                 strcpy(buf + len, str);
 495                                 len += strlen(str);
 496                         }
 497                         len += sprintf(buf+len, ",addr=%s",
 498                                        nfss->hostname);
 499                 }
 500                 len += sprintf( buf + len, " 0 0\n" );
 501         }
 502
 503         free_page((unsigned long) buffer);
 504         return len;
 505 }
 506
 507 /**
 508  *      __wait_on_super - wait on a superblock
 509  *      @sb: superblock to wait on
 510  *
 511  *      Waits for a superblock to become unlocked and then returns. It does
 512  *      not take the lock. This is an internal function. See wait_on_super().
 513  */
 514
 515 void __wait_on_super(struct super_block * sb)
 516 {
 517         DECLARE_WAITQUEUE(wait, current);
 518
 519         add_wait_queue(&sb->s_wait, &wait);
 520 repeat:
 521         set_current_state(TASK_UNINTERRUPTIBLE);
 522         if (sb->s_lock) {
 523                 schedule();
 524                 goto repeat;
 525         }
 526         remove_wait_queue(&sb->s_wait, &wait);
 527         current->state = TASK_RUNNING;
 528 }
 529
 530 /*
 531  * Note: check the dirty flag before waiting, so we don't
 532  * hold up the sync while mounting a device. (The newly
 533  * mounted device won't need syncing.)
 534  */
 535 void sync_supers(kdev_t dev)
 536 {
 537         struct super_block * sb;
 538
 539         for (sb = sb_entry(super_blocks.next);
 540              sb != sb_entry(&super_blocks);
 541              sb = sb_entry(sb->s_list.next)) {
 542                 if (!sb->s_dev)
 543                         continue;
 544                 if (dev && sb->s_dev != dev)
 545                         continue;
 546                 if (!sb->s_dirt)
 547                         continue;
 548                 lock_super(sb);
 549                 if (sb->s_dev && sb->s_dirt && (!dev || dev == sb->s_dev))
 550                         if (sb->s_op && sb->s_op->write_super)
 551                                 sb->s_op->write_super(sb);
 552                 unlock_super(sb);
 553         }
 554 }
 555
 556 /**
 557  *      get_super       -       get the superblock of a device
 558  *      @dev: device to get the superblock for
 559  *
 560  *      Scans the superblock list and finds the superblock of the file system
 561  *      mounted on the device given. %NULL is returned if no match is found.
 562  */
 563
 564 struct super_block * get_super(kdev_t dev)
 565 {
 566         struct super_block * s;
 567
 568         if (!dev)
 569                 return NULL;
 570 restart:
 571         s = sb_entry(super_blocks.next);
 572         while (s != sb_entry(&super_blocks))
 573                 if (s->s_dev == dev) {
 574                         wait_on_super(s);
 575                         if (s->s_dev == dev)
 576                                 return s;
 577                         goto restart;
 578                 } else
 579                         s = sb_entry(s->s_list.next);
 580         return NULL;
 581 }
 582
 583 asmlinkage long sys_ustat(dev_t dev, struct ustat * ubuf)
 584 {
 585         struct super_block *s;
 586         struct ustat tmp;
 587         struct statfs sbuf;
 588         int err = -EINVAL;
 589
 590         lock_kernel();
 591         s = get_super(to_kdev_t(dev));
 592         if (s == NULL)
 593                 goto out;
 594         err = vfs_statfs(s, &sbuf);
 595         if (err)
 596                 goto out;
 597
 598         memset(&tmp,0,sizeof(struct ustat));
 599         tmp.f_tfree = sbuf.f_bfree;
 600         tmp.f_tinode = sbuf.f_ffree;
 601
 602         err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
 603 out:
 604         unlock_kernel();
 605         return err;
 606 }
 607
 608 /**
 609  *      get_empty_super -       find empty superblocks
 610  *
 611  *      Find a superblock with no device assigned. A free superblock is
 612  *      found and returned. If neccessary new superblocks are allocated.
 613  *      %NULL is returned if there are insufficient resources to complete
 614  *      the request.
 615  */
 616
 617 struct super_block *get_empty_super(void)
 618 {
 619         struct super_block *s;
 620
 621         for (s  = sb_entry(super_blocks.next);
 622              s != sb_entry(&super_blocks);
 623              s  = sb_entry(s->s_list.next)) {
 624                 if (s->s_dev)
 625                         continue;
 626                 if (!s->s_lock)
 627                         return s;
 628                 printk("VFS: empty superblock %p locked!\n", s);
 629         }
 630         /* Need a new one... */
 631         if (nr_super_blocks >= max_super_blocks)
 632                 return NULL;
 633         s = kmalloc(sizeof(struct super_block),  GFP_USER);
 634         if (s) {
 635                 nr_super_blocks++;
 636                 memset(s, 0, sizeof(struct super_block));
 637                 INIT_LIST_HEAD(&s->s_dirty);
 638                 list_add (&s->s_list, super_blocks.prev);
 639                 init_waitqueue_head(&s->s_wait);
 640                 INIT_LIST_HEAD(&s->s_files);
 641                 INIT_LIST_HEAD(&s->s_mounts);
 642         }
 643         return s;
 644 }
 645
 646 static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
 647                                        struct file_system_type *type, int flags,
 648                                        void *data, int silent)
 649 {
 650         struct super_block * s;
 651         s = get_empty_super();
 652         if (!s)
 653                 goto out;
 654         s->s_dev = dev;
 655         s->s_bdev = bdev;
 656         s->s_flags = flags;
 657         s->s_dirt = 0;
 658         sema_init(&s->s_vfs_rename_sem,1);
 659         sema_init(&s->s_nfsd_free_path_sem,1);
 660         s->s_type = type;
 661         sema_init(&s->s_dquot.dqio_sem, 1);
 662         sema_init(&s->s_dquot.dqoff_sem, 1);
 663         s->s_dquot.flags = 0;
 664         lock_super(s);
 665         if (!type->read_super(s, data, silent))
 666                 goto out_fail;
 667         unlock_super(s);
 668         /* tell bdcache that we are going to keep this one */
 669         if (bdev)
 670                 atomic_inc(&bdev->bd_count);
 671 out:
 672         return s;
 673
 674 out_fail:
 675         s->s_dev = 0;
 676         s->s_bdev = 0;
 677         s->s_type = NULL;
 678         unlock_super(s);
 679         return NULL;
 680 }
 681
 682 /*
 683  * Unnamed block devices are dummy devices used by virtual
 684  * filesystems which don't use real block-devices.  -- jrs
 685  */
 686
 687 static unsigned int unnamed_dev_in_use[256/(8*sizeof(unsigned int))];
 688
 689 kdev_t get_unnamed_dev(void)
 690 {
 691         int i;
 692
 693         for (i = 1; i < 256; i++) {
 694                 if (!test_and_set_bit(i,unnamed_dev_in_use))
 695                         return MKDEV(UNNAMED_MAJOR, i);
 696         }
 697         return 0;
 698 }
 699
 700 void put_unnamed_dev(kdev_t dev)
 701 {
 702         if (!dev || MAJOR(dev) != UNNAMED_MAJOR)
 703                 return;
 704         if (test_and_clear_bit(MINOR(dev), unnamed_dev_in_use))
 705                 return;
 706         printk("VFS: put_unnamed_dev: freeing unused device %s\n",
 707                         kdevname(dev));
 708 }
 709
 710 static struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 711         char *dev_name, int flags, void * data)
 712 {
 713         struct inode *inode;
 714         struct block_device *bdev;
 715         struct block_device_operations *bdops;
 716         struct super_block * sb;
 717         struct nameidata nd;
 718         kdev_t dev;
 719         int error = 0;
 720         /* What device it is? */
 721         if (!dev_name || !*dev_name)
 722                 return ERR_PTR(-EINVAL);
 723         if (path_init(dev_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
 724                 error = path_walk(dev_name, &nd);
 725         if (error)
 726                 return ERR_PTR(error);
 727         inode = nd.dentry->d_inode;
 728         error = -ENOTBLK;
 729         if (!S_ISBLK(inode->i_mode))
 730                 goto out;
 731         error = -EACCES;
 732         if (IS_NODEV(inode))
 733                 goto out;
 734         bdev = inode->i_bdev;
 735         bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
 736         if (bdops) bdev->bd_op = bdops;
 737         /* Done with lookups, semaphore down */
 738         down(&mount_sem);
 739         dev = to_kdev_t(bdev->bd_dev);
 740         check_disk_change(dev);
 741         error = -EACCES;
 742         if (!(flags & MS_RDONLY) && is_read_only(dev))
 743                 goto out;
 744         sb = get_super(dev);
 745         if (sb) {
 746                 if (fs_type == sb->s_type) {
 747                         path_release(&nd);
 748                         return sb;
 749                 }
 750         } else {
 751                 mode_t mode = FMODE_READ; /* we always need it ;-) */
 752                 if (!(flags & MS_RDONLY))
 753                         mode |= FMODE_WRITE;
 754                 error = blkdev_get(bdev, mode, 0, BDEV_FS);
 755                 if (error)
 756                         goto out;
 757                 error = -EINVAL;
 758                 sb = read_super(dev, bdev, fs_type, flags, data, 0);
 759                 if (sb) {
 760                         get_filesystem(fs_type);
 761                         path_release(&nd);
 762                         return sb;
 763                 }
 764                 blkdev_put(bdev, BDEV_FS);
 765         }
 766 out:
 767         path_release(&nd);
 768         up(&mount_sem);
 769         return ERR_PTR(error);
 770 }
 771
 772 static struct super_block *get_sb_nodev(struct file_system_type *fs_type,
 773         int flags, void * data)
 774 {
 775         kdev_t dev;
 776         int error = -EMFILE;
 777         down(&mount_sem);
 778         dev = get_unnamed_dev();
 779         if (dev) {
 780                 struct super_block * sb;
 781                 error = -EINVAL;
 782                 sb = read_super(dev, NULL, fs_type, flags, data, 0);
 783                 if (sb) {
 784                         get_filesystem(fs_type);
 785                         return sb;
 786                 }
 787                 put_unnamed_dev(dev);
 788         }
 789         up(&mount_sem);
 790         return ERR_PTR(error);
 791 }
 792
 793 static struct super_block *get_sb_single(struct file_system_type *fs_type,
 794         int flags, void *data)
 795 {
 796         struct super_block * sb;
 797         /*
 798          * Get the superblock of kernel-wide instance, but
 799          * keep the reference to fs_type.
 800          */
 801         down(&mount_sem);
 802         sb = fs_type->kern_mnt->mnt_sb;
 803         if (!sb)
 804                 BUG();
 805         get_filesystem(fs_type);
 806         do_remount_sb(sb, flags, data);
 807         return sb;
 808 }
 809
 810 static struct block_device *kill_super(struct super_block *sb, int umount_root)
 811 {
 812         struct block_device *bdev;
 813         kdev_t dev;
 814         dput(sb->s_root);
 815         sb->s_root = NULL;
 816         lock_super(sb);
 817         if (sb->s_op) {
 818                 if (sb->s_op->write_super && sb->s_dirt)
 819                         sb->s_op->write_super(sb);
 820                 if (sb->s_op->put_super)
 821                         sb->s_op->put_super(sb);
 822         }
 823
 824         /* Forget any remaining inodes */
 825         if (invalidate_inodes(sb)) {
 826                 printk("VFS: Busy inodes after unmount. "
 827                         "Self-destruct in 5 seconds.  Have a nice day...\n");
 828         }
 829
 830         dev = sb->s_dev;
 831         sb->s_dev = 0;          /* Free the superblock */
 832         bdev = sb->s_bdev;
 833         sb->s_bdev = NULL;
 834         put_filesystem(sb->s_type);
 835         sb->s_type = NULL;
 836         unlock_super(sb);
 837         if (umount_root) {
 838                 /* special: the old device driver is going to be
 839                    a ramdisk and the point of this call is to free its
 840                    protected memory (even if dirty). */
 841                 destroy_buffers(dev);
 842         }
 843         if (bdev) {
 844                 blkdev_put(bdev, BDEV_FS);
 845                 bdput(bdev);
 846         } else
 847                 put_unnamed_dev(dev);
 848         return bdev;
 849 }
 850
 851 /*
 852  * Alters the mount flags of a mounted file system. Only the mount point
 853  * is used as a reference - file system type and the device are ignored.
 854  */
 855
 856 static int do_remount_sb(struct super_block *sb, int flags, char *data)
 857 {
 858         int retval;
 859
 860         if (!(flags & MS_RDONLY) && sb->s_dev && is_read_only(sb->s_dev))
 861                 return -EACCES;
 862                 /*flags |= MS_RDONLY;*/
 863         /* If we are remounting RDONLY, make sure there are no rw files open */
 864         if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
 865                 if (!fs_may_remount_ro(sb))
 866                         return -EBUSY;
 867         if (sb->s_op && sb->s_op->remount_fs) {
 868                 lock_super(sb);
 869                 retval = sb->s_op->remount_fs(sb, &flags, data);
 870                 unlock_super(sb);
 871                 if (retval)
 872                         return retval;
 873         }
 874         sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 875
 876         /*
 877          * We can't invalidate inodes as we can loose data when remounting
 878          * (someone might manage to alter data while we are waiting in lock_super()
 879          * or in foo_remount_fs()))
 880          */
 881
 882         return 0;
 883 }
 884
 885 struct vfsmount *kern_mount(struct file_system_type *type)
 886 {
 887         kdev_t dev = get_unnamed_dev();
 888         struct super_block *sb;
 889         struct vfsmount *mnt;
 890         if (!dev)
 891                 return ERR_PTR(-EMFILE);
 892         sb = read_super(dev, NULL, type, 0, NULL, 0);
 893         if (!sb) {
 894                 put_unnamed_dev(dev);
 895                 return ERR_PTR(-EINVAL);
 896         }
 897         mnt = add_vfsmnt(sb, sb->s_root, sb->s_root, NULL, "none", type->name);
 898         if (!mnt) {
 899                 kill_super(sb, 0);
 900                 return ERR_PTR(-ENOMEM);
 901         }
 902         type->kern_mnt = mnt;
 903         return mnt;
 904 }
 905
 906 /* Call only after unregister_filesystem() - it's a final cleanup */
 907
 908 void kern_umount(struct vfsmount *mnt)
 909 {
 910         struct super_block *sb = mnt->mnt_sb;
 911         struct dentry *root = sb->s_root;
 912         remove_vfsmnt(mnt);
 913         dput(root);
 914         sb->s_root = NULL;
 915         kill_super(sb, 0);
 916 }
 917
 918 /*
 919  * Doesn't take quota and stuff into account. IOW, in some cases it will
 920  * give false negatives. The main reason why it's here is that we need
 921  * a non-destructive way to look for easily umountable filesystems.
 922  */
 923 int may_umount(struct vfsmount *mnt)
 924 {
 925         if (atomic_read(&mnt->mnt_count) > 2)
 926                 return -EBUSY;
 927         return 0;
 928 }
 929
 930 static int do_umount(struct vfsmount *mnt, int umount_root, int flags)
 931 {
 932         struct super_block * sb = mnt->mnt_sb;
 933
 934         if (mnt == current->fs->rootmnt && !umount_root) {
 935                 int retval = 0;
 936                 /*
 937                  * Special case for "unmounting" root ...
 938                  * we just try to remount it readonly.
 939                  */
 940                 mntput(mnt);
 941                 if (!(sb->s_flags & MS_RDONLY))
 942                         retval = do_remount_sb(sb, MS_RDONLY, 0);
 943                 return retval;
 944         }
 945
 946         if (atomic_read(&mnt->mnt_count) > 2) {
 947                 mntput(mnt);
 948                 return -EBUSY;
 949         }
 950
 951         if (mnt->mnt_instances.next != mnt->mnt_instances.prev) {
 952                 if (sb->s_type->fs_flags & FS_SINGLE)
 953                         put_filesystem(sb->s_type);
 954                 mntput(mnt);
 955                 remove_vfsmnt(mnt);
 956                 return 0;
 957         }
 958
 959         /*
 960          * Before checking whether the filesystem is still busy,
 961          * make sure the kernel doesn't hold any quota files open
 962          * on the device. If the umount fails, too bad -- there
 963          * are no quotas running any more. Just turn them on again.
 964          */
 965         DQUOT_OFF(sb);
 966         acct_auto_close(sb->s_dev);
 967
 968         /*
 969          * If we may have to abort operations to get out of this
 970          * mount, and they will themselves hold resources we must
 971          * allow the fs to do things. In the Unix tradition of
 972          * 'Gee thats tricky lets do it in userspace' the umount_begin
 973          * might fail to complete on the first run through as other tasks
 974          * must return, and the like. Thats for the mount program to worry
 975          * about for the moment.
 976          */
 977
 978         if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
 979                 sb->s_op->umount_begin(sb);
 980
 981         /*
 982          * Shrink dcache, then fsync. This guarantees that if the
 983          * filesystem is quiescent at this point, then (a) only the
 984          * root entry should be in use and (b) that root entry is
 985          * clean.
 986          */
 987         shrink_dcache_sb(sb);
 988         fsync_dev(sb->s_dev);
 989
 990         /* Something might grab it again - redo checks */
 991
 992         if (atomic_read(&mnt->mnt_count) > 2) {
 993                 mntput(mnt);
 994                 return -EBUSY;
 995         }
 996
 997         if (sb->s_root->d_inode->i_state) {
 998                 mntput(mnt);
 999                 return -EBUSY;
1000         }
1001
1002         /* OK, that's the point of no return */
1003         mntput(mnt);
1004         remove_vfsmnt(mnt);
1005
1006         kill_super(sb, umount_root);
1007         return 0;
1008 }
1009
1010 /*
1011  * Now umount can handle mount points as well as block devices.
1012  * This is important for filesystems which use unnamed block devices.
1013  *
1014  * We now support a flag for forced unmount like the other 'big iron'
1015  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1016  */
1017
1018 asmlinkage long sys_umount(char * name, int flags)
1019 {
1020         struct nameidata nd;
1021         char *kname;
1022         int retval;
1023         struct super_block *sb;
1024
1025         if (!capable(CAP_SYS_ADMIN))
1026                 return -EPERM;
1027
1028         lock_kernel();
1029         kname = getname(name);
1030         retval = PTR_ERR(kname);
1031         if (IS_ERR(kname))
1032                 goto out;
1033         retval = 0;
1034         if (path_init(kname, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd))
1035                 retval = path_walk(kname, &nd);
1036         putname(kname);
1037         if (retval)
1038                 goto out;
1039         sb = nd.dentry->d_inode->i_sb;
1040         retval = -EINVAL;
1041         if (nd.dentry!=nd.mnt->mnt_root)
1042                 goto dput_and_out;
1043         dput(nd.dentry);
1044         /* puts nd.mnt */
1045         down(&mount_sem);
1046         retval = do_umount(nd.mnt, 0, flags);
1047         up(&mount_sem);
1048         goto out;
1049 dput_and_out:
1050         path_release(&nd);
1051 out:
1052         unlock_kernel();
1053         return retval;
1054 }
1055
1056 /*
1057  *      The 2.0 compatible umount. No flags.
1058  */
1059
1060 asmlinkage long sys_oldumount(char * name)
1061 {
1062         return sys_umount(name,0);
1063 }
1064
1065 /*
1066  * do loopback mount.
1067  */
1068 static int do_loopback(char *old_name, char *new_name)
1069 {
1070         struct nameidata old_nd, new_nd;
1071         int err = 0;
1072         if (!old_name || !*old_name)
1073                 return -EINVAL;
1074         if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &old_nd))
1075                 err = path_walk(old_name, &old_nd);
1076         if (err)
1077                 goto out;
1078         if (path_init(new_name, LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &new_nd))
1079                 err = path_walk(new_name, &new_nd);
1080         if (err)
1081                 goto out1;
1082         err = -EPERM;
1083         if (!capable(CAP_SYS_ADMIN) &&
1084              current->uid != new_nd.dentry->d_inode->i_uid)
1085                 goto out2;
1086         down(&mount_sem);
1087         err = -ENOENT;
1088         if (d_unhashed(old_nd.dentry) && !IS_ROOT(old_nd.dentry))
1089                 goto out3;
1090         if (d_unhashed(new_nd.dentry) && !IS_ROOT(new_nd.dentry))
1091                 goto out3;
1092         /* there we go */
1093         err = -ENOMEM;
1094         if (old_nd.mnt->mnt_sb->s_type->fs_flags & FS_SINGLE)
1095                 get_filesystem(old_nd.mnt->mnt_sb->s_type);
1096         if (add_vfsmnt(old_nd.mnt->mnt_sb, new_nd.dentry, old_nd.dentry,
1097                        new_nd.mnt, old_nd.mnt->mnt_devname, new_name))
1098                 err = 0;
1099 out3:
1100         up(&mount_sem);
1101 out2:
1102         path_release(&new_nd);
1103 out1:
1104         path_release(&old_nd);
1105 out:
1106         return err;
1107 }
1108
1109 /*
1110  * change filesystem flags. dir should be a physical root of filesystem.
1111  * If you've mounted a non-root directory somewhere and want to do remount
1112  * on it - tough luck.
1113  */
1114
1115 static int do_remount(const char *dir,int flags,char *data)
1116 {
1117         struct nameidata nd;
1118         int retval = 0;
1119
1120         if (!capable(CAP_SYS_ADMIN))
1121                 return -EPERM;
1122
1123         if (path_init(dir, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
1124                 retval = path_walk(dir, &nd);
1125         if (!retval) {
1126                 struct super_block * sb = nd.dentry->d_inode->i_sb;
1127                 retval = -ENODEV;
1128                 if (sb) {
1129                         retval = -EINVAL;
1130                         if (nd.dentry == sb->s_root) {
1131                                 /*
1132                                  * Shrink the dcache and sync the device.
1133                                  */
1134                                 shrink_dcache_sb(sb);
1135                                 fsync_dev(sb->s_dev);
1136                                 if (flags & MS_RDONLY)
1137                                         acct_auto_close(sb->s_dev);
1138                                 retval = do_remount_sb(sb, flags, data);
1139                         }
1140                 }
1141                 path_release(&nd);
1142         }
1143         return retval;
1144 }
1145
1146 static int copy_mount_options (const void * data, unsigned long *where)
1147 {
1148         int i;
1149         unsigned long page;
1150         struct vm_area_struct * vma;
1151
1152         *where = 0;
1153         if (!data)
1154                 return 0;
1155
1156         vma = find_vma(current->mm, (unsigned long) data);
1157         if (!vma || (unsigned long) data < vma->vm_start)
1158                 return -EFAULT;
1159         if (!(vma->vm_flags & VM_READ))
1160                 return -EFAULT;
1161         i = vma->vm_end - (unsigned long) data;
1162         if (PAGE_SIZE <= (unsigned long) i)
1163                 i = PAGE_SIZE-1;
1164         if (!(page = __get_free_page(GFP_KERNEL))) {
1165                 return -ENOMEM;
1166         }
1167         if (copy_from_user((void *) page,data,i)) {
1168                 free_page(page);
1169                 return -EFAULT;
1170         }
1171         *where = page;
1172         return 0;
1173 }
1174
1175 /*
1176  * Flags is a 16-bit value that allows up to 16 non-fs dependent flags to
1177  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
1178  *
1179  * data is a (void *) that can point to any structure up to
1180  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
1181  * information (or be NULL).
1182  *
1183  * NOTE! As old versions of mount() didn't use this setup, the flags
1184  * have to have a special 16-bit magic number in the high word:
1185  * 0xC0ED. If this magic word isn't present, the flags and data info
1186  * aren't used, as the syscall assumes we are talking to an older
1187  * version that didn't understand them.
1188  */
1189 long do_sys_mount(char * dev_name, char * dir_name, char *type_page,
1190                   unsigned long new_flags, void *data_page)
1191 {
1192         struct file_system_type * fstype;
1193         struct nameidata nd;
1194         struct vfsmount *mnt;
1195         struct super_block *sb;
1196         int retval = 0;
1197         unsigned long flags = 0;
1198
1199         /* Basic sanity checks */
1200
1201         if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
1202                 return -EINVAL;
1203         if (!type_page || !memchr(type_page, 0, PAGE_SIZE))
1204                 return -EINVAL;
1205         if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
1206                 return -EINVAL;
1207
1208         /* OK, looks good, now let's see what do they want */
1209
1210         /* just change the flags? - capabilities are checked in do_remount() */
1211         if ((new_flags & (MS_MGC_MSK|MS_REMOUNT)) == (MS_MGC_VAL|MS_REMOUNT))
1212                 return do_remount(dir_name, new_flags&~(MS_MGC_MSK|MS_REMOUNT),
1213                                     (char *) data_page);
1214
1215         if ((new_flags & MS_MGC_MSK) == MS_MGC_VAL)
1216                 flags = new_flags & ~MS_MGC_MSK;
1217
1218         /* loopback mount? This is special - requires fewer capabilities */
1219         if (strcmp(type_page, "bind")==0)
1220                 return do_loopback(dev_name, dir_name);
1221
1222         /* for the rest we _really_ need capabilities... */
1223         if (!capable(CAP_SYS_ADMIN))
1224                 return -EPERM;
1225
1226         /* ... filesystem driver... */
1227         fstype = get_fs_type(type_page);
1228         if (!fstype)
1229                 return -ENODEV;
1230
1231         /* ... and mountpoint. Do the lookup first to force automounting. */
1232         if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &nd))
1233                 retval = path_walk(dir_name, &nd);
1234         if (retval)
1235                 goto fs_out;
1236
1237         /* get superblock, locks mount_sem on success */
1238         if (fstype->fs_flags & FS_NOMOUNT)
1239                 sb = ERR_PTR(-EINVAL);
1240         else if (fstype->fs_flags & FS_REQUIRES_DEV)
1241                 sb = get_sb_bdev(fstype, dev_name,flags, data_page);
1242         else if (fstype->fs_flags & FS_SINGLE)
1243                 sb = get_sb_single(fstype, flags, data_page);
1244         else
1245                 sb = get_sb_nodev(fstype, flags, data_page);
1246
1247         retval = PTR_ERR(sb);
1248         if (IS_ERR(sb))
1249                 goto dput_out;
1250
1251         retval = -ENOENT;
1252         if (d_unhashed(nd.dentry) && !IS_ROOT(nd.dentry))
1253                 goto fail;
1254
1255         /* Something was mounted here while we slept */
1256         while(d_mountpoint(nd.dentry) && follow_down(&nd.mnt, &nd.dentry))
1257                 ;
1258
1259         retval = -ENOMEM;
1260         mnt = add_vfsmnt(sb, nd.dentry, sb->s_root, nd.mnt, dev_name, dir_name);
1261         if (!mnt)
1262                 goto fail;
1263         retval = 0;
1264 unlock_out:
1265         up(&mount_sem);
1266 dput_out:
1267         path_release(&nd);
1268 fs_out:
1269         put_filesystem(fstype);
1270         return retval;
1271
1272 fail:
1273         if (list_empty(&sb->s_mounts))
1274                 kill_super(sb, 0);
1275         goto unlock_out;
1276 }
1277
1278 asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type,
1279                           unsigned long new_flags, void * data)
1280 {
1281         int retval;
1282         unsigned long data_page = 0;
1283         unsigned long type_page = 0;
1284         unsigned long dev_page = 0;
1285         char *dir_page;
1286
1287         lock_kernel();
1288         retval = copy_mount_options (type, &type_page);
1289         if (retval < 0)
1290                 goto out;
1291
1292         /* copy_mount_options allows a NULL user pointer,
1293          * and just returns zero in that case.  But if we
1294          * allow the type to be NULL we will crash.
1295          * Previously we did not check this case.
1296          */
1297         if (type_page == 0) {
1298                 retval = -EINVAL;
1299                 goto out;
1300         }
1301
1302         dir_page = getname(dir_name);
1303         retval = PTR_ERR(dir_page);
1304         if (IS_ERR(dir_page))
1305                 goto out1;
1306
1307         retval = copy_mount_options (dev_name, &dev_page);
1308         if (retval < 0)
1309                 goto out2;
1310         retval = copy_mount_options (data, &data_page);
1311         if (retval >= 0) {
1312                 retval = do_sys_mount((char*)dev_page,dir_page,(char*)type_page,
1313                                       new_flags, (void*)data_page);
1314                 free_page(data_page);
1315         }
1316         free_page(dev_page);
1317 out2:
1318         putname(dir_page);
1319 out1:
1320         free_page(type_page);
1321 out:
1322         unlock_kernel();
1323         return retval;
1324 }
1325
1326 void __init mount_root(void)
1327 {
1328         struct file_system_type * fs_type;
1329         struct super_block * sb;
1330         struct vfsmount *vfsmnt;
1331         struct block_device *bdev = NULL;
1332         mode_t mode;
1333         int retval;
1334         void *handle;
1335         char path[64];
1336         int path_start = -1;
1337
1338 #ifdef CONFIG_ROOT_NFS
1339         void *data;
1340         if (MAJOR(ROOT_DEV) != UNNAMED_MAJOR)
1341                 goto skip_nfs;
1342         fs_type = get_fs_type("nfs");
1343         if (!fs_type)
1344                 goto no_nfs;
1345         ROOT_DEV = get_unnamed_dev();
1346         if (!ROOT_DEV)
1347                 /*
1348                  * Your /linuxrc sucks worse than MSExchange - that's the
1349                  * only way you could run out of anon devices at that point.
1350                  */
1351                 goto no_anon;
1352         data = nfs_root_data();
1353         if (!data)
1354                 goto no_server;
1355         sb = read_super(ROOT_DEV, NULL, fs_type, root_mountflags, data, 1);
1356         if (sb)
1357                 /*
1358                  * We _can_ fail there, but if that will happen we have no
1359                  * chance anyway (no memory for vfsmnt and we _will_ need it,
1360                  * no matter which fs we try to mount).
1361                  */
1362                 goto mount_it;
1363 no_server:
1364         put_unnamed_dev(ROOT_DEV);
1365 no_anon:
1366         put_filesystem(fs_type);
1367 no_nfs:
1368         printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
1369         ROOT_DEV = MKDEV(FLOPPY_MAJOR, 0);
1370 skip_nfs:
1371 #endif
1372
1373 #ifdef CONFIG_BLK_DEV_FD
1374         if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
1375 #ifdef CONFIG_BLK_DEV_RAM
1376                 extern int rd_doload;
1377                 extern void rd_load_secondary(void);
1378 #endif
1379                 floppy_eject();
1380 #ifndef CONFIG_BLK_DEV_RAM
1381                 printk(KERN_NOTICE "(Warning, this kernel has no ramdisk support)\n");
1382 #else
1383                 /* rd_doload is 2 for a dual initrd/ramload setup */
1384                 if(rd_doload==2)
1385                         rd_load_secondary();
1386                 else
1387 #endif
1388                 {
1389                         printk(KERN_NOTICE "VFS: Insert root floppy and press ENTER\n");
1390                         wait_for_keypress();
1391                 }
1392         }
1393 #endif
1394
1395         devfs_make_root (root_device_name);
1396         handle = devfs_find_handle (NULL, ROOT_DEVICE_NAME, 0,
1397                                     MAJOR (ROOT_DEV), MINOR (ROOT_DEV),
1398                                     DEVFS_SPECIAL_BLK, 1);
1399         if (handle)  /*  Sigh: bd*() functions only paper over the cracks  */
1400         {
1401             unsigned major, minor;
1402
1403             devfs_get_maj_min (handle, &major, &minor);
1404             ROOT_DEV = MKDEV (major, minor);
1405         }
1406
1407         /*
1408          * Probably pure paranoia, but I'm less than happy about delving into
1409          * devfs crap and checking it right now. Later.
1410          */
1411         if (!ROOT_DEV)
1412                 panic("I have no root and I want to scream");
1413
1414         bdev = bdget(kdev_t_to_nr(ROOT_DEV));
1415         if (!bdev)
1416                 panic(__FUNCTION__ ": unable to allocate root device");
1417         bdev->bd_op = devfs_get_ops (handle);
1418         path_start = devfs_generate_path (handle, path + 5, sizeof (path) - 5);
1419         mode = FMODE_READ;
1420         if (!(root_mountflags & MS_RDONLY))
1421                 mode |= FMODE_WRITE;
1422         retval = blkdev_get(bdev, mode, 0, BDEV_FS);
1423         if (retval == -EROFS) {
1424                 root_mountflags |= MS_RDONLY;
1425                 retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS);
1426         }
1427         if (retval) {
1428                 /*
1429                  * Allow the user to distinguish between failed open
1430                  * and bad superblock on root device.
1431                  */
1432                 printk ("VFS: Cannot open root device \"%s\" or %s\n",
1433                         root_device_name, kdevname (ROOT_DEV));
1434                 printk ("Please append a correct \"root=\" boot option\n");
1435                 panic("VFS: Unable to mount root fs on %s",
1436                         kdevname(ROOT_DEV));
1437         }
1438
1439         check_disk_change(ROOT_DEV);
1440         sb = get_super(ROOT_DEV);
1441         if (sb) {
1442                 fs_type = sb->s_type;
1443                 goto mount_it;
1444         }
1445
1446         read_lock(&file_systems_lock);
1447         for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) {
1448                 if (!(fs_type->fs_flags & FS_REQUIRES_DEV))
1449                         continue;
1450                 if (!try_inc_mod_count(fs_type->owner))
1451                         continue;
1452                 read_unlock(&file_systems_lock);
1453                 sb = read_super(ROOT_DEV,bdev,fs_type,root_mountflags,NULL,1);
1454                 if (sb)
1455                         goto mount_it;
1456                 read_lock(&file_systems_lock);
1457                 put_filesystem(fs_type);
1458         }
1459         read_unlock(&file_systems_lock);
1460         panic("VFS: Unable to mount root fs on %s", kdevname(ROOT_DEV));
1461
1462 mount_it:
1463         printk ("VFS: Mounted root (%s filesystem)%s.\n",
1464                 fs_type->name,
1465                 (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1466         if (path_start >= 0) {
1467                 devfs_mk_symlink (NULL,
1468                                   "root", 0, DEVFS_FL_DEFAULT,
1469                                   path + 5 + path_start, 0,
1470                                   NULL, NULL);
1471                 memcpy (path + path_start, "/dev/", 5);
1472                 vfsmnt = add_vfsmnt (sb, sb->s_root, sb->s_root, NULL,
1473                                         path + path_start, "/");
1474         }
1475         else
1476                 vfsmnt = add_vfsmnt (sb, sb->s_root, sb->s_root, NULL,
1477                                         "/dev/root", "/");
1478         if (vfsmnt) {
1479                 set_fs_root(current->fs, vfsmnt, sb->s_root);
1480                 set_fs_pwd(current->fs, vfsmnt, sb->s_root);
1481                 if (bdev)
1482                         bdput(bdev); /* sb holds a reference */
1483                 return;
1484         }
1485         panic("VFS: add_vfsmnt failed for root fs");
1486 }
1487
1488
1489 static void chroot_fs_refs(struct dentry *old_root,
1490                            struct vfsmount *old_rootmnt,
1491                            struct dentry *new_root,
1492                            struct vfsmount *new_rootmnt)
1493 {
1494         struct task_struct *p;
1495
1496         /* We can't afford dput() blocking under the tasklist_lock */
1497         mntget(old_rootmnt);
1498         dget(old_root);
1499
1500         read_lock(&tasklist_lock);
1501         for_each_task(p) {
1502                 if (!p->fs) continue;
1503                 if (p->fs->root == old_root && p->fs->rootmnt == old_rootmnt)
1504                         set_fs_root(p->fs, new_rootmnt, new_root);
1505                 if (p->fs->pwd == old_root && p->fs->pwdmnt == old_rootmnt)
1506                         set_fs_pwd(p->fs, new_rootmnt, new_root);
1507         }
1508         read_unlock(&tasklist_lock);
1509
1510         dput(old_root);
1511         mntput(old_rootmnt);
1512 }
1513
1514 /*
1515  * Moves the current root to put_root, and sets root/cwd of all processes
1516  * which had them on the old root to new_root.
1517  *
1518  * Note:
1519  *  - we don't move root/cwd if they are not at the root (reason: if something
1520  *    cared enough to change them, it's probably wrong to force them elsewhere)
1521  *  - it's okay to pick a root that isn't the root of a file system, e.g.
1522  *    /nfs/my_root where /nfs is the mount point. Better avoid creating
1523  *    unreachable mount points this way, though.
1524  */
1525
1526 asmlinkage long sys_pivot_root(const char *new_root, const char *put_old)
1527 {
1528         struct dentry *root = current->fs->root;
1529         struct vfsmount *root_mnt = current->fs->rootmnt;
1530         struct vfsmount *tmp;
1531         struct nameidata new_nd, old_nd;
1532         char *name;
1533         int error;
1534
1535         if (!capable(CAP_SYS_ADMIN))
1536                 return -EPERM;
1537
1538         lock_kernel();
1539
1540         name = getname(new_root);
1541         error = PTR_ERR(name);
1542         if (IS_ERR(name))
1543                 goto out0;
1544         error = 0;
1545         if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd))
1546                 error = path_walk(name, &new_nd);
1547         putname(name);
1548         if (error)
1549                 goto out0;
1550
1551         name = getname(put_old);
1552         error = PTR_ERR(name);
1553         if (IS_ERR(name))
1554                 goto out0;
1555         error = 0;
1556         if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd))
1557                 error = path_walk(name, &old_nd);
1558         putname(name);
1559         if (error)
1560                 goto out1;
1561
1562         down(&mount_sem);
1563         error = -ENOENT;
1564         if (d_unhashed(new_nd.dentry) && !IS_ROOT(new_nd.dentry))
1565                 goto out2;
1566         if (d_unhashed(old_nd.dentry) && !IS_ROOT(old_nd.dentry))
1567                 goto out2;
1568         error = -EBUSY;
1569         if (new_nd.mnt == root_mnt || old_nd.mnt == root_mnt)
1570                 goto out2; /* loop */
1571         error = -EINVAL;
1572         tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
1573         if (tmp != new_nd.mnt) {
1574                 for (;;) {
1575                         if (tmp->mnt_parent == tmp)
1576                                 goto out2;
1577                         if (tmp->mnt_parent == new_nd.mnt)
1578                                 break;
1579                         tmp = tmp->mnt_parent;
1580                 }
1581                 if (!is_subdir(tmp->mnt_mountpoint, new_nd.dentry))
1582                         goto out2;
1583         } else if (!is_subdir(old_nd.dentry, new_nd.dentry))
1584                 goto out2;
1585
1586         error = -ENOMEM;
1587         name = __getname();
1588         if (!name)
1589                 goto out2;
1590
1591         move_vfsmnt(new_nd.mnt, new_nd.dentry, NULL, NULL, "/");
1592         move_vfsmnt(root_mnt, old_nd.dentry, old_nd.mnt, NULL,
1593                         __d_path(old_nd.dentry, old_nd.mnt, new_nd.dentry,
1594                                 new_nd.mnt, name, PAGE_SIZE));
1595         putname(name);
1596         chroot_fs_refs(root,root_mnt,new_nd.dentry,new_nd.mnt);
1597         error = 0;
1598 out2:
1599         up(&mount_sem);
1600         path_release(&old_nd);
1601 out1:
1602         path_release(&new_nd);
1603 out0:
1604         unlock_kernel();
1605         return error;
1606 }
1607
1608
1609 #ifdef CONFIG_BLK_DEV_INITRD
1610
1611 int __init change_root(kdev_t new_root_dev,const char *put_old)
1612 {
1613         kdev_t old_root_dev = ROOT_DEV;
1614         struct vfsmount *old_rootmnt = mntget(current->fs->rootmnt);
1615         struct nameidata devfs_nd, nd;
1616         int error = 0;
1617
1618         /*  First unmount devfs if mounted  */
1619         if (path_init("/dev", LOOKUP_FOLLOW|LOOKUP_POSITIVE, &devfs_nd))
1620                 error = path_walk("/dev", &devfs_nd);
1621         if (!error) {
1622                 struct super_block *sb = devfs_nd.dentry->d_inode->i_sb;
1623
1624                 if (devfs_nd.mnt->mnt_sb->s_magic == DEVFS_SUPER_MAGIC &&
1625                     devfs_nd.dentry == devfs_nd.mnt->mnt_root) {
1626                         dput(devfs_nd.dentry);
1627                         down(&mount_sem);
1628                         /* puts devfs_nd.mnt */
1629                         do_umount(devfs_nd.mnt, 0, 0);
1630                         up(&mount_sem);
1631                 } else
1632                         path_release(&devfs_nd);
1633         }
1634         ROOT_DEV = new_root_dev;
1635         mount_root();
1636 #if 1
1637         shrink_dcache();
1638         printk("change_root: old root has d_count=%d\n",
1639                old_rootmnt->mnt_root->d_count);
1640 #endif
1641         mount_devfs_fs ();
1642         /*
1643          * Get the new mount directory
1644          */
1645         error = 0;
1646         if (path_init(put_old, LOOKUP_FOLLOW|LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &nd))
1647                 error = path_walk(put_old, &nd);
1648         if (error) {
1649                 int blivet;
1650
1651                 printk(KERN_NOTICE "Trying to unmount old root ... ");
1652                 blivet = do_umount(old_rootmnt, 1, 0);
1653                 if (!blivet) {
1654                         printk("okay\n");
1655                         return 0;
1656                 }
1657                 printk(KERN_ERR "error %ld\n",blivet);
1658                 return error;
1659         }
1660         move_vfsmnt(old_rootmnt, nd.dentry, nd.mnt, "/dev/root.old", put_old);
1661         mntput(old_rootmnt);
1662         path_release(&nd);
1663         return 0;
1664 }
1665
1666 #endif