kernel/fs/vfs.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016 Joyent, Inc.
  25  * Copyright 2016 Toomas Soome <tsoome@me.com>
  26  * Copyright 2016 Nexenta Systems, Inc.
  27  * Copyright (c) 2016 by Delphix. All rights reserved.
  28  * Copyright 2016 Nexenta Systems, Inc.
  29  * Copyright 2017 RackTop Systems.
  30  */
  31
  32 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  33 /*        All Rights Reserved   */
  34
  35 /*
  36  * University Copyright- Copyright (c) 1982, 1986, 1988
  37  * The Regents of the University of California
  38  * All Rights Reserved
  39  *
  40  * University Acknowledgment- Portions of this document are derived from
  41  * software developed by the University of California, Berkeley, and its
  42  * contributors.
  43  */
  44
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/errno.h>
  49 #include <sys/user.h>
  50 #include <sys/fstyp.h>
  51 #include <sys/kmem.h>
  52 #include <sys/systm.h>
  53 #include <sys/proc.h>
  54 #include <sys/mount.h>
  55 #include <sys/vfs.h>
  56 #include <sys/vfs_dispatch.h>
  57 #include <sys/fem.h>
  58 #include <sys/mntent.h>
  59 #include <sys/stat.h>
  60 #include <sys/statvfs.h>
  61 #include <sys/statfs.h>
  62 #include <sys/cred.h>
  63 #include <sys/vnode.h>
  64 #include <sys/rwstlock.h>
  65 #include <sys/dnlc.h>
  66 #include <sys/file.h>
  67 #include <sys/time.h>
  68 #include <sys/atomic.h>
  69 #include <sys/cmn_err.h>
  70 #include <sys/buf.h>
  71 #include <sys/swap.h>
  72 #include <sys/debug.h>
  73 #include <sys/vnode.h>
  74 #include <sys/modctl.h>
  75 #include <sys/ddi.h>
  76 #include <sys/pathname.h>
  77 #include <sys/bootconf.h>
  78 #include <sys/dumphdr.h>
  79 #include <sys/poll.h>
  80 #include <sys/sunddi.h>
  81 #include <sys/sysmacros.h>
  82 #include <sys/zone.h>
  83 #include <sys/policy.h>
  84 #include <sys/ctfs.h>
  85 #include <sys/objfs.h>
  86 #include <sys/console.h>
  87 #include <sys/reboot.h>
  88 #include <sys/attr.h>
  89 #include <sys/zio.h>
  90 #include <sys/spa.h>
  91 #include <sys/lofi.h>
  92 #include <sys/bootprops.h>
  93
  94 #include <vm/page.h>
  95
  96 #include <sys/fs_subr.h>
  97 /* Private interfaces to create vopstats-related data structures */
  98 extern void             initialize_vopstats(vopstats_t *);
  99 extern vopstats_t       *get_fstype_vopstats(struct vfs *, struct vfssw *);
 100 extern vsk_anchor_t     *get_vskstat_anchor(struct vfs *);
 101
 102 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
 103 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
 104     const char *, int, int);
 105 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
 106 static void vfs_freemnttab(struct vfs *);
 107 static void vfs_freeopt(mntopt_t *);
 108 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
 109 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
 110 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
 111 static void vfs_createopttbl_extend(mntopts_t *, const char *,
 112     const mntopts_t *);
 113 static char **vfs_copycancelopt_extend(char **const, int);
 114 static void vfs_freecancelopt(char **);
 115 static void getrootfs(char **, char **);
 116 static int getmacpath(dev_info_t *, void *);
 117 static void vfs_mnttabvp_setup(void);
 118
 119 struct ipmnt {
 120         struct ipmnt    *mip_next;
 121         dev_t           mip_dev;
 122         struct vfs      *mip_vfsp;
 123 };
 124
 125 static kmutex_t         vfs_miplist_mutex;
 126 static struct ipmnt     *vfs_miplist = NULL;
 127 static struct ipmnt     *vfs_miplist_end = NULL;
 128
 129 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
 130
 131 /*
 132  * VFS global data.
 133  */
 134 vnode_t *rootdir;               /* pointer to root inode vnode. */
 135 vnode_t *devicesdir;            /* pointer to inode of devices root */
 136 vnode_t *devdir;                /* pointer to inode of dev root */
 137
 138 char *server_rootpath;          /* root path for diskless clients */
 139 char *server_hostname;          /* hostname of diskless server */
 140
 141 static struct vfs root;
 142 static struct vfs devices;
 143 static struct vfs dev;
 144 struct vfs *rootvfs = &root;    /* pointer to root vfs; head of VFS list. */
 145 rvfs_t *rvfs_list;              /* array of vfs ptrs for vfs hash list */
 146 int vfshsz = 512;               /* # of heads/locks in vfs hash arrays */
 147                                 /* must be power of 2!  */
 148 timespec_t vfs_mnttab_ctime;    /* mnttab created time */
 149 timespec_t vfs_mnttab_mtime;    /* mnttab last modified time */
 150 char *vfs_dummyfstype = "\0";
 151 struct pollhead vfs_pollhd;     /* for mnttab pollers */
 152 struct vnode *vfs_mntdummyvp;   /* to fake mnttab read/write for file events */
 153 int     mntfstype;              /* will be set once mnt fs is mounted */
 154
 155 /*
 156  * Table for generic options recognized in the VFS layer and acted
 157  * on at this level before parsing file system specific options.
 158  * The nosuid option is stronger than any of the devices and setuid
 159  * options, so those are canceled when nosuid is seen.
 160  *
 161  * All options which are added here need to be added to the
 162  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
 163  */
 164 /*
 165  * VFS Mount options table
 166  */
 167 static char *ro_cancel[] = { MNTOPT_RW, NULL };
 168 static char *rw_cancel[] = { MNTOPT_RO, NULL };
 169 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
 170 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
 171     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
 172 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
 173 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
 174 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
 175 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
 176 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
 177 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
 178 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
 179 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
 180
 181 static const mntopt_t mntopts[] = {
 182 /*
 183  *      option name             cancel options          default arg     flags
 184  */
 185         { MNTOPT_REMOUNT,       NULL,                   NULL,
 186                 MO_NODISPLAY, NULL },
 187         { MNTOPT_RO,            ro_cancel,              NULL,           0,
 188                 NULL },
 189         { MNTOPT_RW,            rw_cancel,              NULL,           0,
 190                 NULL },
 191         { MNTOPT_SUID,          suid_cancel,            NULL,           0,
 192                 NULL },
 193         { MNTOPT_NOSUID,        nosuid_cancel,          NULL,           0,
 194                 NULL },
 195         { MNTOPT_DEVICES,       devices_cancel,         NULL,           0,
 196                 NULL },
 197         { MNTOPT_NODEVICES,     nodevices_cancel,       NULL,           0,
 198                 NULL },
 199         { MNTOPT_SETUID,        setuid_cancel,          NULL,           0,
 200                 NULL },
 201         { MNTOPT_NOSETUID,      nosetuid_cancel,        NULL,           0,
 202                 NULL },
 203         { MNTOPT_NBMAND,        nbmand_cancel,          NULL,           0,
 204                 NULL },
 205         { MNTOPT_NONBMAND,      nonbmand_cancel,        NULL,           0,
 206                 NULL },
 207         { MNTOPT_EXEC,          exec_cancel,            NULL,           0,
 208                 NULL },
 209         { MNTOPT_NOEXEC,        noexec_cancel,          NULL,           0,
 210                 NULL },
 211 };
 212
 213 const mntopts_t vfs_mntopts = {
 214         sizeof (mntopts) / sizeof (mntopt_t),
 215         (mntopt_t *)&mntopts[0]
 216 };
 217
 218 /*
 219  * File system operation dispatch functions.
 220  */
 221
 222 int
 223 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 224 {
 225         return fsop_mount_dispatch(vfsp, mvp, uap, cr, true);
 226 }
 227
 228 int
 229 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 230 {
 231         return fsop_unmount_dispatch(vfsp, flag, cr, true);
 232 }
 233
 234 int
 235 fsop_root(vfs_t *vfsp, vnode_t **vpp)
 236 {
 237         refstr_t *mntpt;
 238         int ret;
 239
 240         ret = fsop_root_dispatch(vfsp, vpp, true);
 241
 242         /*
 243          * Make sure this root has a path.  With lofs, it is possible to have
 244          * a NULL mountpoint.
 245          */
 246         if (ret == 0 && vfsp->vfs_mntpt != NULL &&
 247             (*vpp)->v_path == vn_vpath_empty) {
 248                 const char *path;
 249
 250                 mntpt = vfs_getmntpoint(vfsp);
 251                 path = refstr_value(mntpt);
 252                 vn_setpath_str(*vpp, path, strlen(path));
 253                 refstr_rele(mntpt);
 254         }
 255
 256         return (ret);
 257 }
 258
 259 int
 260 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
 261 {
 262         return fsop_statfs_dispatch(vfsp, sp, true);
 263 }
 264
 265 int
 266 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
 267 {
 268         return fsop_sync_dispatch(vfsp, flag, cr, true);
 269 }
 270
 271 int
 272 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
 273 {
 274         /*
 275          * In order to handle system attribute fids in a manner
 276          * transparent to the underlying fs, we embed the fid for
 277          * the sysattr parent object in the sysattr fid and tack on
 278          * some extra bytes that only the sysattr layer knows about.
 279          *
 280          * This guarantees that sysattr fids are larger than other fids
 281          * for this vfs. If the vfs supports the sysattr view interface
 282          * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
 283          * collision with XATTR_FIDSZ.
 284          */
 285         if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
 286             fidp->fid_len == XATTR_FIDSZ)
 287                 return (xattr_dir_vget(vfsp, vpp, fidp));
 288
 289         return fsop_vget_dispatch(vfsp, vpp, fidp, true);
 290 }
 291
 292 int
 293 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
 294 {
 295         return fsop_mountroot_dispatch(vfsp, reason, true);
 296 }
 297
 298 void
 299 fsop_freefs(vfs_t *vfsp)
 300 {
 301         fsop_freefs_dispatch(vfsp, true);
 302 }
 303
 304 int
 305 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
 306 {
 307         return fsop_vnstate_dispatch(vfsp, vp, nstate, true);
 308 }
 309
 310 int
 311 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
 312 {
 313         ASSERT((fstype >= 0) && (fstype < nfstype));
 314
 315         if (!ALLOCATED_VFSSW(&vfssw[fstype]) || !VFS_INSTALLED(&vfssw[fstype]))
 316                 return ENOTSUP;
 317
 318         if (vfssw[fstype].vsw_vfsops.vfs_sync == NULL)
 319                 return ENOSYS;
 320
 321         return vfssw[fstype].vsw_vfsops.vfs_sync(NULL, flag, cr);
 322 }
 323
 324 /*
 325  * File system initialization.  vfs_setfsops() must be called from a file
 326  * system's init routine.
 327  */
 328
 329 void
 330 zfs_boot_init(void)
 331 {
 332         if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
 333                 spa_boot_init();
 334 }
 335
 336 int
 337 vfs_setfsops(int fstype, const struct vfsops *ops)
 338 {
 339         /*
 340          * Verify that fstype refers to a valid fs.  Note that
 341          * 0 is valid since it's used to set "stray" ops.
 342          */
 343         if ((fstype < 0) || (fstype >= nfstype))
 344                 return (EINVAL);
 345
 346         if (!ALLOCATED_VFSSW(&vfssw[fstype]))
 347                 return (EINVAL);
 348
 349         vfssw[fstype].vsw_vfsops = *ops;
 350         vfssw[fstype].vsw_flag |= VSW_INSTALLED;
 351
 352         return (0);
 353 }
 354
 355 /*
 356  * Since the vfsops structure is part of the vfssw table and wasn't
 357  * really allocated, we're not really freeing anything.  However, we need to
 358  * take care of a little bookkeeping.
 359  */
 360 int
 361 vfs_freevfsops_by_type(int fstype)
 362 {
 363
 364         /* Verify that fstype refers to a loaded fs (and not fsid 0). */
 365         if ((fstype <= 0) || (fstype >= nfstype))
 366                 return (EINVAL);
 367
 368         WLOCK_VFSSW();
 369         if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
 370                 WUNLOCK_VFSSW();
 371                 return (EINVAL);
 372         }
 373
 374         vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
 375         WUNLOCK_VFSSW();
 376
 377         return (0);
 378 }
 379
 380 /* Support routines used to reference vfs_op */
 381
 382 /* Set the operations vector for a vfs */
 383 void
 384 vfs_setops(struct vfs *vfs, const struct vfsops *ops)
 385 {
 386         vfs->vfs_op = ops;
 387 }
 388
 389 /* Retrieve the operations vector for a vfs */
 390 const struct vfsops *
 391 vfs_getops(struct vfs *vfs)
 392 {
 393         return vfs->vfs_op;
 394 }
 395
 396 /*
 397  * Returns non-zero (1) if the vfsops matches that of the vfs.
 398  * Returns zero (0) if not.
 399  */
 400 int
 401 vfs_matchops(struct vfs *vfs, const struct vfsops *ops)
 402 {
 403         return (vfs_getops(vfs) == ops);
 404 }
 405
 406 /*
 407  * Returns non-zero (1) if the file system has installed a non-default,
 408  * non-error vfs_sync routine.  Returns zero (0) otherwise.
 409  */
 410 int
 411 vfs_can_sync(vfs_t *vfsp)
 412 {
 413         /* vfs_sync() routine is not the default */
 414         return vfs_getops(vfsp)->vfs_sync != NULL;
 415 }
 416
 417 /*
 418  * Initialize a vfs structure.
 419  */
 420 void
 421 vfs_init(struct vfs *vfs, const struct vfsops *ops, void *data)
 422 {
 423         /* Other initialization has been moved to vfs_alloc() */
 424         vfs->vfs_count = 0;
 425         vfs->vfs_next = vfs;
 426         vfs->vfs_prev = vfs;
 427         vfs->vfs_zone_next = vfs;
 428         vfs->vfs_zone_prev = vfs;
 429         vfs->vfs_lofi_id = 0;
 430         sema_init(&vfs->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
 431         vfsimpl_setup(vfs);
 432         vfs->vfs_data = data;
 433         vfs_setops(vfs, ops);
 434 }
 435
 436 /*
 437  * Allocate and initialize the vfs implementation private data
 438  * structure, vfs_impl_t.
 439  */
 440 void
 441 vfsimpl_setup(vfs_t *vfsp)
 442 {
 443         int i;
 444
 445         if (vfsp->vfs_implp != NULL) {
 446                 return;
 447         }
 448
 449         vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
 450         /* Note that these are #define'd in vfs.h */
 451         vfsp->vfs_vskap = NULL;
 452         vfsp->vfs_fstypevsp = NULL;
 453
 454         /* Set size of counted array, then zero the array */
 455         vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
 456         for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
 457                 vfsp->vfs_featureset[i] = 0;
 458         }
 459 }
 460
 461 /*
 462  * Release the vfs_impl_t structure, if it exists. Some unbundled
 463  * filesystems may not use the newer version of vfs and thus
 464  * would not contain this implementation private data structure.
 465  */
 466 void
 467 vfsimpl_teardown(vfs_t *vfsp)
 468 {
 469         vfs_impl_t      *vip = vfsp->vfs_implp;
 470
 471         if (vip == NULL)
 472                 return;
 473
 474         kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
 475         vfsp->vfs_implp = NULL;
 476 }
 477
 478 /*
 479  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
 480  * fstatvfs, and sysfs are in kernel/syscall.
 481  */
 482
 483 /*
 484  * Update every mounted file system.  We call the vfs_sync operation of
 485  * each file system type, passing it a NULL vfsp to indicate that all
 486  * mounted file systems of that type should be updated.
 487  */
 488 void
 489 vfs_sync(int flag)
 490 {
 491         struct vfssw *vswp;
 492         RLOCK_VFSSW();
 493         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 494                 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 495                         vfs_refvfssw(vswp);
 496                         RUNLOCK_VFSSW();
 497                         if (vswp->vsw_vfsops.vfs_sync != NULL)
 498                                 vswp->vsw_vfsops.vfs_sync(NULL, flag, CRED());
 499                         vfs_unrefvfssw(vswp);
 500                         RLOCK_VFSSW();
 501                 }
 502         }
 503         RUNLOCK_VFSSW();
 504 }
 505
 506 void
 507 sync(void)
 508 {
 509         vfs_sync(0);
 510 }
 511
 512 /*
 513  * External routines.
 514  */
 515
 516 krwlock_t vfssw_lock;   /* lock accesses to vfssw */
 517
 518 /*
 519  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
 520  * but otherwise should be accessed only via vfs_list_lock() and
 521  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
 522  */
 523 static krwlock_t vfslist;
 524
 525 /*
 526  * Mount devfs on /devices. This is done right after root is mounted
 527  * to provide device access support for the system
 528  */
 529 static void
 530 vfs_mountdevices(void)
 531 {
 532         struct vfssw *vsw;
 533         struct vnode *mvp;
 534         struct mounta mounta = {        /* fake mounta for devfs_mount() */
 535                 NULL,
 536                 NULL,
 537                 MS_SYSSPACE,
 538                 NULL,
 539                 NULL,
 540                 0,
 541                 NULL,
 542                 0
 543         };
 544
 545         /*
 546          * _init devfs module to fill in the vfssw
 547          */
 548         if (modload("fs", "devfs") == -1)
 549                 panic("Cannot _init devfs module");
 550
 551         /*
 552          * Hold vfs
 553          */
 554         RLOCK_VFSSW();
 555         vsw = vfs_getvfsswbyname("devfs");
 556         VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
 557         VFS_HOLD(&devices);
 558
 559         /*
 560          * Locate mount point
 561          */
 562         if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 563                 panic("Cannot find /devices");
 564
 565         /*
 566          * Perform the mount of /devices
 567          */
 568         if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
 569                 panic("Cannot mount /devices");
 570
 571         RUNLOCK_VFSSW();
 572
 573         /*
 574          * Set appropriate members and add to vfs list for mnttab display
 575          */
 576         vfs_setresource(&devices, "/devices", 0);
 577         vfs_setmntpoint(&devices, "/devices", 0);
 578
 579         /*
 580          * Hold the root of /devices so it won't go away
 581          */
 582         if (VFS_ROOT(&devices, &devicesdir))
 583                 panic("vfs_mountdevices: not devices root");
 584
 585         if (vfs_lock(&devices) != 0) {
 586                 VN_RELE(devicesdir);
 587                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
 588                 return;
 589         }
 590
 591         if (vn_vfswlock(mvp) != 0) {
 592                 vfs_unlock(&devices);
 593                 VN_RELE(devicesdir);
 594                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
 595                 return;
 596         }
 597
 598         vfs_add(mvp, &devices, 0);
 599         vn_vfsunlock(mvp);
 600         vfs_unlock(&devices);
 601         VN_RELE(devicesdir);
 602 }
 603
 604 /*
 605  * mount the first instance of /dev  to root and remain mounted
 606  */
 607 static void
 608 vfs_mountdev1(void)
 609 {
 610         struct vfssw *vsw;
 611         struct vnode *mvp;
 612         struct mounta mounta = {        /* fake mounta for sdev_mount() */
 613                 NULL,
 614                 NULL,
 615                 MS_SYSSPACE | MS_OVERLAY,
 616                 NULL,
 617                 NULL,
 618                 0,
 619                 NULL,
 620                 0
 621         };
 622
 623         /*
 624          * _init dev module to fill in the vfssw
 625          */
 626         if (modload("fs", "dev") == -1)
 627                 cmn_err(CE_PANIC, "Cannot _init dev module\n");
 628
 629         /*
 630          * Hold vfs
 631          */
 632         RLOCK_VFSSW();
 633         vsw = vfs_getvfsswbyname("dev");
 634         VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
 635         VFS_HOLD(&dev);
 636
 637         /*
 638          * Locate mount point
 639          */
 640         if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 641                 cmn_err(CE_PANIC, "Cannot find /dev\n");
 642
 643         /*
 644          * Perform the mount of /dev
 645          */
 646         if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
 647                 cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
 648
 649         RUNLOCK_VFSSW();
 650
 651         /*
 652          * Set appropriate members and add to vfs list for mnttab display
 653          */
 654         vfs_setresource(&dev, "/dev", 0);
 655         vfs_setmntpoint(&dev, "/dev", 0);
 656
 657         /*
 658          * Hold the root of /dev so it won't go away
 659          */
 660         if (VFS_ROOT(&dev, &devdir))
 661                 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
 662
 663         if (vfs_lock(&dev) != 0) {
 664                 VN_RELE(devdir);
 665                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
 666                 return;
 667         }
 668
 669         if (vn_vfswlock(mvp) != 0) {
 670                 vfs_unlock(&dev);
 671                 VN_RELE(devdir);
 672                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
 673                 return;
 674         }
 675
 676         vfs_add(mvp, &dev, 0);
 677         vn_vfsunlock(mvp);
 678         vfs_unlock(&dev);
 679         VN_RELE(devdir);
 680 }
 681
 682 /*
 683  * Mount required filesystem. This is done right after root is mounted.
 684  */
 685 static void
 686 vfs_mountfs(char *module, char *spec, char *path)
 687 {
 688         struct vnode *mvp;
 689         struct mounta mounta;
 690         vfs_t *vfsp;
 691
 692         bzero(&mounta, sizeof (mounta));
 693         mounta.flags = MS_SYSSPACE | MS_DATA;
 694         mounta.fstype = module;
 695         mounta.spec = spec;
 696         mounta.dir = path;
 697         if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
 698                 cmn_err(CE_WARN, "Cannot find %s", path);
 699                 return;
 700         }
 701         if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
 702                 cmn_err(CE_WARN, "Cannot mount %s", path);
 703         else
 704                 VFS_RELE(vfsp);
 705         VN_RELE(mvp);
 706 }
 707
 708 /*
 709  * vfs_mountroot is called by main() to mount the root filesystem.
 710  */
 711 void
 712 vfs_mountroot(void)
 713 {
 714         struct vnode    *rvp = NULL;
 715         char            *path;
 716         size_t          plen;
 717         struct vfssw    *vswp;
 718         proc_t          *p;
 719
 720         rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
 721         rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
 722
 723         /*
 724          * Alloc the vfs hash bucket array and locks
 725          */
 726         rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
 727
 728         /*
 729          * Call machine-dependent routine "rootconf" to choose a root
 730          * file system type.
 731          */
 732         if (rootconf())
 733                 panic("vfs_mountroot: cannot mount root");
 734         /*
 735          * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
 736          * to point to it.  These are used by lookuppn() so that it
 737          * knows where to start from ('/' or '.').
 738          */
 739         vfs_setmntpoint(rootvfs, "/", 0);
 740         if (VFS_ROOT(rootvfs, &rootdir))
 741                 panic("vfs_mountroot: no root vnode");
 742
 743         /*
 744          * At this point, the process tree consists of p0 and possibly some
 745          * direct children of p0.  (i.e. there are no grandchildren)
 746          *
 747          * Walk through them all, setting their current directory.
 748          */
 749         mutex_enter(&pidlock);
 750         for (p = practive; p != NULL; p = p->p_next) {
 751                 ASSERT(p == &p0 || p->p_parent == &p0);
 752
 753                 PTOU(p)->u_cdir = rootdir;
 754                 VN_HOLD(PTOU(p)->u_cdir);
 755                 PTOU(p)->u_rdir = NULL;
 756         }
 757         mutex_exit(&pidlock);
 758
 759         /*
 760          * Setup the global zone's rootvp, now that it exists.
 761          */
 762         global_zone->zone_rootvp = rootdir;
 763         VN_HOLD(global_zone->zone_rootvp);
 764
 765         /*
 766          * Notify the module code that it can begin using the
 767          * root filesystem instead of the boot program's services.
 768          */
 769         modrootloaded = 1;
 770
 771         /*
 772          * Special handling for a ZFS root file system.
 773          */
 774         zfs_boot_init();
 775
 776         /*
 777          * Set up mnttab information for root
 778          */
 779         vfs_setresource(rootvfs, rootfs.bo_name, 0);
 780
 781         /* Now that we're all done with the root FS, set up its vopstats */
 782         if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
 783                 /* Set flag for statistics collection */
 784                 if (vswp->vsw_flag & VSW_STATS) {
 785                         initialize_vopstats(&rootvfs->vfs_vopstats);
 786                         rootvfs->vfs_flag |= VFS_STATS;
 787                         rootvfs->vfs_fstypevsp =
 788                             get_fstype_vopstats(rootvfs, vswp);
 789                         rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
 790                 }
 791                 vfs_unrefvfssw(vswp);
 792         }
 793
 794         /*
 795          * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
 796          * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
 797          */
 798         vfs_mountdevices();
 799         vfs_mountdev1();
 800
 801         vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
 802         vfs_mountfs("proc", "/proc", "/proc");
 803         vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
 804         vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
 805         vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
 806         vfs_mountfs("bootfs", "bootfs", "/system/boot");
 807
 808         if (getzoneid() == GLOBAL_ZONEID) {
 809                 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
 810         }
 811
 812         if (strcmp(rootfs.bo_fstype, "zfs") != 0) {
 813                 /*
 814                  * Look up the root device via devfs so that a dv_node is
 815                  * created for it. The vnode is never VN_RELE()ed.
 816                  * We allocate more than MAXPATHLEN so that the
 817                  * buffer passed to i_ddi_prompath_to_devfspath() is
 818                  * exactly MAXPATHLEN (the function expects a buffer
 819                  * of that length).
 820                  */
 821                 plen = strlen("/devices");
 822                 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
 823                 (void) strcpy(path, "/devices");
 824
 825                 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
 826                     != DDI_SUCCESS ||
 827                     lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
 828
 829                         /* NUL terminate in case "path" has garbage */
 830                         path[plen + MAXPATHLEN - 1] = '\0';
 831 #ifdef  DEBUG
 832                         cmn_err(CE_WARN, "!Cannot lookup root device: %s",
 833                             path);
 834 #endif
 835                 }
 836                 kmem_free(path, plen + MAXPATHLEN);
 837         }
 838
 839         vfs_mnttabvp_setup();
 840 }
 841
 842 /*
 843  * Check to see if our "block device" is actually a file.  If so,
 844  * automatically add a lofi device, and keep track of this fact.
 845  */
 846 static int
 847 lofi_add(const char *fsname, struct vfs *vfsp,
 848     mntopts_t *mntopts, struct mounta *uap)
 849 {
 850         int fromspace = (uap->flags & MS_SYSSPACE) ?
 851             UIO_SYSSPACE : UIO_USERSPACE;
 852         struct lofi_ioctl *li = NULL;
 853         struct vnode *vp = NULL;
 854         struct pathname pn = { NULL };
 855         ldi_ident_t ldi_id;
 856         ldi_handle_t ldi_hdl;
 857         vfssw_t *vfssw;
 858         int id;
 859         int err = 0;
 860
 861         if ((vfssw = vfs_getvfssw(fsname)) == NULL)
 862                 return (0);
 863
 864         if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
 865                 vfs_unrefvfssw(vfssw);
 866                 return (0);
 867         }
 868
 869         vfs_unrefvfssw(vfssw);
 870         vfssw = NULL;
 871
 872         if (pn_get(uap->spec, fromspace, &pn) != 0)
 873                 return (0);
 874
 875         if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
 876                 goto out;
 877
 878         if (vp->v_type != VREG)
 879                 goto out;
 880
 881         /* OK, this is a lofi mount. */
 882
 883         if ((uap->flags & MS_REMOUNT) ||
 884             vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
 885             vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
 886             vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
 887                 err = EINVAL;
 888                 goto out;
 889         }
 890
 891         ldi_id = ldi_ident_from_anon();
 892         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
 893         (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
 894
 895         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
 896             &ldi_hdl, ldi_id);
 897
 898         if (err)
 899                 goto out2;
 900
 901         err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
 902             FREAD | FWRITE | FKIOCTL, kcred, &id);
 903
 904         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
 905
 906         if (!err)
 907                 vfsp->vfs_lofi_id = id;
 908
 909 out2:
 910         ldi_ident_release(ldi_id);
 911 out:
 912         if (li != NULL)
 913                 kmem_free(li, sizeof (*li));
 914         if (vp != NULL)
 915                 VN_RELE(vp);
 916         pn_free(&pn);
 917         return (err);
 918 }
 919
 920 static void
 921 lofi_remove(struct vfs *vfsp)
 922 {
 923         struct lofi_ioctl *li = NULL;
 924         ldi_ident_t ldi_id;
 925         ldi_handle_t ldi_hdl;
 926         int err;
 927
 928         if (vfsp->vfs_lofi_id == 0)
 929                 return;
 930
 931         ldi_id = ldi_ident_from_anon();
 932
 933         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
 934         li->li_id = vfsp->vfs_lofi_id;
 935         li->li_cleanup = B_TRUE;
 936
 937         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
 938             &ldi_hdl, ldi_id);
 939
 940         if (err)
 941                 goto out;
 942
 943         err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
 944             FREAD | FWRITE | FKIOCTL, kcred, NULL);
 945
 946         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
 947
 948         if (!err)
 949                 vfsp->vfs_lofi_id = 0;
 950
 951 out:
 952         ldi_ident_release(ldi_id);
 953         if (li != NULL)
 954                 kmem_free(li, sizeof (*li));
 955 }
 956
 957 /*
 958  * Common mount code.  Called from the system call entry point, from autofs,
 959  * nfsv4 trigger mounts, and from pxfs.
 960  *
 961  * Takes the effective file system type, mount arguments, the mount point
 962  * vnode, flags specifying whether the mount is a remount and whether it
 963  * should be entered into the vfs list, and credentials.  Fills in its vfspp
 964  * parameter with the mounted file system instance's vfs.
 965  *
 966  * Note that the effective file system type is specified as a string.  It may
 967  * be null, in which case it's determined from the mount arguments, and may
 968  * differ from the type specified in the mount arguments; this is a hook to
 969  * allow interposition when instantiating file system instances.
 970  *
 971  * The caller is responsible for releasing its own hold on the mount point
 972  * vp (this routine does its own hold when necessary).
 973  * Also note that for remounts, the mount point vp should be the vnode for
 974  * the root of the file system rather than the vnode that the file system
 975  * is mounted on top of.
 976  */
 977 int
 978 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
 979     struct vfs **vfspp)
 980 {
 981         struct vfssw    *vswp;
 982         vfsops_t        *vfsops;
 983         struct vfs      *vfsp;
 984         struct vnode    *bvp;
 985         dev_t           bdev = 0;
 986         mntopts_t       mnt_mntopts;
 987         int             error = 0;
 988         int             copyout_error = 0;
 989         int             ovflags;
 990         char            *opts = uap->optptr;
 991         char            *inargs = opts;
 992         int             optlen = uap->optlen;
 993         int             remount;
 994         int             rdonly;
 995         int             nbmand = 0;
 996         int             delmip = 0;
 997         int             addmip = 0;
 998         int             splice = ((uap->flags & MS_NOSPLICE) == 0);
 999         int             fromspace = (uap->flags & MS_SYSSPACE) ?
1000             UIO_SYSSPACE : UIO_USERSPACE;
1001         char            *resource = NULL, *mountpt = NULL;
1002         refstr_t        *oldresource, *oldmntpt;
1003         struct pathname pn, rpn;
1004         vsk_anchor_t    *vskap;
1005         char fstname[FSTYPSZ];
1006         zone_t          *zone;
1007
1008         /*
1009          * The v_flag value for the mount point vp is permanently set
1010          * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1011          * for mount point locking.
1012          */
1013         mutex_enter(&vp->v_lock);
1014         vp->v_flag |= VVFSLOCK;
1015         mutex_exit(&vp->v_lock);
1016
1017         mnt_mntopts.mo_count = 0;
1018         /*
1019          * Find the ops vector to use to invoke the file system-specific mount
1020          * method.  If the fsname argument is non-NULL, use it directly.
1021          * Otherwise, dig the file system type information out of the mount
1022          * arguments.
1023          *
1024          * A side effect is to hold the vfssw entry.
1025          *
1026          * Mount arguments can be specified in several ways, which are
1027          * distinguished by flag bit settings.  The preferred way is to set
1028          * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1029          * type supplied as a character string and the last two arguments
1030          * being a pointer to a character buffer and the size of the buffer.
1031          * On entry, the buffer holds a null terminated list of options; on
1032          * return, the string is the list of options the file system
1033          * recognized. If MS_DATA is set arguments five and six point to a
1034          * block of binary data which the file system interprets.
1035          * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1036          * consistently with these conventions.  To handle them, we check to
1037          * see whether the pointer to the file system name has a numeric value
1038          * less than 256.  If so, we treat it as an index.
1039          */
1040         if (fsname != NULL) {
1041                 if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1042                         return (EINVAL);
1043                 }
1044         } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1045                 size_t n;
1046                 uint_t fstype;
1047
1048                 fsname = fstname;
1049
1050                 if ((fstype = (uintptr_t)uap->fstype) < 256) {
1051                         RLOCK_VFSSW();
1052                         if (fstype == 0 || fstype >= nfstype ||
1053                             !ALLOCATED_VFSSW(&vfssw[fstype])) {
1054                                 RUNLOCK_VFSSW();
1055                                 return (EINVAL);
1056                         }
1057                         (void) strcpy(fsname, vfssw[fstype].vsw_name);
1058                         RUNLOCK_VFSSW();
1059                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1060                                 return (EINVAL);
1061                 } else {
1062                         /*
1063                          * Handle either kernel or user address space.
1064                          */
1065                         if (uap->flags & MS_SYSSPACE) {
1066                                 error = copystr(uap->fstype, fsname,
1067                                     FSTYPSZ, &n);
1068                         } else {
1069                                 error = copyinstr(uap->fstype, fsname,
1070                                     FSTYPSZ, &n);
1071                         }
1072                         if (error) {
1073                                 if (error == ENAMETOOLONG)
1074                                         return (EINVAL);
1075                                 return (error);
1076                         }
1077                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1078                                 return (EINVAL);
1079                 }
1080         } else {
1081                 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1082                         return (EINVAL);
1083                 fsname = vswp->vsw_name;
1084         }
1085         if (!VFS_INSTALLED(vswp))
1086                 return (EINVAL);
1087
1088         if ((error = secpolicy_fs_allowed_mount(fsname)) != 0)  {
1089                 vfs_unrefvfssw(vswp);
1090                 return (error);
1091         }
1092
1093         vfsops = &vswp->vsw_vfsops;
1094
1095         vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1096         /*
1097          * Fetch mount options and parse them for generic vfs options
1098          */
1099         if (uap->flags & MS_OPTIONSTR) {
1100                 /*
1101                  * Limit the buffer size
1102                  */
1103                 if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1104                         error = EINVAL;
1105                         goto errout;
1106                 }
1107                 if ((uap->flags & MS_SYSSPACE) == 0) {
1108                         inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1109                         inargs[0] = '\0';
1110                         if (optlen) {
1111                                 error = copyinstr(opts, inargs, (size_t)optlen,
1112                                     NULL);
1113                                 if (error) {
1114                                         goto errout;
1115                                 }
1116                         }
1117                 }
1118                 vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1119         }
1120         /*
1121          * Flag bits override the options string.
1122          */
1123         if (uap->flags & MS_REMOUNT)
1124                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1125         if (uap->flags & MS_RDONLY)
1126                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1127         if (uap->flags & MS_NOSUID)
1128                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1129
1130         /*
1131          * Check if this is a remount; must be set in the option string and
1132          * the file system must support a remount option.
1133          */
1134         if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1135             MNTOPT_REMOUNT, NULL)) {
1136                 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1137                         error = ENOTSUP;
1138                         goto errout;
1139                 }
1140                 uap->flags |= MS_REMOUNT;
1141         }
1142
1143         /*
1144          * uap->flags and vfs_optionisset() should agree.
1145          */
1146         if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1147                 uap->flags |= MS_RDONLY;
1148         }
1149         if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1150                 uap->flags |= MS_NOSUID;
1151         }
1152         nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1153         ASSERT(splice || !remount);
1154         /*
1155          * If we are splicing the fs into the namespace,
1156          * perform mount point checks.
1157          *
1158          * We want to resolve the path for the mount point to eliminate
1159          * '.' and ".." and symlinks in mount points; we can't do the
1160          * same for the resource string, since it would turn
1161          * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1162          * this before grabbing vn_vfswlock(), because otherwise we
1163          * would deadlock with lookuppn().
1164          */
1165         if (splice) {
1166                 ASSERT(vp->v_count > 0);
1167
1168                 /*
1169                  * Pick up mount point and device from appropriate space.
1170                  */
1171                 if (pn_get(uap->spec, fromspace, &pn) == 0) {
1172                         resource = kmem_alloc(pn.pn_pathlen + 1,
1173                             KM_SLEEP);
1174                         (void) strcpy(resource, pn.pn_path);
1175                         pn_free(&pn);
1176                 }
1177                 /*
1178                  * Do a lookupname prior to taking the
1179                  * writelock. Mark this as completed if
1180                  * successful for later cleanup and addition to
1181                  * the mount in progress table.
1182                  */
1183                 if (lookupname(uap->spec, fromspace,
1184                     FOLLOW, NULL, &bvp) == 0) {
1185                         addmip = 1;
1186                 }
1187
1188                 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1189                         pathname_t *pnp;
1190
1191                         if (*pn.pn_path != '/') {
1192                                 error = EINVAL;
1193                                 pn_free(&pn);
1194                                 goto errout;
1195                         }
1196                         pn_alloc(&rpn);
1197                         /*
1198                          * Kludge to prevent autofs from deadlocking with
1199                          * itself when it calls domount().
1200                          *
1201                          * If autofs is calling, it is because it is doing
1202                          * (autofs) mounts in the process of an NFS mount.  A
1203                          * lookuppn() here would cause us to block waiting for
1204                          * said NFS mount to complete, which can't since this
1205                          * is the thread that was supposed to doing it.
1206                          */
1207                         if (fromspace == UIO_USERSPACE) {
1208                                 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1209                                     NULL)) == 0) {
1210                                         pnp = &rpn;
1211                                 } else {
1212                                         /*
1213                                          * The file disappeared or otherwise
1214                                          * became inaccessible since we opened
1215                                          * it; might as well fail the mount
1216                                          * since the mount point is no longer
1217                                          * accessible.
1218                                          */
1219                                         pn_free(&rpn);
1220                                         pn_free(&pn);
1221                                         goto errout;
1222                                 }
1223                         } else {
1224                                 pnp = &pn;
1225                         }
1226                         mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1227                         (void) strcpy(mountpt, pnp->pn_path);
1228
1229                         /*
1230                          * If the addition of the zone's rootpath
1231                          * would push us over a total path length
1232                          * of MAXPATHLEN, we fail the mount with
1233                          * ENAMETOOLONG, which is what we would have
1234                          * gotten if we were trying to perform the same
1235                          * mount in the global zone.
1236                          *
1237                          * strlen() doesn't count the trailing
1238                          * '\0', but zone_rootpathlen counts both a
1239                          * trailing '/' and the terminating '\0'.
1240                          */
1241                         if ((curproc->p_zone->zone_rootpathlen - 1 +
1242                             strlen(mountpt)) > MAXPATHLEN ||
1243                             (resource != NULL &&
1244                             (curproc->p_zone->zone_rootpathlen - 1 +
1245                             strlen(resource)) > MAXPATHLEN)) {
1246                                 error = ENAMETOOLONG;
1247                         }
1248
1249                         pn_free(&rpn);
1250                         pn_free(&pn);
1251                 }
1252
1253                 if (error)
1254                         goto errout;
1255
1256                 /*
1257                  * Prevent path name resolution from proceeding past
1258                  * the mount point.
1259                  */
1260                 if (vn_vfswlock(vp) != 0) {
1261                         error = EBUSY;
1262                         goto errout;
1263                 }
1264
1265                 /*
1266                  * Verify that it's legitimate to establish a mount on
1267                  * the prospective mount point.
1268                  */
1269                 if (vn_mountedvfs(vp) != NULL) {
1270                         /*
1271                          * The mount point lock was obtained after some
1272                          * other thread raced through and established a mount.
1273                          */
1274                         vn_vfsunlock(vp);
1275                         error = EBUSY;
1276                         goto errout;
1277                 }
1278                 if (vp->v_flag & VNOMOUNT) {
1279                         vn_vfsunlock(vp);
1280                         error = EINVAL;
1281                         goto errout;
1282                 }
1283         }
1284         if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1285                 uap->dataptr = NULL;
1286                 uap->datalen = 0;
1287         }
1288
1289         /*
1290          * If this is a remount, we don't want to create a new VFS.
1291          * Instead, we pass the existing one with a remount flag.
1292          */
1293         if (remount) {
1294                 /*
1295                  * Confirm that the mount point is the root vnode of the
1296                  * file system that is being remounted.
1297                  * This can happen if the user specifies a different
1298                  * mount point directory pathname in the (re)mount command.
1299                  *
1300                  * Code below can only be reached if splice is true, so it's
1301                  * safe to do vn_vfsunlock() here.
1302                  */
1303                 if ((vp->v_flag & VROOT) == 0) {
1304                         vn_vfsunlock(vp);
1305                         error = ENOENT;
1306                         goto errout;
1307                 }
1308                 /*
1309                  * Disallow making file systems read-only unless file system
1310                  * explicitly allows it in its vfssw.  Ignore other flags.
1311                  */
1312                 if (rdonly && vn_is_readonly(vp) == 0 &&
1313                     (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1314                         vn_vfsunlock(vp);
1315                         error = EINVAL;
1316                         goto errout;
1317                 }
1318                 /*
1319                  * Disallow changing the NBMAND disposition of the file
1320                  * system on remounts.
1321                  */
1322                 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1323                     (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1324                         vn_vfsunlock(vp);
1325                         error = EINVAL;
1326                         goto errout;
1327                 }
1328                 vfsp = vp->v_vfsp;
1329                 ovflags = vfsp->vfs_flag;
1330                 vfsp->vfs_flag |= VFS_REMOUNT;
1331                 vfsp->vfs_flag &= ~VFS_RDONLY;
1332         } else {
1333                 vfsp = vfs_alloc(KM_SLEEP);
1334                 VFS_INIT(vfsp, vfsops, NULL);
1335         }
1336
1337         VFS_HOLD(vfsp);
1338
1339         if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1340                 if (!remount) {
1341                         if (splice)
1342                                 vn_vfsunlock(vp);
1343                         vfs_free(vfsp);
1344                 } else {
1345                         vn_vfsunlock(vp);
1346                         VFS_RELE(vfsp);
1347                 }
1348                 goto errout;
1349         }
1350
1351         /*
1352          * PRIV_SYS_MOUNT doesn't mean you can become root.
1353          */
1354         if (vfsp->vfs_lofi_id != 0) {
1355                 uap->flags |= MS_NOSUID;
1356                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1357         }
1358
1359         /*
1360          * The vfs_reflock is not used anymore the code below explicitly
1361          * holds it preventing others accesing it directly.
1362          */
1363         if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1364             !(vfsp->vfs_flag & VFS_REMOUNT))
1365                 cmn_err(CE_WARN,
1366                     "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1367
1368         /*
1369          * Lock the vfs. If this is a remount we want to avoid spurious umount
1370          * failures that happen as a side-effect of fsflush() and other mount
1371          * and unmount operations that might be going on simultaneously and
1372          * may have locked the vfs currently. To not return EBUSY immediately
1373          * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1374          */
1375         if (!remount) {
1376                 if (error = vfs_lock(vfsp)) {
1377                         vfsp->vfs_flag = ovflags;
1378
1379                         lofi_remove(vfsp);
1380
1381                         if (splice)
1382                                 vn_vfsunlock(vp);
1383                         vfs_free(vfsp);
1384                         goto errout;
1385                 }
1386         } else {
1387                 vfs_lock_wait(vfsp);
1388         }
1389
1390         /*
1391          * Add device to mount in progress table, global mounts require special
1392          * handling. It is possible that we have already done the lookupname
1393          * on a spliced, non-global fs. If so, we don't want to do it again
1394          * since we cannot do a lookupname after taking the
1395          * wlock above. This case is for a non-spliced, non-global filesystem.
1396          */
1397         if (!addmip) {
1398                 if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1399                         addmip = 1;
1400                 }
1401         }
1402
1403         if (addmip) {
1404                 vnode_t *lvp = NULL;
1405
1406                 error = vfs_get_lofi(vfsp, &lvp);
1407                 if (error > 0) {
1408                         lofi_remove(vfsp);
1409
1410                         if (splice)
1411                                 vn_vfsunlock(vp);
1412                         vfs_unlock(vfsp);
1413
1414                         if (remount) {
1415                                 VFS_RELE(vfsp);
1416                         } else {
1417                                 vfs_free(vfsp);
1418                         }
1419
1420                         goto errout;
1421                 } else if (error == -1) {
1422                         bdev = bvp->v_rdev;
1423                         VN_RELE(bvp);
1424                 } else {
1425                         bdev = lvp->v_rdev;
1426                         VN_RELE(lvp);
1427                         VN_RELE(bvp);
1428                 }
1429
1430                 vfs_addmip(bdev, vfsp);
1431                 addmip = 0;
1432                 delmip = 1;
1433         }
1434         /*
1435          * Invalidate cached entry for the mount point.
1436          */
1437         if (splice)
1438                 dnlc_purge_vp(vp);
1439
1440         /*
1441          * If have an option string but the filesystem doesn't supply a
1442          * prototype options table, create a table with the global
1443          * options and sufficient room to accept all the options in the
1444          * string.  Then parse the passed in option string
1445          * accepting all the options in the string.  This gives us an
1446          * option table with all the proper cancel properties for the
1447          * global options.
1448          *
1449          * Filesystems that supply a prototype options table are handled
1450          * earlier in this function.
1451          */
1452         if (uap->flags & MS_OPTIONSTR) {
1453                 if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1454                         mntopts_t tmp_mntopts;
1455
1456                         tmp_mntopts.mo_count = 0;
1457                         vfs_createopttbl_extend(&tmp_mntopts, inargs,
1458                             &mnt_mntopts);
1459                         vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1460                         vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1461                         vfs_freeopttbl(&tmp_mntopts);
1462                 }
1463         }
1464
1465         /*
1466          * Serialize with zone state transitions.
1467          * See vfs_list_add; zone mounted into is:
1468          *      zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
1469          * not the zone doing the mount (curproc->p_zone), but if we're already
1470          * inside a NGZ, then we know what zone we are.
1471          */
1472         if (INGLOBALZONE(curproc)) {
1473                 zone = zone_find_by_path(mountpt);
1474                 ASSERT(zone != NULL);
1475         } else {
1476                 zone = curproc->p_zone;
1477                 /*
1478                  * zone_find_by_path does a hold, so do one here too so that
1479                  * we can do a zone_rele after mount_completed.
1480                  */
1481                 zone_hold(zone);
1482         }
1483         mount_in_progress(zone);
1484         /*
1485          * Instantiate (or reinstantiate) the file system.  If appropriate,
1486          * splice it into the file system name space.
1487          *
1488          * We want VFS_MOUNT() to be able to override the vfs_resource
1489          * string if necessary (ie, mntfs), and also for a remount to
1490          * change the same (necessary when remounting '/' during boot).
1491          * So we set up vfs_mntpt and vfs_resource to what we think they
1492          * should be, then hand off control to VFS_MOUNT() which can
1493          * override this.
1494          *
1495          * For safety's sake, when changing vfs_resource or vfs_mntpt of
1496          * a vfs which is on the vfs list (i.e. during a remount), we must
1497          * never set those fields to NULL. Several bits of code make
1498          * assumptions that the fields are always valid.
1499          */
1500         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1501         if (remount) {
1502                 if ((oldresource = vfsp->vfs_resource) != NULL)
1503                         refstr_hold(oldresource);
1504                 if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1505                         refstr_hold(oldmntpt);
1506         }
1507         vfs_setresource(vfsp, resource, 0);
1508         vfs_setmntpoint(vfsp, mountpt, 0);
1509
1510         /*
1511          * going to mount on this vnode, so notify.
1512          */
1513         vnevent_mountedover(vp, NULL);
1514         error = VFS_MOUNT(vfsp, vp, uap, credp);
1515
1516         if (uap->flags & MS_RDONLY)
1517                 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1518         if (uap->flags & MS_NOSUID)
1519                 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1520
1521         if (error) {
1522                 lofi_remove(vfsp);
1523
1524                 if (remount) {
1525                         /* put back pre-remount options */
1526                         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1527                         vfs_setmntpoint(vfsp, refstr_value(oldmntpt),
1528                             VFSSP_VERBATIM);
1529                         if (oldmntpt)
1530                                 refstr_rele(oldmntpt);
1531                         vfs_setresource(vfsp, refstr_value(oldresource),
1532                             VFSSP_VERBATIM);
1533                         if (oldresource)
1534                                 refstr_rele(oldresource);
1535                         vfsp->vfs_flag = ovflags;
1536                         vfs_unlock(vfsp);
1537                         VFS_RELE(vfsp);
1538                 } else {
1539                         vfs_unlock(vfsp);
1540                         vfs_freemnttab(vfsp);
1541                         vfs_free(vfsp);
1542                 }
1543         } else {
1544                 /*
1545                  * Set the mount time to now
1546                  */
1547                 vfsp->vfs_mtime = ddi_get_time();
1548                 if (remount) {
1549                         vfsp->vfs_flag &= ~VFS_REMOUNT;
1550                         if (oldresource)
1551                                 refstr_rele(oldresource);
1552                         if (oldmntpt)
1553                                 refstr_rele(oldmntpt);
1554                 } else if (splice) {
1555                         /*
1556                          * Link vfsp into the name space at the mount
1557                          * point. Vfs_add() is responsible for
1558                          * holding the mount point which will be
1559                          * released when vfs_remove() is called.
1560                          */
1561                         vfs_add(vp, vfsp, uap->flags);
1562                 } else {
1563                         /*
1564                          * Hold the reference to file system which is
1565                          * not linked into the name space.
1566                          */
1567                         vfsp->vfs_zone = NULL;
1568                         VFS_HOLD(vfsp);
1569                         vfsp->vfs_vnodecovered = NULL;
1570                 }
1571                 /*
1572                  * Set flags for global options encountered
1573                  */
1574                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1575                         vfsp->vfs_flag |= VFS_RDONLY;
1576                 else
1577                         vfsp->vfs_flag &= ~VFS_RDONLY;
1578                 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1579                         vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1580                 } else {
1581                         if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1582                                 vfsp->vfs_flag |= VFS_NODEVICES;
1583                         else
1584                                 vfsp->vfs_flag &= ~VFS_NODEVICES;
1585                         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1586                                 vfsp->vfs_flag |= VFS_NOSETUID;
1587                         else
1588                                 vfsp->vfs_flag &= ~VFS_NOSETUID;
1589                 }
1590                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1591                         vfsp->vfs_flag |= VFS_NBMAND;
1592                 else
1593                         vfsp->vfs_flag &= ~VFS_NBMAND;
1594
1595                 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1596                         vfsp->vfs_flag |= VFS_XATTR;
1597                 else
1598                         vfsp->vfs_flag &= ~VFS_XATTR;
1599
1600                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1601                         vfsp->vfs_flag |= VFS_NOEXEC;
1602                 else
1603                         vfsp->vfs_flag &= ~VFS_NOEXEC;
1604
1605                 /*
1606                  * Now construct the output option string of options
1607                  * we recognized.
1608                  */
1609                 if (uap->flags & MS_OPTIONSTR) {
1610                         vfs_list_read_lock();
1611                         copyout_error = vfs_buildoptionstr(
1612                             &vfsp->vfs_mntopts, inargs, optlen);
1613                         vfs_list_unlock();
1614                         if (copyout_error == 0 &&
1615                             (uap->flags & MS_SYSSPACE) == 0) {
1616                                 copyout_error = copyoutstr(inargs, opts,
1617                                     optlen, NULL);
1618                         }
1619                 }
1620
1621                 /*
1622                  * If this isn't a remount, set up the vopstats before
1623                  * anyone can touch this. We only allow spliced file
1624                  * systems (file systems which are in the namespace) to
1625                  * have the VFS_STATS flag set.
1626                  * NOTE: PxFS mounts the underlying file system with
1627                  * MS_NOSPLICE set and copies those vfs_flags to its private
1628                  * vfs structure. As a result, PxFS should never have
1629                  * the VFS_STATS flag or else we might access the vfs
1630                  * statistics-related fields prior to them being
1631                  * properly initialized.
1632                  */
1633                 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1634                         initialize_vopstats(&vfsp->vfs_vopstats);
1635                         /*
1636                          * We need to set vfs_vskap to NULL because there's
1637                          * a chance it won't be set below.  This is checked
1638                          * in teardown_vopstats() so we can't have garbage.
1639                          */
1640                         vfsp->vfs_vskap = NULL;
1641                         vfsp->vfs_flag |= VFS_STATS;
1642                         vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1643                 }
1644
1645                 if (vswp->vsw_flag & VSW_XID)
1646                         vfsp->vfs_flag |= VFS_XID;
1647
1648                 vfs_unlock(vfsp);
1649         }
1650         mount_completed(zone);
1651         zone_rele(zone);
1652         if (splice)
1653                 vn_vfsunlock(vp);
1654
1655         if ((error == 0) && (copyout_error == 0)) {
1656                 if (!remount) {
1657                         /*
1658                          * Don't call get_vskstat_anchor() while holding
1659                          * locks since it allocates memory and calls
1660                          * VFS_STATVFS().  For NFS, the latter can generate
1661                          * an over-the-wire call.
1662                          */
1663                         vskap = get_vskstat_anchor(vfsp);
1664                         /* Only take the lock if we have something to do */
1665                         if (vskap != NULL) {
1666                                 vfs_lock_wait(vfsp);
1667                                 if (vfsp->vfs_flag & VFS_STATS) {
1668                                         vfsp->vfs_vskap = vskap;
1669                                 }
1670                                 vfs_unlock(vfsp);
1671                         }
1672                 }
1673                 /* Return vfsp to caller. */
1674                 *vfspp = vfsp;
1675         }
1676 errout:
1677         vfs_freeopttbl(&mnt_mntopts);
1678         if (resource != NULL)
1679                 kmem_free(resource, strlen(resource) + 1);
1680         if (mountpt != NULL)
1681                 kmem_free(mountpt, strlen(mountpt) + 1);
1682         /*
1683          * It is possible we errored prior to adding to mount in progress
1684          * table. Must free vnode we acquired with successful lookupname.
1685          */
1686         if (addmip)
1687                 VN_RELE(bvp);
1688         if (delmip)
1689                 vfs_delmip(vfsp);
1690         ASSERT(vswp != NULL);
1691         vfs_unrefvfssw(vswp);
1692         if (inargs != opts)
1693                 kmem_free(inargs, MAX_MNTOPT_STR);
1694         if (copyout_error) {
1695                 lofi_remove(vfsp);
1696                 VFS_RELE(vfsp);
1697                 error = copyout_error;
1698         }
1699         return (error);
1700 }
1701
1702 static void
1703 vfs_setpath(
1704     struct vfs *vfsp,           /* vfs being updated */
1705     refstr_t **refp,            /* Ref-count string to contain the new path */
1706     const char *newpath,        /* Path to add to refp (above) */
1707     uint32_t flag)              /* flag */
1708 {
1709         size_t len;
1710         refstr_t *ref;
1711         zone_t *zone = curproc->p_zone;
1712         char *sp;
1713         int have_list_lock = 0;
1714
1715         ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1716
1717         /*
1718          * New path must be less than MAXPATHLEN because mntfs
1719          * will only display up to MAXPATHLEN bytes. This is currently
1720          * safe, because domount() uses pn_get(), and other callers
1721          * similarly cap the size to fewer than MAXPATHLEN bytes.
1722          */
1723
1724         ASSERT(strlen(newpath) < MAXPATHLEN);
1725
1726         /* mntfs requires consistency while vfs list lock is held */
1727
1728         if (VFS_ON_LIST(vfsp)) {
1729                 have_list_lock = 1;
1730                 vfs_list_lock();
1731         }
1732
1733         if (*refp != NULL)
1734                 refstr_rele(*refp);
1735
1736         /*
1737          * If we are in a non-global zone then we prefix the supplied path,
1738          * newpath, with the zone's root path, with two exceptions. The first
1739          * is where we have been explicitly directed to avoid doing so; this
1740          * will be the case following a failed remount, where the path supplied
1741          * will be a saved version which must now be restored. The second
1742          * exception is where newpath is not a pathname but a descriptive name,
1743          * e.g. "procfs".
1744          */
1745         if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') {
1746                 ref = refstr_alloc(newpath);
1747                 goto out;
1748         }
1749
1750         /*
1751          * Truncate the trailing '/' in the zoneroot, and merge
1752          * in the zone's rootpath with the "newpath" (resource
1753          * or mountpoint) passed in.
1754          *
1755          * The size of the required buffer is thus the size of
1756          * the buffer required for the passed-in newpath
1757          * (strlen(newpath) + 1), plus the size of the buffer
1758          * required to hold zone_rootpath (zone_rootpathlen)
1759          * minus one for one of the now-superfluous NUL
1760          * terminations, minus one for the trailing '/'.
1761          *
1762          * That gives us:
1763          *
1764          * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1765          *
1766          * Which is what we have below.
1767          */
1768
1769         len = strlen(newpath) + zone->zone_rootpathlen - 1;
1770         sp = kmem_alloc(len, KM_SLEEP);
1771
1772         /*
1773          * Copy everything including the trailing slash, which
1774          * we then overwrite with the NUL character.
1775          */
1776
1777         (void) strcpy(sp, zone->zone_rootpath);
1778         sp[zone->zone_rootpathlen - 2] = '\0';
1779         (void) strcat(sp, newpath);
1780
1781         ref = refstr_alloc(sp);
1782         kmem_free(sp, len);
1783 out:
1784         *refp = ref;
1785
1786         if (have_list_lock) {
1787                 vfs_mnttab_modtimeupd();
1788                 vfs_list_unlock();
1789         }
1790 }
1791
1792 /*
1793  * Record a mounted resource name in a vfs structure.
1794  * If vfsp is already mounted, caller must hold the vfs lock.
1795  */
1796 void
1797 vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag)
1798 {
1799         if (resource == NULL || resource[0] == '\0')
1800                 resource = VFS_NORESOURCE;
1801         vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag);
1802 }
1803
1804 /*
1805  * Record a mount point name in a vfs structure.
1806  * If vfsp is already mounted, caller must hold the vfs lock.
1807  */
1808 void
1809 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag)
1810 {
1811         if (mntpt == NULL || mntpt[0] == '\0')
1812                 mntpt = VFS_NOMNTPT;
1813         vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag);
1814 }
1815
1816 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1817
1818 refstr_t *
1819 vfs_getresource(const struct vfs *vfsp)
1820 {
1821         refstr_t *resource;
1822
1823         vfs_list_read_lock();
1824         resource = vfsp->vfs_resource;
1825         refstr_hold(resource);
1826         vfs_list_unlock();
1827
1828         return (resource);
1829 }
1830
1831 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1832
1833 refstr_t *
1834 vfs_getmntpoint(const struct vfs *vfsp)
1835 {
1836         refstr_t *mntpt;
1837
1838         vfs_list_read_lock();
1839         mntpt = vfsp->vfs_mntpt;
1840         refstr_hold(mntpt);
1841         vfs_list_unlock();
1842
1843         return (mntpt);
1844 }
1845
1846 /*
1847  * Create an empty options table with enough empty slots to hold all
1848  * The options in the options string passed as an argument.
1849  * Potentially prepend another options table.
1850  *
1851  * Note: caller is responsible for locking the vfs list, if needed,
1852  *       to protect mops.
1853  */
1854 static void
1855 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1856     const mntopts_t *mtmpl)
1857 {
1858         const char *s = opts;
1859         uint_t count;
1860
1861         if (opts == NULL || *opts == '\0') {
1862                 count = 0;
1863         } else {
1864                 count = 1;
1865
1866                 /*
1867                  * Count number of options in the string
1868                  */
1869                 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1870                         count++;
1871                         s++;
1872                 }
1873         }
1874         vfs_copyopttbl_extend(mtmpl, mops, count);
1875 }
1876
1877 /*
1878  * Create an empty options table with enough empty slots to hold all
1879  * The options in the options string passed as an argument.
1880  *
1881  * This function is *not* for general use by filesystems.
1882  *
1883  * Note: caller is responsible for locking the vfs list, if needed,
1884  *       to protect mops.
1885  */
1886 void
1887 vfs_createopttbl(mntopts_t *mops, const char *opts)
1888 {
1889         vfs_createopttbl_extend(mops, opts, NULL);
1890 }
1891
1892
1893 /*
1894  * Swap two mount options tables
1895  */
1896 static void
1897 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
1898 {
1899         uint_t tmpcnt;
1900         mntopt_t *tmplist;
1901
1902         tmpcnt = optbl2->mo_count;
1903         tmplist = optbl2->mo_list;
1904         optbl2->mo_count = optbl1->mo_count;
1905         optbl2->mo_list = optbl1->mo_list;
1906         optbl1->mo_count = tmpcnt;
1907         optbl1->mo_list = tmplist;
1908 }
1909
1910 static void
1911 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
1912 {
1913         vfs_list_lock();
1914         vfs_swapopttbl_nolock(optbl1, optbl2);
1915         vfs_mnttab_modtimeupd();
1916         vfs_list_unlock();
1917 }
1918
1919 static char **
1920 vfs_copycancelopt_extend(char **const moc, int extend)
1921 {
1922         int i = 0;
1923         int j;
1924         char **result;
1925
1926         if (moc != NULL) {
1927                 for (; moc[i] != NULL; i++)
1928                         /* count number of options to cancel */;
1929         }
1930
1931         if (i + extend == 0)
1932                 return (NULL);
1933
1934         result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
1935
1936         for (j = 0; j < i; j++) {
1937                 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
1938                 (void) strcpy(result[j], moc[j]);
1939         }
1940         for (; j <= i + extend; j++)
1941                 result[j] = NULL;
1942
1943         return (result);
1944 }
1945
1946 static void
1947 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
1948 {
1949         char *sp, *dp;
1950
1951         d->mo_flags = s->mo_flags;
1952         d->mo_data = s->mo_data;
1953         sp = s->mo_name;
1954         if (sp != NULL) {
1955                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1956                 (void) strcpy(dp, sp);
1957                 d->mo_name = dp;
1958         } else {
1959                 d->mo_name = NULL; /* should never happen */
1960         }
1961
1962         d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
1963
1964         sp = s->mo_arg;
1965         if (sp != NULL) {
1966                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1967                 (void) strcpy(dp, sp);
1968                 d->mo_arg = dp;
1969         } else {
1970                 d->mo_arg = NULL;
1971         }
1972 }
1973
1974 /*
1975  * Copy a mount options table, possibly allocating some spare
1976  * slots at the end.  It is permissible to copy_extend the NULL table.
1977  */
1978 static void
1979 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
1980 {
1981         uint_t i, count;
1982         mntopt_t *motbl;
1983
1984         /*
1985          * Clear out any existing stuff in the options table being initialized
1986          */
1987         vfs_freeopttbl(dmo);
1988         count = (smo == NULL) ? 0 : smo->mo_count;
1989         if ((count + extra) == 0)       /* nothing to do */
1990                 return;
1991         dmo->mo_count = count + extra;
1992         motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
1993         dmo->mo_list = motbl;
1994         for (i = 0; i < count; i++) {
1995                 vfs_copyopt(&smo->mo_list[i], &motbl[i]);
1996         }
1997         for (i = count; i < count + extra; i++) {
1998                 motbl[i].mo_flags = MO_EMPTY;
1999         }
2000 }
2001
2002 /*
2003  * Copy a mount options table.
2004  *
2005  * This function is *not* for general use by filesystems.
2006  *
2007  * Note: caller is responsible for locking the vfs list, if needed,
2008  *       to protect smo and dmo.
2009  */
2010 void
2011 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2012 {
2013         vfs_copyopttbl_extend(smo, dmo, 0);
2014 }
2015
2016 static char **
2017 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2018 {
2019         int c1 = 0;
2020         int c2 = 0;
2021         char **result;
2022         char **sp1, **sp2, **dp;
2023
2024         /*
2025          * First we count both lists of cancel options.
2026          * If either is NULL or has no elements, we return a copy of
2027          * the other.
2028          */
2029         if (mop1->mo_cancel != NULL) {
2030                 for (; mop1->mo_cancel[c1] != NULL; c1++)
2031                         /* count cancel options in mop1 */;
2032         }
2033
2034         if (c1 == 0)
2035                 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2036
2037         if (mop2->mo_cancel != NULL) {
2038                 for (; mop2->mo_cancel[c2] != NULL; c2++)
2039                         /* count cancel options in mop2 */;
2040         }
2041
2042         result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2043
2044         if (c2 == 0)
2045                 return (result);
2046
2047         /*
2048          * When we get here, we've got two sets of cancel options;
2049          * we need to merge the two sets.  We know that the result
2050          * array has "c1+c2+1" entries and in the end we might shrink
2051          * it.
2052          * Result now has a copy of the c1 entries from mop1; we'll
2053          * now lookup all the entries of mop2 in mop1 and copy it if
2054          * it is unique.
2055          * This operation is O(n^2) but it's only called once per
2056          * filesystem per duplicate option.  This is a situation
2057          * which doesn't arise with the filesystems in ON and
2058          * n is generally 1.
2059          */
2060
2061         dp = &result[c1];
2062         for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2063                 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2064                         if (strcmp(*sp1, *sp2) == 0)
2065                                 break;
2066                 }
2067                 if (*sp1 == NULL) {
2068                         /*
2069                          * Option *sp2 not found in mop1, so copy it.
2070                          * The calls to vfs_copycancelopt_extend()
2071                          * guarantee that there's enough room.
2072                          */
2073                         *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2074                         (void) strcpy(*dp++, *sp2);
2075                 }
2076         }
2077         if (dp != &result[c1+c2]) {
2078                 size_t bytes = (dp - result + 1) * sizeof (char *);
2079                 char **nres = kmem_alloc(bytes, KM_SLEEP);
2080
2081                 bcopy(result, nres, bytes);
2082                 kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2083                 result = nres;
2084         }
2085         return (result);
2086 }
2087
2088 /*
2089  * Merge two mount option tables (outer and inner) into one.  This is very
2090  * similar to "merging" global variables and automatic variables in C.
2091  *
2092  * This isn't (and doesn't have to be) fast.
2093  *
2094  * This function is *not* for general use by filesystems.
2095  *
2096  * Note: caller is responsible for locking the vfs list, if needed,
2097  *       to protect omo, imo & dmo.
2098  */
2099 void
2100 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2101 {
2102         uint_t i, count;
2103         mntopt_t *mop, *motbl;
2104         uint_t freeidx;
2105
2106         /*
2107          * First determine how much space we need to allocate.
2108          */
2109         count = omo->mo_count;
2110         for (i = 0; i < imo->mo_count; i++) {
2111                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2112                         continue;
2113                 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2114                         count++;
2115         }
2116         ASSERT(count >= omo->mo_count &&
2117             count <= omo->mo_count + imo->mo_count);
2118         motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2119         for (i = 0; i < omo->mo_count; i++)
2120                 vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2121         freeidx = omo->mo_count;
2122         for (i = 0; i < imo->mo_count; i++) {
2123                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2124                         continue;
2125                 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2126                         char **newcanp;
2127                         uint_t index = mop - omo->mo_list;
2128
2129                         newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2130
2131                         vfs_freeopt(&motbl[index]);
2132                         vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2133
2134                         vfs_freecancelopt(motbl[index].mo_cancel);
2135                         motbl[index].mo_cancel = newcanp;
2136                 } else {
2137                         /*
2138                          * If it's a new option, just copy it over to the first
2139                          * free location.
2140                          */
2141                         vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2142                 }
2143         }
2144         dmo->mo_count = count;
2145         dmo->mo_list = motbl;
2146 }
2147
2148 /*
2149  * Functions to set and clear mount options in a mount options table.
2150  */
2151
2152 /*
2153  * Clear a mount option, if it exists.
2154  *
2155  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2156  * the vfs list.
2157  */
2158 static void
2159 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2160 {
2161         struct mntopt *mop;
2162         uint_t i, count;
2163
2164         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2165
2166         count = mops->mo_count;
2167         for (i = 0; i < count; i++) {
2168                 mop = &mops->mo_list[i];
2169
2170                 if (mop->mo_flags & MO_EMPTY)
2171                         continue;
2172                 if (strcmp(opt, mop->mo_name))
2173                         continue;
2174                 mop->mo_flags &= ~MO_SET;
2175                 if (mop->mo_arg != NULL) {
2176                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2177                 }
2178                 mop->mo_arg = NULL;
2179                 if (update_mnttab)
2180                         vfs_mnttab_modtimeupd();
2181                 break;
2182         }
2183 }
2184
2185 void
2186 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2187 {
2188         int gotlock = 0;
2189
2190         if (VFS_ON_LIST(vfsp)) {
2191                 gotlock = 1;
2192                 vfs_list_lock();
2193         }
2194         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2195         if (gotlock)
2196                 vfs_list_unlock();
2197 }
2198
2199
2200 /*
2201  * Set a mount option on.  If it's not found in the table, it's silently
2202  * ignored.  If the option has MO_IGNORE set, it is still set unless the
2203  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2204  * bits can be used to toggle the MO_NODISPLAY bit for the option.
2205  * If the VFS_CREATEOPT flag bit is set then the first option slot with
2206  * MO_EMPTY set is created as the option passed in.
2207  *
2208  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2209  * the vfs list.
2210  */
2211 static void
2212 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2213     const char *arg, int flags, int update_mnttab)
2214 {
2215         mntopt_t *mop;
2216         uint_t i, count;
2217         char *sp;
2218
2219         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2220
2221         if (flags & VFS_CREATEOPT) {
2222                 if (vfs_hasopt(mops, opt) != NULL) {
2223                         flags &= ~VFS_CREATEOPT;
2224                 }
2225         }
2226         count = mops->mo_count;
2227         for (i = 0; i < count; i++) {
2228                 mop = &mops->mo_list[i];
2229
2230                 if (mop->mo_flags & MO_EMPTY) {
2231                         if ((flags & VFS_CREATEOPT) == 0)
2232                                 continue;
2233                         sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2234                         (void) strcpy(sp, opt);
2235                         mop->mo_name = sp;
2236                         if (arg != NULL)
2237                                 mop->mo_flags = MO_HASVALUE;
2238                         else
2239                                 mop->mo_flags = 0;
2240                 } else if (strcmp(opt, mop->mo_name)) {
2241                         continue;
2242                 }
2243                 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2244                         break;
2245                 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2246                         sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2247                         (void) strcpy(sp, arg);
2248                 } else {
2249                         sp = NULL;
2250                 }
2251                 if (mop->mo_arg != NULL)
2252                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2253                 mop->mo_arg = sp;
2254                 if (flags & VFS_DISPLAY)
2255                         mop->mo_flags &= ~MO_NODISPLAY;
2256                 if (flags & VFS_NODISPLAY)
2257                         mop->mo_flags |= MO_NODISPLAY;
2258                 mop->mo_flags |= MO_SET;
2259                 if (mop->mo_cancel != NULL) {
2260                         char **cp;
2261
2262                         for (cp = mop->mo_cancel; *cp != NULL; cp++)
2263                                 vfs_clearmntopt_nolock(mops, *cp, 0);
2264                 }
2265                 if (update_mnttab)
2266                         vfs_mnttab_modtimeupd();
2267                 break;
2268         }
2269 }
2270
2271 void
2272 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2273 {
2274         int gotlock = 0;
2275
2276         if (VFS_ON_LIST(vfsp)) {
2277                 gotlock = 1;
2278                 vfs_list_lock();
2279         }
2280         vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2281         if (gotlock)
2282                 vfs_list_unlock();
2283 }
2284
2285
2286 /*
2287  * Add a "tag" option to a mounted file system's options list.
2288  *
2289  * Note: caller is responsible for locking the vfs list, if needed,
2290  *       to protect mops.
2291  */
2292 static mntopt_t *
2293 vfs_addtag(mntopts_t *mops, const char *tag)
2294 {
2295         uint_t count;
2296         mntopt_t *mop, *motbl;
2297
2298         count = mops->mo_count + 1;
2299         motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2300         if (mops->mo_count) {
2301                 size_t len = (count - 1) * sizeof (mntopt_t);
2302
2303                 bcopy(mops->mo_list, motbl, len);
2304                 kmem_free(mops->mo_list, len);
2305         }
2306         mops->mo_count = count;
2307         mops->mo_list = motbl;
2308         mop = &motbl[count - 1];
2309         mop->mo_flags = MO_TAG;
2310         mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2311         (void) strcpy(mop->mo_name, tag);
2312         return (mop);
2313 }
2314
2315 /*
2316  * Allow users to set arbitrary "tags" in a vfs's mount options.
2317  * Broader use within the kernel is discouraged.
2318  */
2319 int
2320 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2321     cred_t *cr)
2322 {
2323         vfs_t *vfsp;
2324         mntopts_t *mops;
2325         mntopt_t *mop;
2326         int found = 0;
2327         dev_t dev = makedevice(major, minor);
2328         int err = 0;
2329         char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2330
2331         /*
2332          * Find the desired mounted file system
2333          */
2334         vfs_list_lock();
2335         vfsp = rootvfs;
2336         do {
2337                 if (vfsp->vfs_dev == dev &&
2338                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2339                         found = 1;
2340                         break;
2341                 }
2342                 vfsp = vfsp->vfs_next;
2343         } while (vfsp != rootvfs);
2344
2345         if (!found) {
2346                 err = EINVAL;
2347                 goto out;
2348         }
2349         err = secpolicy_fs_config(cr, vfsp);
2350         if (err != 0)
2351                 goto out;
2352
2353         mops = &vfsp->vfs_mntopts;
2354         /*
2355          * Add tag if it doesn't already exist
2356          */
2357         if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2358                 int len;
2359
2360                 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2361                 len = strlen(buf);
2362                 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2363                         err = ENAMETOOLONG;
2364                         goto out;
2365                 }
2366                 mop = vfs_addtag(mops, tag);
2367         }
2368         if ((mop->mo_flags & MO_TAG) == 0) {
2369                 err = EINVAL;
2370                 goto out;
2371         }
2372         vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2373 out:
2374         vfs_list_unlock();
2375         kmem_free(buf, MAX_MNTOPT_STR);
2376         return (err);
2377 }
2378
2379 /*
2380  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2381  * Broader use within the kernel is discouraged.
2382  */
2383 int
2384 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2385     cred_t *cr)
2386 {
2387         vfs_t *vfsp;
2388         mntopt_t *mop;
2389         int found = 0;
2390         dev_t dev = makedevice(major, minor);
2391         int err = 0;
2392
2393         /*
2394          * Find the desired mounted file system
2395          */
2396         vfs_list_lock();
2397         vfsp = rootvfs;
2398         do {
2399                 if (vfsp->vfs_dev == dev &&
2400                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2401                         found = 1;
2402                         break;
2403                 }
2404                 vfsp = vfsp->vfs_next;
2405         } while (vfsp != rootvfs);
2406
2407         if (!found) {
2408                 err = EINVAL;
2409                 goto out;
2410         }
2411         err = secpolicy_fs_config(cr, vfsp);
2412         if (err != 0)
2413                 goto out;
2414
2415         if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2416                 err = EINVAL;
2417                 goto out;
2418         }
2419         if ((mop->mo_flags & MO_TAG) == 0) {
2420                 err = EINVAL;
2421                 goto out;
2422         }
2423         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2424 out:
2425         vfs_list_unlock();
2426         return (err);
2427 }
2428
2429 /*
2430  * Function to parse an option string and fill in a mount options table.
2431  * Unknown options are silently ignored.  The input option string is modified
2432  * by replacing separators with nulls.  If the create flag is set, options
2433  * not found in the table are just added on the fly.  The table must have
2434  * an option slot marked MO_EMPTY to add an option on the fly.
2435  *
2436  * This function is *not* for general use by filesystems.
2437  *
2438  * Note: caller is responsible for locking the vfs list, if needed,
2439  *       to protect mops..
2440  */
2441 void
2442 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2443 {
2444         char *s = osp, *p, *nextop, *valp, *cp, *ep;
2445         int setflg = VFS_NOFORCEOPT;
2446
2447         if (osp == NULL)
2448                 return;
2449         while (*s != '\0') {
2450                 p = strchr(s, ',');     /* find next option */
2451                 if (p == NULL) {
2452                         cp = NULL;
2453                         p = s + strlen(s);
2454                 } else {
2455                         cp = p;         /* save location of comma */
2456                         *p++ = '\0';    /* mark end and point to next option */
2457                 }
2458                 nextop = p;
2459                 p = strchr(s, '=');     /* look for value */
2460                 if (p == NULL) {
2461                         valp = NULL;    /* no value supplied */
2462                 } else {
2463                         ep = p;         /* save location of equals */
2464                         *p++ = '\0';    /* end option and point to value */
2465                         valp = p;
2466                 }
2467                 /*
2468                  * set option into options table
2469                  */
2470                 if (create)
2471                         setflg |= VFS_CREATEOPT;
2472                 vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2473                 if (cp != NULL)
2474                         *cp = ',';      /* restore the comma */
2475                 if (valp != NULL)
2476                         *ep = '=';      /* restore the equals */
2477                 s = nextop;
2478         }
2479 }
2480
2481 /*
2482  * Function to inquire if an option exists in a mount options table.
2483  * Returns a pointer to the option if it exists, else NULL.
2484  *
2485  * This function is *not* for general use by filesystems.
2486  *
2487  * Note: caller is responsible for locking the vfs list, if needed,
2488  *       to protect mops.
2489  */
2490 struct mntopt *
2491 vfs_hasopt(const mntopts_t *mops, const char *opt)
2492 {
2493         struct mntopt *mop;
2494         uint_t i, count;
2495
2496         count = mops->mo_count;
2497         for (i = 0; i < count; i++) {
2498                 mop = &mops->mo_list[i];
2499
2500                 if (mop->mo_flags & MO_EMPTY)
2501                         continue;
2502                 if (strcmp(opt, mop->mo_name) == 0)
2503                         return (mop);
2504         }
2505         return (NULL);
2506 }
2507
2508 /*
2509  * Function to inquire if an option is set in a mount options table.
2510  * Returns non-zero if set and fills in the arg pointer with a pointer to
2511  * the argument string or NULL if there is no argument string.
2512  */
2513 static int
2514 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2515 {
2516         struct mntopt *mop;
2517         uint_t i, count;
2518
2519         count = mops->mo_count;
2520         for (i = 0; i < count; i++) {
2521                 mop = &mops->mo_list[i];
2522
2523                 if (mop->mo_flags & MO_EMPTY)
2524                         continue;
2525                 if (strcmp(opt, mop->mo_name))
2526                         continue;
2527                 if ((mop->mo_flags & MO_SET) == 0)
2528                         return (0);
2529                 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2530                         *argp = mop->mo_arg;
2531                 return (1);
2532         }
2533         return (0);
2534 }
2535
2536
2537 int
2538 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2539 {
2540         int ret;
2541
2542         vfs_list_read_lock();
2543         ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2544         vfs_list_unlock();
2545         return (ret);
2546 }
2547
2548
2549 /*
2550  * Construct a comma separated string of the options set in the given
2551  * mount table, return the string in the given buffer.  Return non-zero if
2552  * the buffer would overflow.
2553  *
2554  * This function is *not* for general use by filesystems.
2555  *
2556  * Note: caller is responsible for locking the vfs list, if needed,
2557  *       to protect mp.
2558  */
2559 int
2560 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2561 {
2562         char *cp;
2563         uint_t i;
2564
2565         buf[0] = '\0';
2566         cp = buf;
2567         for (i = 0; i < mp->mo_count; i++) {
2568                 struct mntopt *mop;
2569
2570                 mop = &mp->mo_list[i];
2571                 if (mop->mo_flags & MO_SET) {
2572                         int optlen, comma = 0;
2573
2574                         if (buf[0] != '\0')
2575                                 comma = 1;
2576                         optlen = strlen(mop->mo_name);
2577                         if (strlen(buf) + comma + optlen + 1 > len)
2578                                 goto err;
2579                         if (comma)
2580                                 *cp++ = ',';
2581                         (void) strcpy(cp, mop->mo_name);
2582                         cp += optlen;
2583                         /*
2584                          * Append option value if there is one
2585                          */
2586                         if (mop->mo_arg != NULL) {
2587                                 int arglen;
2588
2589                                 arglen = strlen(mop->mo_arg);
2590                                 if (strlen(buf) + arglen + 2 > len)
2591                                         goto err;
2592                                 *cp++ = '=';
2593                                 (void) strcpy(cp, mop->mo_arg);
2594                                 cp += arglen;
2595                         }
2596                 }
2597         }
2598         return (0);
2599 err:
2600         return (EOVERFLOW);
2601 }
2602
2603 static void
2604 vfs_freecancelopt(char **moc)
2605 {
2606         if (moc != NULL) {
2607                 int ccnt = 0;
2608                 char **cp;
2609
2610                 for (cp = moc; *cp != NULL; cp++) {
2611                         kmem_free(*cp, strlen(*cp) + 1);
2612                         ccnt++;
2613                 }
2614                 kmem_free(moc, (ccnt + 1) * sizeof (char *));
2615         }
2616 }
2617
2618 static void
2619 vfs_freeopt(mntopt_t *mop)
2620 {
2621         if (mop->mo_name != NULL)
2622                 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2623
2624         vfs_freecancelopt(mop->mo_cancel);
2625
2626         if (mop->mo_arg != NULL)
2627                 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2628 }
2629
2630 /*
2631  * Free a mount options table
2632  *
2633  * This function is *not* for general use by filesystems.
2634  *
2635  * Note: caller is responsible for locking the vfs list, if needed,
2636  *       to protect mp.
2637  */
2638 void
2639 vfs_freeopttbl(mntopts_t *mp)
2640 {
2641         uint_t i, count;
2642
2643         count = mp->mo_count;
2644         for (i = 0; i < count; i++) {
2645                 vfs_freeopt(&mp->mo_list[i]);
2646         }
2647         if (count) {
2648                 kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2649                 mp->mo_count = 0;
2650                 mp->mo_list = NULL;
2651         }
2652 }
2653
2654
2655 /* ARGSUSED */
2656 static int
2657 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2658     caller_context_t *ct)
2659 {
2660         return (0);
2661 }
2662
2663 /* ARGSUSED */
2664 static int
2665 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2666     caller_context_t *ct)
2667 {
2668         return (0);
2669 }
2670
2671 /*
2672  * The dummy vnode is currently used only by file events notification
2673  * module which is just interested in the timestamps.
2674  */
2675 /* ARGSUSED */
2676 static int
2677 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2678     caller_context_t *ct)
2679 {
2680         bzero(vap, sizeof (vattr_t));
2681         vap->va_type = VREG;
2682         vap->va_nlink = 1;
2683         vap->va_ctime = vfs_mnttab_ctime;
2684         /*
2685          * it is ok to just copy mtime as the time will be monotonically
2686          * increasing.
2687          */
2688         vap->va_mtime = vfs_mnttab_mtime;
2689         vap->va_atime = vap->va_mtime;
2690         return (0);
2691 }
2692
2693 static void
2694 vfs_mnttabvp_setup(void)
2695 {
2696         static const struct vnodeops dummyops = {
2697                 .vnop_name = "mnttab",
2698                 .vop_read = vfs_mntdummyread,
2699                 .vop_write = vfs_mntdummywrite,
2700                 .vop_getattr = vfs_mntdummygetattr,
2701                 .vop_vnevent = fs_vnevent_support,
2702         };
2703         vnode_t *tvp;
2704
2705         /*
2706          * A global dummy vnode is allocated to represent mntfs files.
2707          * The mntfs file (/etc/mnttab) can be monitored for file events
2708          * and receive an event when mnttab changes. Dummy VOP calls
2709          * will be made on this vnode. The file events notification module
2710          * intercepts this vnode and delivers relevant events.
2711          */
2712         tvp = vn_alloc(KM_SLEEP);
2713         tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2714         vn_setops(tvp, &dummyops);
2715         tvp->v_type = VREG;
2716         /*
2717          * The mnt dummy ops do not reference v_data.
2718          * No other module intercepting this vnode should either.
2719          * Just set it to point to itself.
2720          */
2721         tvp->v_data = (caddr_t)tvp;
2722         tvp->v_vfsp = rootvfs;
2723         vfs_mntdummyvp = tvp;
2724 }
2725
2726 /*
2727  * performs fake read/write ops
2728  */
2729 static void
2730 vfs_mnttab_rwop(int rw)
2731 {
2732         struct uio      uio;
2733         struct iovec    iov;
2734         char    buf[1];
2735
2736         if (vfs_mntdummyvp == NULL)
2737                 return;
2738
2739         bzero(&uio, sizeof (uio));
2740         bzero(&iov, sizeof (iov));
2741         iov.iov_base = buf;
2742         iov.iov_len = 0;
2743         uio.uio_iov = &iov;
2744         uio.uio_iovcnt = 1;
2745         uio.uio_loffset = 0;
2746         uio.uio_segflg = UIO_SYSSPACE;
2747         uio.uio_resid = 0;
2748         if (rw) {
2749                 (void) fop_write(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2750         } else {
2751                 (void) fop_read(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2752         }
2753 }
2754
2755 /*
2756  * Generate a write operation.
2757  */
2758 void
2759 vfs_mnttab_writeop(void)
2760 {
2761         vfs_mnttab_rwop(1);
2762 }
2763
2764 /*
2765  * Generate a read operation.
2766  */
2767 void
2768 vfs_mnttab_readop(void)
2769 {
2770         vfs_mnttab_rwop(0);
2771 }
2772
2773 /*
2774  * Free any mnttab information recorded in the vfs struct.
2775  * The vfs must not be on the vfs list.
2776  */
2777 static void
2778 vfs_freemnttab(struct vfs *vfsp)
2779 {
2780         ASSERT(!VFS_ON_LIST(vfsp));
2781
2782         /*
2783          * Free device and mount point information
2784          */
2785         if (vfsp->vfs_mntpt != NULL) {
2786                 refstr_rele(vfsp->vfs_mntpt);
2787                 vfsp->vfs_mntpt = NULL;
2788         }
2789         if (vfsp->vfs_resource != NULL) {
2790                 refstr_rele(vfsp->vfs_resource);
2791                 vfsp->vfs_resource = NULL;
2792         }
2793         /*
2794          * Now free mount options information
2795          */
2796         vfs_freeopttbl(&vfsp->vfs_mntopts);
2797 }
2798
2799 /*
2800  * Return the last mnttab modification time
2801  */
2802 void
2803 vfs_mnttab_modtime(timespec_t *ts)
2804 {
2805         ASSERT(RW_LOCK_HELD(&vfslist));
2806         *ts = vfs_mnttab_mtime;
2807 }
2808
2809 /*
2810  * See if mnttab is changed
2811  */
2812 void
2813 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2814 {
2815         int changed;
2816
2817         *phpp = NULL;
2818
2819         /*
2820          * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2821          * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2822          * to not grab the vfs list lock because tv_sec is monotonically
2823          * increasing.
2824          */
2825
2826         changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2827             (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2828         if (!changed) {
2829                 *phpp = &vfs_pollhd;
2830         }
2831 }
2832
2833 /* Provide a unique and monotonically-increasing timestamp. */
2834 void
2835 vfs_mono_time(timespec_t *ts)
2836 {
2837         static volatile hrtime_t hrt;           /* The saved time. */
2838         hrtime_t        newhrt, oldhrt;         /* For effecting the CAS. */
2839         timespec_t      newts;
2840
2841         /*
2842          * Try gethrestime() first, but be prepared to fabricate a sensible
2843          * answer at the first sign of any trouble.
2844          */
2845         gethrestime(&newts);
2846         newhrt = ts2hrt(&newts);
2847         for (;;) {
2848                 oldhrt = hrt;
2849                 if (newhrt <= hrt)
2850                         newhrt = hrt + 1;
2851                 if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
2852                         break;
2853         }
2854         hrt2ts(newhrt, ts);
2855 }
2856
2857 /*
2858  * Update the mnttab modification time and wake up any waiters for
2859  * mnttab changes
2860  */
2861 void
2862 vfs_mnttab_modtimeupd()
2863 {
2864         hrtime_t oldhrt, newhrt;
2865
2866         ASSERT(RW_WRITE_HELD(&vfslist));
2867         oldhrt = ts2hrt(&vfs_mnttab_mtime);
2868         gethrestime(&vfs_mnttab_mtime);
2869         newhrt = ts2hrt(&vfs_mnttab_mtime);
2870         if (oldhrt == (hrtime_t)0)
2871                 vfs_mnttab_ctime = vfs_mnttab_mtime;
2872         /*
2873          * Attempt to provide unique mtime (like uniqtime but not).
2874          */
2875         if (newhrt == oldhrt) {
2876                 newhrt++;
2877                 hrt2ts(newhrt, &vfs_mnttab_mtime);
2878         }
2879         pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
2880         vfs_mnttab_writeop();
2881 }
2882
2883 int
2884 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
2885 {
2886         vnode_t *coveredvp;
2887         int error;
2888         extern void teardown_vopstats(vfs_t *);
2889
2890         /*
2891          * Get covered vnode. This will be NULL if the vfs is not linked
2892          * into the file system name space (i.e., domount() with MNT_NOSPICE).
2893          */
2894         coveredvp = vfsp->vfs_vnodecovered;
2895         ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
2896
2897         /*
2898          * Purge all dnlc entries for this vfs.
2899          */
2900         (void) dnlc_purge_vfsp(vfsp, 0);
2901
2902         /* For forcible umount, skip VFS_SYNC() since it may hang */
2903         if ((flag & MS_FORCE) == 0)
2904                 (void) VFS_SYNC(vfsp, 0, cr);
2905
2906         /*
2907          * Lock the vfs to maintain fs status quo during unmount.  This
2908          * has to be done after the sync because ufs_update tries to acquire
2909          * the vfs_reflock.
2910          */
2911         vfs_lock_wait(vfsp);
2912
2913         if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
2914                 vfs_unlock(vfsp);
2915                 if (coveredvp != NULL)
2916                         vn_vfsunlock(coveredvp);
2917         } else if (coveredvp != NULL) {
2918                 teardown_vopstats(vfsp);
2919                 /*
2920                  * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
2921                  * when it frees vfsp so we do a VN_HOLD() so we can
2922                  * continue to use coveredvp afterwards.
2923                  */
2924                 VN_HOLD(coveredvp);
2925                 vfs_remove(vfsp);
2926                 vn_vfsunlock(coveredvp);
2927                 VN_RELE(coveredvp);
2928         } else {
2929                 teardown_vopstats(vfsp);
2930                 /*
2931                  * Release the reference to vfs that is not linked
2932                  * into the name space.
2933                  */
2934                 vfs_unlock(vfsp);
2935                 VFS_RELE(vfsp);
2936         }
2937         return (error);
2938 }
2939
2940
2941 /*
2942  * Vfs_unmountall() is called by uadmin() to unmount all
2943  * mounted file systems (except the root file system) during shutdown.
2944  * It follows the existing locking protocol when traversing the vfs list
2945  * to sync and unmount vfses. Even though there should be no
2946  * other thread running while the system is shutting down, it is prudent
2947  * to still follow the locking protocol.
2948  */
2949 void
2950 vfs_unmountall(void)
2951 {
2952         struct vfs *vfsp;
2953         struct vfs *prev_vfsp = NULL;
2954         int error;
2955
2956         /*
2957          * Toss all dnlc entries now so that the per-vfs sync
2958          * and unmount operations don't have to slog through
2959          * a bunch of uninteresting vnodes over and over again.
2960          */
2961         dnlc_purge();
2962
2963         vfs_list_lock();
2964         for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
2965                 prev_vfsp = vfsp->vfs_prev;
2966
2967                 if (vfs_lock(vfsp) != 0)
2968                         continue;
2969                 error = vn_vfswlock(vfsp->vfs_vnodecovered);
2970                 vfs_unlock(vfsp);
2971                 if (error)
2972                         continue;
2973
2974                 vfs_list_unlock();
2975
2976                 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
2977                 (void) dounmount(vfsp, 0, CRED());
2978
2979                 /*
2980                  * Since we dropped the vfslist lock above we must
2981                  * verify that next_vfsp still exists, else start over.
2982                  */
2983                 vfs_list_lock();
2984                 for (vfsp = rootvfs->vfs_prev;
2985                     vfsp != rootvfs; vfsp = vfsp->vfs_prev)
2986                         if (vfsp == prev_vfsp)
2987                                 break;
2988                 if (vfsp == rootvfs && prev_vfsp != rootvfs)
2989                         prev_vfsp = rootvfs->vfs_prev;
2990         }
2991         vfs_list_unlock();
2992 }
2993
2994 /*
2995  * Called to add an entry to the end of the vfs mount in progress list
2996  */
2997 void
2998 vfs_addmip(dev_t dev, struct vfs *vfsp)
2999 {
3000         struct ipmnt *mipp;
3001
3002         mipp = kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3003         mipp->mip_next = NULL;
3004         mipp->mip_dev = dev;
3005         mipp->mip_vfsp = vfsp;
3006         mutex_enter(&vfs_miplist_mutex);
3007         if (vfs_miplist_end != NULL)
3008                 vfs_miplist_end->mip_next = mipp;
3009         else
3010                 vfs_miplist = mipp;
3011         vfs_miplist_end = mipp;
3012         mutex_exit(&vfs_miplist_mutex);
3013 }
3014
3015 /*
3016  * Called to remove an entry from the mount in progress list
3017  * Either because the mount completed or it failed.
3018  */
3019 void
3020 vfs_delmip(struct vfs *vfsp)
3021 {
3022         struct ipmnt *mipp, *mipprev;
3023
3024         mutex_enter(&vfs_miplist_mutex);
3025         mipprev = NULL;
3026         for (mipp = vfs_miplist;
3027             mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3028                 mipprev = mipp;
3029         }
3030         if (mipp == NULL)
3031                 return; /* shouldn't happen */
3032         if (mipp == vfs_miplist_end)
3033                 vfs_miplist_end = mipprev;
3034         if (mipprev == NULL)
3035                 vfs_miplist = mipp->mip_next;
3036         else
3037                 mipprev->mip_next = mipp->mip_next;
3038         mutex_exit(&vfs_miplist_mutex);
3039         kmem_free(mipp, sizeof (struct ipmnt));
3040 }
3041
3042 /*
3043  * vfs_add is called by a specific filesystem's mount routine to add
3044  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3045  * The vfs should already have been locked by the caller.
3046  *
3047  * coveredvp is NULL if this is the root.
3048  */
3049 void
3050 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3051 {
3052         int newflag;
3053
3054         ASSERT(vfs_lock_held(vfsp));
3055         VFS_HOLD(vfsp);
3056         newflag = vfsp->vfs_flag;
3057         if (mflag & MS_RDONLY)
3058                 newflag |= VFS_RDONLY;
3059         else
3060                 newflag &= ~VFS_RDONLY;
3061         if (mflag & MS_NOSUID)
3062                 newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3063         else
3064                 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3065         if (mflag & MS_NOMNTTAB)
3066                 newflag |= VFS_NOMNTTAB;
3067         else
3068                 newflag &= ~VFS_NOMNTTAB;
3069
3070         if (coveredvp != NULL) {
3071                 ASSERT(vn_vfswlock_held(coveredvp));
3072                 coveredvp->v_vfsmountedhere = vfsp;
3073                 VN_HOLD(coveredvp);
3074         }
3075         vfsp->vfs_vnodecovered = coveredvp;
3076         vfsp->vfs_flag = newflag;
3077
3078         vfs_list_add(vfsp);
3079 }
3080
3081 /*
3082  * Remove a vfs from the vfs list, null out the pointer from the
3083  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3084  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3085  * reference to the vfs and to the covered vnode.
3086  *
3087  * Called from dounmount after it's confirmed with the file system
3088  * that the unmount is legal.
3089  */
3090 void
3091 vfs_remove(struct vfs *vfsp)
3092 {
3093         vnode_t *vp;
3094
3095         ASSERT(vfs_lock_held(vfsp));
3096
3097         /*
3098          * Can't unmount root.  Should never happen because fs will
3099          * be busy.
3100          */
3101         if (vfsp == rootvfs)
3102                 panic("vfs_remove: unmounting root");
3103
3104         vfs_list_remove(vfsp);
3105
3106         /*
3107          * Unhook from the file system name space.
3108          */
3109         vp = vfsp->vfs_vnodecovered;
3110         ASSERT(vn_vfswlock_held(vp));
3111         vp->v_vfsmountedhere = NULL;
3112         vfsp->vfs_vnodecovered = NULL;
3113         VN_RELE(vp);
3114
3115         /*
3116          * Release lock and wakeup anybody waiting.
3117          */
3118         vfs_unlock(vfsp);
3119         VFS_RELE(vfsp);
3120 }
3121
3122 /*
3123  * Lock a filesystem to prevent access to it while mounting,
3124  * unmounting and syncing.  Return EBUSY immediately if lock
3125  * can't be acquired.
3126  */
3127 int
3128 vfs_lock(vfs_t *vfsp)
3129 {
3130         vn_vfslocks_entry_t *vpvfsentry;
3131
3132         vpvfsentry = vn_vfslocks_getlock(vfsp);
3133         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3134                 return (0);
3135
3136         vn_vfslocks_rele(vpvfsentry);
3137         return (EBUSY);
3138 }
3139
3140 int
3141 vfs_rlock(vfs_t *vfsp)
3142 {
3143         vn_vfslocks_entry_t *vpvfsentry;
3144
3145         vpvfsentry = vn_vfslocks_getlock(vfsp);
3146
3147         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3148                 return (0);
3149
3150         vn_vfslocks_rele(vpvfsentry);
3151         return (EBUSY);
3152 }
3153
3154 void
3155 vfs_lock_wait(vfs_t *vfsp)
3156 {
3157         vn_vfslocks_entry_t *vpvfsentry;
3158
3159         vpvfsentry = vn_vfslocks_getlock(vfsp);
3160         rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3161 }
3162
3163 void
3164 vfs_rlock_wait(vfs_t *vfsp)
3165 {
3166         vn_vfslocks_entry_t *vpvfsentry;
3167
3168         vpvfsentry = vn_vfslocks_getlock(vfsp);
3169         rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3170 }
3171
3172 /*
3173  * Unlock a locked filesystem.
3174  */
3175 void
3176 vfs_unlock(vfs_t *vfsp)
3177 {
3178         vn_vfslocks_entry_t *vpvfsentry;
3179
3180         /*
3181          * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3182          * And these changes should remain for the patch changes as it is.
3183          */
3184         if (panicstr)
3185                 return;
3186
3187         /*
3188          * ve_refcount needs to be dropped twice here.
3189          * 1. To release refernce after a call to vfs_locks_getlock()
3190          * 2. To release the reference from the locking routines like
3191          *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3192          */
3193
3194         vpvfsentry = vn_vfslocks_getlock(vfsp);
3195         vn_vfslocks_rele(vpvfsentry);
3196
3197         rwst_exit(&vpvfsentry->ve_lock);
3198         vn_vfslocks_rele(vpvfsentry);
3199 }
3200
3201 /*
3202  * Utility routine that allows a filesystem to construct its
3203  * fsid in "the usual way" - by munging some underlying dev_t and
3204  * the filesystem type number into the 64-bit fsid.  Note that
3205  * this implicitly relies on dev_t persistence to make filesystem
3206  * id's persistent.
3207  *
3208  * There's nothing to prevent an individual fs from constructing its
3209  * fsid in a different way, and indeed they should.
3210  *
3211  * Since we want fsids to be 32-bit quantities (so that they can be
3212  * exported identically by either 32-bit or 64-bit APIs, as well as
3213  * the fact that fsid's are "known" to NFS), we compress the device
3214  * number given down to 32-bits, and panic if that isn't possible.
3215  */
3216 void
3217 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3218 {
3219         if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3220                 panic("device number too big for fsid!");
3221         fsi->val[1] = val;
3222 }
3223
3224 int
3225 vfs_lock_held(vfs_t *vfsp)
3226 {
3227         int held;
3228         vn_vfslocks_entry_t *vpvfsentry;
3229
3230         /*
3231          * vfs_lock_held will mimic sema_held behaviour
3232          * if panicstr is set. And these changes should remain
3233          * for the patch changes as it is.
3234          */
3235         if (panicstr)
3236                 return (1);
3237
3238         vpvfsentry = vn_vfslocks_getlock(vfsp);
3239         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3240
3241         vn_vfslocks_rele(vpvfsentry);
3242         return (held);
3243 }
3244
3245 struct _kthread *
3246 vfs_lock_owner(vfs_t *vfsp)
3247 {
3248         struct _kthread *owner;
3249         vn_vfslocks_entry_t *vpvfsentry;
3250
3251         /*
3252          * vfs_wlock_held will mimic sema_held behaviour
3253          * if panicstr is set. And these changes should remain
3254          * for the patch changes as it is.
3255          */
3256         if (panicstr)
3257                 return (NULL);
3258
3259         vpvfsentry = vn_vfslocks_getlock(vfsp);
3260         owner = rwst_owner(&vpvfsentry->ve_lock);
3261
3262         vn_vfslocks_rele(vpvfsentry);
3263         return (owner);
3264 }
3265
3266 /*
3267  * vfs list locking.
3268  *
3269  * Rather than manipulate the vfslist lock directly, we abstract into lock
3270  * and unlock routines to allow the locking implementation to be changed for
3271  * clustering.
3272  *
3273  * Whenever the vfs list is modified through its hash links, the overall list
3274  * lock must be obtained before locking the relevant hash bucket.  But to see
3275  * whether a given vfs is on the list, it suffices to obtain the lock for the
3276  * hash bucket without getting the overall list lock.  (See getvfs() below.)
3277  */
3278
3279 void
3280 vfs_list_lock()
3281 {
3282         rw_enter(&vfslist, RW_WRITER);
3283 }
3284
3285 void
3286 vfs_list_read_lock()
3287 {
3288         rw_enter(&vfslist, RW_READER);
3289 }
3290
3291 void
3292 vfs_list_unlock()
3293 {
3294         rw_exit(&vfslist);
3295 }
3296
3297 /*
3298  * Low level worker routines for adding entries to and removing entries from
3299  * the vfs list.
3300  */
3301
3302 static void
3303 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3304 {
3305         int vhno;
3306         struct vfs **hp;
3307         dev_t dev;
3308
3309         ASSERT(RW_WRITE_HELD(&vfslist));
3310
3311         dev = expldev(vfsp->vfs_fsid.val[0]);
3312         vhno = VFSHASH(getmajor(dev), getminor(dev));
3313
3314         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3315
3316         /*
3317          * Link into the hash table, inserting it at the end, so that LOFS
3318          * with the same fsid as UFS (or other) file systems will not hide the
3319          * UFS.
3320          */
3321         if (insert_at_head) {
3322                 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3323                 rvfs_list[vhno].rvfs_head = vfsp;
3324         } else {
3325                 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3326                     hp = &(*hp)->vfs_hash)
3327                         continue;
3328                 /*
3329                  * hp now contains the address of the pointer to update
3330                  * to effect the insertion.
3331                  */
3332                 vfsp->vfs_hash = NULL;
3333                 *hp = vfsp;
3334         }
3335
3336         rvfs_list[vhno].rvfs_len++;
3337         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3338 }
3339
3340
3341 static void
3342 vfs_hash_remove(struct vfs *vfsp)
3343 {
3344         int vhno;
3345         struct vfs *tvfsp;
3346         dev_t dev;
3347
3348         ASSERT(RW_WRITE_HELD(&vfslist));
3349
3350         dev = expldev(vfsp->vfs_fsid.val[0]);
3351         vhno = VFSHASH(getmajor(dev), getminor(dev));
3352
3353         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3354
3355         /*
3356          * Remove from hash.
3357          */
3358         if (rvfs_list[vhno].rvfs_head == vfsp) {
3359                 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3360                 rvfs_list[vhno].rvfs_len--;
3361                 goto foundit;
3362         }
3363         for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3364             tvfsp = tvfsp->vfs_hash) {
3365                 if (tvfsp->vfs_hash == vfsp) {
3366                         tvfsp->vfs_hash = vfsp->vfs_hash;
3367                         rvfs_list[vhno].rvfs_len--;
3368                         goto foundit;
3369                 }
3370         }
3371         cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3372
3373 foundit:
3374
3375         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3376 }
3377
3378
3379 void
3380 vfs_list_add(struct vfs *vfsp)
3381 {
3382         zone_t *zone;
3383
3384         /*
3385          * Typically, the vfs_t will have been created on behalf of the file
3386          * system in vfs_init, where it will have been provided with a
3387          * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3388          * by an unbundled file system. We therefore check for such an example
3389          * before stamping the vfs_t with its creation time for the benefit of
3390          * mntfs.
3391          */
3392         if (vfsp->vfs_implp == NULL)
3393                 vfsimpl_setup(vfsp);
3394         vfs_mono_time(&vfsp->vfs_hrctime);
3395
3396         /*
3397          * The zone that owns the mount is the one that performed the mount.
3398          * Note that this isn't necessarily the same as the zone mounted into.
3399          * The corresponding zone_rele_ref() will be done when the vfs_t
3400          * is being free'd.
3401          */
3402         vfsp->vfs_zone = curproc->p_zone;
3403         zone_init_ref(&vfsp->vfs_implp->vi_zone_ref);
3404         zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref,
3405             ZONE_REF_VFS);
3406
3407         /*
3408          * Find the zone mounted into, and put this mount on its vfs list.
3409          */
3410         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3411         ASSERT(zone != NULL);
3412         /*
3413          * Special casing for the root vfs.  This structure is allocated
3414          * statically and hooked onto rootvfs at link time.  During the
3415          * vfs_mountroot call at system startup time, the root file system's
3416          * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3417          * as argument.  The code below must detect and handle this special
3418          * case.  The only apparent justification for this special casing is
3419          * to ensure that the root file system appears at the head of the
3420          * list.
3421          *
3422          * XXX: I'm assuming that it's ok to do normal list locking when
3423          *      adding the entry for the root file system (this used to be
3424          *      done with no locks held).
3425          */
3426         vfs_list_lock();
3427         /*
3428          * Link into the vfs list proper.
3429          */
3430         if (vfsp == &root) {
3431                 /*
3432                  * Assert: This vfs is already on the list as its first entry.
3433                  * Thus, there's nothing to do.
3434                  */
3435                 ASSERT(rootvfs == vfsp);
3436                 /*
3437                  * Add it to the head of the global zone's vfslist.
3438                  */
3439                 ASSERT(zone == global_zone);
3440                 ASSERT(zone->zone_vfslist == NULL);
3441                 zone->zone_vfslist = vfsp;
3442         } else {
3443                 /*
3444                  * Link to end of list using vfs_prev (as rootvfs is now a
3445                  * doubly linked circular list) so list is in mount order for
3446                  * mnttab use.
3447                  */
3448                 rootvfs->vfs_prev->vfs_next = vfsp;
3449                 vfsp->vfs_prev = rootvfs->vfs_prev;
3450                 rootvfs->vfs_prev = vfsp;
3451                 vfsp->vfs_next = rootvfs;
3452
3453                 /*
3454                  * Do it again for the zone-private list (which may be NULL).
3455                  */
3456                 if (zone->zone_vfslist == NULL) {
3457                         ASSERT(zone != global_zone);
3458                         zone->zone_vfslist = vfsp;
3459                 } else {
3460                         zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3461                         vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3462                         zone->zone_vfslist->vfs_zone_prev = vfsp;
3463                         vfsp->vfs_zone_next = zone->zone_vfslist;
3464                 }
3465         }
3466
3467         /*
3468          * Link into the hash table, inserting it at the end, so that LOFS
3469          * with the same fsid as UFS (or other) file systems will not hide
3470          * the UFS.
3471          */
3472         vfs_hash_add(vfsp, 0);
3473
3474         /*
3475          * update the mnttab modification time
3476          */
3477         vfs_mnttab_modtimeupd();
3478         vfs_list_unlock();
3479         zone_rele(zone);
3480 }
3481
3482 void
3483 vfs_list_remove(struct vfs *vfsp)
3484 {
3485         zone_t *zone;
3486
3487         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3488         ASSERT(zone != NULL);
3489         /*
3490          * Callers are responsible for preventing attempts to unmount the
3491          * root.
3492          */
3493         ASSERT(vfsp != rootvfs);
3494
3495         vfs_list_lock();
3496
3497         /*
3498          * Remove from hash.
3499          */
3500         vfs_hash_remove(vfsp);
3501
3502         /*
3503          * Remove from vfs list.
3504          */
3505         vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3506         vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3507         vfsp->vfs_next = vfsp->vfs_prev = NULL;
3508
3509         /*
3510          * Remove from zone-specific vfs list.
3511          */
3512         if (zone->zone_vfslist == vfsp)
3513                 zone->zone_vfslist = vfsp->vfs_zone_next;
3514
3515         if (vfsp->vfs_zone_next == vfsp) {
3516                 ASSERT(vfsp->vfs_zone_prev == vfsp);
3517                 ASSERT(zone->zone_vfslist == vfsp);
3518                 zone->zone_vfslist = NULL;
3519         }
3520
3521         vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3522         vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3523         vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3524
3525         /*
3526          * update the mnttab modification time
3527          */
3528         vfs_mnttab_modtimeupd();
3529         vfs_list_unlock();
3530         zone_rele(zone);
3531 }
3532
3533 struct vfs *
3534 getvfs(fsid_t *fsid)
3535 {
3536         struct vfs *vfsp;
3537         int val0 = fsid->val[0];
3538         int val1 = fsid->val[1];
3539         dev_t dev = expldev(val0);
3540         int vhno = VFSHASH(getmajor(dev), getminor(dev));
3541         kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3542
3543         mutex_enter(hmp);
3544         for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3545                 if (vfsp->vfs_fsid.val[0] == val0 &&
3546                     vfsp->vfs_fsid.val[1] == val1) {
3547                         VFS_HOLD(vfsp);
3548                         mutex_exit(hmp);
3549                         return (vfsp);
3550                 }
3551         }
3552         mutex_exit(hmp);
3553         return (NULL);
3554 }
3555
3556 /*
3557  * Search the vfs mount in progress list for a specified device/vfs entry.
3558  * Returns 0 if the first entry in the list that the device matches has the
3559  * given vfs pointer as well.  If the device matches but a different vfs
3560  * pointer is encountered in the list before the given vfs pointer then
3561  * a 1 is returned.
3562  */
3563
3564 int
3565 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3566 {
3567         int retval = 0;
3568         struct ipmnt *mipp;
3569
3570         mutex_enter(&vfs_miplist_mutex);
3571         for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3572                 if (mipp->mip_dev == dev) {
3573                         if (mipp->mip_vfsp != vfsp)
3574                                 retval = 1;
3575                         break;
3576                 }
3577         }
3578         mutex_exit(&vfs_miplist_mutex);
3579         return (retval);
3580 }
3581
3582 /*
3583  * Search the vfs list for a specified device.  Returns 1, if entry is found
3584  * or 0 if no suitable entry is found.
3585  */
3586
3587 int
3588 vfs_devismounted(dev_t dev)
3589 {
3590         struct vfs *vfsp;
3591         int found;
3592
3593         vfs_list_read_lock();
3594         vfsp = rootvfs;
3595         found = 0;
3596         do {
3597                 if (vfsp->vfs_dev == dev) {
3598                         found = 1;
3599                         break;
3600                 }
3601                 vfsp = vfsp->vfs_next;
3602         } while (vfsp != rootvfs);
3603
3604         vfs_list_unlock();
3605         return (found);
3606 }
3607
3608 /*
3609  * Search the vfs list for a specified device.  Returns a pointer to it
3610  * or NULL if no suitable entry is found. The caller of this routine
3611  * is responsible for releasing the returned vfs pointer.
3612  */
3613 struct vfs *
3614 vfs_dev2vfsp(dev_t dev)
3615 {
3616         struct vfs *vfsp;
3617         int found;
3618
3619         vfs_list_read_lock();
3620         vfsp = rootvfs;
3621         found = 0;
3622         do {
3623                 /*
3624                  * The following could be made more efficient by making
3625                  * the entire loop use vfs_zone_next if the call is from
3626                  * a zone.  The only callers, however, ustat(2) and
3627                  * umount2(2), don't seem to justify the added
3628                  * complexity at present.
3629                  */
3630                 if (vfsp->vfs_dev == dev &&
3631                     ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3632                     curproc->p_zone)) {
3633                         VFS_HOLD(vfsp);
3634                         found = 1;
3635                         break;
3636                 }
3637                 vfsp = vfsp->vfs_next;
3638         } while (vfsp != rootvfs);
3639         vfs_list_unlock();
3640         return (found ? vfsp: NULL);
3641 }
3642
3643 /*
3644  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3645  * or NULL if no suitable entry is found. The caller of this routine
3646  * is responsible for releasing the returned vfs pointer.
3647  *
3648  * Note that if multiple mntpoints match, the last one matching is
3649  * returned in an attempt to return the "top" mount when overlay
3650  * mounts are covering the same mount point.  This is accomplished by starting
3651  * at the end of the list and working our way backwards, stopping at the first
3652  * matching mount.
3653  */
3654 struct vfs *
3655 vfs_mntpoint2vfsp(const char *mp)
3656 {
3657         struct vfs *vfsp;
3658         struct vfs *retvfsp = NULL;
3659         zone_t *zone = curproc->p_zone;
3660         struct vfs *list;
3661
3662         vfs_list_read_lock();
3663         if (getzoneid() == GLOBAL_ZONEID) {
3664                 /*
3665                  * The global zone may see filesystems in any zone.
3666                  */
3667                 vfsp = rootvfs->vfs_prev;
3668                 do {
3669                         if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3670                                 retvfsp = vfsp;
3671                                 break;
3672                         }
3673                         vfsp = vfsp->vfs_prev;
3674                 } while (vfsp != rootvfs->vfs_prev);
3675         } else if ((list = zone->zone_vfslist) != NULL) {
3676                 const char *mntpt;
3677
3678                 vfsp = list->vfs_zone_prev;
3679                 do {
3680                         mntpt = refstr_value(vfsp->vfs_mntpt);
3681                         mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3682                         if (strcmp(mntpt, mp) == 0) {
3683                                 retvfsp = vfsp;
3684                                 break;
3685                         }
3686                         vfsp = vfsp->vfs_zone_prev;
3687                 } while (vfsp != list->vfs_zone_prev);
3688         }
3689         if (retvfsp)
3690                 VFS_HOLD(retvfsp);
3691         vfs_list_unlock();
3692         return (retvfsp);
3693 }
3694
3695 /*
3696  * Search the vfs list for a specified vfsops.
3697  * if vfs entry is found then return 1, else 0.
3698  */
3699 int
3700 vfs_opsinuse(const struct vfsops *ops)
3701 {
3702         struct vfs *vfsp;
3703         int found;
3704
3705         vfs_list_read_lock();
3706         vfsp = rootvfs;
3707         found = 0;
3708         do {
3709                 if (vfs_getops(vfsp) == ops) {
3710                         found = 1;
3711                         break;
3712                 }
3713                 vfsp = vfsp->vfs_next;
3714         } while (vfsp != rootvfs);
3715         vfs_list_unlock();
3716         return (found);
3717 }
3718
3719 /*
3720  * Allocate an entry in vfssw for a file system type
3721  */
3722 struct vfssw *
3723 allocate_vfssw(const char *type)
3724 {
3725         struct vfssw *vswp;
3726
3727         if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3728                 /*
3729                  * The vfssw table uses the empty string to identify an
3730                  * available entry; we cannot add any type which has
3731                  * a leading NUL. The string length is limited to
3732                  * the size of the st_fstype array in struct stat.
3733                  */
3734                 return (NULL);
3735         }
3736
3737         ASSERT(VFSSW_WRITE_LOCKED());
3738         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3739                 if (!ALLOCATED_VFSSW(vswp)) {
3740                         vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3741                         (void) strcpy(vswp->vsw_name, type);
3742                         ASSERT(vswp->vsw_count == 0);
3743                         vswp->vsw_count = 1;
3744                         mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3745                         return (vswp);
3746                 }
3747         return (NULL);
3748 }
3749
3750 /*
3751  * Impose additional layer of translation between vfstype names
3752  * and module names in the filesystem.
3753  */
3754 static const char *
3755 vfs_to_modname(const char *vfstype)
3756 {
3757         if (strcmp(vfstype, "proc") == 0) {
3758                 vfstype = "procfs";
3759         } else if (strcmp(vfstype, "fd") == 0) {
3760                 vfstype = "fdfs";
3761         } else if (strncmp(vfstype, "nfs", 3) == 0) {
3762                 vfstype = "nfs";
3763         }
3764
3765         return (vfstype);
3766 }
3767
3768 /*
3769  * Find a vfssw entry given a file system type name.
3770  * Try to autoload the filesystem if it's not found.
3771  * If it's installed, return the vfssw locked to prevent unloading.
3772  */
3773 struct vfssw *
3774 vfs_getvfssw(const char *type)
3775 {
3776         struct vfssw *vswp;
3777         const char *modname;
3778
3779         RLOCK_VFSSW();
3780         vswp = vfs_getvfsswbyname(type);
3781         modname = vfs_to_modname(type);
3782
3783         if (rootdir == NULL) {
3784                 /*
3785                  * If we haven't yet loaded the root file system, then our
3786                  * _init won't be called until later. Allocate vfssw entry,
3787                  * because mod_installfs won't be called.
3788                  */
3789                 if (vswp == NULL) {
3790                         RUNLOCK_VFSSW();
3791                         WLOCK_VFSSW();
3792                         if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3793                                 if ((vswp = allocate_vfssw(type)) == NULL) {
3794                                         WUNLOCK_VFSSW();
3795                                         return (NULL);
3796                                 }
3797                         }
3798                         WUNLOCK_VFSSW();
3799                         RLOCK_VFSSW();
3800                 }
3801                 if (!VFS_INSTALLED(vswp)) {
3802                         RUNLOCK_VFSSW();
3803                         (void) modloadonly("fs", modname);
3804                 } else
3805                         RUNLOCK_VFSSW();
3806                 return (vswp);
3807         }
3808
3809         /*
3810          * Try to load the filesystem.  Before calling modload(), we drop
3811          * our lock on the VFS switch table, and pick it up after the
3812          * module is loaded.  However, there is a potential race:  the
3813          * module could be unloaded after the call to modload() completes
3814          * but before we pick up the lock and drive on.  Therefore,
3815          * we keep reloading the module until we've loaded the module
3816          * _and_ we have the lock on the VFS switch table.
3817          */
3818         while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3819                 RUNLOCK_VFSSW();
3820                 if (modload("fs", modname) == -1)
3821                         return (NULL);
3822                 RLOCK_VFSSW();
3823                 if (vswp == NULL)
3824                         if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3825                                 break;
3826         }
3827         RUNLOCK_VFSSW();
3828
3829         return (vswp);
3830 }
3831
3832 /*
3833  * Find a vfssw entry given a file system type name.
3834  */
3835 struct vfssw *
3836 vfs_getvfsswbyname(const char *type)
3837 {
3838         struct vfssw *vswp;
3839
3840         ASSERT(VFSSW_LOCKED());
3841         if (type == NULL || *type == '\0')
3842                 return (NULL);
3843
3844         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3845                 if (strcmp(type, vswp->vsw_name) == 0) {
3846                         vfs_refvfssw(vswp);
3847                         return (vswp);
3848                 }
3849         }
3850
3851         return (NULL);
3852 }
3853
3854 /*
3855  * Find a vfssw entry given a set of vfsops.
3856  */
3857 struct vfssw *
3858 vfs_getvfsswbyvfsops(const struct vfsops *ops)
3859 {
3860         struct vfssw *vswp;
3861
3862         RLOCK_VFSSW();
3863         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3864                 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == ops) {
3865                         vfs_refvfssw(vswp);
3866                         RUNLOCK_VFSSW();
3867                         return (vswp);
3868                 }
3869         }
3870         RUNLOCK_VFSSW();
3871
3872         return (NULL);
3873 }
3874
3875 /*
3876  * Reference a vfssw entry.
3877  */
3878 void
3879 vfs_refvfssw(struct vfssw *vswp)
3880 {
3881
3882         mutex_enter(&vswp->vsw_lock);
3883         vswp->vsw_count++;
3884         mutex_exit(&vswp->vsw_lock);
3885 }
3886
3887 /*
3888  * Unreference a vfssw entry.
3889  */
3890 void
3891 vfs_unrefvfssw(struct vfssw *vswp)
3892 {
3893
3894         mutex_enter(&vswp->vsw_lock);
3895         vswp->vsw_count--;
3896         mutex_exit(&vswp->vsw_lock);
3897 }
3898
3899 static int sync_retries = 20;   /* number of retries when not making progress */
3900 static int sync_triesleft;      /* portion of sync_retries remaining */
3901
3902 static pgcnt_t old_pgcnt, new_pgcnt;
3903 static int new_bufcnt, old_bufcnt;
3904
3905 /*
3906  * Sync all of the mounted filesystems, and then wait for the actual i/o to
3907  * complete.  We wait by counting the number of dirty pages and buffers,
3908  * pushing them out using bio_busy() and page_busy(), and then counting again.
3909  * This routine is used during the uadmin A_SHUTDOWN code.  It should only
3910  * be used after some higher-level mechanism has quiesced the system so that
3911  * new writes are not being initiated while we are waiting for completion.
3912  *
3913  * To ensure finite running time, our algorithm uses sync_triesleft (a progress
3914  * counter used by the vfs_syncall() loop below). It is declared above so
3915  * it can be found easily in the debugger.
3916  *
3917  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
3918  * sync_retries consecutive calls to bio_busy() and page_busy() without
3919  * decreasing either the number of dirty buffers or dirty pages below the
3920  * lowest count we have seen so far, we give up and return from vfs_syncall().
3921  *
3922  * Each loop iteration ends with a call to delay() one second to allow time for
3923  * i/o completion and to permit the user time to read our progress messages.
3924  */
3925 void
3926 vfs_syncall(void)
3927 {
3928         if (rootdir == NULL && !modrootloaded)
3929                 return; /* no filesystems have been loaded yet */
3930
3931         printf("syncing file systems...");
3932         sync();
3933
3934         sync_triesleft = sync_retries;
3935
3936         old_bufcnt = new_bufcnt = INT_MAX;
3937         old_pgcnt = new_pgcnt = ULONG_MAX;
3938
3939         while (sync_triesleft > 0) {
3940                 old_bufcnt = MIN(old_bufcnt, new_bufcnt);
3941                 old_pgcnt = MIN(old_pgcnt, new_pgcnt);
3942
3943                 new_bufcnt = bio_busy(B_TRUE);
3944                 new_pgcnt = page_busy(B_TRUE);
3945
3946                 if (new_bufcnt == 0 && new_pgcnt == 0)
3947                         break;
3948
3949                 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
3950                         sync_triesleft = sync_retries;
3951                 else
3952                         sync_triesleft--;
3953
3954                 if (new_bufcnt)
3955                         printf(" [%d]", new_bufcnt);
3956                 if (new_pgcnt)
3957                         printf(" %lu", new_pgcnt);
3958
3959                 ddi_sleep(1);
3960         }
3961
3962         if (new_bufcnt != 0 || new_pgcnt != 0)
3963                 printf(" done (not all i/o completed)\n");
3964         else
3965                 printf(" done\n");
3966
3967         ddi_sleep(1);
3968 }
3969
3970 /*
3971  * Map VFS flags to statvfs flags.  These shouldn't really be separate
3972  * flags at all.
3973  */
3974 uint_t
3975 vf_to_stf(uint_t vf)
3976 {
3977         uint_t stf = 0;
3978
3979         if (vf & VFS_RDONLY)
3980                 stf |= ST_RDONLY;
3981         if (vf & VFS_NOSETUID)
3982                 stf |= ST_NOSUID;
3983         if (vf & VFS_NOTRUNC)
3984                 stf |= ST_NOTRUNC;
3985
3986         return (stf);
3987 }
3988
3989 /*
3990  * Entries for (illegal) fstype 0.
3991  */
3992 /* ARGSUSED */
3993 int
3994 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
3995 {
3996         cmn_err(CE_PANIC, "stray vfs operation");
3997         return (0);
3998 }
3999
4000 /*
4001  * Entries for (illegal) fstype 0.
4002  */
4003 int
4004 vfsstray(void)
4005 {
4006         cmn_err(CE_PANIC, "stray vfs operation");
4007         return (0);
4008 }
4009
4010 /*
4011  * Support for dealing with forced UFS unmount and its interaction with
4012  * LOFS. Could be used by any filesystem.
4013  * See bug 1203132.
4014  */
4015 int
4016 vfs_EIO(void)
4017 {
4018         return (EIO);
4019 }
4020
4021 /*
4022  * We've gotta define the op for sync separately, since the compiler gets
4023  * confused if we mix and match ANSI and normal style prototypes when
4024  * a "short" argument is present and spits out a warning.
4025  */
4026 /*ARGSUSED*/
4027 int
4028 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4029 {
4030         return (EIO);
4031 }
4032
4033 vfs_t EIO_vfs;
4034
4035 const struct vfsops EIO_vfsops = {
4036         .vfs_mount = (void *) vfs_EIO,
4037         .vfs_unmount = (void *) vfs_EIO,
4038         .vfs_root = (void *) vfs_EIO,
4039         .vfs_statvfs = (void *) vfs_EIO,
4040         .vfs_sync = (void *) vfs_EIO_sync,
4041         .vfs_vget = (void *) vfs_EIO,
4042         .vfs_mountroot = (void *) vfs_EIO,
4043         .vfs_freevfs = (void *) vfs_EIO,
4044         .vfs_vnstate = (void *) vfs_EIO,
4045 };
4046
4047 static const struct vfsops stray_vfsops = {
4048         .vfs_mount = (void *) vfsstray,
4049         .vfs_unmount = (void *) vfsstray,
4050         .vfs_root = (void *) vfsstray,
4051         .vfs_statvfs = (void *) vfsstray,
4052         .vfs_sync = (void *) vfsstray_sync,
4053         .vfs_vget = (void *) vfsstray,
4054         .vfs_mountroot = (void *) vfsstray,
4055         .vfs_freevfs = (void *) vfsstray,
4056         .vfs_vnstate = (void *) vfsstray,
4057 };
4058
4059 /*
4060  * Called from startup() to initialize all loaded vfs's
4061  */
4062 void
4063 vfsinit(void)
4064 {
4065         struct vfssw *vswp;
4066         int error;
4067         extern int vopstats_enabled;
4068         extern void vopstats_startup();
4069
4070         /* Create vfs cache */
4071         vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4072             sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4073
4074         /* Initialize the vnode cache (file systems may use it during init). */
4075         vn_create_cache();
4076
4077         /* Setup event monitor framework */
4078         fem_init();
4079
4080         /* Initialize the dummy stray file system type. */
4081         error = vfs_setfsops(0, &stray_vfsops);
4082
4083         VFS_INIT(&EIO_vfs, &EIO_vfsops, NULL);
4084
4085         /*
4086          * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4087          * on this vfs can immediately notice it's invalid.
4088          */
4089         EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4090
4091         /*
4092          * Call the init routines of non-loadable filesystems only.
4093          * Filesystems which are loaded as separate modules will be
4094          * initialized by the module loading code instead.
4095          */
4096
4097         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4098                 RLOCK_VFSSW();
4099                 if (vswp->vsw_init != NULL)
4100                         (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4101                 RUNLOCK_VFSSW();
4102         }
4103
4104         vopstats_startup();
4105
4106         if (vopstats_enabled) {
4107                 /* EIO_vfs can collect stats, but we don't retrieve them */
4108                 initialize_vopstats(&EIO_vfs.vfs_vopstats);
4109                 EIO_vfs.vfs_fstypevsp = NULL;
4110                 EIO_vfs.vfs_vskap = NULL;
4111                 EIO_vfs.vfs_flag |= VFS_STATS;
4112         }
4113
4114         xattr_init();
4115
4116         reparse_point_init();
4117 }
4118
4119 vfs_t *
4120 vfs_alloc(int kmflag)
4121 {
4122         vfs_t *vfsp;
4123
4124         vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4125
4126         /*
4127          * Do the simplest initialization here.
4128          * Everything else gets done in vfs_init()
4129          */
4130         bzero(vfsp, sizeof (vfs_t));
4131         return (vfsp);
4132 }
4133
4134 void
4135 vfs_free(vfs_t *vfsp)
4136 {
4137         /*
4138          * One would be tempted to assert that "vfsp->vfs_count == 0".
4139          * The problem is that this gets called out of domount() with
4140          * a partially initialized vfs and a vfs_count of 1.  This is
4141          * also called from vfs_rele() with a vfs_count of 0.  We can't
4142          * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4143          * returned.  This is because VFS_MOUNT() fully initializes the
4144          * vfs structure and its associated data.  VFS_RELE() will call
4145          * VFS_FREEVFS() which may panic the system if the data structures
4146          * aren't fully initialized from a successful VFS_MOUNT()).
4147          */
4148
4149         /* If FEM was in use, make sure everything gets cleaned up */
4150         if (vfsp->vfs_femhead) {
4151                 ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4152                 mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4153                 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4154                 vfsp->vfs_femhead = NULL;
4155         }
4156
4157         if (vfsp->vfs_implp)
4158                 vfsimpl_teardown(vfsp);
4159         sema_destroy(&vfsp->vfs_reflock);
4160         kmem_cache_free(vfs_cache, vfsp);
4161 }
4162
4163 /*
4164  * Increments the vfs reference count by one atomically.
4165  */
4166 void
4167 vfs_hold(vfs_t *vfsp)
4168 {
4169         atomic_inc_32(&vfsp->vfs_count);
4170         ASSERT(vfsp->vfs_count != 0);
4171 }
4172
4173 /*
4174  * Decrements the vfs reference count by one atomically. When
4175  * vfs reference count becomes zero, it calls the file system
4176  * specific vfs_freevfs() to free up the resources.
4177  */
4178 void
4179 vfs_rele(vfs_t *vfsp)
4180 {
4181         ASSERT(vfsp->vfs_count != 0);
4182         if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) {
4183                 VFS_FREEVFS(vfsp);
4184                 lofi_remove(vfsp);
4185                 if (vfsp->vfs_zone)
4186                         zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
4187                             ZONE_REF_VFS);
4188                 vfs_freemnttab(vfsp);
4189                 vfs_free(vfsp);
4190         }
4191 }
4192
4193
4194 #if defined(__x86)
4195 extern int hvmboot_rootconf();
4196 #endif /* __x86 */
4197
4198 extern ib_boot_prop_t *iscsiboot_prop;
4199
4200 int
4201 rootconf()
4202 {
4203         int error;
4204         struct vfssw *vsw;
4205         extern void pm_init();
4206         char *fstyp, *fsmod;
4207         int ret = -1;
4208
4209         getrootfs(&fstyp, &fsmod);
4210
4211 #if defined(__x86)
4212         /*
4213          * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4214          * which lives in /platform/i86hvm, and hence is only available when
4215          * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4216          * is not available then the modstub for this function will return 0.
4217          * If the hvm_bootstrap misc module is available it will be loaded
4218          * and hvmboot_rootconf() will be invoked.
4219          */
4220         if (error = hvmboot_rootconf())
4221                 return (error);
4222 #endif /* __x86 */
4223
4224         if (modload("fs", fsmod) == -1)
4225                 panic("Cannot _init %s module", fsmod);
4226
4227         RLOCK_VFSSW();
4228         vsw = vfs_getvfsswbyname(fstyp);
4229         RUNLOCK_VFSSW();
4230         if (vsw == NULL) {
4231                 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4232                 return (ENXIO);
4233         }
4234         VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4235         VFS_HOLD(rootvfs);
4236
4237         /* always mount readonly first */
4238         rootvfs->vfs_flag |= VFS_RDONLY;
4239
4240         pm_init();
4241
4242         if (netboot && iscsiboot_prop) {
4243                 cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4244                     " shouldn't happen in the same time");
4245                 return (EINVAL);
4246         }
4247
4248         if (netboot || iscsiboot_prop) {
4249                 ret = strplumb();
4250                 if (ret != 0) {
4251                         cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4252                         return (EFAULT);
4253                 }
4254         }
4255
4256         if ((ret == 0) && iscsiboot_prop) {
4257                 ret = modload("drv", "iscsi");
4258                 /* -1 indicates fail */
4259                 if (ret == -1) {
4260                         cmn_err(CE_WARN, "Failed to load iscsi module");
4261                         iscsi_boot_prop_free();
4262                         return (EINVAL);
4263                 } else {
4264                         if (!i_ddi_attach_pseudo_node("iscsi")) {
4265                                 cmn_err(CE_WARN,
4266                                     "Failed to attach iscsi driver");
4267                                 iscsi_boot_prop_free();
4268                                 return (ENODEV);
4269                         }
4270                 }
4271         }
4272
4273         error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4274         vfs_unrefvfssw(vsw);
4275         rootdev = rootvfs->vfs_dev;
4276
4277         if (error)
4278                 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4279                     rootfs.bo_name, fstyp);
4280         else
4281                 cmn_err(CE_CONT, "?root on %s fstype %s\n",
4282                     rootfs.bo_name, fstyp);
4283         return (error);
4284 }
4285
4286 /*
4287  * XXX this is called by nfs only and should probably be removed
4288  * If booted with ASKNAME, prompt on the console for a filesystem
4289  * name and return it.
4290  */
4291 void
4292 getfsname(char *askfor, char *name, size_t namelen)
4293 {
4294         if (boothowto & RB_ASKNAME) {
4295                 printf("%s name: ", askfor);
4296                 console_gets(name, namelen);
4297         }
4298 }
4299
4300 /*
4301  * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4302  * property.
4303  *
4304  * Filesystem types starting with the prefix "nfs" are diskless clients;
4305  * init the root filename name (rootfs.bo_name), too.
4306  *
4307  * If we are booting via NFS we currently have these options:
4308  *      nfs -   dynamically choose NFS V2, V3, or V4 (default)
4309  *      nfs2 -  force NFS V2
4310  *      nfs3 -  force NFS V3
4311  *      nfs4 -  force NFS V4
4312  * Because we need to maintain backward compatibility with the naming
4313  * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4314  * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs".  The dynamic
4315  * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4316  * This is only for root filesystems, all other uses will expect
4317  * that "nfs" == NFS V2.
4318  */
4319 static void
4320 getrootfs(char **fstypp, char **fsmodp)
4321 {
4322         char *propstr = NULL;
4323
4324         /*
4325          * Check fstype property; for diskless it should be one of "nfs",
4326          * "nfs2", "nfs3" or "nfs4".
4327          */
4328         if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4329             DDI_PROP_DONTPASS, "fstype", &propstr)
4330             == DDI_SUCCESS) {
4331                 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4332                 ddi_prop_free(propstr);
4333
4334         /*
4335          * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4336          * assume the type of this root filesystem is 'zfs'.
4337          */
4338         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4339             DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4340             == DDI_SUCCESS) {
4341                 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4342                 ddi_prop_free(propstr);
4343         }
4344
4345         if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4346                 *fstypp = *fsmodp = rootfs.bo_fstype;
4347                 return;
4348         }
4349
4350         ++netboot;
4351
4352         if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4353                 (void) strcpy(rootfs.bo_fstype, "nfs");
4354         else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4355                 (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4356
4357         /*
4358          * check if path to network interface is specified in bootpath
4359          * or by a hypervisor domain configuration file.
4360          * XXPV - enable strlumb_get_netdev_path()
4361          */
4362         if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4363             "xpv-nfsroot")) {
4364                 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4365         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4366             DDI_PROP_DONTPASS, "bootpath", &propstr)
4367             == DDI_SUCCESS) {
4368                 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4369                 ddi_prop_free(propstr);
4370         } else {
4371                 rootfs.bo_name[0] = '\0';
4372         }
4373         *fstypp = rootfs.bo_fstype;
4374         *fsmodp = "nfs";
4375 }
4376
4377 /*
4378  * VFS feature routines
4379  */
4380
4381 #define VFTINDEX(feature)       (((feature) >> 32) & 0xFFFFFFFF)
4382 #define VFTBITS(feature)        ((feature) & 0xFFFFFFFFLL)
4383
4384 /* Register a feature in the vfs */
4385 void
4386 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4387 {
4388         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4389         if (vfsp->vfs_implp == NULL)
4390                 return;
4391
4392         vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4393 }
4394
4395 void
4396 vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature)
4397 {
4398         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4399         if (vfsp->vfs_implp == NULL)
4400                 return;
4401         vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature);
4402 }
4403
4404 /*
4405  * Query a vfs for a feature.
4406  * Returns 1 if feature is present, 0 if not
4407  */
4408 int
4409 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4410 {
4411         int     ret = 0;
4412
4413         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4414         if (vfsp->vfs_implp == NULL)
4415                 return (ret);
4416
4417         if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4418                 ret = 1;
4419
4420         return (ret);
4421 }
4422
4423 /*
4424  * Propagate feature set from one vfs to another
4425  */
4426 void
4427 vfs_propagate_features(vfs_t *from, vfs_t *to)
4428 {
4429         int i;
4430
4431         if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4432                 return;
4433
4434         for (i = 1; i <= to->vfs_featureset[0]; i++) {
4435                 to->vfs_featureset[i] = from->vfs_featureset[i];
4436         }
4437 }
4438
4439 #define LOFINODE_PATH "/dev/lofi/%d"
4440
4441 /*
4442  * Return the vnode for the lofi node if there's a lofi mount in place.
4443  * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4444  * failure.
4445  */
4446 int
4447 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4448 {
4449         char *path = NULL;
4450         int strsize;
4451         int err;
4452
4453         if (vfsp->vfs_lofi_id == 0) {
4454                 *vpp = NULL;
4455                 return (-1);
4456         }
4457
4458         strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_id);
4459         path = kmem_alloc(strsize + 1, KM_SLEEP);
4460         (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_id);
4461
4462         /*
4463          * We may be inside a zone, so we need to use the /dev path, but
4464          * it's created asynchronously, so we wait here.
4465          */
4466         for (;;) {
4467                 err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4468
4469                 if (err != ENOENT)
4470                         break;
4471
4472                 if ((err = delay_sig(hz / 8)) == EINTR)
4473                         break;
4474         }
4475
4476         if (err)
4477                 *vpp = NULL;
4478
4479         kmem_free(path, strsize + 1);
4480         return (err);
4481 }