usr/src/uts/common/fs/vfs.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25  * Copyright 2016 Toomas Soome <tsoome@me.com>
  26  * Copyright (c) 2016 by Delphix. All rights reserved.
  27  * Copyright 2016 Nexenta Systems, Inc.
  28  */
  29
  30 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  31 /*        All Rights Reserved   */
  32
  33 /*
  34  * University Copyright- Copyright (c) 1982, 1986, 1988
  35  * The Regents of the University of California
  36  * All Rights Reserved
  37  *
  38  * University Acknowledgment- Portions of this document are derived from
  39  * software developed by the University of California, Berkeley, and its
  40  * contributors.
  41  */
  42
  43 #include <sys/types.h>
  44 #include <sys/t_lock.h>
  45 #include <sys/param.h>
  46 #include <sys/errno.h>
  47 #include <sys/user.h>
  48 #include <sys/fstyp.h>
  49 #include <sys/kmem.h>
  50 #include <sys/systm.h>
  51 #include <sys/proc.h>
  52 #include <sys/mount.h>
  53 #include <sys/vfs.h>
  54 #include <sys/vfs_opreg.h>
  55 #include <sys/fem.h>
  56 #include <sys/mntent.h>
  57 #include <sys/stat.h>
  58 #include <sys/statvfs.h>
  59 #include <sys/statfs.h>
  60 #include <sys/cred.h>
  61 #include <sys/vnode.h>
  62 #include <sys/rwstlock.h>
  63 #include <sys/dnlc.h>
  64 #include <sys/file.h>
  65 #include <sys/time.h>
  66 #include <sys/atomic.h>
  67 #include <sys/cmn_err.h>
  68 #include <sys/buf.h>
  69 #include <sys/swap.h>
  70 #include <sys/debug.h>
  71 #include <sys/vnode.h>
  72 #include <sys/modctl.h>
  73 #include <sys/ddi.h>
  74 #include <sys/pathname.h>
  75 #include <sys/bootconf.h>
  76 #include <sys/dumphdr.h>
  77 #include <sys/dc_ki.h>
  78 #include <sys/poll.h>
  79 #include <sys/sunddi.h>
  80 #include <sys/sysmacros.h>
  81 #include <sys/zone.h>
  82 #include <sys/policy.h>
  83 #include <sys/ctfs.h>
  84 #include <sys/objfs.h>
  85 #include <sys/console.h>
  86 #include <sys/reboot.h>
  87 #include <sys/attr.h>
  88 #include <sys/zio.h>
  89 #include <sys/spa.h>
  90 #include <sys/lofi.h>
  91 #include <sys/bootprops.h>
  92
  93 #include <vm/page.h>
  94
  95 #include <fs/fs_subr.h>
  96 /* Private interfaces to create vopstats-related data structures */
  97 extern void             initialize_vopstats(vopstats_t *);
  98 extern vopstats_t       *get_fstype_vopstats(struct vfs *, struct vfssw *);
  99 extern vsk_anchor_t     *get_vskstat_anchor(struct vfs *);
 100
 101 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
 102 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
 103     const char *, int, int);
 104 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
 105 static void vfs_freemnttab(struct vfs *);
 106 static void vfs_freeopt(mntopt_t *);
 107 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
 108 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
 109 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
 110 static void vfs_createopttbl_extend(mntopts_t *, const char *,
 111     const mntopts_t *);
 112 static char **vfs_copycancelopt_extend(char **const, int);
 113 static void vfs_freecancelopt(char **);
 114 static void getrootfs(char **, char **);
 115 static int getmacpath(dev_info_t *, void *);
 116 static void vfs_mnttabvp_setup(void);
 117
 118 struct ipmnt {
 119         struct ipmnt    *mip_next;
 120         dev_t           mip_dev;
 121         struct vfs      *mip_vfsp;
 122 };
 123
 124 static kmutex_t         vfs_miplist_mutex;
 125 static struct ipmnt     *vfs_miplist = NULL;
 126 static struct ipmnt     *vfs_miplist_end = NULL;
 127
 128 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
 129
 130 /*
 131  * VFS global data.
 132  */
 133 vnode_t *rootdir;               /* pointer to root inode vnode. */
 134 vnode_t *devicesdir;            /* pointer to inode of devices root */
 135 vnode_t *devdir;                /* pointer to inode of dev root */
 136
 137 char *server_rootpath;          /* root path for diskless clients */
 138 char *server_hostname;          /* hostname of diskless server */
 139
 140 static struct vfs root;
 141 static struct vfs devices;
 142 static struct vfs dev;
 143 struct vfs *rootvfs = &root;    /* pointer to root vfs; head of VFS list. */
 144 rvfs_t *rvfs_list;              /* array of vfs ptrs for vfs hash list */
 145 int vfshsz = 512;               /* # of heads/locks in vfs hash arrays */
 146                                 /* must be power of 2!  */
 147 timespec_t vfs_mnttab_ctime;    /* mnttab created time */
 148 timespec_t vfs_mnttab_mtime;    /* mnttab last modified time */
 149 char *vfs_dummyfstype = "\0";
 150 struct pollhead vfs_pollhd;     /* for mnttab pollers */
 151 struct vnode *vfs_mntdummyvp;   /* to fake mnttab read/write for file events */
 152 int     mntfstype;              /* will be set once mnt fs is mounted */
 153
 154 /*
 155  * Table for generic options recognized in the VFS layer and acted
 156  * on at this level before parsing file system specific options.
 157  * The nosuid option is stronger than any of the devices and setuid
 158  * options, so those are canceled when nosuid is seen.
 159  *
 160  * All options which are added here need to be added to the
 161  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
 162  */
 163 /*
 164  * VFS Mount options table
 165  */
 166 static char *ro_cancel[] = { MNTOPT_RW, NULL };
 167 static char *rw_cancel[] = { MNTOPT_RO, NULL };
 168 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
 169 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
 170     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
 171 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
 172 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
 173 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
 174 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
 175 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
 176 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
 177 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
 178 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
 179
 180 static const mntopt_t mntopts[] = {
 181 /*
 182  *      option name             cancel options          default arg     flags
 183  */
 184         { MNTOPT_REMOUNT,       NULL,                   NULL,
 185                 MO_NODISPLAY, (void *)0 },
 186         { MNTOPT_RO,            ro_cancel,              NULL,           0,
 187                 (void *)0 },
 188         { MNTOPT_RW,            rw_cancel,              NULL,           0,
 189                 (void *)0 },
 190         { MNTOPT_SUID,          suid_cancel,            NULL,           0,
 191                 (void *)0 },
 192         { MNTOPT_NOSUID,        nosuid_cancel,          NULL,           0,
 193                 (void *)0 },
 194         { MNTOPT_DEVICES,       devices_cancel,         NULL,           0,
 195                 (void *)0 },
 196         { MNTOPT_NODEVICES,     nodevices_cancel,       NULL,           0,
 197                 (void *)0 },
 198         { MNTOPT_SETUID,        setuid_cancel,          NULL,           0,
 199                 (void *)0 },
 200         { MNTOPT_NOSETUID,      nosetuid_cancel,        NULL,           0,
 201                 (void *)0 },
 202         { MNTOPT_NBMAND,        nbmand_cancel,          NULL,           0,
 203                 (void *)0 },
 204         { MNTOPT_NONBMAND,      nonbmand_cancel,        NULL,           0,
 205                 (void *)0 },
 206         { MNTOPT_EXEC,          exec_cancel,            NULL,           0,
 207                 (void *)0 },
 208         { MNTOPT_NOEXEC,        noexec_cancel,          NULL,           0,
 209                 (void *)0 },
 210 };
 211
 212 const mntopts_t vfs_mntopts = {
 213         sizeof (mntopts) / sizeof (mntopt_t),
 214         (mntopt_t *)&mntopts[0]
 215 };
 216
 217 /*
 218  * File system operation dispatch functions.
 219  */
 220
 221 int
 222 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 223 {
 224         return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
 225 }
 226
 227 int
 228 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 229 {
 230         return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
 231 }
 232
 233 int
 234 fsop_root(vfs_t *vfsp, vnode_t **vpp)
 235 {
 236         refstr_t *mntpt;
 237         int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
 238         /*
 239          * Make sure this root has a path.  With lofs, it is possible to have
 240          * a NULL mountpoint.
 241          */
 242         if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
 243                 mntpt = vfs_getmntpoint(vfsp);
 244                 vn_setpath_str(*vpp, refstr_value(mntpt),
 245                     strlen(refstr_value(mntpt)));
 246                 refstr_rele(mntpt);
 247         }
 248
 249         return (ret);
 250 }
 251
 252 int
 253 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
 254 {
 255         return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
 256 }
 257
 258 int
 259 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
 260 {
 261         return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
 262 }
 263
 264 int
 265 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
 266 {
 267         /*
 268          * In order to handle system attribute fids in a manner
 269          * transparent to the underlying fs, we embed the fid for
 270          * the sysattr parent object in the sysattr fid and tack on
 271          * some extra bytes that only the sysattr layer knows about.
 272          *
 273          * This guarantees that sysattr fids are larger than other fids
 274          * for this vfs. If the vfs supports the sysattr view interface
 275          * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
 276          * collision with XATTR_FIDSZ.
 277          */
 278         if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
 279             fidp->fid_len == XATTR_FIDSZ)
 280                 return (xattr_dir_vget(vfsp, vpp, fidp));
 281
 282         return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
 283 }
 284
 285 int
 286 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
 287 {
 288         return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
 289 }
 290
 291 void
 292 fsop_freefs(vfs_t *vfsp)
 293 {
 294         (*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
 295 }
 296
 297 int
 298 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
 299 {
 300         return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
 301 }
 302
 303 int
 304 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
 305 {
 306         ASSERT((fstype >= 0) && (fstype < nfstype));
 307
 308         if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
 309                 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
 310         else
 311                 return (ENOTSUP);
 312 }
 313
 314 /*
 315  * File system initialization.  vfs_setfsops() must be called from a file
 316  * system's init routine.
 317  */
 318
 319 static int
 320 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
 321     int *unused_ops)
 322 {
 323         static const fs_operation_trans_def_t vfs_ops_table[] = {
 324                 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
 325                         fs_nosys, fs_nosys,
 326
 327                 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
 328                         fs_nosys, fs_nosys,
 329
 330                 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
 331                         fs_nosys, fs_nosys,
 332
 333                 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
 334                         fs_nosys, fs_nosys,
 335
 336                 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
 337                         (fs_generic_func_p) fs_sync,
 338                         (fs_generic_func_p) fs_sync,    /* No errors allowed */
 339
 340                 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
 341                         fs_nosys, fs_nosys,
 342
 343                 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
 344                         fs_nosys, fs_nosys,
 345
 346                 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
 347                         (fs_generic_func_p)fs_freevfs,
 348                         (fs_generic_func_p)fs_freevfs,  /* Shouldn't fail */
 349
 350                 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
 351                         (fs_generic_func_p)fs_nosys,
 352                         (fs_generic_func_p)fs_nosys,
 353
 354                 NULL, 0, NULL, NULL
 355         };
 356
 357         return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
 358 }
 359
 360 void
 361 zfs_boot_init(void)
 362 {
 363         if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
 364                 spa_boot_init();
 365 }
 366
 367 int
 368 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
 369 {
 370         int error;
 371         int unused_ops;
 372
 373         /*
 374          * Verify that fstype refers to a valid fs.  Note that
 375          * 0 is valid since it's used to set "stray" ops.
 376          */
 377         if ((fstype < 0) || (fstype >= nfstype))
 378                 return (EINVAL);
 379
 380         if (!ALLOCATED_VFSSW(&vfssw[fstype]))
 381                 return (EINVAL);
 382
 383         /* Set up the operations vector. */
 384
 385         error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
 386
 387         if (error != 0)
 388                 return (error);
 389
 390         vfssw[fstype].vsw_flag |= VSW_INSTALLED;
 391
 392         if (actual != NULL)
 393                 *actual = &vfssw[fstype].vsw_vfsops;
 394
 395 #if DEBUG
 396         if (unused_ops != 0)
 397                 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
 398                     "but not used", vfssw[fstype].vsw_name, unused_ops);
 399 #endif
 400
 401         return (0);
 402 }
 403
 404 int
 405 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
 406 {
 407         int error;
 408         int unused_ops;
 409
 410         *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
 411
 412         error = fs_copyfsops(template, *actual, &unused_ops);
 413         if (error != 0) {
 414                 kmem_free(*actual, sizeof (vfsops_t));
 415                 *actual = NULL;
 416                 return (error);
 417         }
 418
 419         return (0);
 420 }
 421
 422 /*
 423  * Free a vfsops structure created as a result of vfs_makefsops().
 424  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
 425  * vfs_freevfsops_by_type().
 426  */
 427 void
 428 vfs_freevfsops(vfsops_t *vfsops)
 429 {
 430         kmem_free(vfsops, sizeof (vfsops_t));
 431 }
 432
 433 /*
 434  * Since the vfsops structure is part of the vfssw table and wasn't
 435  * really allocated, we're not really freeing anything.  We keep
 436  * the name for consistency with vfs_freevfsops().  We do, however,
 437  * need to take care of a little bookkeeping.
 438  * NOTE: For a vfsops structure created by vfs_setfsops(), use
 439  * vfs_freevfsops_by_type().
 440  */
 441 int
 442 vfs_freevfsops_by_type(int fstype)
 443 {
 444
 445         /* Verify that fstype refers to a loaded fs (and not fsid 0). */
 446         if ((fstype <= 0) || (fstype >= nfstype))
 447                 return (EINVAL);
 448
 449         WLOCK_VFSSW();
 450         if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
 451                 WUNLOCK_VFSSW();
 452                 return (EINVAL);
 453         }
 454
 455         vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
 456         WUNLOCK_VFSSW();
 457
 458         return (0);
 459 }
 460
 461 /* Support routines used to reference vfs_op */
 462
 463 /* Set the operations vector for a vfs */
 464 void
 465 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
 466 {
 467         vfsops_t        *op;
 468
 469         ASSERT(vfsp != NULL);
 470         ASSERT(vfsops != NULL);
 471
 472         op = vfsp->vfs_op;
 473         membar_consumer();
 474         if (vfsp->vfs_femhead == NULL &&
 475             atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) {
 476                 return;
 477         }
 478         fsem_setvfsops(vfsp, vfsops);
 479 }
 480
 481 /* Retrieve the operations vector for a vfs */
 482 vfsops_t *
 483 vfs_getops(vfs_t *vfsp)
 484 {
 485         vfsops_t        *op;
 486
 487         ASSERT(vfsp != NULL);
 488
 489         op = vfsp->vfs_op;
 490         membar_consumer();
 491         if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
 492                 return (op);
 493         } else {
 494                 return (fsem_getvfsops(vfsp));
 495         }
 496 }
 497
 498 /*
 499  * Returns non-zero (1) if the vfsops matches that of the vfs.
 500  * Returns zero (0) if not.
 501  */
 502 int
 503 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
 504 {
 505         return (vfs_getops(vfsp) == vfsops);
 506 }
 507
 508 /*
 509  * Returns non-zero (1) if the file system has installed a non-default,
 510  * non-error vfs_sync routine.  Returns zero (0) otherwise.
 511  */
 512 int
 513 vfs_can_sync(vfs_t *vfsp)
 514 {
 515         /* vfs_sync() routine is not the default/error function */
 516         return (vfs_getops(vfsp)->vfs_sync != fs_sync);
 517 }
 518
 519 /*
 520  * Initialize a vfs structure.
 521  */
 522 void
 523 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
 524 {
 525         /* Other initialization has been moved to vfs_alloc() */
 526         vfsp->vfs_count = 0;
 527         vfsp->vfs_next = vfsp;
 528         vfsp->vfs_prev = vfsp;
 529         vfsp->vfs_zone_next = vfsp;
 530         vfsp->vfs_zone_prev = vfsp;
 531         vfsp->vfs_lofi_id = 0;
 532         sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
 533         vfsimpl_setup(vfsp);
 534         vfsp->vfs_data = (data);
 535         vfs_setops((vfsp), (op));
 536 }
 537
 538 /*
 539  * Allocate and initialize the vfs implementation private data
 540  * structure, vfs_impl_t.
 541  */
 542 void
 543 vfsimpl_setup(vfs_t *vfsp)
 544 {
 545         int i;
 546
 547         if (vfsp->vfs_implp != NULL) {
 548                 return;
 549         }
 550
 551         vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
 552         /* Note that these are #define'd in vfs.h */
 553         vfsp->vfs_vskap = NULL;
 554         vfsp->vfs_fstypevsp = NULL;
 555
 556         /* Set size of counted array, then zero the array */
 557         vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
 558         for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
 559                 vfsp->vfs_featureset[i] = 0;
 560         }
 561 }
 562
 563 /*
 564  * Release the vfs_impl_t structure, if it exists. Some unbundled
 565  * filesystems may not use the newer version of vfs and thus
 566  * would not contain this implementation private data structure.
 567  */
 568 void
 569 vfsimpl_teardown(vfs_t *vfsp)
 570 {
 571         vfs_impl_t      *vip = vfsp->vfs_implp;
 572
 573         if (vip == NULL)
 574                 return;
 575
 576         kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
 577         vfsp->vfs_implp = NULL;
 578 }
 579
 580 /*
 581  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
 582  * fstatvfs, and sysfs moved to common/syscall.
 583  */
 584
 585 /*
 586  * Update every mounted file system.  We call the vfs_sync operation of
 587  * each file system type, passing it a NULL vfsp to indicate that all
 588  * mounted file systems of that type should be updated.
 589  */
 590 void
 591 vfs_sync(int flag)
 592 {
 593         struct vfssw *vswp;
 594         RLOCK_VFSSW();
 595         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 596                 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 597                         vfs_refvfssw(vswp);
 598                         RUNLOCK_VFSSW();
 599                         (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
 600                             CRED());
 601                         vfs_unrefvfssw(vswp);
 602                         RLOCK_VFSSW();
 603                 }
 604         }
 605         RUNLOCK_VFSSW();
 606 }
 607
 608 void
 609 sync(void)
 610 {
 611         vfs_sync(0);
 612 }
 613
 614 /*
 615  * External routines.
 616  */
 617
 618 krwlock_t vfssw_lock;   /* lock accesses to vfssw */
 619
 620 /*
 621  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
 622  * but otherwise should be accessed only via vfs_list_lock() and
 623  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
 624  */
 625 static krwlock_t vfslist;
 626
 627 /*
 628  * Mount devfs on /devices. This is done right after root is mounted
 629  * to provide device access support for the system
 630  */
 631 static void
 632 vfs_mountdevices(void)
 633 {
 634         struct vfssw *vsw;
 635         struct vnode *mvp;
 636         struct mounta mounta = {        /* fake mounta for devfs_mount() */
 637                 NULL,
 638                 NULL,
 639                 MS_SYSSPACE,
 640                 NULL,
 641                 NULL,
 642                 0,
 643                 NULL,
 644                 0
 645         };
 646
 647         /*
 648          * _init devfs module to fill in the vfssw
 649          */
 650         if (modload("fs", "devfs") == -1)
 651                 panic("Cannot _init devfs module");
 652
 653         /*
 654          * Hold vfs
 655          */
 656         RLOCK_VFSSW();
 657         vsw = vfs_getvfsswbyname("devfs");
 658         VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
 659         VFS_HOLD(&devices);
 660
 661         /*
 662          * Locate mount point
 663          */
 664         if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 665                 panic("Cannot find /devices");
 666
 667         /*
 668          * Perform the mount of /devices
 669          */
 670         if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
 671                 panic("Cannot mount /devices");
 672
 673         RUNLOCK_VFSSW();
 674
 675         /*
 676          * Set appropriate members and add to vfs list for mnttab display
 677          */
 678         vfs_setresource(&devices, "/devices", 0);
 679         vfs_setmntpoint(&devices, "/devices", 0);
 680
 681         /*
 682          * Hold the root of /devices so it won't go away
 683          */
 684         if (VFS_ROOT(&devices, &devicesdir))
 685                 panic("vfs_mountdevices: not devices root");
 686
 687         if (vfs_lock(&devices) != 0) {
 688                 VN_RELE(devicesdir);
 689                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
 690                 return;
 691         }
 692
 693         if (vn_vfswlock(mvp) != 0) {
 694                 vfs_unlock(&devices);
 695                 VN_RELE(devicesdir);
 696                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
 697                 return;
 698         }
 699
 700         vfs_add(mvp, &devices, 0);
 701         vn_vfsunlock(mvp);
 702         vfs_unlock(&devices);
 703         VN_RELE(devicesdir);
 704 }
 705
 706 /*
 707  * mount the first instance of /dev  to root and remain mounted
 708  */
 709 static void
 710 vfs_mountdev1(void)
 711 {
 712         struct vfssw *vsw;
 713         struct vnode *mvp;
 714         struct mounta mounta = {        /* fake mounta for sdev_mount() */
 715                 NULL,
 716                 NULL,
 717                 MS_SYSSPACE | MS_OVERLAY,
 718                 NULL,
 719                 NULL,
 720                 0,
 721                 NULL,
 722                 0
 723         };
 724
 725         /*
 726          * _init dev module to fill in the vfssw
 727          */
 728         if (modload("fs", "dev") == -1)
 729                 cmn_err(CE_PANIC, "Cannot _init dev module\n");
 730
 731         /*
 732          * Hold vfs
 733          */
 734         RLOCK_VFSSW();
 735         vsw = vfs_getvfsswbyname("dev");
 736         VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
 737         VFS_HOLD(&dev);
 738
 739         /*
 740          * Locate mount point
 741          */
 742         if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 743                 cmn_err(CE_PANIC, "Cannot find /dev\n");
 744
 745         /*
 746          * Perform the mount of /dev
 747          */
 748         if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
 749                 cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
 750
 751         RUNLOCK_VFSSW();
 752
 753         /*
 754          * Set appropriate members and add to vfs list for mnttab display
 755          */
 756         vfs_setresource(&dev, "/dev", 0);
 757         vfs_setmntpoint(&dev, "/dev", 0);
 758
 759         /*
 760          * Hold the root of /dev so it won't go away
 761          */
 762         if (VFS_ROOT(&dev, &devdir))
 763                 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
 764
 765         if (vfs_lock(&dev) != 0) {
 766                 VN_RELE(devdir);
 767                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
 768                 return;
 769         }
 770
 771         if (vn_vfswlock(mvp) != 0) {
 772                 vfs_unlock(&dev);
 773                 VN_RELE(devdir);
 774                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
 775                 return;
 776         }
 777
 778         vfs_add(mvp, &dev, 0);
 779         vn_vfsunlock(mvp);
 780         vfs_unlock(&dev);
 781         VN_RELE(devdir);
 782 }
 783
 784 /*
 785  * Mount required filesystem. This is done right after root is mounted.
 786  */
 787 static void
 788 vfs_mountfs(char *module, char *spec, char *path)
 789 {
 790         struct vnode *mvp;
 791         struct mounta mounta;
 792         vfs_t *vfsp;
 793
 794         mounta.flags = MS_SYSSPACE | MS_DATA;
 795         mounta.fstype = module;
 796         mounta.spec = spec;
 797         mounta.dir = path;
 798         if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
 799                 cmn_err(CE_WARN, "Cannot find %s", path);
 800                 return;
 801         }
 802         if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
 803                 cmn_err(CE_WARN, "Cannot mount %s", path);
 804         else
 805                 VFS_RELE(vfsp);
 806         VN_RELE(mvp);
 807 }
 808
 809 /*
 810  * vfs_mountroot is called by main() to mount the root filesystem.
 811  */
 812 void
 813 vfs_mountroot(void)
 814 {
 815         struct vnode    *rvp = NULL;
 816         char            *path;
 817         size_t          plen;
 818         struct vfssw    *vswp;
 819         proc_t          *p;
 820
 821         rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
 822         rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
 823
 824         /*
 825          * Alloc the vfs hash bucket array and locks
 826          */
 827         rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
 828
 829         /*
 830          * Call machine-dependent routine "rootconf" to choose a root
 831          * file system type.
 832          */
 833         if (rootconf())
 834                 panic("vfs_mountroot: cannot mount root");
 835         /*
 836          * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
 837          * to point to it.  These are used by lookuppn() so that it
 838          * knows where to start from ('/' or '.').
 839          */
 840         vfs_setmntpoint(rootvfs, "/", 0);
 841         if (VFS_ROOT(rootvfs, &rootdir))
 842                 panic("vfs_mountroot: no root vnode");
 843
 844         /*
 845          * At this point, the process tree consists of p0 and possibly some
 846          * direct children of p0.  (i.e. there are no grandchildren)
 847          *
 848          * Walk through them all, setting their current directory.
 849          */
 850         mutex_enter(&pidlock);
 851         for (p = practive; p != NULL; p = p->p_next) {
 852                 ASSERT(p == &p0 || p->p_parent == &p0);
 853
 854                 PTOU(p)->u_cdir = rootdir;
 855                 VN_HOLD(PTOU(p)->u_cdir);
 856                 PTOU(p)->u_rdir = NULL;
 857         }
 858         mutex_exit(&pidlock);
 859
 860         /*
 861          * Setup the global zone's rootvp, now that it exists.
 862          */
 863         global_zone->zone_rootvp = rootdir;
 864         VN_HOLD(global_zone->zone_rootvp);
 865
 866         /*
 867          * Notify the module code that it can begin using the
 868          * root filesystem instead of the boot program's services.
 869          */
 870         modrootloaded = 1;
 871
 872         /*
 873          * Special handling for a ZFS root file system.
 874          */
 875         zfs_boot_init();
 876
 877         /*
 878          * Set up mnttab information for root
 879          */
 880         vfs_setresource(rootvfs, rootfs.bo_name, 0);
 881
 882         /*
 883          * Notify cluster software that the root filesystem is available.
 884          */
 885         clboot_mountroot();
 886
 887         /* Now that we're all done with the root FS, set up its vopstats */
 888         if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
 889                 /* Set flag for statistics collection */
 890                 if (vswp->vsw_flag & VSW_STATS) {
 891                         initialize_vopstats(&rootvfs->vfs_vopstats);
 892                         rootvfs->vfs_flag |= VFS_STATS;
 893                         rootvfs->vfs_fstypevsp =
 894                             get_fstype_vopstats(rootvfs, vswp);
 895                         rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
 896                 }
 897                 vfs_unrefvfssw(vswp);
 898         }
 899
 900         /*
 901          * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
 902          * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
 903          */
 904         vfs_mountdevices();
 905         vfs_mountdev1();
 906
 907         vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
 908         vfs_mountfs("proc", "/proc", "/proc");
 909         vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
 910         vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
 911         vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
 912         vfs_mountfs("bootfs", "bootfs", "/system/boot");
 913
 914         if (getzoneid() == GLOBAL_ZONEID) {
 915                 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
 916         }
 917
 918         if (strcmp(rootfs.bo_fstype, "zfs") != 0) {
 919                 /*
 920                  * Look up the root device via devfs so that a dv_node is
 921                  * created for it. The vnode is never VN_RELE()ed.
 922                  * We allocate more than MAXPATHLEN so that the
 923                  * buffer passed to i_ddi_prompath_to_devfspath() is
 924                  * exactly MAXPATHLEN (the function expects a buffer
 925                  * of that length).
 926                  */
 927                 plen = strlen("/devices");
 928                 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
 929                 (void) strcpy(path, "/devices");
 930
 931                 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
 932                     != DDI_SUCCESS ||
 933                     lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
 934
 935                         /* NUL terminate in case "path" has garbage */
 936                         path[plen + MAXPATHLEN - 1] = '\0';
 937 #ifdef  DEBUG
 938                         cmn_err(CE_WARN, "!Cannot lookup root device: %s",
 939                             path);
 940 #endif
 941                 }
 942                 kmem_free(path, plen + MAXPATHLEN);
 943         }
 944
 945         vfs_mnttabvp_setup();
 946 }
 947
 948 /*
 949  * Check to see if our "block device" is actually a file.  If so,
 950  * automatically add a lofi device, and keep track of this fact.
 951  */
 952 static int
 953 lofi_add(const char *fsname, struct vfs *vfsp,
 954     mntopts_t *mntopts, struct mounta *uap)
 955 {
 956         int fromspace = (uap->flags & MS_SYSSPACE) ?
 957             UIO_SYSSPACE : UIO_USERSPACE;
 958         struct lofi_ioctl *li = NULL;
 959         struct vnode *vp = NULL;
 960         struct pathname pn = { NULL };
 961         ldi_ident_t ldi_id;
 962         ldi_handle_t ldi_hdl;
 963         vfssw_t *vfssw;
 964         int id;
 965         int err = 0;
 966
 967         if ((vfssw = vfs_getvfssw(fsname)) == NULL)
 968                 return (0);
 969
 970         if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
 971                 vfs_unrefvfssw(vfssw);
 972                 return (0);
 973         }
 974
 975         vfs_unrefvfssw(vfssw);
 976         vfssw = NULL;
 977
 978         if (pn_get(uap->spec, fromspace, &pn) != 0)
 979                 return (0);
 980
 981         if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
 982                 goto out;
 983
 984         if (vp->v_type != VREG)
 985                 goto out;
 986
 987         /* OK, this is a lofi mount. */
 988
 989         if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
 990             vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
 991             vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
 992             vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
 993                 err = EINVAL;
 994                 goto out;
 995         }
 996
 997         ldi_id = ldi_ident_from_anon();
 998         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
 999         (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1000
1001         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1002             &ldi_hdl, ldi_id);
1003
1004         if (err)
1005                 goto out2;
1006
1007         err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1008             FREAD | FWRITE | FKIOCTL, kcred, &id);
1009
1010         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1011
1012         if (!err)
1013                 vfsp->vfs_lofi_id = id;
1014
1015 out2:
1016         ldi_ident_release(ldi_id);
1017 out:
1018         if (li != NULL)
1019                 kmem_free(li, sizeof (*li));
1020         if (vp != NULL)
1021                 VN_RELE(vp);
1022         pn_free(&pn);
1023         return (err);
1024 }
1025
1026 static void
1027 lofi_remove(struct vfs *vfsp)
1028 {
1029         struct lofi_ioctl *li = NULL;
1030         ldi_ident_t ldi_id;
1031         ldi_handle_t ldi_hdl;
1032         int err;
1033
1034         if (vfsp->vfs_lofi_id == 0)
1035                 return;
1036
1037         ldi_id = ldi_ident_from_anon();
1038
1039         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1040         li->li_id = vfsp->vfs_lofi_id;
1041         li->li_cleanup = B_TRUE;
1042
1043         err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred,
1044             &ldi_hdl, ldi_id);
1045
1046         if (err)
1047                 goto out;
1048
1049         err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1050             FREAD | FWRITE | FKIOCTL, kcred, NULL);
1051
1052         (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred);
1053
1054         if (!err)
1055                 vfsp->vfs_lofi_id = 0;
1056
1057 out:
1058         ldi_ident_release(ldi_id);
1059         if (li != NULL)
1060                 kmem_free(li, sizeof (*li));
1061 }
1062
1063 /*
1064  * Common mount code.  Called from the system call entry point, from autofs,
1065  * nfsv4 trigger mounts, and from pxfs.
1066  *
1067  * Takes the effective file system type, mount arguments, the mount point
1068  * vnode, flags specifying whether the mount is a remount and whether it
1069  * should be entered into the vfs list, and credentials.  Fills in its vfspp
1070  * parameter with the mounted file system instance's vfs.
1071  *
1072  * Note that the effective file system type is specified as a string.  It may
1073  * be null, in which case it's determined from the mount arguments, and may
1074  * differ from the type specified in the mount arguments; this is a hook to
1075  * allow interposition when instantiating file system instances.
1076  *
1077  * The caller is responsible for releasing its own hold on the mount point
1078  * vp (this routine does its own hold when necessary).
1079  * Also note that for remounts, the mount point vp should be the vnode for
1080  * the root of the file system rather than the vnode that the file system
1081  * is mounted on top of.
1082  */
1083 int
1084 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1085     struct vfs **vfspp)
1086 {
1087         struct vfssw    *vswp;
1088         vfsops_t        *vfsops;
1089         struct vfs      *vfsp;
1090         struct vnode    *bvp;
1091         dev_t           bdev = 0;
1092         mntopts_t       mnt_mntopts;
1093         int             error = 0;
1094         int             copyout_error = 0;
1095         int             ovflags;
1096         char            *opts = uap->optptr;
1097         char            *inargs = opts;
1098         int             optlen = uap->optlen;
1099         int             remount;
1100         int             rdonly;
1101         int             nbmand = 0;
1102         int             delmip = 0;
1103         int             addmip = 0;
1104         int             splice = ((uap->flags & MS_NOSPLICE) == 0);
1105         int             fromspace = (uap->flags & MS_SYSSPACE) ?
1106             UIO_SYSSPACE : UIO_USERSPACE;
1107         char            *resource = NULL, *mountpt = NULL;
1108         refstr_t        *oldresource, *oldmntpt;
1109         struct pathname pn, rpn;
1110         vsk_anchor_t    *vskap;
1111         char fstname[FSTYPSZ];
1112         zone_t          *zone;
1113
1114         /*
1115          * The v_flag value for the mount point vp is permanently set
1116          * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1117          * for mount point locking.
1118          */
1119         mutex_enter(&vp->v_lock);
1120         vp->v_flag |= VVFSLOCK;
1121         mutex_exit(&vp->v_lock);
1122
1123         mnt_mntopts.mo_count = 0;
1124         /*
1125          * Find the ops vector to use to invoke the file system-specific mount
1126          * method.  If the fsname argument is non-NULL, use it directly.
1127          * Otherwise, dig the file system type information out of the mount
1128          * arguments.
1129          *
1130          * A side effect is to hold the vfssw entry.
1131          *
1132          * Mount arguments can be specified in several ways, which are
1133          * distinguished by flag bit settings.  The preferred way is to set
1134          * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1135          * type supplied as a character string and the last two arguments
1136          * being a pointer to a character buffer and the size of the buffer.
1137          * On entry, the buffer holds a null terminated list of options; on
1138          * return, the string is the list of options the file system
1139          * recognized. If MS_DATA is set arguments five and six point to a
1140          * block of binary data which the file system interprets.
1141          * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1142          * consistently with these conventions.  To handle them, we check to
1143          * see whether the pointer to the file system name has a numeric value
1144          * less than 256.  If so, we treat it as an index.
1145          */
1146         if (fsname != NULL) {
1147                 if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1148                         return (EINVAL);
1149                 }
1150         } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1151                 size_t n;
1152                 uint_t fstype;
1153
1154                 fsname = fstname;
1155
1156                 if ((fstype = (uintptr_t)uap->fstype) < 256) {
1157                         RLOCK_VFSSW();
1158                         if (fstype == 0 || fstype >= nfstype ||
1159                             !ALLOCATED_VFSSW(&vfssw[fstype])) {
1160                                 RUNLOCK_VFSSW();
1161                                 return (EINVAL);
1162                         }
1163                         (void) strcpy(fsname, vfssw[fstype].vsw_name);
1164                         RUNLOCK_VFSSW();
1165                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1166                                 return (EINVAL);
1167                 } else {
1168                         /*
1169                          * Handle either kernel or user address space.
1170                          */
1171                         if (uap->flags & MS_SYSSPACE) {
1172                                 error = copystr(uap->fstype, fsname,
1173                                     FSTYPSZ, &n);
1174                         } else {
1175                                 error = copyinstr(uap->fstype, fsname,
1176                                     FSTYPSZ, &n);
1177                         }
1178                         if (error) {
1179                                 if (error == ENAMETOOLONG)
1180                                         return (EINVAL);
1181                                 return (error);
1182                         }
1183                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1184                                 return (EINVAL);
1185                 }
1186         } else {
1187                 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1188                         return (EINVAL);
1189                 fsname = vswp->vsw_name;
1190         }
1191         if (!VFS_INSTALLED(vswp))
1192                 return (EINVAL);
1193
1194         if ((error = secpolicy_fs_allowed_mount(fsname)) != 0)  {
1195                 vfs_unrefvfssw(vswp);
1196                 return (error);
1197         }
1198
1199         vfsops = &vswp->vsw_vfsops;
1200
1201         vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1202         /*
1203          * Fetch mount options and parse them for generic vfs options
1204          */
1205         if (uap->flags & MS_OPTIONSTR) {
1206                 /*
1207                  * Limit the buffer size
1208                  */
1209                 if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1210                         error = EINVAL;
1211                         goto errout;
1212                 }
1213                 if ((uap->flags & MS_SYSSPACE) == 0) {
1214                         inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1215                         inargs[0] = '\0';
1216                         if (optlen) {
1217                                 error = copyinstr(opts, inargs, (size_t)optlen,
1218                                     NULL);
1219                                 if (error) {
1220                                         goto errout;
1221                                 }
1222                         }
1223                 }
1224                 vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1225         }
1226         /*
1227          * Flag bits override the options string.
1228          */
1229         if (uap->flags & MS_REMOUNT)
1230                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1231         if (uap->flags & MS_RDONLY)
1232                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1233         if (uap->flags & MS_NOSUID)
1234                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1235
1236         /*
1237          * Check if this is a remount; must be set in the option string and
1238          * the file system must support a remount option.
1239          */
1240         if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1241             MNTOPT_REMOUNT, NULL)) {
1242                 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1243                         error = ENOTSUP;
1244                         goto errout;
1245                 }
1246                 uap->flags |= MS_REMOUNT;
1247         }
1248
1249         /*
1250          * uap->flags and vfs_optionisset() should agree.
1251          */
1252         if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1253                 uap->flags |= MS_RDONLY;
1254         }
1255         if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1256                 uap->flags |= MS_NOSUID;
1257         }
1258         nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1259         ASSERT(splice || !remount);
1260         /*
1261          * If we are splicing the fs into the namespace,
1262          * perform mount point checks.
1263          *
1264          * We want to resolve the path for the mount point to eliminate
1265          * '.' and ".." and symlinks in mount points; we can't do the
1266          * same for the resource string, since it would turn
1267          * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1268          * this before grabbing vn_vfswlock(), because otherwise we
1269          * would deadlock with lookuppn().
1270          */
1271         if (splice) {
1272                 ASSERT(vp->v_count > 0);
1273
1274                 /*
1275                  * Pick up mount point and device from appropriate space.
1276                  */
1277                 if (pn_get(uap->spec, fromspace, &pn) == 0) {
1278                         resource = kmem_alloc(pn.pn_pathlen + 1,
1279                             KM_SLEEP);
1280                         (void) strcpy(resource, pn.pn_path);
1281                         pn_free(&pn);
1282                 }
1283                 /*
1284                  * Do a lookupname prior to taking the
1285                  * writelock. Mark this as completed if
1286                  * successful for later cleanup and addition to
1287                  * the mount in progress table.
1288                  */
1289                 if ((uap->flags & MS_GLOBAL) == 0 &&
1290                     lookupname(uap->spec, fromspace,
1291                     FOLLOW, NULL, &bvp) == 0) {
1292                         addmip = 1;
1293                 }
1294
1295                 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1296                         pathname_t *pnp;
1297
1298                         if (*pn.pn_path != '/') {
1299                                 error = EINVAL;
1300                                 pn_free(&pn);
1301                                 goto errout;
1302                         }
1303                         pn_alloc(&rpn);
1304                         /*
1305                          * Kludge to prevent autofs from deadlocking with
1306                          * itself when it calls domount().
1307                          *
1308                          * If autofs is calling, it is because it is doing
1309                          * (autofs) mounts in the process of an NFS mount.  A
1310                          * lookuppn() here would cause us to block waiting for
1311                          * said NFS mount to complete, which can't since this
1312                          * is the thread that was supposed to doing it.
1313                          */
1314                         if (fromspace == UIO_USERSPACE) {
1315                                 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1316                                     NULL)) == 0) {
1317                                         pnp = &rpn;
1318                                 } else {
1319                                         /*
1320                                          * The file disappeared or otherwise
1321                                          * became inaccessible since we opened
1322                                          * it; might as well fail the mount
1323                                          * since the mount point is no longer
1324                                          * accessible.
1325                                          */
1326                                         pn_free(&rpn);
1327                                         pn_free(&pn);
1328                                         goto errout;
1329                                 }
1330                         } else {
1331                                 pnp = &pn;
1332                         }
1333                         mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1334                         (void) strcpy(mountpt, pnp->pn_path);
1335
1336                         /*
1337                          * If the addition of the zone's rootpath
1338                          * would push us over a total path length
1339                          * of MAXPATHLEN, we fail the mount with
1340                          * ENAMETOOLONG, which is what we would have
1341                          * gotten if we were trying to perform the same
1342                          * mount in the global zone.
1343                          *
1344                          * strlen() doesn't count the trailing
1345                          * '\0', but zone_rootpathlen counts both a
1346                          * trailing '/' and the terminating '\0'.
1347                          */
1348                         if ((curproc->p_zone->zone_rootpathlen - 1 +
1349                             strlen(mountpt)) > MAXPATHLEN ||
1350                             (resource != NULL &&
1351                             (curproc->p_zone->zone_rootpathlen - 1 +
1352                             strlen(resource)) > MAXPATHLEN)) {
1353                                 error = ENAMETOOLONG;
1354                         }
1355
1356                         pn_free(&rpn);
1357                         pn_free(&pn);
1358                 }
1359
1360                 if (error)
1361                         goto errout;
1362
1363                 /*
1364                  * Prevent path name resolution from proceeding past
1365                  * the mount point.
1366                  */
1367                 if (vn_vfswlock(vp) != 0) {
1368                         error = EBUSY;
1369                         goto errout;
1370                 }
1371
1372                 /*
1373                  * Verify that it's legitimate to establish a mount on
1374                  * the prospective mount point.
1375                  */
1376                 if (vn_mountedvfs(vp) != NULL) {
1377                         /*
1378                          * The mount point lock was obtained after some
1379                          * other thread raced through and established a mount.
1380                          */
1381                         vn_vfsunlock(vp);
1382                         error = EBUSY;
1383                         goto errout;
1384                 }
1385                 if (vp->v_flag & VNOMOUNT) {
1386                         vn_vfsunlock(vp);
1387                         error = EINVAL;
1388                         goto errout;
1389                 }
1390         }
1391         if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1392                 uap->dataptr = NULL;
1393                 uap->datalen = 0;
1394         }
1395
1396         /*
1397          * If this is a remount, we don't want to create a new VFS.
1398          * Instead, we pass the existing one with a remount flag.
1399          */
1400         if (remount) {
1401                 /*
1402                  * Confirm that the mount point is the root vnode of the
1403                  * file system that is being remounted.
1404                  * This can happen if the user specifies a different
1405                  * mount point directory pathname in the (re)mount command.
1406                  *
1407                  * Code below can only be reached if splice is true, so it's
1408                  * safe to do vn_vfsunlock() here.
1409                  */
1410                 if ((vp->v_flag & VROOT) == 0) {
1411                         vn_vfsunlock(vp);
1412                         error = ENOENT;
1413                         goto errout;
1414                 }
1415                 /*
1416                  * Disallow making file systems read-only unless file system
1417                  * explicitly allows it in its vfssw.  Ignore other flags.
1418                  */
1419                 if (rdonly && vn_is_readonly(vp) == 0 &&
1420                     (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1421                         vn_vfsunlock(vp);
1422                         error = EINVAL;
1423                         goto errout;
1424                 }
1425                 /*
1426                  * Disallow changing the NBMAND disposition of the file
1427                  * system on remounts.
1428                  */
1429                 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1430                     (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1431                         vn_vfsunlock(vp);
1432                         error = EINVAL;
1433                         goto errout;
1434                 }
1435                 vfsp = vp->v_vfsp;
1436                 ovflags = vfsp->vfs_flag;
1437                 vfsp->vfs_flag |= VFS_REMOUNT;
1438                 vfsp->vfs_flag &= ~VFS_RDONLY;
1439         } else {
1440                 vfsp = vfs_alloc(KM_SLEEP);
1441                 VFS_INIT(vfsp, vfsops, NULL);
1442         }
1443
1444         VFS_HOLD(vfsp);
1445
1446         if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1447                 if (!remount) {
1448                         if (splice)
1449                                 vn_vfsunlock(vp);
1450                         vfs_free(vfsp);
1451                 } else {
1452                         vn_vfsunlock(vp);
1453                         VFS_RELE(vfsp);
1454                 }
1455                 goto errout;
1456         }
1457
1458         /*
1459          * PRIV_SYS_MOUNT doesn't mean you can become root.
1460          */
1461         if (vfsp->vfs_lofi_id != 0) {
1462                 uap->flags |= MS_NOSUID;
1463                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1464         }
1465
1466         /*
1467          * The vfs_reflock is not used anymore the code below explicitly
1468          * holds it preventing others accesing it directly.
1469          */
1470         if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1471             !(vfsp->vfs_flag & VFS_REMOUNT))
1472                 cmn_err(CE_WARN,
1473                     "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1474
1475         /*
1476          * Lock the vfs. If this is a remount we want to avoid spurious umount
1477          * failures that happen as a side-effect of fsflush() and other mount
1478          * and unmount operations that might be going on simultaneously and
1479          * may have locked the vfs currently. To not return EBUSY immediately
1480          * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1481          */
1482         if (!remount) {
1483                 if (error = vfs_lock(vfsp)) {
1484                         vfsp->vfs_flag = ovflags;
1485
1486                         lofi_remove(vfsp);
1487
1488                         if (splice)
1489                                 vn_vfsunlock(vp);
1490                         vfs_free(vfsp);
1491                         goto errout;
1492                 }
1493         } else {
1494                 vfs_lock_wait(vfsp);
1495         }
1496
1497         /*
1498          * Add device to mount in progress table, global mounts require special
1499          * handling. It is possible that we have already done the lookupname
1500          * on a spliced, non-global fs. If so, we don't want to do it again
1501          * since we cannot do a lookupname after taking the
1502          * wlock above. This case is for a non-spliced, non-global filesystem.
1503          */
1504         if (!addmip) {
1505                 if ((uap->flags & MS_GLOBAL) == 0 &&
1506                     lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1507                         addmip = 1;
1508                 }
1509         }
1510
1511         if (addmip) {
1512                 vnode_t *lvp = NULL;
1513
1514                 error = vfs_get_lofi(vfsp, &lvp);
1515                 if (error > 0) {
1516                         lofi_remove(vfsp);
1517
1518                         if (splice)
1519                                 vn_vfsunlock(vp);
1520                         vfs_unlock(vfsp);
1521
1522                         if (remount) {
1523                                 VFS_RELE(vfsp);
1524                         } else {
1525                                 vfs_free(vfsp);
1526                         }
1527
1528                         goto errout;
1529                 } else if (error == -1) {
1530                         bdev = bvp->v_rdev;
1531                         VN_RELE(bvp);
1532                 } else {
1533                         bdev = lvp->v_rdev;
1534                         VN_RELE(lvp);
1535                         VN_RELE(bvp);
1536                 }
1537
1538                 vfs_addmip(bdev, vfsp);
1539                 addmip = 0;
1540                 delmip = 1;
1541         }
1542         /*
1543          * Invalidate cached entry for the mount point.
1544          */
1545         if (splice)
1546                 dnlc_purge_vp(vp);
1547
1548         /*
1549          * If have an option string but the filesystem doesn't supply a
1550          * prototype options table, create a table with the global
1551          * options and sufficient room to accept all the options in the
1552          * string.  Then parse the passed in option string
1553          * accepting all the options in the string.  This gives us an
1554          * option table with all the proper cancel properties for the
1555          * global options.
1556          *
1557          * Filesystems that supply a prototype options table are handled
1558          * earlier in this function.
1559          */
1560         if (uap->flags & MS_OPTIONSTR) {
1561                 if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1562                         mntopts_t tmp_mntopts;
1563
1564                         tmp_mntopts.mo_count = 0;
1565                         vfs_createopttbl_extend(&tmp_mntopts, inargs,
1566                             &mnt_mntopts);
1567                         vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1568                         vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1569                         vfs_freeopttbl(&tmp_mntopts);
1570                 }
1571         }
1572
1573         /*
1574          * Serialize with zone state transitions.
1575          * See vfs_list_add; zone mounted into is:
1576          *      zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
1577          * not the zone doing the mount (curproc->p_zone), but if we're already
1578          * inside a NGZ, then we know what zone we are.
1579          */
1580         if (INGLOBALZONE(curproc)) {
1581                 zone = zone_find_by_path(mountpt);
1582                 ASSERT(zone != NULL);
1583         } else {
1584                 zone = curproc->p_zone;
1585                 /*
1586                  * zone_find_by_path does a hold, so do one here too so that
1587                  * we can do a zone_rele after mount_completed.
1588                  */
1589                 zone_hold(zone);
1590         }
1591         mount_in_progress(zone);
1592         /*
1593          * Instantiate (or reinstantiate) the file system.  If appropriate,
1594          * splice it into the file system name space.
1595          *
1596          * We want VFS_MOUNT() to be able to override the vfs_resource
1597          * string if necessary (ie, mntfs), and also for a remount to
1598          * change the same (necessary when remounting '/' during boot).
1599          * So we set up vfs_mntpt and vfs_resource to what we think they
1600          * should be, then hand off control to VFS_MOUNT() which can
1601          * override this.
1602          *
1603          * For safety's sake, when changing vfs_resource or vfs_mntpt of
1604          * a vfs which is on the vfs list (i.e. during a remount), we must
1605          * never set those fields to NULL. Several bits of code make
1606          * assumptions that the fields are always valid.
1607          */
1608         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1609         if (remount) {
1610                 if ((oldresource = vfsp->vfs_resource) != NULL)
1611                         refstr_hold(oldresource);
1612                 if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1613                         refstr_hold(oldmntpt);
1614         }
1615         vfs_setresource(vfsp, resource, 0);
1616         vfs_setmntpoint(vfsp, mountpt, 0);
1617
1618         /*
1619          * going to mount on this vnode, so notify.
1620          */
1621         vnevent_mountedover(vp, NULL);
1622         error = VFS_MOUNT(vfsp, vp, uap, credp);
1623
1624         if (uap->flags & MS_RDONLY)
1625                 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1626         if (uap->flags & MS_NOSUID)
1627                 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1628         if (uap->flags & MS_GLOBAL)
1629                 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1630
1631         if (error) {
1632                 lofi_remove(vfsp);
1633
1634                 if (remount) {
1635                         /* put back pre-remount options */
1636                         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1637                         vfs_setmntpoint(vfsp, refstr_value(oldmntpt),
1638                             VFSSP_VERBATIM);
1639                         if (oldmntpt)
1640                                 refstr_rele(oldmntpt);
1641                         vfs_setresource(vfsp, refstr_value(oldresource),
1642                             VFSSP_VERBATIM);
1643                         if (oldresource)
1644                                 refstr_rele(oldresource);
1645                         vfsp->vfs_flag = ovflags;
1646                         vfs_unlock(vfsp);
1647                         VFS_RELE(vfsp);
1648                 } else {
1649                         vfs_unlock(vfsp);
1650                         vfs_freemnttab(vfsp);
1651                         vfs_free(vfsp);
1652                 }
1653         } else {
1654                 /*
1655                  * Set the mount time to now
1656                  */
1657                 vfsp->vfs_mtime = ddi_get_time();
1658                 if (remount) {
1659                         vfsp->vfs_flag &= ~VFS_REMOUNT;
1660                         if (oldresource)
1661                                 refstr_rele(oldresource);
1662                         if (oldmntpt)
1663                                 refstr_rele(oldmntpt);
1664                 } else if (splice) {
1665                         /*
1666                          * Link vfsp into the name space at the mount
1667                          * point. Vfs_add() is responsible for
1668                          * holding the mount point which will be
1669                          * released when vfs_remove() is called.
1670                          */
1671                         vfs_add(vp, vfsp, uap->flags);
1672                 } else {
1673                         /*
1674                          * Hold the reference to file system which is
1675                          * not linked into the name space.
1676                          */
1677                         vfsp->vfs_zone = NULL;
1678                         VFS_HOLD(vfsp);
1679                         vfsp->vfs_vnodecovered = NULL;
1680                 }
1681                 /*
1682                  * Set flags for global options encountered
1683                  */
1684                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1685                         vfsp->vfs_flag |= VFS_RDONLY;
1686                 else
1687                         vfsp->vfs_flag &= ~VFS_RDONLY;
1688                 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1689                         vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1690                 } else {
1691                         if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1692                                 vfsp->vfs_flag |= VFS_NODEVICES;
1693                         else
1694                                 vfsp->vfs_flag &= ~VFS_NODEVICES;
1695                         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1696                                 vfsp->vfs_flag |= VFS_NOSETUID;
1697                         else
1698                                 vfsp->vfs_flag &= ~VFS_NOSETUID;
1699                 }
1700                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1701                         vfsp->vfs_flag |= VFS_NBMAND;
1702                 else
1703                         vfsp->vfs_flag &= ~VFS_NBMAND;
1704
1705                 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1706                         vfsp->vfs_flag |= VFS_XATTR;
1707                 else
1708                         vfsp->vfs_flag &= ~VFS_XATTR;
1709
1710                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1711                         vfsp->vfs_flag |= VFS_NOEXEC;
1712                 else
1713                         vfsp->vfs_flag &= ~VFS_NOEXEC;
1714
1715                 /*
1716                  * Now construct the output option string of options
1717                  * we recognized.
1718                  */
1719                 if (uap->flags & MS_OPTIONSTR) {
1720                         vfs_list_read_lock();
1721                         copyout_error = vfs_buildoptionstr(
1722                             &vfsp->vfs_mntopts, inargs, optlen);
1723                         vfs_list_unlock();
1724                         if (copyout_error == 0 &&
1725                             (uap->flags & MS_SYSSPACE) == 0) {
1726                                 copyout_error = copyoutstr(inargs, opts,
1727                                     optlen, NULL);
1728                         }
1729                 }
1730
1731                 /*
1732                  * If this isn't a remount, set up the vopstats before
1733                  * anyone can touch this. We only allow spliced file
1734                  * systems (file systems which are in the namespace) to
1735                  * have the VFS_STATS flag set.
1736                  * NOTE: PxFS mounts the underlying file system with
1737                  * MS_NOSPLICE set and copies those vfs_flags to its private
1738                  * vfs structure. As a result, PxFS should never have
1739                  * the VFS_STATS flag or else we might access the vfs
1740                  * statistics-related fields prior to them being
1741                  * properly initialized.
1742                  */
1743                 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1744                         initialize_vopstats(&vfsp->vfs_vopstats);
1745                         /*
1746                          * We need to set vfs_vskap to NULL because there's
1747                          * a chance it won't be set below.  This is checked
1748                          * in teardown_vopstats() so we can't have garbage.
1749                          */
1750                         vfsp->vfs_vskap = NULL;
1751                         vfsp->vfs_flag |= VFS_STATS;
1752                         vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1753                 }
1754
1755                 if (vswp->vsw_flag & VSW_XID)
1756                         vfsp->vfs_flag |= VFS_XID;
1757
1758                 vfs_unlock(vfsp);
1759         }
1760         mount_completed(zone);
1761         zone_rele(zone);
1762         if (splice)
1763                 vn_vfsunlock(vp);
1764
1765         if ((error == 0) && (copyout_error == 0)) {
1766                 if (!remount) {
1767                         /*
1768                          * Don't call get_vskstat_anchor() while holding
1769                          * locks since it allocates memory and calls
1770                          * VFS_STATVFS().  For NFS, the latter can generate
1771                          * an over-the-wire call.
1772                          */
1773                         vskap = get_vskstat_anchor(vfsp);
1774                         /* Only take the lock if we have something to do */
1775                         if (vskap != NULL) {
1776                                 vfs_lock_wait(vfsp);
1777                                 if (vfsp->vfs_flag & VFS_STATS) {
1778                                         vfsp->vfs_vskap = vskap;
1779                                 }
1780                                 vfs_unlock(vfsp);
1781                         }
1782                 }
1783                 /* Return vfsp to caller. */
1784                 *vfspp = vfsp;
1785         }
1786 errout:
1787         vfs_freeopttbl(&mnt_mntopts);
1788         if (resource != NULL)
1789                 kmem_free(resource, strlen(resource) + 1);
1790         if (mountpt != NULL)
1791                 kmem_free(mountpt, strlen(mountpt) + 1);
1792         /*
1793          * It is possible we errored prior to adding to mount in progress
1794          * table. Must free vnode we acquired with successful lookupname.
1795          */
1796         if (addmip)
1797                 VN_RELE(bvp);
1798         if (delmip)
1799                 vfs_delmip(vfsp);
1800         ASSERT(vswp != NULL);
1801         vfs_unrefvfssw(vswp);
1802         if (inargs != opts)
1803                 kmem_free(inargs, MAX_MNTOPT_STR);
1804         if (copyout_error) {
1805                 lofi_remove(vfsp);
1806                 VFS_RELE(vfsp);
1807                 error = copyout_error;
1808         }
1809         return (error);
1810 }
1811
1812 static void
1813 vfs_setpath(
1814     struct vfs *vfsp,           /* vfs being updated */
1815     refstr_t **refp,            /* Ref-count string to contain the new path */
1816     const char *newpath,        /* Path to add to refp (above) */
1817     uint32_t flag)              /* flag */
1818 {
1819         size_t len;
1820         refstr_t *ref;
1821         zone_t *zone = curproc->p_zone;
1822         char *sp;
1823         int have_list_lock = 0;
1824
1825         ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1826
1827         /*
1828          * New path must be less than MAXPATHLEN because mntfs
1829          * will only display up to MAXPATHLEN bytes. This is currently
1830          * safe, because domount() uses pn_get(), and other callers
1831          * similarly cap the size to fewer than MAXPATHLEN bytes.
1832          */
1833
1834         ASSERT(strlen(newpath) < MAXPATHLEN);
1835
1836         /* mntfs requires consistency while vfs list lock is held */
1837
1838         if (VFS_ON_LIST(vfsp)) {
1839                 have_list_lock = 1;
1840                 vfs_list_lock();
1841         }
1842
1843         if (*refp != NULL)
1844                 refstr_rele(*refp);
1845
1846         /*
1847          * If we are in a non-global zone then we prefix the supplied path,
1848          * newpath, with the zone's root path, with two exceptions. The first
1849          * is where we have been explicitly directed to avoid doing so; this
1850          * will be the case following a failed remount, where the path supplied
1851          * will be a saved version which must now be restored. The second
1852          * exception is where newpath is not a pathname but a descriptive name,
1853          * e.g. "procfs".
1854          */
1855         if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') {
1856                 ref = refstr_alloc(newpath);
1857                 goto out;
1858         }
1859
1860         /*
1861          * Truncate the trailing '/' in the zoneroot, and merge
1862          * in the zone's rootpath with the "newpath" (resource
1863          * or mountpoint) passed in.
1864          *
1865          * The size of the required buffer is thus the size of
1866          * the buffer required for the passed-in newpath
1867          * (strlen(newpath) + 1), plus the size of the buffer
1868          * required to hold zone_rootpath (zone_rootpathlen)
1869          * minus one for one of the now-superfluous NUL
1870          * terminations, minus one for the trailing '/'.
1871          *
1872          * That gives us:
1873          *
1874          * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1875          *
1876          * Which is what we have below.
1877          */
1878
1879         len = strlen(newpath) + zone->zone_rootpathlen - 1;
1880         sp = kmem_alloc(len, KM_SLEEP);
1881
1882         /*
1883          * Copy everything including the trailing slash, which
1884          * we then overwrite with the NUL character.
1885          */
1886
1887         (void) strcpy(sp, zone->zone_rootpath);
1888         sp[zone->zone_rootpathlen - 2] = '\0';
1889         (void) strcat(sp, newpath);
1890
1891         ref = refstr_alloc(sp);
1892         kmem_free(sp, len);
1893 out:
1894         *refp = ref;
1895
1896         if (have_list_lock) {
1897                 vfs_mnttab_modtimeupd();
1898                 vfs_list_unlock();
1899         }
1900 }
1901
1902 /*
1903  * Record a mounted resource name in a vfs structure.
1904  * If vfsp is already mounted, caller must hold the vfs lock.
1905  */
1906 void
1907 vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag)
1908 {
1909         if (resource == NULL || resource[0] == '\0')
1910                 resource = VFS_NORESOURCE;
1911         vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag);
1912 }
1913
1914 /*
1915  * Record a mount point name in a vfs structure.
1916  * If vfsp is already mounted, caller must hold the vfs lock.
1917  */
1918 void
1919 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag)
1920 {
1921         if (mntpt == NULL || mntpt[0] == '\0')
1922                 mntpt = VFS_NOMNTPT;
1923         vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag);
1924 }
1925
1926 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1927
1928 refstr_t *
1929 vfs_getresource(const struct vfs *vfsp)
1930 {
1931         refstr_t *resource;
1932
1933         vfs_list_read_lock();
1934         resource = vfsp->vfs_resource;
1935         refstr_hold(resource);
1936         vfs_list_unlock();
1937
1938         return (resource);
1939 }
1940
1941 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1942
1943 refstr_t *
1944 vfs_getmntpoint(const struct vfs *vfsp)
1945 {
1946         refstr_t *mntpt;
1947
1948         vfs_list_read_lock();
1949         mntpt = vfsp->vfs_mntpt;
1950         refstr_hold(mntpt);
1951         vfs_list_unlock();
1952
1953         return (mntpt);
1954 }
1955
1956 /*
1957  * Create an empty options table with enough empty slots to hold all
1958  * The options in the options string passed as an argument.
1959  * Potentially prepend another options table.
1960  *
1961  * Note: caller is responsible for locking the vfs list, if needed,
1962  *       to protect mops.
1963  */
1964 static void
1965 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1966     const mntopts_t *mtmpl)
1967 {
1968         const char *s = opts;
1969         uint_t count;
1970
1971         if (opts == NULL || *opts == '\0') {
1972                 count = 0;
1973         } else {
1974                 count = 1;
1975
1976                 /*
1977                  * Count number of options in the string
1978                  */
1979                 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1980                         count++;
1981                         s++;
1982                 }
1983         }
1984         vfs_copyopttbl_extend(mtmpl, mops, count);
1985 }
1986
1987 /*
1988  * Create an empty options table with enough empty slots to hold all
1989  * The options in the options string passed as an argument.
1990  *
1991  * This function is *not* for general use by filesystems.
1992  *
1993  * Note: caller is responsible for locking the vfs list, if needed,
1994  *       to protect mops.
1995  */
1996 void
1997 vfs_createopttbl(mntopts_t *mops, const char *opts)
1998 {
1999         vfs_createopttbl_extend(mops, opts, NULL);
2000 }
2001
2002
2003 /*
2004  * Swap two mount options tables
2005  */
2006 static void
2007 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2008 {
2009         uint_t tmpcnt;
2010         mntopt_t *tmplist;
2011
2012         tmpcnt = optbl2->mo_count;
2013         tmplist = optbl2->mo_list;
2014         optbl2->mo_count = optbl1->mo_count;
2015         optbl2->mo_list = optbl1->mo_list;
2016         optbl1->mo_count = tmpcnt;
2017         optbl1->mo_list = tmplist;
2018 }
2019
2020 static void
2021 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2022 {
2023         vfs_list_lock();
2024         vfs_swapopttbl_nolock(optbl1, optbl2);
2025         vfs_mnttab_modtimeupd();
2026         vfs_list_unlock();
2027 }
2028
2029 static char **
2030 vfs_copycancelopt_extend(char **const moc, int extend)
2031 {
2032         int i = 0;
2033         int j;
2034         char **result;
2035
2036         if (moc != NULL) {
2037                 for (; moc[i] != NULL; i++)
2038                         /* count number of options to cancel */;
2039         }
2040
2041         if (i + extend == 0)
2042                 return (NULL);
2043
2044         result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2045
2046         for (j = 0; j < i; j++) {
2047                 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2048                 (void) strcpy(result[j], moc[j]);
2049         }
2050         for (; j <= i + extend; j++)
2051                 result[j] = NULL;
2052
2053         return (result);
2054 }
2055
2056 static void
2057 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2058 {
2059         char *sp, *dp;
2060
2061         d->mo_flags = s->mo_flags;
2062         d->mo_data = s->mo_data;
2063         sp = s->mo_name;
2064         if (sp != NULL) {
2065                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2066                 (void) strcpy(dp, sp);
2067                 d->mo_name = dp;
2068         } else {
2069                 d->mo_name = NULL; /* should never happen */
2070         }
2071
2072         d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2073
2074         sp = s->mo_arg;
2075         if (sp != NULL) {
2076                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2077                 (void) strcpy(dp, sp);
2078                 d->mo_arg = dp;
2079         } else {
2080                 d->mo_arg = NULL;
2081         }
2082 }
2083
2084 /*
2085  * Copy a mount options table, possibly allocating some spare
2086  * slots at the end.  It is permissible to copy_extend the NULL table.
2087  */
2088 static void
2089 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2090 {
2091         uint_t i, count;
2092         mntopt_t *motbl;
2093
2094         /*
2095          * Clear out any existing stuff in the options table being initialized
2096          */
2097         vfs_freeopttbl(dmo);
2098         count = (smo == NULL) ? 0 : smo->mo_count;
2099         if ((count + extra) == 0)       /* nothing to do */
2100                 return;
2101         dmo->mo_count = count + extra;
2102         motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2103         dmo->mo_list = motbl;
2104         for (i = 0; i < count; i++) {
2105                 vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2106         }
2107         for (i = count; i < count + extra; i++) {
2108                 motbl[i].mo_flags = MO_EMPTY;
2109         }
2110 }
2111
2112 /*
2113  * Copy a mount options table.
2114  *
2115  * This function is *not* for general use by filesystems.
2116  *
2117  * Note: caller is responsible for locking the vfs list, if needed,
2118  *       to protect smo and dmo.
2119  */
2120 void
2121 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2122 {
2123         vfs_copyopttbl_extend(smo, dmo, 0);
2124 }
2125
2126 static char **
2127 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2128 {
2129         int c1 = 0;
2130         int c2 = 0;
2131         char **result;
2132         char **sp1, **sp2, **dp;
2133
2134         /*
2135          * First we count both lists of cancel options.
2136          * If either is NULL or has no elements, we return a copy of
2137          * the other.
2138          */
2139         if (mop1->mo_cancel != NULL) {
2140                 for (; mop1->mo_cancel[c1] != NULL; c1++)
2141                         /* count cancel options in mop1 */;
2142         }
2143
2144         if (c1 == 0)
2145                 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2146
2147         if (mop2->mo_cancel != NULL) {
2148                 for (; mop2->mo_cancel[c2] != NULL; c2++)
2149                         /* count cancel options in mop2 */;
2150         }
2151
2152         result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2153
2154         if (c2 == 0)
2155                 return (result);
2156
2157         /*
2158          * When we get here, we've got two sets of cancel options;
2159          * we need to merge the two sets.  We know that the result
2160          * array has "c1+c2+1" entries and in the end we might shrink
2161          * it.
2162          * Result now has a copy of the c1 entries from mop1; we'll
2163          * now lookup all the entries of mop2 in mop1 and copy it if
2164          * it is unique.
2165          * This operation is O(n^2) but it's only called once per
2166          * filesystem per duplicate option.  This is a situation
2167          * which doesn't arise with the filesystems in ON and
2168          * n is generally 1.
2169          */
2170
2171         dp = &result[c1];
2172         for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2173                 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2174                         if (strcmp(*sp1, *sp2) == 0)
2175                                 break;
2176                 }
2177                 if (*sp1 == NULL) {
2178                         /*
2179                          * Option *sp2 not found in mop1, so copy it.
2180                          * The calls to vfs_copycancelopt_extend()
2181                          * guarantee that there's enough room.
2182                          */
2183                         *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2184                         (void) strcpy(*dp++, *sp2);
2185                 }
2186         }
2187         if (dp != &result[c1+c2]) {
2188                 size_t bytes = (dp - result + 1) * sizeof (char *);
2189                 char **nres = kmem_alloc(bytes, KM_SLEEP);
2190
2191                 bcopy(result, nres, bytes);
2192                 kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2193                 result = nres;
2194         }
2195         return (result);
2196 }
2197
2198 /*
2199  * Merge two mount option tables (outer and inner) into one.  This is very
2200  * similar to "merging" global variables and automatic variables in C.
2201  *
2202  * This isn't (and doesn't have to be) fast.
2203  *
2204  * This function is *not* for general use by filesystems.
2205  *
2206  * Note: caller is responsible for locking the vfs list, if needed,
2207  *       to protect omo, imo & dmo.
2208  */
2209 void
2210 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2211 {
2212         uint_t i, count;
2213         mntopt_t *mop, *motbl;
2214         uint_t freeidx;
2215
2216         /*
2217          * First determine how much space we need to allocate.
2218          */
2219         count = omo->mo_count;
2220         for (i = 0; i < imo->mo_count; i++) {
2221                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2222                         continue;
2223                 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2224                         count++;
2225         }
2226         ASSERT(count >= omo->mo_count &&
2227             count <= omo->mo_count + imo->mo_count);
2228         motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2229         for (i = 0; i < omo->mo_count; i++)
2230                 vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2231         freeidx = omo->mo_count;
2232         for (i = 0; i < imo->mo_count; i++) {
2233                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2234                         continue;
2235                 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2236                         char **newcanp;
2237                         uint_t index = mop - omo->mo_list;
2238
2239                         newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2240
2241                         vfs_freeopt(&motbl[index]);
2242                         vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2243
2244                         vfs_freecancelopt(motbl[index].mo_cancel);
2245                         motbl[index].mo_cancel = newcanp;
2246                 } else {
2247                         /*
2248                          * If it's a new option, just copy it over to the first
2249                          * free location.
2250                          */
2251                         vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2252                 }
2253         }
2254         dmo->mo_count = count;
2255         dmo->mo_list = motbl;
2256 }
2257
2258 /*
2259  * Functions to set and clear mount options in a mount options table.
2260  */
2261
2262 /*
2263  * Clear a mount option, if it exists.
2264  *
2265  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2266  * the vfs list.
2267  */
2268 static void
2269 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2270 {
2271         struct mntopt *mop;
2272         uint_t i, count;
2273
2274         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2275
2276         count = mops->mo_count;
2277         for (i = 0; i < count; i++) {
2278                 mop = &mops->mo_list[i];
2279
2280                 if (mop->mo_flags & MO_EMPTY)
2281                         continue;
2282                 if (strcmp(opt, mop->mo_name))
2283                         continue;
2284                 mop->mo_flags &= ~MO_SET;
2285                 if (mop->mo_arg != NULL) {
2286                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2287                 }
2288                 mop->mo_arg = NULL;
2289                 if (update_mnttab)
2290                         vfs_mnttab_modtimeupd();
2291                 break;
2292         }
2293 }
2294
2295 void
2296 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2297 {
2298         int gotlock = 0;
2299
2300         if (VFS_ON_LIST(vfsp)) {
2301                 gotlock = 1;
2302                 vfs_list_lock();
2303         }
2304         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2305         if (gotlock)
2306                 vfs_list_unlock();
2307 }
2308
2309
2310 /*
2311  * Set a mount option on.  If it's not found in the table, it's silently
2312  * ignored.  If the option has MO_IGNORE set, it is still set unless the
2313  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2314  * bits can be used to toggle the MO_NODISPLAY bit for the option.
2315  * If the VFS_CREATEOPT flag bit is set then the first option slot with
2316  * MO_EMPTY set is created as the option passed in.
2317  *
2318  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2319  * the vfs list.
2320  */
2321 static void
2322 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2323     const char *arg, int flags, int update_mnttab)
2324 {
2325         mntopt_t *mop;
2326         uint_t i, count;
2327         char *sp;
2328
2329         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2330
2331         if (flags & VFS_CREATEOPT) {
2332                 if (vfs_hasopt(mops, opt) != NULL) {
2333                         flags &= ~VFS_CREATEOPT;
2334                 }
2335         }
2336         count = mops->mo_count;
2337         for (i = 0; i < count; i++) {
2338                 mop = &mops->mo_list[i];
2339
2340                 if (mop->mo_flags & MO_EMPTY) {
2341                         if ((flags & VFS_CREATEOPT) == 0)
2342                                 continue;
2343                         sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2344                         (void) strcpy(sp, opt);
2345                         mop->mo_name = sp;
2346                         if (arg != NULL)
2347                                 mop->mo_flags = MO_HASVALUE;
2348                         else
2349                                 mop->mo_flags = 0;
2350                 } else if (strcmp(opt, mop->mo_name)) {
2351                         continue;
2352                 }
2353                 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2354                         break;
2355                 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2356                         sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2357                         (void) strcpy(sp, arg);
2358                 } else {
2359                         sp = NULL;
2360                 }
2361                 if (mop->mo_arg != NULL)
2362                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2363                 mop->mo_arg = sp;
2364                 if (flags & VFS_DISPLAY)
2365                         mop->mo_flags &= ~MO_NODISPLAY;
2366                 if (flags & VFS_NODISPLAY)
2367                         mop->mo_flags |= MO_NODISPLAY;
2368                 mop->mo_flags |= MO_SET;
2369                 if (mop->mo_cancel != NULL) {
2370                         char **cp;
2371
2372                         for (cp = mop->mo_cancel; *cp != NULL; cp++)
2373                                 vfs_clearmntopt_nolock(mops, *cp, 0);
2374                 }
2375                 if (update_mnttab)
2376                         vfs_mnttab_modtimeupd();
2377                 break;
2378         }
2379 }
2380
2381 void
2382 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2383 {
2384         int gotlock = 0;
2385
2386         if (VFS_ON_LIST(vfsp)) {
2387                 gotlock = 1;
2388                 vfs_list_lock();
2389         }
2390         vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2391         if (gotlock)
2392                 vfs_list_unlock();
2393 }
2394
2395
2396 /*
2397  * Add a "tag" option to a mounted file system's options list.
2398  *
2399  * Note: caller is responsible for locking the vfs list, if needed,
2400  *       to protect mops.
2401  */
2402 static mntopt_t *
2403 vfs_addtag(mntopts_t *mops, const char *tag)
2404 {
2405         uint_t count;
2406         mntopt_t *mop, *motbl;
2407
2408         count = mops->mo_count + 1;
2409         motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2410         if (mops->mo_count) {
2411                 size_t len = (count - 1) * sizeof (mntopt_t);
2412
2413                 bcopy(mops->mo_list, motbl, len);
2414                 kmem_free(mops->mo_list, len);
2415         }
2416         mops->mo_count = count;
2417         mops->mo_list = motbl;
2418         mop = &motbl[count - 1];
2419         mop->mo_flags = MO_TAG;
2420         mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2421         (void) strcpy(mop->mo_name, tag);
2422         return (mop);
2423 }
2424
2425 /*
2426  * Allow users to set arbitrary "tags" in a vfs's mount options.
2427  * Broader use within the kernel is discouraged.
2428  */
2429 int
2430 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2431     cred_t *cr)
2432 {
2433         vfs_t *vfsp;
2434         mntopts_t *mops;
2435         mntopt_t *mop;
2436         int found = 0;
2437         dev_t dev = makedevice(major, minor);
2438         int err = 0;
2439         char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2440
2441         /*
2442          * Find the desired mounted file system
2443          */
2444         vfs_list_lock();
2445         vfsp = rootvfs;
2446         do {
2447                 if (vfsp->vfs_dev == dev &&
2448                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2449                         found = 1;
2450                         break;
2451                 }
2452                 vfsp = vfsp->vfs_next;
2453         } while (vfsp != rootvfs);
2454
2455         if (!found) {
2456                 err = EINVAL;
2457                 goto out;
2458         }
2459         err = secpolicy_fs_config(cr, vfsp);
2460         if (err != 0)
2461                 goto out;
2462
2463         mops = &vfsp->vfs_mntopts;
2464         /*
2465          * Add tag if it doesn't already exist
2466          */
2467         if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2468                 int len;
2469
2470                 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2471                 len = strlen(buf);
2472                 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2473                         err = ENAMETOOLONG;
2474                         goto out;
2475                 }
2476                 mop = vfs_addtag(mops, tag);
2477         }
2478         if ((mop->mo_flags & MO_TAG) == 0) {
2479                 err = EINVAL;
2480                 goto out;
2481         }
2482         vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2483 out:
2484         vfs_list_unlock();
2485         kmem_free(buf, MAX_MNTOPT_STR);
2486         return (err);
2487 }
2488
2489 /*
2490  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2491  * Broader use within the kernel is discouraged.
2492  */
2493 int
2494 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2495     cred_t *cr)
2496 {
2497         vfs_t *vfsp;
2498         mntopt_t *mop;
2499         int found = 0;
2500         dev_t dev = makedevice(major, minor);
2501         int err = 0;
2502
2503         /*
2504          * Find the desired mounted file system
2505          */
2506         vfs_list_lock();
2507         vfsp = rootvfs;
2508         do {
2509                 if (vfsp->vfs_dev == dev &&
2510                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2511                         found = 1;
2512                         break;
2513                 }
2514                 vfsp = vfsp->vfs_next;
2515         } while (vfsp != rootvfs);
2516
2517         if (!found) {
2518                 err = EINVAL;
2519                 goto out;
2520         }
2521         err = secpolicy_fs_config(cr, vfsp);
2522         if (err != 0)
2523                 goto out;
2524
2525         if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2526                 err = EINVAL;
2527                 goto out;
2528         }
2529         if ((mop->mo_flags & MO_TAG) == 0) {
2530                 err = EINVAL;
2531                 goto out;
2532         }
2533         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2534 out:
2535         vfs_list_unlock();
2536         return (err);
2537 }
2538
2539 /*
2540  * Function to parse an option string and fill in a mount options table.
2541  * Unknown options are silently ignored.  The input option string is modified
2542  * by replacing separators with nulls.  If the create flag is set, options
2543  * not found in the table are just added on the fly.  The table must have
2544  * an option slot marked MO_EMPTY to add an option on the fly.
2545  *
2546  * This function is *not* for general use by filesystems.
2547  *
2548  * Note: caller is responsible for locking the vfs list, if needed,
2549  *       to protect mops..
2550  */
2551 void
2552 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2553 {
2554         char *s = osp, *p, *nextop, *valp, *cp, *ep;
2555         int setflg = VFS_NOFORCEOPT;
2556
2557         if (osp == NULL)
2558                 return;
2559         while (*s != '\0') {
2560                 p = strchr(s, ',');     /* find next option */
2561                 if (p == NULL) {
2562                         cp = NULL;
2563                         p = s + strlen(s);
2564                 } else {
2565                         cp = p;         /* save location of comma */
2566                         *p++ = '\0';    /* mark end and point to next option */
2567                 }
2568                 nextop = p;
2569                 p = strchr(s, '=');     /* look for value */
2570                 if (p == NULL) {
2571                         valp = NULL;    /* no value supplied */
2572                 } else {
2573                         ep = p;         /* save location of equals */
2574                         *p++ = '\0';    /* end option and point to value */
2575                         valp = p;
2576                 }
2577                 /*
2578                  * set option into options table
2579                  */
2580                 if (create)
2581                         setflg |= VFS_CREATEOPT;
2582                 vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2583                 if (cp != NULL)
2584                         *cp = ',';      /* restore the comma */
2585                 if (valp != NULL)
2586                         *ep = '=';      /* restore the equals */
2587                 s = nextop;
2588         }
2589 }
2590
2591 /*
2592  * Function to inquire if an option exists in a mount options table.
2593  * Returns a pointer to the option if it exists, else NULL.
2594  *
2595  * This function is *not* for general use by filesystems.
2596  *
2597  * Note: caller is responsible for locking the vfs list, if needed,
2598  *       to protect mops.
2599  */
2600 struct mntopt *
2601 vfs_hasopt(const mntopts_t *mops, const char *opt)
2602 {
2603         struct mntopt *mop;
2604         uint_t i, count;
2605
2606         count = mops->mo_count;
2607         for (i = 0; i < count; i++) {
2608                 mop = &mops->mo_list[i];
2609
2610                 if (mop->mo_flags & MO_EMPTY)
2611                         continue;
2612                 if (strcmp(opt, mop->mo_name) == 0)
2613                         return (mop);
2614         }
2615         return (NULL);
2616 }
2617
2618 /*
2619  * Function to inquire if an option is set in a mount options table.
2620  * Returns non-zero if set and fills in the arg pointer with a pointer to
2621  * the argument string or NULL if there is no argument string.
2622  */
2623 static int
2624 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2625 {
2626         struct mntopt *mop;
2627         uint_t i, count;
2628
2629         count = mops->mo_count;
2630         for (i = 0; i < count; i++) {
2631                 mop = &mops->mo_list[i];
2632
2633                 if (mop->mo_flags & MO_EMPTY)
2634                         continue;
2635                 if (strcmp(opt, mop->mo_name))
2636                         continue;
2637                 if ((mop->mo_flags & MO_SET) == 0)
2638                         return (0);
2639                 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2640                         *argp = mop->mo_arg;
2641                 return (1);
2642         }
2643         return (0);
2644 }
2645
2646
2647 int
2648 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2649 {
2650         int ret;
2651
2652         vfs_list_read_lock();
2653         ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2654         vfs_list_unlock();
2655         return (ret);
2656 }
2657
2658
2659 /*
2660  * Construct a comma separated string of the options set in the given
2661  * mount table, return the string in the given buffer.  Return non-zero if
2662  * the buffer would overflow.
2663  *
2664  * This function is *not* for general use by filesystems.
2665  *
2666  * Note: caller is responsible for locking the vfs list, if needed,
2667  *       to protect mp.
2668  */
2669 int
2670 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2671 {
2672         char *cp;
2673         uint_t i;
2674
2675         buf[0] = '\0';
2676         cp = buf;
2677         for (i = 0; i < mp->mo_count; i++) {
2678                 struct mntopt *mop;
2679
2680                 mop = &mp->mo_list[i];
2681                 if (mop->mo_flags & MO_SET) {
2682                         int optlen, comma = 0;
2683
2684                         if (buf[0] != '\0')
2685                                 comma = 1;
2686                         optlen = strlen(mop->mo_name);
2687                         if (strlen(buf) + comma + optlen + 1 > len)
2688                                 goto err;
2689                         if (comma)
2690                                 *cp++ = ',';
2691                         (void) strcpy(cp, mop->mo_name);
2692                         cp += optlen;
2693                         /*
2694                          * Append option value if there is one
2695                          */
2696                         if (mop->mo_arg != NULL) {
2697                                 int arglen;
2698
2699                                 arglen = strlen(mop->mo_arg);
2700                                 if (strlen(buf) + arglen + 2 > len)
2701                                         goto err;
2702                                 *cp++ = '=';
2703                                 (void) strcpy(cp, mop->mo_arg);
2704                                 cp += arglen;
2705                         }
2706                 }
2707         }
2708         return (0);
2709 err:
2710         return (EOVERFLOW);
2711 }
2712
2713 static void
2714 vfs_freecancelopt(char **moc)
2715 {
2716         if (moc != NULL) {
2717                 int ccnt = 0;
2718                 char **cp;
2719
2720                 for (cp = moc; *cp != NULL; cp++) {
2721                         kmem_free(*cp, strlen(*cp) + 1);
2722                         ccnt++;
2723                 }
2724                 kmem_free(moc, (ccnt + 1) * sizeof (char *));
2725         }
2726 }
2727
2728 static void
2729 vfs_freeopt(mntopt_t *mop)
2730 {
2731         if (mop->mo_name != NULL)
2732                 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2733
2734         vfs_freecancelopt(mop->mo_cancel);
2735
2736         if (mop->mo_arg != NULL)
2737                 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2738 }
2739
2740 /*
2741  * Free a mount options table
2742  *
2743  * This function is *not* for general use by filesystems.
2744  *
2745  * Note: caller is responsible for locking the vfs list, if needed,
2746  *       to protect mp.
2747  */
2748 void
2749 vfs_freeopttbl(mntopts_t *mp)
2750 {
2751         uint_t i, count;
2752
2753         count = mp->mo_count;
2754         for (i = 0; i < count; i++) {
2755                 vfs_freeopt(&mp->mo_list[i]);
2756         }
2757         if (count) {
2758                 kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2759                 mp->mo_count = 0;
2760                 mp->mo_list = NULL;
2761         }
2762 }
2763
2764
2765 /* ARGSUSED */
2766 static int
2767 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2768     caller_context_t *ct)
2769 {
2770         return (0);
2771 }
2772
2773 /* ARGSUSED */
2774 static int
2775 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2776     caller_context_t *ct)
2777 {
2778         return (0);
2779 }
2780
2781 /*
2782  * The dummy vnode is currently used only by file events notification
2783  * module which is just interested in the timestamps.
2784  */
2785 /* ARGSUSED */
2786 static int
2787 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2788     caller_context_t *ct)
2789 {
2790         bzero(vap, sizeof (vattr_t));
2791         vap->va_type = VREG;
2792         vap->va_nlink = 1;
2793         vap->va_ctime = vfs_mnttab_ctime;
2794         /*
2795          * it is ok to just copy mtime as the time will be monotonically
2796          * increasing.
2797          */
2798         vap->va_mtime = vfs_mnttab_mtime;
2799         vap->va_atime = vap->va_mtime;
2800         return (0);
2801 }
2802
2803 static void
2804 vfs_mnttabvp_setup(void)
2805 {
2806         vnode_t *tvp;
2807         vnodeops_t *vfs_mntdummyvnops;
2808         const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2809                 VOPNAME_READ,           { .vop_read = vfs_mntdummyread },
2810                 VOPNAME_WRITE,          { .vop_write = vfs_mntdummywrite },
2811                 VOPNAME_GETATTR,        { .vop_getattr = vfs_mntdummygetattr },
2812                 VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
2813                 NULL,                   NULL
2814         };
2815
2816         if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2817             &vfs_mntdummyvnops) != 0) {
2818                 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2819                 /* Shouldn't happen, but not bad enough to panic */
2820                 return;
2821         }
2822
2823         /*
2824          * A global dummy vnode is allocated to represent mntfs files.
2825          * The mntfs file (/etc/mnttab) can be monitored for file events
2826          * and receive an event when mnttab changes. Dummy VOP calls
2827          * will be made on this vnode. The file events notification module
2828          * intercepts this vnode and delivers relevant events.
2829          */
2830         tvp = vn_alloc(KM_SLEEP);
2831         tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2832         vn_setops(tvp, vfs_mntdummyvnops);
2833         tvp->v_type = VREG;
2834         /*
2835          * The mnt dummy ops do not reference v_data.
2836          * No other module intercepting this vnode should either.
2837          * Just set it to point to itself.
2838          */
2839         tvp->v_data = (caddr_t)tvp;
2840         tvp->v_vfsp = rootvfs;
2841         vfs_mntdummyvp = tvp;
2842 }
2843
2844 /*
2845  * performs fake read/write ops
2846  */
2847 static void
2848 vfs_mnttab_rwop(int rw)
2849 {
2850         struct uio      uio;
2851         struct iovec    iov;
2852         char    buf[1];
2853
2854         if (vfs_mntdummyvp == NULL)
2855                 return;
2856
2857         bzero(&uio, sizeof (uio));
2858         bzero(&iov, sizeof (iov));
2859         iov.iov_base = buf;
2860         iov.iov_len = 0;
2861         uio.uio_iov = &iov;
2862         uio.uio_iovcnt = 1;
2863         uio.uio_loffset = 0;
2864         uio.uio_segflg = UIO_SYSSPACE;
2865         uio.uio_resid = 0;
2866         if (rw) {
2867                 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2868         } else {
2869                 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2870         }
2871 }
2872
2873 /*
2874  * Generate a write operation.
2875  */
2876 void
2877 vfs_mnttab_writeop(void)
2878 {
2879         vfs_mnttab_rwop(1);
2880 }
2881
2882 /*
2883  * Generate a read operation.
2884  */
2885 void
2886 vfs_mnttab_readop(void)
2887 {
2888         vfs_mnttab_rwop(0);
2889 }
2890
2891 /*
2892  * Free any mnttab information recorded in the vfs struct.
2893  * The vfs must not be on the vfs list.
2894  */
2895 static void
2896 vfs_freemnttab(struct vfs *vfsp)
2897 {
2898         ASSERT(!VFS_ON_LIST(vfsp));
2899
2900         /*
2901          * Free device and mount point information
2902          */
2903         if (vfsp->vfs_mntpt != NULL) {
2904                 refstr_rele(vfsp->vfs_mntpt);
2905                 vfsp->vfs_mntpt = NULL;
2906         }
2907         if (vfsp->vfs_resource != NULL) {
2908                 refstr_rele(vfsp->vfs_resource);
2909                 vfsp->vfs_resource = NULL;
2910         }
2911         /*
2912          * Now free mount options information
2913          */
2914         vfs_freeopttbl(&vfsp->vfs_mntopts);
2915 }
2916
2917 /*
2918  * Return the last mnttab modification time
2919  */
2920 void
2921 vfs_mnttab_modtime(timespec_t *ts)
2922 {
2923         ASSERT(RW_LOCK_HELD(&vfslist));
2924         *ts = vfs_mnttab_mtime;
2925 }
2926
2927 /*
2928  * See if mnttab is changed
2929  */
2930 void
2931 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2932 {
2933         int changed;
2934
2935         *phpp = (struct pollhead *)NULL;
2936
2937         /*
2938          * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2939          * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2940          * to not grab the vfs list lock because tv_sec is monotonically
2941          * increasing.
2942          */
2943
2944         changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2945             (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2946         if (!changed) {
2947                 *phpp = &vfs_pollhd;
2948         }
2949 }
2950
2951 /* Provide a unique and monotonically-increasing timestamp. */
2952 void
2953 vfs_mono_time(timespec_t *ts)
2954 {
2955         static volatile hrtime_t hrt;           /* The saved time. */
2956         hrtime_t        newhrt, oldhrt;         /* For effecting the CAS. */
2957         timespec_t      newts;
2958
2959         /*
2960          * Try gethrestime() first, but be prepared to fabricate a sensible
2961          * answer at the first sign of any trouble.
2962          */
2963         gethrestime(&newts);
2964         newhrt = ts2hrt(&newts);
2965         for (;;) {
2966                 oldhrt = hrt;
2967                 if (newhrt <= hrt)
2968                         newhrt = hrt + 1;
2969                 if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
2970                         break;
2971         }
2972         hrt2ts(newhrt, ts);
2973 }
2974
2975 /*
2976  * Update the mnttab modification time and wake up any waiters for
2977  * mnttab changes
2978  */
2979 void
2980 vfs_mnttab_modtimeupd()
2981 {
2982         hrtime_t oldhrt, newhrt;
2983
2984         ASSERT(RW_WRITE_HELD(&vfslist));
2985         oldhrt = ts2hrt(&vfs_mnttab_mtime);
2986         gethrestime(&vfs_mnttab_mtime);
2987         newhrt = ts2hrt(&vfs_mnttab_mtime);
2988         if (oldhrt == (hrtime_t)0)
2989                 vfs_mnttab_ctime = vfs_mnttab_mtime;
2990         /*
2991          * Attempt to provide unique mtime (like uniqtime but not).
2992          */
2993         if (newhrt == oldhrt) {
2994                 newhrt++;
2995                 hrt2ts(newhrt, &vfs_mnttab_mtime);
2996         }
2997         pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
2998         vfs_mnttab_writeop();
2999 }
3000
3001 int
3002 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
3003 {
3004         vnode_t *coveredvp;
3005         int error;
3006         extern void teardown_vopstats(vfs_t *);
3007
3008         /*
3009          * Get covered vnode. This will be NULL if the vfs is not linked
3010          * into the file system name space (i.e., domount() with MNT_NOSPICE).
3011          */
3012         coveredvp = vfsp->vfs_vnodecovered;
3013         ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3014
3015         /*
3016          * Purge all dnlc entries for this vfs.
3017          */
3018         (void) dnlc_purge_vfsp(vfsp, 0);
3019
3020         /* For forcible umount, skip VFS_SYNC() since it may hang */
3021         if ((flag & MS_FORCE) == 0)
3022                 (void) VFS_SYNC(vfsp, 0, cr);
3023
3024         /*
3025          * Lock the vfs to maintain fs status quo during unmount.  This
3026          * has to be done after the sync because ufs_update tries to acquire
3027          * the vfs_reflock.
3028          */
3029         vfs_lock_wait(vfsp);
3030
3031         if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3032                 vfs_unlock(vfsp);
3033                 if (coveredvp != NULL)
3034                         vn_vfsunlock(coveredvp);
3035         } else if (coveredvp != NULL) {
3036                 teardown_vopstats(vfsp);
3037                 /*
3038                  * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3039                  * when it frees vfsp so we do a VN_HOLD() so we can
3040                  * continue to use coveredvp afterwards.
3041                  */
3042                 VN_HOLD(coveredvp);
3043                 vfs_remove(vfsp);
3044                 vn_vfsunlock(coveredvp);
3045                 VN_RELE(coveredvp);
3046         } else {
3047                 teardown_vopstats(vfsp);
3048                 /*
3049                  * Release the reference to vfs that is not linked
3050                  * into the name space.
3051                  */
3052                 vfs_unlock(vfsp);
3053                 VFS_RELE(vfsp);
3054         }
3055         return (error);
3056 }
3057
3058
3059 /*
3060  * Vfs_unmountall() is called by uadmin() to unmount all
3061  * mounted file systems (except the root file system) during shutdown.
3062  * It follows the existing locking protocol when traversing the vfs list
3063  * to sync and unmount vfses. Even though there should be no
3064  * other thread running while the system is shutting down, it is prudent
3065  * to still follow the locking protocol.
3066  */
3067 void
3068 vfs_unmountall(void)
3069 {
3070         struct vfs *vfsp;
3071         struct vfs *prev_vfsp = NULL;
3072         int error;
3073
3074         /*
3075          * Toss all dnlc entries now so that the per-vfs sync
3076          * and unmount operations don't have to slog through
3077          * a bunch of uninteresting vnodes over and over again.
3078          */
3079         dnlc_purge();
3080
3081         vfs_list_lock();
3082         for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3083                 prev_vfsp = vfsp->vfs_prev;
3084
3085                 if (vfs_lock(vfsp) != 0)
3086                         continue;
3087                 error = vn_vfswlock(vfsp->vfs_vnodecovered);
3088                 vfs_unlock(vfsp);
3089                 if (error)
3090                         continue;
3091
3092                 vfs_list_unlock();
3093
3094                 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3095                 (void) dounmount(vfsp, 0, CRED());
3096
3097                 /*
3098                  * Since we dropped the vfslist lock above we must
3099                  * verify that next_vfsp still exists, else start over.
3100                  */
3101                 vfs_list_lock();
3102                 for (vfsp = rootvfs->vfs_prev;
3103                     vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3104                         if (vfsp == prev_vfsp)
3105                                 break;
3106                 if (vfsp == rootvfs && prev_vfsp != rootvfs)
3107                         prev_vfsp = rootvfs->vfs_prev;
3108         }
3109         vfs_list_unlock();
3110 }
3111
3112 /*
3113  * Called to add an entry to the end of the vfs mount in progress list
3114  */
3115 void
3116 vfs_addmip(dev_t dev, struct vfs *vfsp)
3117 {
3118         struct ipmnt *mipp;
3119
3120         mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3121         mipp->mip_next = NULL;
3122         mipp->mip_dev = dev;
3123         mipp->mip_vfsp = vfsp;
3124         mutex_enter(&vfs_miplist_mutex);
3125         if (vfs_miplist_end != NULL)
3126                 vfs_miplist_end->mip_next = mipp;
3127         else
3128                 vfs_miplist = mipp;
3129         vfs_miplist_end = mipp;
3130         mutex_exit(&vfs_miplist_mutex);
3131 }
3132
3133 /*
3134  * Called to remove an entry from the mount in progress list
3135  * Either because the mount completed or it failed.
3136  */
3137 void
3138 vfs_delmip(struct vfs *vfsp)
3139 {
3140         struct ipmnt *mipp, *mipprev;
3141
3142         mutex_enter(&vfs_miplist_mutex);
3143         mipprev = NULL;
3144         for (mipp = vfs_miplist;
3145             mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3146                 mipprev = mipp;
3147         }
3148         if (mipp == NULL)
3149                 return; /* shouldn't happen */
3150         if (mipp == vfs_miplist_end)
3151                 vfs_miplist_end = mipprev;
3152         if (mipprev == NULL)
3153                 vfs_miplist = mipp->mip_next;
3154         else
3155                 mipprev->mip_next = mipp->mip_next;
3156         mutex_exit(&vfs_miplist_mutex);
3157         kmem_free(mipp, sizeof (struct ipmnt));
3158 }
3159
3160 /*
3161  * vfs_add is called by a specific filesystem's mount routine to add
3162  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3163  * The vfs should already have been locked by the caller.
3164  *
3165  * coveredvp is NULL if this is the root.
3166  */
3167 void
3168 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3169 {
3170         int newflag;
3171
3172         ASSERT(vfs_lock_held(vfsp));
3173         VFS_HOLD(vfsp);
3174         newflag = vfsp->vfs_flag;
3175         if (mflag & MS_RDONLY)
3176                 newflag |= VFS_RDONLY;
3177         else
3178                 newflag &= ~VFS_RDONLY;
3179         if (mflag & MS_NOSUID)
3180                 newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3181         else
3182                 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3183         if (mflag & MS_NOMNTTAB)
3184                 newflag |= VFS_NOMNTTAB;
3185         else
3186                 newflag &= ~VFS_NOMNTTAB;
3187
3188         if (coveredvp != NULL) {
3189                 ASSERT(vn_vfswlock_held(coveredvp));
3190                 coveredvp->v_vfsmountedhere = vfsp;
3191                 VN_HOLD(coveredvp);
3192         }
3193         vfsp->vfs_vnodecovered = coveredvp;
3194         vfsp->vfs_flag = newflag;
3195
3196         vfs_list_add(vfsp);
3197 }
3198
3199 /*
3200  * Remove a vfs from the vfs list, null out the pointer from the
3201  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3202  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3203  * reference to the vfs and to the covered vnode.
3204  *
3205  * Called from dounmount after it's confirmed with the file system
3206  * that the unmount is legal.
3207  */
3208 void
3209 vfs_remove(struct vfs *vfsp)
3210 {
3211         vnode_t *vp;
3212
3213         ASSERT(vfs_lock_held(vfsp));
3214
3215         /*
3216          * Can't unmount root.  Should never happen because fs will
3217          * be busy.
3218          */
3219         if (vfsp == rootvfs)
3220                 panic("vfs_remove: unmounting root");
3221
3222         vfs_list_remove(vfsp);
3223
3224         /*
3225          * Unhook from the file system name space.
3226          */
3227         vp = vfsp->vfs_vnodecovered;
3228         ASSERT(vn_vfswlock_held(vp));
3229         vp->v_vfsmountedhere = NULL;
3230         vfsp->vfs_vnodecovered = NULL;
3231         VN_RELE(vp);
3232
3233         /*
3234          * Release lock and wakeup anybody waiting.
3235          */
3236         vfs_unlock(vfsp);
3237         VFS_RELE(vfsp);
3238 }
3239
3240 /*
3241  * Lock a filesystem to prevent access to it while mounting,
3242  * unmounting and syncing.  Return EBUSY immediately if lock
3243  * can't be acquired.
3244  */
3245 int
3246 vfs_lock(vfs_t *vfsp)
3247 {
3248         vn_vfslocks_entry_t *vpvfsentry;
3249
3250         vpvfsentry = vn_vfslocks_getlock(vfsp);
3251         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3252                 return (0);
3253
3254         vn_vfslocks_rele(vpvfsentry);
3255         return (EBUSY);
3256 }
3257
3258 int
3259 vfs_rlock(vfs_t *vfsp)
3260 {
3261         vn_vfslocks_entry_t *vpvfsentry;
3262
3263         vpvfsentry = vn_vfslocks_getlock(vfsp);
3264
3265         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3266                 return (0);
3267
3268         vn_vfslocks_rele(vpvfsentry);
3269         return (EBUSY);
3270 }
3271
3272 void
3273 vfs_lock_wait(vfs_t *vfsp)
3274 {
3275         vn_vfslocks_entry_t *vpvfsentry;
3276
3277         vpvfsentry = vn_vfslocks_getlock(vfsp);
3278         rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3279 }
3280
3281 void
3282 vfs_rlock_wait(vfs_t *vfsp)
3283 {
3284         vn_vfslocks_entry_t *vpvfsentry;
3285
3286         vpvfsentry = vn_vfslocks_getlock(vfsp);
3287         rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3288 }
3289
3290 /*
3291  * Unlock a locked filesystem.
3292  */
3293 void
3294 vfs_unlock(vfs_t *vfsp)
3295 {
3296         vn_vfslocks_entry_t *vpvfsentry;
3297
3298         /*
3299          * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3300          * And these changes should remain for the patch changes as it is.
3301          */
3302         if (panicstr)
3303                 return;
3304
3305         /*
3306          * ve_refcount needs to be dropped twice here.
3307          * 1. To release refernce after a call to vfs_locks_getlock()
3308          * 2. To release the reference from the locking routines like
3309          *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3310          */
3311
3312         vpvfsentry = vn_vfslocks_getlock(vfsp);
3313         vn_vfslocks_rele(vpvfsentry);
3314
3315         rwst_exit(&vpvfsentry->ve_lock);
3316         vn_vfslocks_rele(vpvfsentry);
3317 }
3318
3319 /*
3320  * Utility routine that allows a filesystem to construct its
3321  * fsid in "the usual way" - by munging some underlying dev_t and
3322  * the filesystem type number into the 64-bit fsid.  Note that
3323  * this implicitly relies on dev_t persistence to make filesystem
3324  * id's persistent.
3325  *
3326  * There's nothing to prevent an individual fs from constructing its
3327  * fsid in a different way, and indeed they should.
3328  *
3329  * Since we want fsids to be 32-bit quantities (so that they can be
3330  * exported identically by either 32-bit or 64-bit APIs, as well as
3331  * the fact that fsid's are "known" to NFS), we compress the device
3332  * number given down to 32-bits, and panic if that isn't possible.
3333  */
3334 void
3335 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3336 {
3337         if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3338                 panic("device number too big for fsid!");
3339         fsi->val[1] = val;
3340 }
3341
3342 int
3343 vfs_lock_held(vfs_t *vfsp)
3344 {
3345         int held;
3346         vn_vfslocks_entry_t *vpvfsentry;
3347
3348         /*
3349          * vfs_lock_held will mimic sema_held behaviour
3350          * if panicstr is set. And these changes should remain
3351          * for the patch changes as it is.
3352          */
3353         if (panicstr)
3354                 return (1);
3355
3356         vpvfsentry = vn_vfslocks_getlock(vfsp);
3357         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3358
3359         vn_vfslocks_rele(vpvfsentry);
3360         return (held);
3361 }
3362
3363 struct _kthread *
3364 vfs_lock_owner(vfs_t *vfsp)
3365 {
3366         struct _kthread *owner;
3367         vn_vfslocks_entry_t *vpvfsentry;
3368
3369         /*
3370          * vfs_wlock_held will mimic sema_held behaviour
3371          * if panicstr is set. And these changes should remain
3372          * for the patch changes as it is.
3373          */
3374         if (panicstr)
3375                 return (NULL);
3376
3377         vpvfsentry = vn_vfslocks_getlock(vfsp);
3378         owner = rwst_owner(&vpvfsentry->ve_lock);
3379
3380         vn_vfslocks_rele(vpvfsentry);
3381         return (owner);
3382 }
3383
3384 /*
3385  * vfs list locking.
3386  *
3387  * Rather than manipulate the vfslist lock directly, we abstract into lock
3388  * and unlock routines to allow the locking implementation to be changed for
3389  * clustering.
3390  *
3391  * Whenever the vfs list is modified through its hash links, the overall list
3392  * lock must be obtained before locking the relevant hash bucket.  But to see
3393  * whether a given vfs is on the list, it suffices to obtain the lock for the
3394  * hash bucket without getting the overall list lock.  (See getvfs() below.)
3395  */
3396
3397 void
3398 vfs_list_lock()
3399 {
3400         rw_enter(&vfslist, RW_WRITER);
3401 }
3402
3403 void
3404 vfs_list_read_lock()
3405 {
3406         rw_enter(&vfslist, RW_READER);
3407 }
3408
3409 void
3410 vfs_list_unlock()
3411 {
3412         rw_exit(&vfslist);
3413 }
3414
3415 /*
3416  * Low level worker routines for adding entries to and removing entries from
3417  * the vfs list.
3418  */
3419
3420 static void
3421 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3422 {
3423         int vhno;
3424         struct vfs **hp;
3425         dev_t dev;
3426
3427         ASSERT(RW_WRITE_HELD(&vfslist));
3428
3429         dev = expldev(vfsp->vfs_fsid.val[0]);
3430         vhno = VFSHASH(getmajor(dev), getminor(dev));
3431
3432         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3433
3434         /*
3435          * Link into the hash table, inserting it at the end, so that LOFS
3436          * with the same fsid as UFS (or other) file systems will not hide the
3437          * UFS.
3438          */
3439         if (insert_at_head) {
3440                 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3441                 rvfs_list[vhno].rvfs_head = vfsp;
3442         } else {
3443                 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3444                     hp = &(*hp)->vfs_hash)
3445                         continue;
3446                 /*
3447                  * hp now contains the address of the pointer to update
3448                  * to effect the insertion.
3449                  */
3450                 vfsp->vfs_hash = NULL;
3451                 *hp = vfsp;
3452         }
3453
3454         rvfs_list[vhno].rvfs_len++;
3455         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3456 }
3457
3458
3459 static void
3460 vfs_hash_remove(struct vfs *vfsp)
3461 {
3462         int vhno;
3463         struct vfs *tvfsp;
3464         dev_t dev;
3465
3466         ASSERT(RW_WRITE_HELD(&vfslist));
3467
3468         dev = expldev(vfsp->vfs_fsid.val[0]);
3469         vhno = VFSHASH(getmajor(dev), getminor(dev));
3470
3471         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3472
3473         /*
3474          * Remove from hash.
3475          */
3476         if (rvfs_list[vhno].rvfs_head == vfsp) {
3477                 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3478                 rvfs_list[vhno].rvfs_len--;
3479                 goto foundit;
3480         }
3481         for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3482             tvfsp = tvfsp->vfs_hash) {
3483                 if (tvfsp->vfs_hash == vfsp) {
3484                         tvfsp->vfs_hash = vfsp->vfs_hash;
3485                         rvfs_list[vhno].rvfs_len--;
3486                         goto foundit;
3487                 }
3488         }
3489         cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3490
3491 foundit:
3492
3493         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3494 }
3495
3496
3497 void
3498 vfs_list_add(struct vfs *vfsp)
3499 {
3500         zone_t *zone;
3501
3502         /*
3503          * Typically, the vfs_t will have been created on behalf of the file
3504          * system in vfs_init, where it will have been provided with a
3505          * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3506          * by an unbundled file system. We therefore check for such an example
3507          * before stamping the vfs_t with its creation time for the benefit of
3508          * mntfs.
3509          */
3510         if (vfsp->vfs_implp == NULL)
3511                 vfsimpl_setup(vfsp);
3512         vfs_mono_time(&vfsp->vfs_hrctime);
3513
3514         /*
3515          * The zone that owns the mount is the one that performed the mount.
3516          * Note that this isn't necessarily the same as the zone mounted into.
3517          * The corresponding zone_rele_ref() will be done when the vfs_t
3518          * is being free'd.
3519          */
3520         vfsp->vfs_zone = curproc->p_zone;
3521         zone_init_ref(&vfsp->vfs_implp->vi_zone_ref);
3522         zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref,
3523             ZONE_REF_VFS);
3524
3525         /*
3526          * Find the zone mounted into, and put this mount on its vfs list.
3527          */
3528         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3529         ASSERT(zone != NULL);
3530         /*
3531          * Special casing for the root vfs.  This structure is allocated
3532          * statically and hooked onto rootvfs at link time.  During the
3533          * vfs_mountroot call at system startup time, the root file system's
3534          * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3535          * as argument.  The code below must detect and handle this special
3536          * case.  The only apparent justification for this special casing is
3537          * to ensure that the root file system appears at the head of the
3538          * list.
3539          *
3540          * XXX: I'm assuming that it's ok to do normal list locking when
3541          *      adding the entry for the root file system (this used to be
3542          *      done with no locks held).
3543          */
3544         vfs_list_lock();
3545         /*
3546          * Link into the vfs list proper.
3547          */
3548         if (vfsp == &root) {
3549                 /*
3550                  * Assert: This vfs is already on the list as its first entry.
3551                  * Thus, there's nothing to do.
3552                  */
3553                 ASSERT(rootvfs == vfsp);
3554                 /*
3555                  * Add it to the head of the global zone's vfslist.
3556                  */
3557                 ASSERT(zone == global_zone);
3558                 ASSERT(zone->zone_vfslist == NULL);
3559                 zone->zone_vfslist = vfsp;
3560         } else {
3561                 /*
3562                  * Link to end of list using vfs_prev (as rootvfs is now a
3563                  * doubly linked circular list) so list is in mount order for
3564                  * mnttab use.
3565                  */
3566                 rootvfs->vfs_prev->vfs_next = vfsp;
3567                 vfsp->vfs_prev = rootvfs->vfs_prev;
3568                 rootvfs->vfs_prev = vfsp;
3569                 vfsp->vfs_next = rootvfs;
3570
3571                 /*
3572                  * Do it again for the zone-private list (which may be NULL).
3573                  */
3574                 if (zone->zone_vfslist == NULL) {
3575                         ASSERT(zone != global_zone);
3576                         zone->zone_vfslist = vfsp;
3577                 } else {
3578                         zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3579                         vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3580                         zone->zone_vfslist->vfs_zone_prev = vfsp;
3581                         vfsp->vfs_zone_next = zone->zone_vfslist;
3582                 }
3583         }
3584
3585         /*
3586          * Link into the hash table, inserting it at the end, so that LOFS
3587          * with the same fsid as UFS (or other) file systems will not hide
3588          * the UFS.
3589          */
3590         vfs_hash_add(vfsp, 0);
3591
3592         /*
3593          * update the mnttab modification time
3594          */
3595         vfs_mnttab_modtimeupd();
3596         vfs_list_unlock();
3597         zone_rele(zone);
3598 }
3599
3600 void
3601 vfs_list_remove(struct vfs *vfsp)
3602 {
3603         zone_t *zone;
3604
3605         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3606         ASSERT(zone != NULL);
3607         /*
3608          * Callers are responsible for preventing attempts to unmount the
3609          * root.
3610          */
3611         ASSERT(vfsp != rootvfs);
3612
3613         vfs_list_lock();
3614
3615         /*
3616          * Remove from hash.
3617          */
3618         vfs_hash_remove(vfsp);
3619
3620         /*
3621          * Remove from vfs list.
3622          */
3623         vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3624         vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3625         vfsp->vfs_next = vfsp->vfs_prev = NULL;
3626
3627         /*
3628          * Remove from zone-specific vfs list.
3629          */
3630         if (zone->zone_vfslist == vfsp)
3631                 zone->zone_vfslist = vfsp->vfs_zone_next;
3632
3633         if (vfsp->vfs_zone_next == vfsp) {
3634                 ASSERT(vfsp->vfs_zone_prev == vfsp);
3635                 ASSERT(zone->zone_vfslist == vfsp);
3636                 zone->zone_vfslist = NULL;
3637         }
3638
3639         vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3640         vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3641         vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3642
3643         /*
3644          * update the mnttab modification time
3645          */
3646         vfs_mnttab_modtimeupd();
3647         vfs_list_unlock();
3648         zone_rele(zone);
3649 }
3650
3651 struct vfs *
3652 getvfs(fsid_t *fsid)
3653 {
3654         struct vfs *vfsp;
3655         int val0 = fsid->val[0];
3656         int val1 = fsid->val[1];
3657         dev_t dev = expldev(val0);
3658         int vhno = VFSHASH(getmajor(dev), getminor(dev));
3659         kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3660
3661         mutex_enter(hmp);
3662         for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3663                 if (vfsp->vfs_fsid.val[0] == val0 &&
3664                     vfsp->vfs_fsid.val[1] == val1) {
3665                         VFS_HOLD(vfsp);
3666                         mutex_exit(hmp);
3667                         return (vfsp);
3668                 }
3669         }
3670         mutex_exit(hmp);
3671         return (NULL);
3672 }
3673
3674 /*
3675  * Search the vfs mount in progress list for a specified device/vfs entry.
3676  * Returns 0 if the first entry in the list that the device matches has the
3677  * given vfs pointer as well.  If the device matches but a different vfs
3678  * pointer is encountered in the list before the given vfs pointer then
3679  * a 1 is returned.
3680  */
3681
3682 int
3683 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3684 {
3685         int retval = 0;
3686         struct ipmnt *mipp;
3687
3688         mutex_enter(&vfs_miplist_mutex);
3689         for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3690                 if (mipp->mip_dev == dev) {
3691                         if (mipp->mip_vfsp != vfsp)
3692                                 retval = 1;
3693                         break;
3694                 }
3695         }
3696         mutex_exit(&vfs_miplist_mutex);
3697         return (retval);
3698 }
3699
3700 /*
3701  * Search the vfs list for a specified device.  Returns 1, if entry is found
3702  * or 0 if no suitable entry is found.
3703  */
3704
3705 int
3706 vfs_devismounted(dev_t dev)
3707 {
3708         struct vfs *vfsp;
3709         int found;
3710
3711         vfs_list_read_lock();
3712         vfsp = rootvfs;
3713         found = 0;
3714         do {
3715                 if (vfsp->vfs_dev == dev) {
3716                         found = 1;
3717                         break;
3718                 }
3719                 vfsp = vfsp->vfs_next;
3720         } while (vfsp != rootvfs);
3721
3722         vfs_list_unlock();
3723         return (found);
3724 }
3725
3726 /*
3727  * Search the vfs list for a specified device.  Returns a pointer to it
3728  * or NULL if no suitable entry is found. The caller of this routine
3729  * is responsible for releasing the returned vfs pointer.
3730  */
3731 struct vfs *
3732 vfs_dev2vfsp(dev_t dev)
3733 {
3734         struct vfs *vfsp;
3735         int found;
3736
3737         vfs_list_read_lock();
3738         vfsp = rootvfs;
3739         found = 0;
3740         do {
3741                 /*
3742                  * The following could be made more efficient by making
3743                  * the entire loop use vfs_zone_next if the call is from
3744                  * a zone.  The only callers, however, ustat(2) and
3745                  * umount2(2), don't seem to justify the added
3746                  * complexity at present.
3747                  */
3748                 if (vfsp->vfs_dev == dev &&
3749                     ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3750                     curproc->p_zone)) {
3751                         VFS_HOLD(vfsp);
3752                         found = 1;
3753                         break;
3754                 }
3755                 vfsp = vfsp->vfs_next;
3756         } while (vfsp != rootvfs);
3757         vfs_list_unlock();
3758         return (found ? vfsp: NULL);
3759 }
3760
3761 /*
3762  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3763  * or NULL if no suitable entry is found. The caller of this routine
3764  * is responsible for releasing the returned vfs pointer.
3765  *
3766  * Note that if multiple mntpoints match, the last one matching is
3767  * returned in an attempt to return the "top" mount when overlay
3768  * mounts are covering the same mount point.  This is accomplished by starting
3769  * at the end of the list and working our way backwards, stopping at the first
3770  * matching mount.
3771  */
3772 struct vfs *
3773 vfs_mntpoint2vfsp(const char *mp)
3774 {
3775         struct vfs *vfsp;
3776         struct vfs *retvfsp = NULL;
3777         zone_t *zone = curproc->p_zone;
3778         struct vfs *list;
3779
3780         vfs_list_read_lock();
3781         if (getzoneid() == GLOBAL_ZONEID) {
3782                 /*
3783                  * The global zone may see filesystems in any zone.
3784                  */
3785                 vfsp = rootvfs->vfs_prev;
3786                 do {
3787                         if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3788                                 retvfsp = vfsp;
3789                                 break;
3790                         }
3791                         vfsp = vfsp->vfs_prev;
3792                 } while (vfsp != rootvfs->vfs_prev);
3793         } else if ((list = zone->zone_vfslist) != NULL) {
3794                 const char *mntpt;
3795
3796                 vfsp = list->vfs_zone_prev;
3797                 do {
3798                         mntpt = refstr_value(vfsp->vfs_mntpt);
3799                         mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3800                         if (strcmp(mntpt, mp) == 0) {
3801                                 retvfsp = vfsp;
3802                                 break;
3803                         }
3804                         vfsp = vfsp->vfs_zone_prev;
3805                 } while (vfsp != list->vfs_zone_prev);
3806         }
3807         if (retvfsp)
3808                 VFS_HOLD(retvfsp);
3809         vfs_list_unlock();
3810         return (retvfsp);
3811 }
3812
3813 /*
3814  * Search the vfs list for a specified vfsops.
3815  * if vfs entry is found then return 1, else 0.
3816  */
3817 int
3818 vfs_opsinuse(vfsops_t *ops)
3819 {
3820         struct vfs *vfsp;
3821         int found;
3822
3823         vfs_list_read_lock();
3824         vfsp = rootvfs;
3825         found = 0;
3826         do {
3827                 if (vfs_getops(vfsp) == ops) {
3828                         found = 1;
3829                         break;
3830                 }
3831                 vfsp = vfsp->vfs_next;
3832         } while (vfsp != rootvfs);
3833         vfs_list_unlock();
3834         return (found);
3835 }
3836
3837 /*
3838  * Allocate an entry in vfssw for a file system type
3839  */
3840 struct vfssw *
3841 allocate_vfssw(const char *type)
3842 {
3843         struct vfssw *vswp;
3844
3845         if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3846                 /*
3847                  * The vfssw table uses the empty string to identify an
3848                  * available entry; we cannot add any type which has
3849                  * a leading NUL. The string length is limited to
3850                  * the size of the st_fstype array in struct stat.
3851                  */
3852                 return (NULL);
3853         }
3854
3855         ASSERT(VFSSW_WRITE_LOCKED());
3856         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3857                 if (!ALLOCATED_VFSSW(vswp)) {
3858                         vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3859                         (void) strcpy(vswp->vsw_name, type);
3860                         ASSERT(vswp->vsw_count == 0);
3861                         vswp->vsw_count = 1;
3862                         mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3863                         return (vswp);
3864                 }
3865         return (NULL);
3866 }
3867
3868 /*
3869  * Impose additional layer of translation between vfstype names
3870  * and module names in the filesystem.
3871  */
3872 static const char *
3873 vfs_to_modname(const char *vfstype)
3874 {
3875         if (strcmp(vfstype, "proc") == 0) {
3876                 vfstype = "procfs";
3877         } else if (strcmp(vfstype, "fd") == 0) {
3878                 vfstype = "fdfs";
3879         } else if (strncmp(vfstype, "nfs", 3) == 0) {
3880                 vfstype = "nfs";
3881         }
3882
3883         return (vfstype);
3884 }
3885
3886 /*
3887  * Find a vfssw entry given a file system type name.
3888  * Try to autoload the filesystem if it's not found.
3889  * If it's installed, return the vfssw locked to prevent unloading.
3890  */
3891 struct vfssw *
3892 vfs_getvfssw(const char *type)
3893 {
3894         struct vfssw *vswp;
3895         const char *modname;
3896
3897         RLOCK_VFSSW();
3898         vswp = vfs_getvfsswbyname(type);
3899         modname = vfs_to_modname(type);
3900
3901         if (rootdir == NULL) {
3902                 /*
3903                  * If we haven't yet loaded the root file system, then our
3904                  * _init won't be called until later. Allocate vfssw entry,
3905                  * because mod_installfs won't be called.
3906                  */
3907                 if (vswp == NULL) {
3908                         RUNLOCK_VFSSW();
3909                         WLOCK_VFSSW();
3910                         if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3911                                 if ((vswp = allocate_vfssw(type)) == NULL) {
3912                                         WUNLOCK_VFSSW();
3913                                         return (NULL);
3914                                 }
3915                         }
3916                         WUNLOCK_VFSSW();
3917                         RLOCK_VFSSW();
3918                 }
3919                 if (!VFS_INSTALLED(vswp)) {
3920                         RUNLOCK_VFSSW();
3921                         (void) modloadonly("fs", modname);
3922                 } else
3923                         RUNLOCK_VFSSW();
3924                 return (vswp);
3925         }
3926
3927         /*
3928          * Try to load the filesystem.  Before calling modload(), we drop
3929          * our lock on the VFS switch table, and pick it up after the
3930          * module is loaded.  However, there is a potential race:  the
3931          * module could be unloaded after the call to modload() completes
3932          * but before we pick up the lock and drive on.  Therefore,
3933          * we keep reloading the module until we've loaded the module
3934          * _and_ we have the lock on the VFS switch table.
3935          */
3936         while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3937                 RUNLOCK_VFSSW();
3938                 if (modload("fs", modname) == -1)
3939                         return (NULL);
3940                 RLOCK_VFSSW();
3941                 if (vswp == NULL)
3942                         if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3943                                 break;
3944         }
3945         RUNLOCK_VFSSW();
3946
3947         return (vswp);
3948 }
3949
3950 /*
3951  * Find a vfssw entry given a file system type name.
3952  */
3953 struct vfssw *
3954 vfs_getvfsswbyname(const char *type)
3955 {
3956         struct vfssw *vswp;
3957
3958         ASSERT(VFSSW_LOCKED());
3959         if (type == NULL || *type == '\0')
3960                 return (NULL);
3961
3962         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3963                 if (strcmp(type, vswp->vsw_name) == 0) {
3964                         vfs_refvfssw(vswp);
3965                         return (vswp);
3966                 }
3967         }
3968
3969         return (NULL);
3970 }
3971
3972 /*
3973  * Find a vfssw entry given a set of vfsops.
3974  */
3975 struct vfssw *
3976 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3977 {
3978         struct vfssw *vswp;
3979
3980         RLOCK_VFSSW();
3981         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3982                 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3983                         vfs_refvfssw(vswp);
3984                         RUNLOCK_VFSSW();
3985                         return (vswp);
3986                 }
3987         }
3988         RUNLOCK_VFSSW();
3989
3990         return (NULL);
3991 }
3992
3993 /*
3994  * Reference a vfssw entry.
3995  */
3996 void
3997 vfs_refvfssw(struct vfssw *vswp)
3998 {
3999
4000         mutex_enter(&vswp->vsw_lock);
4001         vswp->vsw_count++;
4002         mutex_exit(&vswp->vsw_lock);
4003 }
4004
4005 /*
4006  * Unreference a vfssw entry.
4007  */
4008 void
4009 vfs_unrefvfssw(struct vfssw *vswp)
4010 {
4011
4012         mutex_enter(&vswp->vsw_lock);
4013         vswp->vsw_count--;
4014         mutex_exit(&vswp->vsw_lock);
4015 }
4016
4017 static int sync_retries = 20;   /* number of retries when not making progress */
4018 static int sync_triesleft;      /* portion of sync_retries remaining */
4019
4020 static pgcnt_t old_pgcnt, new_pgcnt;
4021 static int new_bufcnt, old_bufcnt;
4022
4023 /*
4024  * Sync all of the mounted filesystems, and then wait for the actual i/o to
4025  * complete.  We wait by counting the number of dirty pages and buffers,
4026  * pushing them out using bio_busy() and page_busy(), and then counting again.
4027  * This routine is used during the uadmin A_SHUTDOWN code.  It should only
4028  * be used after some higher-level mechanism has quiesced the system so that
4029  * new writes are not being initiated while we are waiting for completion.
4030  *
4031  * To ensure finite running time, our algorithm uses sync_triesleft (a progress
4032  * counter used by the vfs_syncall() loop below). It is declared above so
4033  * it can be found easily in the debugger.
4034  *
4035  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
4036  * sync_retries consecutive calls to bio_busy() and page_busy() without
4037  * decreasing either the number of dirty buffers or dirty pages below the
4038  * lowest count we have seen so far, we give up and return from vfs_syncall().
4039  *
4040  * Each loop iteration ends with a call to delay() one second to allow time for
4041  * i/o completion and to permit the user time to read our progress messages.
4042  */
4043 void
4044 vfs_syncall(void)
4045 {
4046         if (rootdir == NULL && !modrootloaded)
4047                 return; /* no filesystems have been loaded yet */
4048
4049         printf("syncing file systems...");
4050         sync();
4051
4052         sync_triesleft = sync_retries;
4053
4054         old_bufcnt = new_bufcnt = INT_MAX;
4055         old_pgcnt = new_pgcnt = ULONG_MAX;
4056
4057         while (sync_triesleft > 0) {
4058                 old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4059                 old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4060
4061                 new_bufcnt = bio_busy(B_TRUE);
4062                 new_pgcnt = page_busy(B_TRUE);
4063
4064                 if (new_bufcnt == 0 && new_pgcnt == 0)
4065                         break;
4066
4067                 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4068                         sync_triesleft = sync_retries;
4069                 else
4070                         sync_triesleft--;
4071
4072                 if (new_bufcnt)
4073                         printf(" [%d]", new_bufcnt);
4074                 if (new_pgcnt)
4075                         printf(" %lu", new_pgcnt);
4076
4077                 delay(hz);
4078         }
4079
4080         if (new_bufcnt != 0 || new_pgcnt != 0)
4081                 printf(" done (not all i/o completed)\n");
4082         else
4083                 printf(" done\n");
4084
4085         delay(hz);
4086 }
4087
4088 /*
4089  * Map VFS flags to statvfs flags.  These shouldn't really be separate
4090  * flags at all.
4091  */
4092 uint_t
4093 vf_to_stf(uint_t vf)
4094 {
4095         uint_t stf = 0;
4096
4097         if (vf & VFS_RDONLY)
4098                 stf |= ST_RDONLY;
4099         if (vf & VFS_NOSETUID)
4100                 stf |= ST_NOSUID;
4101         if (vf & VFS_NOTRUNC)
4102                 stf |= ST_NOTRUNC;
4103
4104         return (stf);
4105 }
4106
4107 /*
4108  * Entries for (illegal) fstype 0.
4109  */
4110 /* ARGSUSED */
4111 int
4112 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4113 {
4114         cmn_err(CE_PANIC, "stray vfs operation");
4115         return (0);
4116 }
4117
4118 /*
4119  * Entries for (illegal) fstype 0.
4120  */
4121 int
4122 vfsstray(void)
4123 {
4124         cmn_err(CE_PANIC, "stray vfs operation");
4125         return (0);
4126 }
4127
4128 /*
4129  * Support for dealing with forced UFS unmount and its interaction with
4130  * LOFS. Could be used by any filesystem.
4131  * See bug 1203132.
4132  */
4133 int
4134 vfs_EIO(void)
4135 {
4136         return (EIO);
4137 }
4138
4139 /*
4140  * We've gotta define the op for sync separately, since the compiler gets
4141  * confused if we mix and match ANSI and normal style prototypes when
4142  * a "short" argument is present and spits out a warning.
4143  */
4144 /*ARGSUSED*/
4145 int
4146 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4147 {
4148         return (EIO);
4149 }
4150
4151 vfs_t EIO_vfs;
4152 vfsops_t *EIO_vfsops;
4153
4154 /*
4155  * Called from startup() to initialize all loaded vfs's
4156  */
4157 void
4158 vfsinit(void)
4159 {
4160         struct vfssw *vswp;
4161         int error;
4162         extern int vopstats_enabled;
4163         extern void vopstats_startup();
4164
4165         static const fs_operation_def_t EIO_vfsops_template[] = {
4166                 VFSNAME_MOUNT,          { .error = vfs_EIO },
4167                 VFSNAME_UNMOUNT,        { .error = vfs_EIO },
4168                 VFSNAME_ROOT,           { .error = vfs_EIO },
4169                 VFSNAME_STATVFS,        { .error = vfs_EIO },
4170                 VFSNAME_SYNC,           { .vfs_sync = vfs_EIO_sync },
4171                 VFSNAME_VGET,           { .error = vfs_EIO },
4172                 VFSNAME_MOUNTROOT,      { .error = vfs_EIO },
4173                 VFSNAME_FREEVFS,        { .error = vfs_EIO },
4174                 VFSNAME_VNSTATE,        { .error = vfs_EIO },
4175                 NULL, NULL
4176         };
4177
4178         static const fs_operation_def_t stray_vfsops_template[] = {
4179                 VFSNAME_MOUNT,          { .error = vfsstray },
4180                 VFSNAME_UNMOUNT,        { .error = vfsstray },
4181                 VFSNAME_ROOT,           { .error = vfsstray },
4182                 VFSNAME_STATVFS,        { .error = vfsstray },
4183                 VFSNAME_SYNC,           { .vfs_sync = vfsstray_sync },
4184                 VFSNAME_VGET,           { .error = vfsstray },
4185                 VFSNAME_MOUNTROOT,      { .error = vfsstray },
4186                 VFSNAME_FREEVFS,        { .error = vfsstray },
4187                 VFSNAME_VNSTATE,        { .error = vfsstray },
4188                 NULL, NULL
4189         };
4190
4191         /* Create vfs cache */
4192         vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4193             sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4194
4195         /* Initialize the vnode cache (file systems may use it during init). */
4196         vn_create_cache();
4197
4198         /* Setup event monitor framework */
4199         fem_init();
4200
4201         /* Initialize the dummy stray file system type. */
4202         error = vfs_setfsops(0, stray_vfsops_template, NULL);
4203
4204         /* Initialize the dummy EIO file system. */
4205         error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4206         if (error != 0) {
4207                 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4208                 /* Shouldn't happen, but not bad enough to panic */
4209         }
4210
4211         VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4212
4213         /*
4214          * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4215          * on this vfs can immediately notice it's invalid.
4216          */
4217         EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4218
4219         /*
4220          * Call the init routines of non-loadable filesystems only.
4221          * Filesystems which are loaded as separate modules will be
4222          * initialized by the module loading code instead.
4223          */
4224
4225         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4226                 RLOCK_VFSSW();
4227                 if (vswp->vsw_init != NULL)
4228                         (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4229                 RUNLOCK_VFSSW();
4230         }
4231
4232         vopstats_startup();
4233
4234         if (vopstats_enabled) {
4235                 /* EIO_vfs can collect stats, but we don't retrieve them */
4236                 initialize_vopstats(&EIO_vfs.vfs_vopstats);
4237                 EIO_vfs.vfs_fstypevsp = NULL;
4238                 EIO_vfs.vfs_vskap = NULL;
4239                 EIO_vfs.vfs_flag |= VFS_STATS;
4240         }
4241
4242         xattr_init();
4243
4244         reparse_point_init();
4245 }
4246
4247 vfs_t *
4248 vfs_alloc(int kmflag)
4249 {
4250         vfs_t *vfsp;
4251
4252         vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4253
4254         /*
4255          * Do the simplest initialization here.
4256          * Everything else gets done in vfs_init()
4257          */
4258         bzero(vfsp, sizeof (vfs_t));
4259         return (vfsp);
4260 }
4261
4262 void
4263 vfs_free(vfs_t *vfsp)
4264 {
4265         /*
4266          * One would be tempted to assert that "vfsp->vfs_count == 0".
4267          * The problem is that this gets called out of domount() with
4268          * a partially initialized vfs and a vfs_count of 1.  This is
4269          * also called from vfs_rele() with a vfs_count of 0.  We can't
4270          * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4271          * returned.  This is because VFS_MOUNT() fully initializes the
4272          * vfs structure and its associated data.  VFS_RELE() will call
4273          * VFS_FREEVFS() which may panic the system if the data structures
4274          * aren't fully initialized from a successful VFS_MOUNT()).
4275          */
4276
4277         /* If FEM was in use, make sure everything gets cleaned up */
4278         if (vfsp->vfs_femhead) {
4279                 ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4280                 mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4281                 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4282                 vfsp->vfs_femhead = NULL;
4283         }
4284
4285         if (vfsp->vfs_implp)
4286                 vfsimpl_teardown(vfsp);
4287         sema_destroy(&vfsp->vfs_reflock);
4288         kmem_cache_free(vfs_cache, vfsp);
4289 }
4290
4291 /*
4292  * Increments the vfs reference count by one atomically.
4293  */
4294 void
4295 vfs_hold(vfs_t *vfsp)
4296 {
4297         atomic_inc_32(&vfsp->vfs_count);
4298         ASSERT(vfsp->vfs_count != 0);
4299 }
4300
4301 /*
4302  * Decrements the vfs reference count by one atomically. When
4303  * vfs reference count becomes zero, it calls the file system
4304  * specific vfs_freevfs() to free up the resources.
4305  */
4306 void
4307 vfs_rele(vfs_t *vfsp)
4308 {
4309         ASSERT(vfsp->vfs_count != 0);
4310         if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) {
4311                 VFS_FREEVFS(vfsp);
4312                 lofi_remove(vfsp);
4313                 if (vfsp->vfs_zone)
4314                         zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
4315                             ZONE_REF_VFS);
4316                 vfs_freemnttab(vfsp);
4317                 vfs_free(vfsp);
4318         }
4319 }
4320
4321 /*
4322  * Generic operations vector support.
4323  *
4324  * This is used to build operations vectors for both the vfs and vnode.
4325  * It's normally called only when a file system is loaded.
4326  *
4327  * There are many possible algorithms for this, including the following:
4328  *
4329  *   (1) scan the list of known operations; for each, see if the file system
4330  *       includes an entry for it, and fill it in as appropriate.
4331  *
4332  *   (2) set up defaults for all known operations.  scan the list of ops
4333  *       supplied by the file system; for each which is both supplied and
4334  *       known, fill it in.
4335  *
4336  *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4337  *       in entries as we go.
4338  *
4339  * we choose (1) for simplicity, and because performance isn't critical here.
4340  * note that (2) could be sped up using a precomputed hash table on known ops.
4341  * (3) could be faster than either, but only if the lists were very large or
4342  * supplied in sorted order.
4343  *
4344  */
4345
4346 int
4347 fs_build_vector(void *vector, int *unused_ops,
4348     const fs_operation_trans_def_t *translation,
4349     const fs_operation_def_t *operations)
4350 {
4351         int i, num_trans, num_ops, used;
4352
4353         /*
4354          * Count the number of translations and the number of supplied
4355          * operations.
4356          */
4357
4358         {
4359                 const fs_operation_trans_def_t *p;
4360
4361                 for (num_trans = 0, p = translation;
4362                     p->name != NULL;
4363                     num_trans++, p++)
4364                         ;
4365         }
4366
4367         {
4368                 const fs_operation_def_t *p;
4369
4370                 for (num_ops = 0, p = operations;
4371                     p->name != NULL;
4372                     num_ops++, p++)
4373                         ;
4374         }
4375
4376         /* Walk through each operation known to our caller.  There will be */
4377         /* one entry in the supplied "translation table" for each. */
4378
4379         used = 0;
4380
4381         for (i = 0; i < num_trans; i++) {
4382                 int j, found;
4383                 char *curname;
4384                 fs_generic_func_p result;
4385                 fs_generic_func_p *location;
4386
4387                 curname = translation[i].name;
4388
4389                 /* Look for a matching operation in the list supplied by the */
4390                 /* file system. */
4391
4392                 found = 0;
4393
4394                 for (j = 0; j < num_ops; j++) {
4395                         if (strcmp(operations[j].name, curname) == 0) {
4396                                 used++;
4397                                 found = 1;
4398                                 break;
4399                         }
4400                 }
4401
4402                 /*
4403                  * If the file system is using a "placeholder" for default
4404                  * or error functions, grab the appropriate function out of
4405                  * the translation table.  If the file system didn't supply
4406                  * this operation at all, use the default function.
4407                  */
4408
4409                 if (found) {
4410                         result = operations[j].func.fs_generic;
4411                         if (result == fs_default) {
4412                                 result = translation[i].defaultFunc;
4413                         } else if (result == fs_error) {
4414                                 result = translation[i].errorFunc;
4415                         } else if (result == NULL) {
4416                                 /* Null values are PROHIBITED */
4417                                 return (EINVAL);
4418                         }
4419                 } else {
4420                         result = translation[i].defaultFunc;
4421                 }
4422
4423                 /* Now store the function into the operations vector. */
4424
4425                 location = (fs_generic_func_p *)
4426                     (((char *)vector) + translation[i].offset);
4427
4428                 *location = result;
4429         }
4430
4431         *unused_ops = num_ops - used;
4432
4433         return (0);
4434 }
4435
4436 /* Placeholder functions, should never be called. */
4437
4438 int
4439 fs_error(void)
4440 {
4441         cmn_err(CE_PANIC, "fs_error called");
4442         return (0);
4443 }
4444
4445 int
4446 fs_default(void)
4447 {
4448         cmn_err(CE_PANIC, "fs_default called");
4449         return (0);
4450 }
4451
4452 #ifdef __sparc
4453
4454 /*
4455  * Part of the implementation of booting off a mirrored root
4456  * involves a change of dev_t for the root device.  To
4457  * accomplish this, first remove the existing hash table
4458  * entry for the root device, convert to the new dev_t,
4459  * then re-insert in the hash table at the head of the list.
4460  */
4461 void
4462 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4463 {
4464         vfs_list_lock();
4465
4466         vfs_hash_remove(vfsp);
4467
4468         vfsp->vfs_dev = ndev;
4469         vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4470
4471         vfs_hash_add(vfsp, 1);
4472
4473         vfs_list_unlock();
4474 }
4475
4476 #else /* x86 NEWBOOT */
4477
4478 #if defined(__x86)
4479 extern int hvmboot_rootconf();
4480 #endif /* __x86 */
4481
4482 extern ib_boot_prop_t *iscsiboot_prop;
4483
4484 int
4485 rootconf()
4486 {
4487         int error;
4488         struct vfssw *vsw;
4489         extern void pm_init();
4490         char *fstyp, *fsmod;
4491         int ret = -1;
4492
4493         getrootfs(&fstyp, &fsmod);
4494
4495 #if defined(__x86)
4496         /*
4497          * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4498          * which lives in /platform/i86hvm, and hence is only available when
4499          * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4500          * is not available then the modstub for this function will return 0.
4501          * If the hvm_bootstrap misc module is available it will be loaded
4502          * and hvmboot_rootconf() will be invoked.
4503          */
4504         if (error = hvmboot_rootconf())
4505                 return (error);
4506 #endif /* __x86 */
4507
4508         if (error = clboot_rootconf())
4509                 return (error);
4510
4511         if (modload("fs", fsmod) == -1)
4512                 panic("Cannot _init %s module", fsmod);
4513
4514         RLOCK_VFSSW();
4515         vsw = vfs_getvfsswbyname(fstyp);
4516         RUNLOCK_VFSSW();
4517         if (vsw == NULL) {
4518                 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4519                 return (ENXIO);
4520         }
4521         VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4522         VFS_HOLD(rootvfs);
4523
4524         /* always mount readonly first */
4525         rootvfs->vfs_flag |= VFS_RDONLY;
4526
4527         pm_init();
4528
4529         if (netboot && iscsiboot_prop) {
4530                 cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4531                     " shouldn't happen in the same time");
4532                 return (EINVAL);
4533         }
4534
4535         if (netboot || iscsiboot_prop) {
4536                 ret = strplumb();
4537                 if (ret != 0) {
4538                         cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4539                         return (EFAULT);
4540                 }
4541         }
4542
4543         if ((ret == 0) && iscsiboot_prop) {
4544                 ret = modload("drv", "iscsi");
4545                 /* -1 indicates fail */
4546                 if (ret == -1) {
4547                         cmn_err(CE_WARN, "Failed to load iscsi module");
4548                         iscsi_boot_prop_free();
4549                         return (EINVAL);
4550                 } else {
4551                         if (!i_ddi_attach_pseudo_node("iscsi")) {
4552                                 cmn_err(CE_WARN,
4553                                     "Failed to attach iscsi driver");
4554                                 iscsi_boot_prop_free();
4555                                 return (ENODEV);
4556                         }
4557                 }
4558         }
4559
4560         error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4561         vfs_unrefvfssw(vsw);
4562         rootdev = rootvfs->vfs_dev;
4563
4564         if (error)
4565                 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4566                     rootfs.bo_name, fstyp);
4567         else
4568                 cmn_err(CE_CONT, "?root on %s fstype %s\n",
4569                     rootfs.bo_name, fstyp);
4570         return (error);
4571 }
4572
4573 /*
4574  * XXX this is called by nfs only and should probably be removed
4575  * If booted with ASKNAME, prompt on the console for a filesystem
4576  * name and return it.
4577  */
4578 void
4579 getfsname(char *askfor, char *name, size_t namelen)
4580 {
4581         if (boothowto & RB_ASKNAME) {
4582                 printf("%s name: ", askfor);
4583                 console_gets(name, namelen);
4584         }
4585 }
4586
4587 /*
4588  * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4589  * property.
4590  *
4591  * Filesystem types starting with the prefix "nfs" are diskless clients;
4592  * init the root filename name (rootfs.bo_name), too.
4593  *
4594  * If we are booting via NFS we currently have these options:
4595  *      nfs -   dynamically choose NFS V2, V3, or V4 (default)
4596  *      nfs2 -  force NFS V2
4597  *      nfs3 -  force NFS V3
4598  *      nfs4 -  force NFS V4
4599  * Because we need to maintain backward compatibility with the naming
4600  * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4601  * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs".  The dynamic
4602  * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4603  * This is only for root filesystems, all other uses will expect
4604  * that "nfs" == NFS V2.
4605  */
4606 static void
4607 getrootfs(char **fstypp, char **fsmodp)
4608 {
4609         char *propstr = NULL;
4610
4611         /*
4612          * Check fstype property; for diskless it should be one of "nfs",
4613          * "nfs2", "nfs3" or "nfs4".
4614          */
4615         if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4616             DDI_PROP_DONTPASS, "fstype", &propstr)
4617             == DDI_SUCCESS) {
4618                 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4619                 ddi_prop_free(propstr);
4620
4621         /*
4622          * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4623          * assume the type of this root filesystem is 'zfs'.
4624          */
4625         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4626             DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4627             == DDI_SUCCESS) {
4628                 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4629                 ddi_prop_free(propstr);
4630         }
4631
4632         if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4633                 *fstypp = *fsmodp = rootfs.bo_fstype;
4634                 return;
4635         }
4636
4637         ++netboot;
4638
4639         if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4640                 (void) strcpy(rootfs.bo_fstype, "nfs");
4641         else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4642                 (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4643
4644         /*
4645          * check if path to network interface is specified in bootpath
4646          * or by a hypervisor domain configuration file.
4647          * XXPV - enable strlumb_get_netdev_path()
4648          */
4649         if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4650             "xpv-nfsroot")) {
4651                 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4652         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4653             DDI_PROP_DONTPASS, "bootpath", &propstr)
4654             == DDI_SUCCESS) {
4655                 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4656                 ddi_prop_free(propstr);
4657         } else {
4658                 rootfs.bo_name[0] = '\0';
4659         }
4660         *fstypp = rootfs.bo_fstype;
4661         *fsmodp = "nfs";
4662 }
4663 #endif
4664
4665 /*
4666  * VFS feature routines
4667  */
4668
4669 #define VFTINDEX(feature)       (((feature) >> 32) & 0xFFFFFFFF)
4670 #define VFTBITS(feature)        ((feature) & 0xFFFFFFFFLL)
4671
4672 /* Register a feature in the vfs */
4673 void
4674 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4675 {
4676         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4677         if (vfsp->vfs_implp == NULL)
4678                 return;
4679
4680         vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4681 }
4682
4683 void
4684 vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature)
4685 {
4686         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4687         if (vfsp->vfs_implp == NULL)
4688                 return;
4689         vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature);
4690 }
4691
4692 /*
4693  * Query a vfs for a feature.
4694  * Returns 1 if feature is present, 0 if not
4695  */
4696 int
4697 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4698 {
4699         int     ret = 0;
4700
4701         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4702         if (vfsp->vfs_implp == NULL)
4703                 return (ret);
4704
4705         if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4706                 ret = 1;
4707
4708         return (ret);
4709 }
4710
4711 /*
4712  * Propagate feature set from one vfs to another
4713  */
4714 void
4715 vfs_propagate_features(vfs_t *from, vfs_t *to)
4716 {
4717         int i;
4718
4719         if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4720                 return;
4721
4722         for (i = 1; i <= to->vfs_featureset[0]; i++) {
4723                 to->vfs_featureset[i] = from->vfs_featureset[i];
4724         }
4725 }
4726
4727 #define LOFINODE_PATH "/dev/lofi/%d"
4728
4729 /*
4730  * Return the vnode for the lofi node if there's a lofi mount in place.
4731  * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4732  * failure.
4733  */
4734 int
4735 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4736 {
4737         char *path = NULL;
4738         int strsize;
4739         int err;
4740
4741         if (vfsp->vfs_lofi_id == 0) {
4742                 *vpp = NULL;
4743                 return (-1);
4744         }
4745
4746         strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_id);
4747         path = kmem_alloc(strsize + 1, KM_SLEEP);
4748         (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_id);
4749
4750         /*
4751          * We may be inside a zone, so we need to use the /dev path, but
4752          * it's created asynchronously, so we wait here.
4753          */
4754         for (;;) {
4755                 err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4756
4757                 if (err != ENOENT)
4758                         break;
4759
4760                 if ((err = delay_sig(hz / 8)) == EINTR)
4761                         break;
4762         }
4763
4764         if (err)
4765                 *vpp = NULL;
4766
4767         kmem_free(path, strsize + 1);
4768         return (err);
4769 }