usr/src/uts/common/fs/vfs.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/errno.h>
  43 #include <sys/user.h>
  44 #include <sys/fstyp.h>
  45 #include <sys/kmem.h>
  46 #include <sys/systm.h>
  47 #include <sys/proc.h>
  48 #include <sys/mount.h>
  49 #include <sys/vfs.h>
  50 #include <sys/vfs_opreg.h>
  51 #include <sys/fem.h>
  52 #include <sys/mntent.h>
  53 #include <sys/stat.h>
  54 #include <sys/statvfs.h>
  55 #include <sys/statfs.h>
  56 #include <sys/cred.h>
  57 #include <sys/vnode.h>
  58 #include <sys/rwstlock.h>
  59 #include <sys/dnlc.h>
  60 #include <sys/file.h>
  61 #include <sys/time.h>
  62 #include <sys/atomic.h>
  63 #include <sys/cmn_err.h>
  64 #include <sys/buf.h>
  65 #include <sys/swap.h>
  66 #include <sys/debug.h>
  67 #include <sys/vnode.h>
  68 #include <sys/modctl.h>
  69 #include <sys/ddi.h>
  70 #include <sys/pathname.h>
  71 #include <sys/bootconf.h>
  72 #include <sys/dumphdr.h>
  73 #include <sys/dc_ki.h>
  74 #include <sys/poll.h>
  75 #include <sys/sunddi.h>
  76 #include <sys/sysmacros.h>
  77 #include <sys/zone.h>
  78 #include <sys/policy.h>
  79 #include <sys/ctfs.h>
  80 #include <sys/objfs.h>
  81 #include <sys/console.h>
  82 #include <sys/reboot.h>
  83 #include <sys/attr.h>
  84 #include <sys/zio.h>
  85 #include <sys/spa.h>
  86 #include <sys/lofi.h>
  87 #include <sys/bootprops.h>
  88
  89 #include <vm/page.h>
  90
  91 #include <fs/fs_subr.h>
  92 /* Private interfaces to create vopstats-related data structures */
  93 extern void             initialize_vopstats(vopstats_t *);
  94 extern vopstats_t       *get_fstype_vopstats(struct vfs *, struct vfssw *);
  95 extern vsk_anchor_t     *get_vskstat_anchor(struct vfs *);
  96
  97 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
  98 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
  99     const char *, int, int);
 100 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
 101 static void vfs_freemnttab(struct vfs *);
 102 static void vfs_freeopt(mntopt_t *);
 103 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
 104 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
 105 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
 106 static void vfs_createopttbl_extend(mntopts_t *, const char *,
 107     const mntopts_t *);
 108 static char **vfs_copycancelopt_extend(char **const, int);
 109 static void vfs_freecancelopt(char **);
 110 static void getrootfs(char **, char **);
 111 static int getmacpath(dev_info_t *, void *);
 112 static void vfs_mnttabvp_setup(void);
 113
 114 struct ipmnt {
 115         struct ipmnt    *mip_next;
 116         dev_t           mip_dev;
 117         struct vfs      *mip_vfsp;
 118 };
 119
 120 static kmutex_t         vfs_miplist_mutex;
 121 static struct ipmnt     *vfs_miplist = NULL;
 122 static struct ipmnt     *vfs_miplist_end = NULL;
 123
 124 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */
 125
 126 /*
 127  * VFS global data.
 128  */
 129 vnode_t *rootdir;               /* pointer to root inode vnode. */
 130 vnode_t *devicesdir;            /* pointer to inode of devices root */
 131 vnode_t *devdir;                /* pointer to inode of dev root */
 132
 133 char *server_rootpath;          /* root path for diskless clients */
 134 char *server_hostname;          /* hostname of diskless server */
 135
 136 static struct vfs root;
 137 static struct vfs devices;
 138 static struct vfs dev;
 139 struct vfs *rootvfs = &root;    /* pointer to root vfs; head of VFS list. */
 140 rvfs_t *rvfs_list;              /* array of vfs ptrs for vfs hash list */
 141 int vfshsz = 512;               /* # of heads/locks in vfs hash arrays */
 142                                 /* must be power of 2!  */
 143 timespec_t vfs_mnttab_ctime;    /* mnttab created time */
 144 timespec_t vfs_mnttab_mtime;    /* mnttab last modified time */
 145 char *vfs_dummyfstype = "\0";
 146 struct pollhead vfs_pollhd;     /* for mnttab pollers */
 147 struct vnode *vfs_mntdummyvp;   /* to fake mnttab read/write for file events */
 148 int     mntfstype;              /* will be set once mnt fs is mounted */
 149
 150 /*
 151  * Table for generic options recognized in the VFS layer and acted
 152  * on at this level before parsing file system specific options.
 153  * The nosuid option is stronger than any of the devices and setuid
 154  * options, so those are canceled when nosuid is seen.
 155  *
 156  * All options which are added here need to be added to the
 157  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
 158  */
 159 /*
 160  * VFS Mount options table
 161  */
 162 static char *ro_cancel[] = { MNTOPT_RW, NULL };
 163 static char *rw_cancel[] = { MNTOPT_RO, NULL };
 164 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
 165 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
 166     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
 167 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
 168 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
 169 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
 170 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
 171 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
 172 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
 173 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
 174 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
 175
 176 static const mntopt_t mntopts[] = {
 177 /*
 178  *      option name             cancel options          default arg     flags
 179  */
 180         { MNTOPT_REMOUNT,       NULL,                   NULL,
 181                 MO_NODISPLAY, (void *)0 },
 182         { MNTOPT_RO,            ro_cancel,              NULL,           0,
 183                 (void *)0 },
 184         { MNTOPT_RW,            rw_cancel,              NULL,           0,
 185                 (void *)0 },
 186         { MNTOPT_SUID,          suid_cancel,            NULL,           0,
 187                 (void *)0 },
 188         { MNTOPT_NOSUID,        nosuid_cancel,          NULL,           0,
 189                 (void *)0 },
 190         { MNTOPT_DEVICES,       devices_cancel,         NULL,           0,
 191                 (void *)0 },
 192         { MNTOPT_NODEVICES,     nodevices_cancel,       NULL,           0,
 193                 (void *)0 },
 194         { MNTOPT_SETUID,        setuid_cancel,          NULL,           0,
 195                 (void *)0 },
 196         { MNTOPT_NOSETUID,      nosetuid_cancel,        NULL,           0,
 197                 (void *)0 },
 198         { MNTOPT_NBMAND,        nbmand_cancel,          NULL,           0,
 199                 (void *)0 },
 200         { MNTOPT_NONBMAND,      nonbmand_cancel,        NULL,           0,
 201                 (void *)0 },
 202         { MNTOPT_EXEC,          exec_cancel,            NULL,           0,
 203                 (void *)0 },
 204         { MNTOPT_NOEXEC,        noexec_cancel,          NULL,           0,
 205                 (void *)0 },
 206 };
 207
 208 const mntopts_t vfs_mntopts = {
 209         sizeof (mntopts) / sizeof (mntopt_t),
 210         (mntopt_t *)&mntopts[0]
 211 };
 212
 213 /*
 214  * File system operation dispatch functions.
 215  */
 216
 217 int
 218 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 219 {
 220         return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
 221 }
 222
 223 int
 224 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 225 {
 226         return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
 227 }
 228
 229 int
 230 fsop_root(vfs_t *vfsp, vnode_t **vpp)
 231 {
 232         refstr_t *mntpt;
 233         int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
 234         /*
 235          * Make sure this root has a path.  With lofs, it is possible to have
 236          * a NULL mountpoint.
 237          */
 238         if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
 239                 mntpt = vfs_getmntpoint(vfsp);
 240                 vn_setpath_str(*vpp, refstr_value(mntpt),
 241                     strlen(refstr_value(mntpt)));
 242                 refstr_rele(mntpt);
 243         }
 244
 245         return (ret);
 246 }
 247
 248 int
 249 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
 250 {
 251         return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
 252 }
 253
 254 int
 255 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
 256 {
 257         return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
 258 }
 259
 260 int
 261 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
 262 {
 263         /*
 264          * In order to handle system attribute fids in a manner
 265          * transparent to the underlying fs, we embed the fid for
 266          * the sysattr parent object in the sysattr fid and tack on
 267          * some extra bytes that only the sysattr layer knows about.
 268          *
 269          * This guarantees that sysattr fids are larger than other fids
 270          * for this vfs. If the vfs supports the sysattr view interface
 271          * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
 272          * collision with XATTR_FIDSZ.
 273          */
 274         if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
 275             fidp->fid_len == XATTR_FIDSZ)
 276                 return (xattr_dir_vget(vfsp, vpp, fidp));
 277
 278         return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
 279 }
 280
 281 int
 282 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
 283 {
 284         return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
 285 }
 286
 287 void
 288 fsop_freefs(vfs_t *vfsp)
 289 {
 290         (*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
 291 }
 292
 293 int
 294 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
 295 {
 296         return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
 297 }
 298
 299 int
 300 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
 301 {
 302         ASSERT((fstype >= 0) && (fstype < nfstype));
 303
 304         if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
 305                 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
 306         else
 307                 return (ENOTSUP);
 308 }
 309
 310 /*
 311  * File system initialization.  vfs_setfsops() must be called from a file
 312  * system's init routine.
 313  */
 314
 315 static int
 316 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
 317     int *unused_ops)
 318 {
 319         static const fs_operation_trans_def_t vfs_ops_table[] = {
 320                 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
 321                         fs_nosys, fs_nosys,
 322
 323                 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
 324                         fs_nosys, fs_nosys,
 325
 326                 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
 327                         fs_nosys, fs_nosys,
 328
 329                 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
 330                         fs_nosys, fs_nosys,
 331
 332                 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
 333                         (fs_generic_func_p) fs_sync,
 334                         (fs_generic_func_p) fs_sync,    /* No errors allowed */
 335
 336                 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
 337                         fs_nosys, fs_nosys,
 338
 339                 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
 340                         fs_nosys, fs_nosys,
 341
 342                 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
 343                         (fs_generic_func_p)fs_freevfs,
 344                         (fs_generic_func_p)fs_freevfs,  /* Shouldn't fail */
 345
 346                 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
 347                         (fs_generic_func_p)fs_nosys,
 348                         (fs_generic_func_p)fs_nosys,
 349
 350                 NULL, 0, NULL, NULL
 351         };
 352
 353         return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
 354 }
 355
 356 void
 357 zfs_boot_init() {
 358
 359         if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
 360                 spa_boot_init();
 361 }
 362
 363 int
 364 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
 365 {
 366         int error;
 367         int unused_ops;
 368
 369         /*
 370          * Verify that fstype refers to a valid fs.  Note that
 371          * 0 is valid since it's used to set "stray" ops.
 372          */
 373         if ((fstype < 0) || (fstype >= nfstype))
 374                 return (EINVAL);
 375
 376         if (!ALLOCATED_VFSSW(&vfssw[fstype]))
 377                 return (EINVAL);
 378
 379         /* Set up the operations vector. */
 380
 381         error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
 382
 383         if (error != 0)
 384                 return (error);
 385
 386         vfssw[fstype].vsw_flag |= VSW_INSTALLED;
 387
 388         if (actual != NULL)
 389                 *actual = &vfssw[fstype].vsw_vfsops;
 390
 391 #if DEBUG
 392         if (unused_ops != 0)
 393                 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
 394                     "but not used", vfssw[fstype].vsw_name, unused_ops);
 395 #endif
 396
 397         return (0);
 398 }
 399
 400 int
 401 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
 402 {
 403         int error;
 404         int unused_ops;
 405
 406         *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
 407
 408         error = fs_copyfsops(template, *actual, &unused_ops);
 409         if (error != 0) {
 410                 kmem_free(*actual, sizeof (vfsops_t));
 411                 *actual = NULL;
 412                 return (error);
 413         }
 414
 415         return (0);
 416 }
 417
 418 /*
 419  * Free a vfsops structure created as a result of vfs_makefsops().
 420  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
 421  * vfs_freevfsops_by_type().
 422  */
 423 void
 424 vfs_freevfsops(vfsops_t *vfsops)
 425 {
 426         kmem_free(vfsops, sizeof (vfsops_t));
 427 }
 428
 429 /*
 430  * Since the vfsops structure is part of the vfssw table and wasn't
 431  * really allocated, we're not really freeing anything.  We keep
 432  * the name for consistency with vfs_freevfsops().  We do, however,
 433  * need to take care of a little bookkeeping.
 434  * NOTE: For a vfsops structure created by vfs_setfsops(), use
 435  * vfs_freevfsops_by_type().
 436  */
 437 int
 438 vfs_freevfsops_by_type(int fstype)
 439 {
 440
 441         /* Verify that fstype refers to a loaded fs (and not fsid 0). */
 442         if ((fstype <= 0) || (fstype >= nfstype))
 443                 return (EINVAL);
 444
 445         WLOCK_VFSSW();
 446         if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
 447                 WUNLOCK_VFSSW();
 448                 return (EINVAL);
 449         }
 450
 451         vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
 452         WUNLOCK_VFSSW();
 453
 454         return (0);
 455 }
 456
 457 /* Support routines used to reference vfs_op */
 458
 459 /* Set the operations vector for a vfs */
 460 void
 461 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
 462 {
 463         vfsops_t        *op;
 464
 465         ASSERT(vfsp != NULL);
 466         ASSERT(vfsops != NULL);
 467
 468         op = vfsp->vfs_op;
 469         membar_consumer();
 470         if (vfsp->vfs_femhead == NULL &&
 471             casptr(&vfsp->vfs_op, op, vfsops) == op) {
 472                 return;
 473         }
 474         fsem_setvfsops(vfsp, vfsops);
 475 }
 476
 477 /* Retrieve the operations vector for a vfs */
 478 vfsops_t *
 479 vfs_getops(vfs_t *vfsp)
 480 {
 481         vfsops_t        *op;
 482
 483         ASSERT(vfsp != NULL);
 484
 485         op = vfsp->vfs_op;
 486         membar_consumer();
 487         if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
 488                 return (op);
 489         } else {
 490                 return (fsem_getvfsops(vfsp));
 491         }
 492 }
 493
 494 /*
 495  * Returns non-zero (1) if the vfsops matches that of the vfs.
 496  * Returns zero (0) if not.
 497  */
 498 int
 499 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
 500 {
 501         return (vfs_getops(vfsp) == vfsops);
 502 }
 503
 504 /*
 505  * Returns non-zero (1) if the file system has installed a non-default,
 506  * non-error vfs_sync routine.  Returns zero (0) otherwise.
 507  */
 508 int
 509 vfs_can_sync(vfs_t *vfsp)
 510 {
 511         /* vfs_sync() routine is not the default/error function */
 512         return (vfs_getops(vfsp)->vfs_sync != fs_sync);
 513 }
 514
 515 /*
 516  * Initialize a vfs structure.
 517  */
 518 void
 519 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
 520 {
 521         /* Other initialization has been moved to vfs_alloc() */
 522         vfsp->vfs_count = 0;
 523         vfsp->vfs_next = vfsp;
 524         vfsp->vfs_prev = vfsp;
 525         vfsp->vfs_zone_next = vfsp;
 526         vfsp->vfs_zone_prev = vfsp;
 527         vfsp->vfs_lofi_minor = 0;
 528         sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
 529         vfsimpl_setup(vfsp);
 530         vfsp->vfs_data = (data);
 531         vfs_setops((vfsp), (op));
 532 }
 533
 534 /*
 535  * Allocate and initialize the vfs implementation private data
 536  * structure, vfs_impl_t.
 537  */
 538 void
 539 vfsimpl_setup(vfs_t *vfsp)
 540 {
 541         int i;
 542
 543         if (vfsp->vfs_implp != NULL) {
 544                 return;
 545         }
 546
 547         vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
 548         /* Note that these are #define'd in vfs.h */
 549         vfsp->vfs_vskap = NULL;
 550         vfsp->vfs_fstypevsp = NULL;
 551
 552         /* Set size of counted array, then zero the array */
 553         vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
 554         for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
 555                 vfsp->vfs_featureset[i] = 0;
 556         }
 557 }
 558
 559 /*
 560  * Release the vfs_impl_t structure, if it exists. Some unbundled
 561  * filesystems may not use the newer version of vfs and thus
 562  * would not contain this implementation private data structure.
 563  */
 564 void
 565 vfsimpl_teardown(vfs_t *vfsp)
 566 {
 567         vfs_impl_t      *vip = vfsp->vfs_implp;
 568
 569         if (vip == NULL)
 570                 return;
 571
 572         kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
 573         vfsp->vfs_implp = NULL;
 574 }
 575
 576 /*
 577  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
 578  * fstatvfs, and sysfs moved to common/syscall.
 579  */
 580
 581 /*
 582  * Update every mounted file system.  We call the vfs_sync operation of
 583  * each file system type, passing it a NULL vfsp to indicate that all
 584  * mounted file systems of that type should be updated.
 585  */
 586 void
 587 vfs_sync(int flag)
 588 {
 589         struct vfssw *vswp;
 590         RLOCK_VFSSW();
 591         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 592                 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 593                         vfs_refvfssw(vswp);
 594                         RUNLOCK_VFSSW();
 595                         (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
 596                             CRED());
 597                         vfs_unrefvfssw(vswp);
 598                         RLOCK_VFSSW();
 599                 }
 600         }
 601         RUNLOCK_VFSSW();
 602 }
 603
 604 void
 605 sync(void)
 606 {
 607         vfs_sync(0);
 608 }
 609
 610 /*
 611  * External routines.
 612  */
 613
 614 krwlock_t vfssw_lock;   /* lock accesses to vfssw */
 615
 616 /*
 617  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
 618  * but otherwise should be accessed only via vfs_list_lock() and
 619  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
 620  */
 621 static krwlock_t vfslist;
 622
 623 /*
 624  * Mount devfs on /devices. This is done right after root is mounted
 625  * to provide device access support for the system
 626  */
 627 static void
 628 vfs_mountdevices(void)
 629 {
 630         struct vfssw *vsw;
 631         struct vnode *mvp;
 632         struct mounta mounta = {        /* fake mounta for devfs_mount() */
 633                 NULL,
 634                 NULL,
 635                 MS_SYSSPACE,
 636                 NULL,
 637                 NULL,
 638                 0,
 639                 NULL,
 640                 0
 641         };
 642
 643         /*
 644          * _init devfs module to fill in the vfssw
 645          */
 646         if (modload("fs", "devfs") == -1)
 647                 panic("Cannot _init devfs module");
 648
 649         /*
 650          * Hold vfs
 651          */
 652         RLOCK_VFSSW();
 653         vsw = vfs_getvfsswbyname("devfs");
 654         VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
 655         VFS_HOLD(&devices);
 656
 657         /*
 658          * Locate mount point
 659          */
 660         if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 661                 panic("Cannot find /devices");
 662
 663         /*
 664          * Perform the mount of /devices
 665          */
 666         if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
 667                 panic("Cannot mount /devices");
 668
 669         RUNLOCK_VFSSW();
 670
 671         /*
 672          * Set appropriate members and add to vfs list for mnttab display
 673          */
 674         vfs_setresource(&devices, "/devices");
 675         vfs_setmntpoint(&devices, "/devices");
 676
 677         /*
 678          * Hold the root of /devices so it won't go away
 679          */
 680         if (VFS_ROOT(&devices, &devicesdir))
 681                 panic("vfs_mountdevices: not devices root");
 682
 683         if (vfs_lock(&devices) != 0) {
 684                 VN_RELE(devicesdir);
 685                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
 686                 return;
 687         }
 688
 689         if (vn_vfswlock(mvp) != 0) {
 690                 vfs_unlock(&devices);
 691                 VN_RELE(devicesdir);
 692                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
 693                 return;
 694         }
 695
 696         vfs_add(mvp, &devices, 0);
 697         vn_vfsunlock(mvp);
 698         vfs_unlock(&devices);
 699         VN_RELE(devicesdir);
 700 }
 701
 702 /*
 703  * mount the first instance of /dev  to root and remain mounted
 704  */
 705 static void
 706 vfs_mountdev1(void)
 707 {
 708         struct vfssw *vsw;
 709         struct vnode *mvp;
 710         struct mounta mounta = {        /* fake mounta for sdev_mount() */
 711                 NULL,
 712                 NULL,
 713                 MS_SYSSPACE | MS_OVERLAY,
 714                 NULL,
 715                 NULL,
 716                 0,
 717                 NULL,
 718                 0
 719         };
 720
 721         /*
 722          * _init dev module to fill in the vfssw
 723          */
 724         if (modload("fs", "dev") == -1)
 725                 cmn_err(CE_PANIC, "Cannot _init dev module\n");
 726
 727         /*
 728          * Hold vfs
 729          */
 730         RLOCK_VFSSW();
 731         vsw = vfs_getvfsswbyname("dev");
 732         VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
 733         VFS_HOLD(&dev);
 734
 735         /*
 736          * Locate mount point
 737          */
 738         if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
 739                 cmn_err(CE_PANIC, "Cannot find /dev\n");
 740
 741         /*
 742          * Perform the mount of /dev
 743          */
 744         if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
 745                 cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
 746
 747         RUNLOCK_VFSSW();
 748
 749         /*
 750          * Set appropriate members and add to vfs list for mnttab display
 751          */
 752         vfs_setresource(&dev, "/dev");
 753         vfs_setmntpoint(&dev, "/dev");
 754
 755         /*
 756          * Hold the root of /dev so it won't go away
 757          */
 758         if (VFS_ROOT(&dev, &devdir))
 759                 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
 760
 761         if (vfs_lock(&dev) != 0) {
 762                 VN_RELE(devdir);
 763                 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
 764                 return;
 765         }
 766
 767         if (vn_vfswlock(mvp) != 0) {
 768                 vfs_unlock(&dev);
 769                 VN_RELE(devdir);
 770                 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
 771                 return;
 772         }
 773
 774         vfs_add(mvp, &dev, 0);
 775         vn_vfsunlock(mvp);
 776         vfs_unlock(&dev);
 777         VN_RELE(devdir);
 778 }
 779
 780 /*
 781  * Mount required filesystem. This is done right after root is mounted.
 782  */
 783 static void
 784 vfs_mountfs(char *module, char *spec, char *path)
 785 {
 786         struct vnode *mvp;
 787         struct mounta mounta;
 788         vfs_t *vfsp;
 789
 790         mounta.flags = MS_SYSSPACE | MS_DATA;
 791         mounta.fstype = module;
 792         mounta.spec = spec;
 793         mounta.dir = path;
 794         if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
 795                 cmn_err(CE_WARN, "Cannot find %s", path);
 796                 return;
 797         }
 798         if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
 799                 cmn_err(CE_WARN, "Cannot mount %s", path);
 800         else
 801                 VFS_RELE(vfsp);
 802         VN_RELE(mvp);
 803 }
 804
 805 /*
 806  * vfs_mountroot is called by main() to mount the root filesystem.
 807  */
 808 void
 809 vfs_mountroot(void)
 810 {
 811         struct vnode    *rvp = NULL;
 812         char            *path;
 813         size_t          plen;
 814         struct vfssw    *vswp;
 815         proc_t          *p;
 816
 817         rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
 818         rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
 819
 820         /*
 821          * Alloc the vfs hash bucket array and locks
 822          */
 823         rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
 824
 825         /*
 826          * Call machine-dependent routine "rootconf" to choose a root
 827          * file system type.
 828          */
 829         if (rootconf())
 830                 panic("vfs_mountroot: cannot mount root");
 831         /*
 832          * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
 833          * to point to it.  These are used by lookuppn() so that it
 834          * knows where to start from ('/' or '.').
 835          */
 836         vfs_setmntpoint(rootvfs, "/");
 837         if (VFS_ROOT(rootvfs, &rootdir))
 838                 panic("vfs_mountroot: no root vnode");
 839
 840         /*
 841          * At this point, the process tree consists of p0 and possibly some
 842          * direct children of p0.  (i.e. there are no grandchildren)
 843          *
 844          * Walk through them all, setting their current directory.
 845          */
 846         mutex_enter(&pidlock);
 847         for (p = practive; p != NULL; p = p->p_next) {
 848                 ASSERT(p == &p0 || p->p_parent == &p0);
 849
 850                 PTOU(p)->u_cdir = rootdir;
 851                 VN_HOLD(PTOU(p)->u_cdir);
 852                 PTOU(p)->u_rdir = NULL;
 853         }
 854         mutex_exit(&pidlock);
 855
 856         /*
 857          * Setup the global zone's rootvp, now that it exists.
 858          */
 859         global_zone->zone_rootvp = rootdir;
 860         VN_HOLD(global_zone->zone_rootvp);
 861
 862         /*
 863          * Notify the module code that it can begin using the
 864          * root filesystem instead of the boot program's services.
 865          */
 866         modrootloaded = 1;
 867
 868         /*
 869          * Special handling for a ZFS root file system.
 870          */
 871         zfs_boot_init();
 872
 873         /*
 874          * Set up mnttab information for root
 875          */
 876         vfs_setresource(rootvfs, rootfs.bo_name);
 877
 878         /*
 879          * Notify cluster software that the root filesystem is available.
 880          */
 881         clboot_mountroot();
 882
 883         /* Now that we're all done with the root FS, set up its vopstats */
 884         if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
 885                 /* Set flag for statistics collection */
 886                 if (vswp->vsw_flag & VSW_STATS) {
 887                         initialize_vopstats(&rootvfs->vfs_vopstats);
 888                         rootvfs->vfs_flag |= VFS_STATS;
 889                         rootvfs->vfs_fstypevsp =
 890                             get_fstype_vopstats(rootvfs, vswp);
 891                         rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
 892                 }
 893                 vfs_unrefvfssw(vswp);
 894         }
 895
 896         /*
 897          * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
 898          * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
 899          */
 900         vfs_mountdevices();
 901         vfs_mountdev1();
 902
 903         vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
 904         vfs_mountfs("proc", "/proc", "/proc");
 905         vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
 906         vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
 907         vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
 908
 909         if (getzoneid() == GLOBAL_ZONEID) {
 910                 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
 911         }
 912
 913 #ifdef __sparc
 914         /*
 915          * This bit of magic can go away when we convert sparc to
 916          * the new boot architecture based on ramdisk.
 917          *
 918          * Booting off a mirrored root volume:
 919          * At this point, we have booted and mounted root on a
 920          * single component of the mirror.  Complete the boot
 921          * by configuring SVM and converting the root to the
 922          * dev_t of the mirrored root device.  This dev_t conversion
 923          * only works because the underlying device doesn't change.
 924          */
 925         if (root_is_svm) {
 926                 if (svm_rootconf()) {
 927                         panic("vfs_mountroot: cannot remount root");
 928                 }
 929
 930                 /*
 931                  * mnttab should reflect the new root device
 932                  */
 933                 vfs_lock_wait(rootvfs);
 934                 vfs_setresource(rootvfs, rootfs.bo_name);
 935                 vfs_unlock(rootvfs);
 936         }
 937 #endif /* __sparc */
 938
 939         /*
 940          * Look up the root device via devfs so that a dv_node is
 941          * created for it. The vnode is never VN_RELE()ed.
 942          * We allocate more than MAXPATHLEN so that the
 943          * buffer passed to i_ddi_prompath_to_devfspath() is
 944          * exactly MAXPATHLEN (the function expects a buffer
 945          * of that length).
 946          */
 947         plen = strlen("/devices");
 948         path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
 949         (void) strcpy(path, "/devices");
 950
 951         if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
 952             != DDI_SUCCESS ||
 953             lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
 954
 955                 /* NUL terminate in case "path" has garbage */
 956                 path[plen + MAXPATHLEN - 1] = '\0';
 957 #ifdef  DEBUG
 958                 cmn_err(CE_WARN, "!Cannot lookup root device: %s", path);
 959 #endif
 960         }
 961         kmem_free(path, plen + MAXPATHLEN);
 962         vfs_mnttabvp_setup();
 963 }
 964
 965 /*
 966  * If remount failed and we're in a zone we need to check for the zone
 967  * root path and strip it before the call to vfs_setpath().
 968  *
 969  * If strpath doesn't begin with the zone_rootpath the original
 970  * strpath is returned unchanged.
 971  */
 972 static const char *
 973 stripzonepath(const char *strpath)
 974 {
 975         char *str1, *str2;
 976         int i;
 977         zone_t *zonep = curproc->p_zone;
 978
 979         if (zonep->zone_rootpath == NULL || strpath == NULL) {
 980                 return (NULL);
 981         }
 982
 983         /*
 984          * we check for the end of the string at one past the
 985          * current position because the zone_rootpath always
 986          * ends with "/" but we don't want to strip that off.
 987          */
 988         str1 = zonep->zone_rootpath;
 989         str2 = (char *)strpath;
 990         ASSERT(str1[0] != '\0');
 991         for (i = 0; str1[i + 1] != '\0'; i++) {
 992                 if (str1[i] != str2[i])
 993                         return ((char *)strpath);
 994         }
 995         return (&str2[i]);
 996 }
 997
 998 /*
 999  * Check to see if our "block device" is actually a file.  If so,
1000  * automatically add a lofi device, and keep track of this fact.
1001  */
1002 static int
1003 lofi_add(const char *fsname, struct vfs *vfsp,
1004     mntopts_t *mntopts, struct mounta *uap)
1005 {
1006         int fromspace = (uap->flags & MS_SYSSPACE) ?
1007             UIO_SYSSPACE : UIO_USERSPACE;
1008         struct lofi_ioctl *li = NULL;
1009         struct vnode *vp = NULL;
1010         struct pathname pn = { NULL };
1011         ldi_ident_t ldi_id;
1012         ldi_handle_t ldi_hdl;
1013         vfssw_t *vfssw;
1014         int minor;
1015         int err = 0;
1016
1017         if (fsname == NULL ||
1018             (vfssw = vfs_getvfssw(fsname)) == NULL)
1019                 return (0);
1020
1021         if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
1022                 vfs_unrefvfssw(vfssw);
1023                 return (0);
1024         }
1025
1026         vfs_unrefvfssw(vfssw);
1027         vfssw = NULL;
1028
1029         if (pn_get(uap->spec, fromspace, &pn) != 0)
1030                 return (0);
1031
1032         if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
1033                 goto out;
1034
1035         if (vp->v_type != VREG)
1036                 goto out;
1037
1038         /* OK, this is a lofi mount. */
1039
1040         if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
1041             vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
1042             vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
1043             vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
1044                 err = EINVAL;
1045                 goto out;
1046         }
1047
1048         ldi_id = ldi_ident_from_anon();
1049         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1050         (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1051
1052         /*
1053          * The lofi control node is currently exclusive-open.  We'd like
1054          * to improve this, but in the meantime, we'll loop waiting for
1055          * access.
1056          */
1057         for (;;) {
1058                 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE | FEXCL,
1059                     kcred, &ldi_hdl, ldi_id);
1060
1061                 if (err != EBUSY)
1062                         break;
1063
1064                 if ((err = delay_sig(hz / 8)) == EINTR)
1065                         break;
1066         }
1067
1068         if (err)
1069                 goto out2;
1070
1071         err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1072             FREAD | FWRITE | FEXCL | FKIOCTL, kcred, &minor);
1073
1074         (void) ldi_close(ldi_hdl, FREAD | FWRITE | FEXCL, kcred);
1075
1076         if (!err)
1077                 vfsp->vfs_lofi_minor = minor;
1078
1079 out2:
1080         ldi_ident_release(ldi_id);
1081 out:
1082         if (li != NULL)
1083                 kmem_free(li, sizeof (*li));
1084         if (vp != NULL)
1085                 VN_RELE(vp);
1086         pn_free(&pn);
1087         return (err);
1088 }
1089
1090 static void
1091 lofi_remove(struct vfs *vfsp)
1092 {
1093         struct lofi_ioctl *li = NULL;
1094         ldi_ident_t ldi_id;
1095         ldi_handle_t ldi_hdl;
1096         int err;
1097
1098         if (vfsp->vfs_lofi_minor == 0)
1099                 return;
1100
1101         ldi_id = ldi_ident_from_anon();
1102
1103         li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1104         li->li_minor = vfsp->vfs_lofi_minor;
1105         li->li_cleanup = B_TRUE;
1106
1107         do {
1108                 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE | FEXCL,
1109                     kcred, &ldi_hdl, ldi_id);
1110         } while (err == EBUSY);
1111
1112         if (err)
1113                 goto out;
1114
1115         err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1116             FREAD | FWRITE | FEXCL | FKIOCTL, kcred, NULL);
1117
1118         (void) ldi_close(ldi_hdl, FREAD | FWRITE | FEXCL, kcred);
1119
1120         if (!err)
1121                 vfsp->vfs_lofi_minor = 0;
1122
1123 out:
1124         ldi_ident_release(ldi_id);
1125         if (li != NULL)
1126                 kmem_free(li, sizeof (*li));
1127 }
1128
1129 /*
1130  * Common mount code.  Called from the system call entry point, from autofs,
1131  * nfsv4 trigger mounts, and from pxfs.
1132  *
1133  * Takes the effective file system type, mount arguments, the mount point
1134  * vnode, flags specifying whether the mount is a remount and whether it
1135  * should be entered into the vfs list, and credentials.  Fills in its vfspp
1136  * parameter with the mounted file system instance's vfs.
1137  *
1138  * Note that the effective file system type is specified as a string.  It may
1139  * be null, in which case it's determined from the mount arguments, and may
1140  * differ from the type specified in the mount arguments; this is a hook to
1141  * allow interposition when instantiating file system instances.
1142  *
1143  * The caller is responsible for releasing its own hold on the mount point
1144  * vp (this routine does its own hold when necessary).
1145  * Also note that for remounts, the mount point vp should be the vnode for
1146  * the root of the file system rather than the vnode that the file system
1147  * is mounted on top of.
1148  */
1149 int
1150 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1151         struct vfs **vfspp)
1152 {
1153         struct vfssw    *vswp;
1154         vfsops_t        *vfsops;
1155         struct vfs      *vfsp;
1156         struct vnode    *bvp;
1157         dev_t           bdev = 0;
1158         mntopts_t       mnt_mntopts;
1159         int             error = 0;
1160         int             copyout_error = 0;
1161         int             ovflags;
1162         char            *opts = uap->optptr;
1163         char            *inargs = opts;
1164         int             optlen = uap->optlen;
1165         int             remount;
1166         int             rdonly;
1167         int             nbmand = 0;
1168         int             delmip = 0;
1169         int             addmip = 0;
1170         int             splice = ((uap->flags & MS_NOSPLICE) == 0);
1171         int             fromspace = (uap->flags & MS_SYSSPACE) ?
1172             UIO_SYSSPACE : UIO_USERSPACE;
1173         char            *resource = NULL, *mountpt = NULL;
1174         refstr_t        *oldresource, *oldmntpt;
1175         struct pathname pn, rpn;
1176         vsk_anchor_t    *vskap;
1177         char fstname[FSTYPSZ];
1178
1179         /*
1180          * The v_flag value for the mount point vp is permanently set
1181          * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1182          * for mount point locking.
1183          */
1184         mutex_enter(&vp->v_lock);
1185         vp->v_flag |= VVFSLOCK;
1186         mutex_exit(&vp->v_lock);
1187
1188         mnt_mntopts.mo_count = 0;
1189         /*
1190          * Find the ops vector to use to invoke the file system-specific mount
1191          * method.  If the fsname argument is non-NULL, use it directly.
1192          * Otherwise, dig the file system type information out of the mount
1193          * arguments.
1194          *
1195          * A side effect is to hold the vfssw entry.
1196          *
1197          * Mount arguments can be specified in several ways, which are
1198          * distinguished by flag bit settings.  The preferred way is to set
1199          * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1200          * type supplied as a character string and the last two arguments
1201          * being a pointer to a character buffer and the size of the buffer.
1202          * On entry, the buffer holds a null terminated list of options; on
1203          * return, the string is the list of options the file system
1204          * recognized. If MS_DATA is set arguments five and six point to a
1205          * block of binary data which the file system interprets.
1206          * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1207          * consistently with these conventions.  To handle them, we check to
1208          * see whether the pointer to the file system name has a numeric value
1209          * less than 256.  If so, we treat it as an index.
1210          */
1211         if (fsname != NULL) {
1212                 if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1213                         return (EINVAL);
1214                 }
1215         } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1216                 size_t n;
1217                 uint_t fstype;
1218
1219                 fsname = fstname;
1220
1221                 if ((fstype = (uintptr_t)uap->fstype) < 256) {
1222                         RLOCK_VFSSW();
1223                         if (fstype == 0 || fstype >= nfstype ||
1224                             !ALLOCATED_VFSSW(&vfssw[fstype])) {
1225                                 RUNLOCK_VFSSW();
1226                                 return (EINVAL);
1227                         }
1228                         (void) strcpy(fsname, vfssw[fstype].vsw_name);
1229                         RUNLOCK_VFSSW();
1230                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1231                                 return (EINVAL);
1232                 } else {
1233                         /*
1234                          * Handle either kernel or user address space.
1235                          */
1236                         if (uap->flags & MS_SYSSPACE) {
1237                                 error = copystr(uap->fstype, fsname,
1238                                     FSTYPSZ, &n);
1239                         } else {
1240                                 error = copyinstr(uap->fstype, fsname,
1241                                     FSTYPSZ, &n);
1242                         }
1243                         if (error) {
1244                                 if (error == ENAMETOOLONG)
1245                                         return (EINVAL);
1246                                 return (error);
1247                         }
1248                         if ((vswp = vfs_getvfssw(fsname)) == NULL)
1249                                 return (EINVAL);
1250                 }
1251         } else {
1252                 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1253                         return (EINVAL);
1254         }
1255         if (!VFS_INSTALLED(vswp))
1256                 return (EINVAL);
1257         vfsops = &vswp->vsw_vfsops;
1258
1259         vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1260         /*
1261          * Fetch mount options and parse them for generic vfs options
1262          */
1263         if (uap->flags & MS_OPTIONSTR) {
1264                 /*
1265                  * Limit the buffer size
1266                  */
1267                 if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1268                         error = EINVAL;
1269                         goto errout;
1270                 }
1271                 if ((uap->flags & MS_SYSSPACE) == 0) {
1272                         inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1273                         inargs[0] = '\0';
1274                         if (optlen) {
1275                                 error = copyinstr(opts, inargs, (size_t)optlen,
1276                                     NULL);
1277                                 if (error) {
1278                                         goto errout;
1279                                 }
1280                         }
1281                 }
1282                 vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1283         }
1284         /*
1285          * Flag bits override the options string.
1286          */
1287         if (uap->flags & MS_REMOUNT)
1288                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1289         if (uap->flags & MS_RDONLY)
1290                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1291         if (uap->flags & MS_NOSUID)
1292                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1293
1294         /*
1295          * Check if this is a remount; must be set in the option string and
1296          * the file system must support a remount option.
1297          */
1298         if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1299             MNTOPT_REMOUNT, NULL)) {
1300                 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1301                         error = ENOTSUP;
1302                         goto errout;
1303                 }
1304                 uap->flags |= MS_REMOUNT;
1305         }
1306
1307         /*
1308          * uap->flags and vfs_optionisset() should agree.
1309          */
1310         if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1311                 uap->flags |= MS_RDONLY;
1312         }
1313         if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1314                 uap->flags |= MS_NOSUID;
1315         }
1316         nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1317         ASSERT(splice || !remount);
1318         /*
1319          * If we are splicing the fs into the namespace,
1320          * perform mount point checks.
1321          *
1322          * We want to resolve the path for the mount point to eliminate
1323          * '.' and ".." and symlinks in mount points; we can't do the
1324          * same for the resource string, since it would turn
1325          * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1326          * this before grabbing vn_vfswlock(), because otherwise we
1327          * would deadlock with lookuppn().
1328          */
1329         if (splice) {
1330                 ASSERT(vp->v_count > 0);
1331
1332                 /*
1333                  * Pick up mount point and device from appropriate space.
1334                  */
1335                 if (pn_get(uap->spec, fromspace, &pn) == 0) {
1336                         resource = kmem_alloc(pn.pn_pathlen + 1,
1337                             KM_SLEEP);
1338                         (void) strcpy(resource, pn.pn_path);
1339                         pn_free(&pn);
1340                 }
1341                 /*
1342                  * Do a lookupname prior to taking the
1343                  * writelock. Mark this as completed if
1344                  * successful for later cleanup and addition to
1345                  * the mount in progress table.
1346                  */
1347                 if ((uap->flags & MS_GLOBAL) == 0 &&
1348                     lookupname(uap->spec, fromspace,
1349                     FOLLOW, NULL, &bvp) == 0) {
1350                         addmip = 1;
1351                 }
1352
1353                 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1354                         pathname_t *pnp;
1355
1356                         if (*pn.pn_path != '/') {
1357                                 error = EINVAL;
1358                                 pn_free(&pn);
1359                                 goto errout;
1360                         }
1361                         pn_alloc(&rpn);
1362                         /*
1363                          * Kludge to prevent autofs from deadlocking with
1364                          * itself when it calls domount().
1365                          *
1366                          * If autofs is calling, it is because it is doing
1367                          * (autofs) mounts in the process of an NFS mount.  A
1368                          * lookuppn() here would cause us to block waiting for
1369                          * said NFS mount to complete, which can't since this
1370                          * is the thread that was supposed to doing it.
1371                          */
1372                         if (fromspace == UIO_USERSPACE) {
1373                                 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1374                                     NULL)) == 0) {
1375                                         pnp = &rpn;
1376                                 } else {
1377                                         /*
1378                                          * The file disappeared or otherwise
1379                                          * became inaccessible since we opened
1380                                          * it; might as well fail the mount
1381                                          * since the mount point is no longer
1382                                          * accessible.
1383                                          */
1384                                         pn_free(&rpn);
1385                                         pn_free(&pn);
1386                                         goto errout;
1387                                 }
1388                         } else {
1389                                 pnp = &pn;
1390                         }
1391                         mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1392                         (void) strcpy(mountpt, pnp->pn_path);
1393
1394                         /*
1395                          * If the addition of the zone's rootpath
1396                          * would push us over a total path length
1397                          * of MAXPATHLEN, we fail the mount with
1398                          * ENAMETOOLONG, which is what we would have
1399                          * gotten if we were trying to perform the same
1400                          * mount in the global zone.
1401                          *
1402                          * strlen() doesn't count the trailing
1403                          * '\0', but zone_rootpathlen counts both a
1404                          * trailing '/' and the terminating '\0'.
1405                          */
1406                         if ((curproc->p_zone->zone_rootpathlen - 1 +
1407                             strlen(mountpt)) > MAXPATHLEN ||
1408                             (resource != NULL &&
1409                             (curproc->p_zone->zone_rootpathlen - 1 +
1410                             strlen(resource)) > MAXPATHLEN)) {
1411                                 error = ENAMETOOLONG;
1412                         }
1413
1414                         pn_free(&rpn);
1415                         pn_free(&pn);
1416                 }
1417
1418                 if (error)
1419                         goto errout;
1420
1421                 /*
1422                  * Prevent path name resolution from proceeding past
1423                  * the mount point.
1424                  */
1425                 if (vn_vfswlock(vp) != 0) {
1426                         error = EBUSY;
1427                         goto errout;
1428                 }
1429
1430                 /*
1431                  * Verify that it's legitimate to establish a mount on
1432                  * the prospective mount point.
1433                  */
1434                 if (vn_mountedvfs(vp) != NULL) {
1435                         /*
1436                          * The mount point lock was obtained after some
1437                          * other thread raced through and established a mount.
1438                          */
1439                         vn_vfsunlock(vp);
1440                         error = EBUSY;
1441                         goto errout;
1442                 }
1443                 if (vp->v_flag & VNOMOUNT) {
1444                         vn_vfsunlock(vp);
1445                         error = EINVAL;
1446                         goto errout;
1447                 }
1448         }
1449         if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1450                 uap->dataptr = NULL;
1451                 uap->datalen = 0;
1452         }
1453
1454         /*
1455          * If this is a remount, we don't want to create a new VFS.
1456          * Instead, we pass the existing one with a remount flag.
1457          */
1458         if (remount) {
1459                 /*
1460                  * Confirm that the mount point is the root vnode of the
1461                  * file system that is being remounted.
1462                  * This can happen if the user specifies a different
1463                  * mount point directory pathname in the (re)mount command.
1464                  *
1465                  * Code below can only be reached if splice is true, so it's
1466                  * safe to do vn_vfsunlock() here.
1467                  */
1468                 if ((vp->v_flag & VROOT) == 0) {
1469                         vn_vfsunlock(vp);
1470                         error = ENOENT;
1471                         goto errout;
1472                 }
1473                 /*
1474                  * Disallow making file systems read-only unless file system
1475                  * explicitly allows it in its vfssw.  Ignore other flags.
1476                  */
1477                 if (rdonly && vn_is_readonly(vp) == 0 &&
1478                     (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1479                         vn_vfsunlock(vp);
1480                         error = EINVAL;
1481                         goto errout;
1482                 }
1483                 /*
1484                  * Disallow changing the NBMAND disposition of the file
1485                  * system on remounts.
1486                  */
1487                 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1488                     (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1489                         vn_vfsunlock(vp);
1490                         error = EINVAL;
1491                         goto errout;
1492                 }
1493                 vfsp = vp->v_vfsp;
1494                 ovflags = vfsp->vfs_flag;
1495                 vfsp->vfs_flag |= VFS_REMOUNT;
1496                 vfsp->vfs_flag &= ~VFS_RDONLY;
1497         } else {
1498                 vfsp = vfs_alloc(KM_SLEEP);
1499                 VFS_INIT(vfsp, vfsops, NULL);
1500         }
1501
1502         VFS_HOLD(vfsp);
1503
1504         if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1505                 if (!remount) {
1506                         if (splice)
1507                                 vn_vfsunlock(vp);
1508                         vfs_free(vfsp);
1509                 } else {
1510                         vn_vfsunlock(vp);
1511                         VFS_RELE(vfsp);
1512                 }
1513                 goto errout;
1514         }
1515
1516         /*
1517          * PRIV_SYS_MOUNT doesn't mean you can become root.
1518          */
1519         if (vfsp->vfs_lofi_minor != 0) {
1520                 uap->flags |= MS_NOSUID;
1521                 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1522         }
1523
1524         /*
1525          * The vfs_reflock is not used anymore the code below explicitly
1526          * holds it preventing others accesing it directly.
1527          */
1528         if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1529             !(vfsp->vfs_flag & VFS_REMOUNT))
1530                 cmn_err(CE_WARN,
1531                     "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1532
1533         /*
1534          * Lock the vfs. If this is a remount we want to avoid spurious umount
1535          * failures that happen as a side-effect of fsflush() and other mount
1536          * and unmount operations that might be going on simultaneously and
1537          * may have locked the vfs currently. To not return EBUSY immediately
1538          * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1539          */
1540         if (!remount) {
1541                 if (error = vfs_lock(vfsp)) {
1542                         vfsp->vfs_flag = ovflags;
1543
1544                         lofi_remove(vfsp);
1545
1546                         if (splice)
1547                                 vn_vfsunlock(vp);
1548                         vfs_free(vfsp);
1549                         goto errout;
1550                 }
1551         } else {
1552                 vfs_lock_wait(vfsp);
1553         }
1554
1555         /*
1556          * Add device to mount in progress table, global mounts require special
1557          * handling. It is possible that we have already done the lookupname
1558          * on a spliced, non-global fs. If so, we don't want to do it again
1559          * since we cannot do a lookupname after taking the
1560          * wlock above. This case is for a non-spliced, non-global filesystem.
1561          */
1562         if (!addmip) {
1563                 if ((uap->flags & MS_GLOBAL) == 0 &&
1564                     lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1565                         addmip = 1;
1566                 }
1567         }
1568
1569         if (addmip) {
1570                 vnode_t *lvp = NULL;
1571
1572                 error = vfs_get_lofi(vfsp, &lvp);
1573                 if (error > 0) {
1574                         lofi_remove(vfsp);
1575
1576                         if (splice)
1577                                 vn_vfsunlock(vp);
1578                         vfs_unlock(vfsp);
1579
1580                         if (remount) {
1581                                 VFS_RELE(vfsp);
1582                         } else {
1583                                 vfs_free(vfsp);
1584                         }
1585
1586                         goto errout;
1587                 } else if (error == -1) {
1588                         bdev = bvp->v_rdev;
1589                         VN_RELE(bvp);
1590                 } else {
1591                         bdev = lvp->v_rdev;
1592                         VN_RELE(lvp);
1593                         VN_RELE(bvp);
1594                 }
1595
1596                 vfs_addmip(bdev, vfsp);
1597                 addmip = 0;
1598                 delmip = 1;
1599         }
1600         /*
1601          * Invalidate cached entry for the mount point.
1602          */
1603         if (splice)
1604                 dnlc_purge_vp(vp);
1605
1606         /*
1607          * If have an option string but the filesystem doesn't supply a
1608          * prototype options table, create a table with the global
1609          * options and sufficient room to accept all the options in the
1610          * string.  Then parse the passed in option string
1611          * accepting all the options in the string.  This gives us an
1612          * option table with all the proper cancel properties for the
1613          * global options.
1614          *
1615          * Filesystems that supply a prototype options table are handled
1616          * earlier in this function.
1617          */
1618         if (uap->flags & MS_OPTIONSTR) {
1619                 if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1620                         mntopts_t tmp_mntopts;
1621
1622                         tmp_mntopts.mo_count = 0;
1623                         vfs_createopttbl_extend(&tmp_mntopts, inargs,
1624                             &mnt_mntopts);
1625                         vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1626                         vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1627                         vfs_freeopttbl(&tmp_mntopts);
1628                 }
1629         }
1630
1631         /*
1632          * Serialize with zone creations.
1633          */
1634         mount_in_progress();
1635         /*
1636          * Instantiate (or reinstantiate) the file system.  If appropriate,
1637          * splice it into the file system name space.
1638          *
1639          * We want VFS_MOUNT() to be able to override the vfs_resource
1640          * string if necessary (ie, mntfs), and also for a remount to
1641          * change the same (necessary when remounting '/' during boot).
1642          * So we set up vfs_mntpt and vfs_resource to what we think they
1643          * should be, then hand off control to VFS_MOUNT() which can
1644          * override this.
1645          *
1646          * For safety's sake, when changing vfs_resource or vfs_mntpt of
1647          * a vfs which is on the vfs list (i.e. during a remount), we must
1648          * never set those fields to NULL. Several bits of code make
1649          * assumptions that the fields are always valid.
1650          */
1651         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1652         if (remount) {
1653                 if ((oldresource = vfsp->vfs_resource) != NULL)
1654                         refstr_hold(oldresource);
1655                 if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1656                         refstr_hold(oldmntpt);
1657         }
1658         vfs_setresource(vfsp, resource);
1659         vfs_setmntpoint(vfsp, mountpt);
1660
1661         /*
1662          * going to mount on this vnode, so notify.
1663          */
1664         vnevent_mountedover(vp, NULL);
1665         error = VFS_MOUNT(vfsp, vp, uap, credp);
1666
1667         if (uap->flags & MS_RDONLY)
1668                 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1669         if (uap->flags & MS_NOSUID)
1670                 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1671         if (uap->flags & MS_GLOBAL)
1672                 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1673
1674         if (error) {
1675                 lofi_remove(vfsp);
1676
1677                 if (remount) {
1678                         /* put back pre-remount options */
1679                         vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1680                         vfs_setmntpoint(vfsp, (stripzonepath(
1681                             refstr_value(oldmntpt))));
1682                         if (oldmntpt)
1683                                 refstr_rele(oldmntpt);
1684                         vfs_setresource(vfsp, (stripzonepath(
1685                             refstr_value(oldresource))));
1686                         if (oldresource)
1687                                 refstr_rele(oldresource);
1688                         vfsp->vfs_flag = ovflags;
1689                         vfs_unlock(vfsp);
1690                         VFS_RELE(vfsp);
1691                 } else {
1692                         vfs_unlock(vfsp);
1693                         vfs_freemnttab(vfsp);
1694                         vfs_free(vfsp);
1695                 }
1696         } else {
1697                 /*
1698                  * Set the mount time to now
1699                  */
1700                 vfsp->vfs_mtime = ddi_get_time();
1701                 if (remount) {
1702                         vfsp->vfs_flag &= ~VFS_REMOUNT;
1703                         if (oldresource)
1704                                 refstr_rele(oldresource);
1705                         if (oldmntpt)
1706                                 refstr_rele(oldmntpt);
1707                 } else if (splice) {
1708                         /*
1709                          * Link vfsp into the name space at the mount
1710                          * point. Vfs_add() is responsible for
1711                          * holding the mount point which will be
1712                          * released when vfs_remove() is called.
1713                          */
1714                         vfs_add(vp, vfsp, uap->flags);
1715                 } else {
1716                         /*
1717                          * Hold the reference to file system which is
1718                          * not linked into the name space.
1719                          */
1720                         vfsp->vfs_zone = NULL;
1721                         VFS_HOLD(vfsp);
1722                         vfsp->vfs_vnodecovered = NULL;
1723                 }
1724                 /*
1725                  * Set flags for global options encountered
1726                  */
1727                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1728                         vfsp->vfs_flag |= VFS_RDONLY;
1729                 else
1730                         vfsp->vfs_flag &= ~VFS_RDONLY;
1731                 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1732                         vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1733                 } else {
1734                         if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1735                                 vfsp->vfs_flag |= VFS_NODEVICES;
1736                         else
1737                                 vfsp->vfs_flag &= ~VFS_NODEVICES;
1738                         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1739                                 vfsp->vfs_flag |= VFS_NOSETUID;
1740                         else
1741                                 vfsp->vfs_flag &= ~VFS_NOSETUID;
1742                 }
1743                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1744                         vfsp->vfs_flag |= VFS_NBMAND;
1745                 else
1746                         vfsp->vfs_flag &= ~VFS_NBMAND;
1747
1748                 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1749                         vfsp->vfs_flag |= VFS_XATTR;
1750                 else
1751                         vfsp->vfs_flag &= ~VFS_XATTR;
1752
1753                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1754                         vfsp->vfs_flag |= VFS_NOEXEC;
1755                 else
1756                         vfsp->vfs_flag &= ~VFS_NOEXEC;
1757
1758                 /*
1759                  * Now construct the output option string of options
1760                  * we recognized.
1761                  */
1762                 if (uap->flags & MS_OPTIONSTR) {
1763                         vfs_list_read_lock();
1764                         copyout_error = vfs_buildoptionstr(
1765                             &vfsp->vfs_mntopts, inargs, optlen);
1766                         vfs_list_unlock();
1767                         if (copyout_error == 0 &&
1768                             (uap->flags & MS_SYSSPACE) == 0) {
1769                                 copyout_error = copyoutstr(inargs, opts,
1770                                     optlen, NULL);
1771                         }
1772                 }
1773
1774                 /*
1775                  * If this isn't a remount, set up the vopstats before
1776                  * anyone can touch this. We only allow spliced file
1777                  * systems (file systems which are in the namespace) to
1778                  * have the VFS_STATS flag set.
1779                  * NOTE: PxFS mounts the underlying file system with
1780                  * MS_NOSPLICE set and copies those vfs_flags to its private
1781                  * vfs structure. As a result, PxFS should never have
1782                  * the VFS_STATS flag or else we might access the vfs
1783                  * statistics-related fields prior to them being
1784                  * properly initialized.
1785                  */
1786                 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1787                         initialize_vopstats(&vfsp->vfs_vopstats);
1788                         /*
1789                          * We need to set vfs_vskap to NULL because there's
1790                          * a chance it won't be set below.  This is checked
1791                          * in teardown_vopstats() so we can't have garbage.
1792                          */
1793                         vfsp->vfs_vskap = NULL;
1794                         vfsp->vfs_flag |= VFS_STATS;
1795                         vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1796                 }
1797
1798                 if (vswp->vsw_flag & VSW_XID)
1799                         vfsp->vfs_flag |= VFS_XID;
1800
1801                 vfs_unlock(vfsp);
1802         }
1803         mount_completed();
1804         if (splice)
1805                 vn_vfsunlock(vp);
1806
1807         if ((error == 0) && (copyout_error == 0)) {
1808                 if (!remount) {
1809                         /*
1810                          * Don't call get_vskstat_anchor() while holding
1811                          * locks since it allocates memory and calls
1812                          * VFS_STATVFS().  For NFS, the latter can generate
1813                          * an over-the-wire call.
1814                          */
1815                         vskap = get_vskstat_anchor(vfsp);
1816                         /* Only take the lock if we have something to do */
1817                         if (vskap != NULL) {
1818                                 vfs_lock_wait(vfsp);
1819                                 if (vfsp->vfs_flag & VFS_STATS) {
1820                                         vfsp->vfs_vskap = vskap;
1821                                 }
1822                                 vfs_unlock(vfsp);
1823                         }
1824                 }
1825                 /* Return vfsp to caller. */
1826                 *vfspp = vfsp;
1827         }
1828 errout:
1829         vfs_freeopttbl(&mnt_mntopts);
1830         if (resource != NULL)
1831                 kmem_free(resource, strlen(resource) + 1);
1832         if (mountpt != NULL)
1833                 kmem_free(mountpt, strlen(mountpt) + 1);
1834         /*
1835          * It is possible we errored prior to adding to mount in progress
1836          * table. Must free vnode we acquired with successful lookupname.
1837          */
1838         if (addmip)
1839                 VN_RELE(bvp);
1840         if (delmip)
1841                 vfs_delmip(vfsp);
1842         ASSERT(vswp != NULL);
1843         vfs_unrefvfssw(vswp);
1844         if (inargs != opts)
1845                 kmem_free(inargs, MAX_MNTOPT_STR);
1846         if (copyout_error) {
1847                 lofi_remove(vfsp);
1848                 VFS_RELE(vfsp);
1849                 error = copyout_error;
1850         }
1851         return (error);
1852 }
1853
1854 static void
1855 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath)
1856 {
1857         size_t len;
1858         refstr_t *ref;
1859         zone_t *zone = curproc->p_zone;
1860         char *sp;
1861         int have_list_lock = 0;
1862
1863         ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1864
1865         /*
1866          * New path must be less than MAXPATHLEN because mntfs
1867          * will only display up to MAXPATHLEN bytes. This is currently
1868          * safe, because domount() uses pn_get(), and other callers
1869          * similarly cap the size to fewer than MAXPATHLEN bytes.
1870          */
1871
1872         ASSERT(strlen(newpath) < MAXPATHLEN);
1873
1874         /* mntfs requires consistency while vfs list lock is held */
1875
1876         if (VFS_ON_LIST(vfsp)) {
1877                 have_list_lock = 1;
1878                 vfs_list_lock();
1879         }
1880
1881         if (*refp != NULL)
1882                 refstr_rele(*refp);
1883
1884         /* Do we need to modify the path? */
1885
1886         if (zone == global_zone || *newpath != '/') {
1887                 ref = refstr_alloc(newpath);
1888                 goto out;
1889         }
1890
1891         /*
1892          * Truncate the trailing '/' in the zoneroot, and merge
1893          * in the zone's rootpath with the "newpath" (resource
1894          * or mountpoint) passed in.
1895          *
1896          * The size of the required buffer is thus the size of
1897          * the buffer required for the passed-in newpath
1898          * (strlen(newpath) + 1), plus the size of the buffer
1899          * required to hold zone_rootpath (zone_rootpathlen)
1900          * minus one for one of the now-superfluous NUL
1901          * terminations, minus one for the trailing '/'.
1902          *
1903          * That gives us:
1904          *
1905          * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1906          *
1907          * Which is what we have below.
1908          */
1909
1910         len = strlen(newpath) + zone->zone_rootpathlen - 1;
1911         sp = kmem_alloc(len, KM_SLEEP);
1912
1913         /*
1914          * Copy everything including the trailing slash, which
1915          * we then overwrite with the NUL character.
1916          */
1917
1918         (void) strcpy(sp, zone->zone_rootpath);
1919         sp[zone->zone_rootpathlen - 2] = '\0';
1920         (void) strcat(sp, newpath);
1921
1922         ref = refstr_alloc(sp);
1923         kmem_free(sp, len);
1924 out:
1925         *refp = ref;
1926
1927         if (have_list_lock) {
1928                 vfs_mnttab_modtimeupd();
1929                 vfs_list_unlock();
1930         }
1931 }
1932
1933 /*
1934  * Record a mounted resource name in a vfs structure.
1935  * If vfsp is already mounted, caller must hold the vfs lock.
1936  */
1937 void
1938 vfs_setresource(struct vfs *vfsp, const char *resource)
1939 {
1940         if (resource == NULL || resource[0] == '\0')
1941                 resource = VFS_NORESOURCE;
1942         vfs_setpath(vfsp, &vfsp->vfs_resource, resource);
1943 }
1944
1945 /*
1946  * Record a mount point name in a vfs structure.
1947  * If vfsp is already mounted, caller must hold the vfs lock.
1948  */
1949 void
1950 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt)
1951 {
1952         if (mntpt == NULL || mntpt[0] == '\0')
1953                 mntpt = VFS_NOMNTPT;
1954         vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt);
1955 }
1956
1957 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1958
1959 refstr_t *
1960 vfs_getresource(const struct vfs *vfsp)
1961 {
1962         refstr_t *resource;
1963
1964         vfs_list_read_lock();
1965         resource = vfsp->vfs_resource;
1966         refstr_hold(resource);
1967         vfs_list_unlock();
1968
1969         return (resource);
1970 }
1971
1972 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1973
1974 refstr_t *
1975 vfs_getmntpoint(const struct vfs *vfsp)
1976 {
1977         refstr_t *mntpt;
1978
1979         vfs_list_read_lock();
1980         mntpt = vfsp->vfs_mntpt;
1981         refstr_hold(mntpt);
1982         vfs_list_unlock();
1983
1984         return (mntpt);
1985 }
1986
1987 /*
1988  * Create an empty options table with enough empty slots to hold all
1989  * The options in the options string passed as an argument.
1990  * Potentially prepend another options table.
1991  *
1992  * Note: caller is responsible for locking the vfs list, if needed,
1993  *       to protect mops.
1994  */
1995 static void
1996 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1997     const mntopts_t *mtmpl)
1998 {
1999         const char *s = opts;
2000         uint_t count;
2001
2002         if (opts == NULL || *opts == '\0') {
2003                 count = 0;
2004         } else {
2005                 count = 1;
2006
2007                 /*
2008                  * Count number of options in the string
2009                  */
2010                 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
2011                         count++;
2012                         s++;
2013                 }
2014         }
2015         vfs_copyopttbl_extend(mtmpl, mops, count);
2016 }
2017
2018 /*
2019  * Create an empty options table with enough empty slots to hold all
2020  * The options in the options string passed as an argument.
2021  *
2022  * This function is *not* for general use by filesystems.
2023  *
2024  * Note: caller is responsible for locking the vfs list, if needed,
2025  *       to protect mops.
2026  */
2027 void
2028 vfs_createopttbl(mntopts_t *mops, const char *opts)
2029 {
2030         vfs_createopttbl_extend(mops, opts, NULL);
2031 }
2032
2033
2034 /*
2035  * Swap two mount options tables
2036  */
2037 static void
2038 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2039 {
2040         uint_t tmpcnt;
2041         mntopt_t *tmplist;
2042
2043         tmpcnt = optbl2->mo_count;
2044         tmplist = optbl2->mo_list;
2045         optbl2->mo_count = optbl1->mo_count;
2046         optbl2->mo_list = optbl1->mo_list;
2047         optbl1->mo_count = tmpcnt;
2048         optbl1->mo_list = tmplist;
2049 }
2050
2051 static void
2052 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2053 {
2054         vfs_list_lock();
2055         vfs_swapopttbl_nolock(optbl1, optbl2);
2056         vfs_mnttab_modtimeupd();
2057         vfs_list_unlock();
2058 }
2059
2060 static char **
2061 vfs_copycancelopt_extend(char **const moc, int extend)
2062 {
2063         int i = 0;
2064         int j;
2065         char **result;
2066
2067         if (moc != NULL) {
2068                 for (; moc[i] != NULL; i++)
2069                         /* count number of options to cancel */;
2070         }
2071
2072         if (i + extend == 0)
2073                 return (NULL);
2074
2075         result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2076
2077         for (j = 0; j < i; j++) {
2078                 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2079                 (void) strcpy(result[j], moc[j]);
2080         }
2081         for (; j <= i + extend; j++)
2082                 result[j] = NULL;
2083
2084         return (result);
2085 }
2086
2087 static void
2088 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2089 {
2090         char *sp, *dp;
2091
2092         d->mo_flags = s->mo_flags;
2093         d->mo_data = s->mo_data;
2094         sp = s->mo_name;
2095         if (sp != NULL) {
2096                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2097                 (void) strcpy(dp, sp);
2098                 d->mo_name = dp;
2099         } else {
2100                 d->mo_name = NULL; /* should never happen */
2101         }
2102
2103         d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2104
2105         sp = s->mo_arg;
2106         if (sp != NULL) {
2107                 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2108                 (void) strcpy(dp, sp);
2109                 d->mo_arg = dp;
2110         } else {
2111                 d->mo_arg = NULL;
2112         }
2113 }
2114
2115 /*
2116  * Copy a mount options table, possibly allocating some spare
2117  * slots at the end.  It is permissible to copy_extend the NULL table.
2118  */
2119 static void
2120 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2121 {
2122         uint_t i, count;
2123         mntopt_t *motbl;
2124
2125         /*
2126          * Clear out any existing stuff in the options table being initialized
2127          */
2128         vfs_freeopttbl(dmo);
2129         count = (smo == NULL) ? 0 : smo->mo_count;
2130         if ((count + extra) == 0)       /* nothing to do */
2131                 return;
2132         dmo->mo_count = count + extra;
2133         motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2134         dmo->mo_list = motbl;
2135         for (i = 0; i < count; i++) {
2136                 vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2137         }
2138         for (i = count; i < count + extra; i++) {
2139                 motbl[i].mo_flags = MO_EMPTY;
2140         }
2141 }
2142
2143 /*
2144  * Copy a mount options table.
2145  *
2146  * This function is *not* for general use by filesystems.
2147  *
2148  * Note: caller is responsible for locking the vfs list, if needed,
2149  *       to protect smo and dmo.
2150  */
2151 void
2152 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2153 {
2154         vfs_copyopttbl_extend(smo, dmo, 0);
2155 }
2156
2157 static char **
2158 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2159 {
2160         int c1 = 0;
2161         int c2 = 0;
2162         char **result;
2163         char **sp1, **sp2, **dp;
2164
2165         /*
2166          * First we count both lists of cancel options.
2167          * If either is NULL or has no elements, we return a copy of
2168          * the other.
2169          */
2170         if (mop1->mo_cancel != NULL) {
2171                 for (; mop1->mo_cancel[c1] != NULL; c1++)
2172                         /* count cancel options in mop1 */;
2173         }
2174
2175         if (c1 == 0)
2176                 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2177
2178         if (mop2->mo_cancel != NULL) {
2179                 for (; mop2->mo_cancel[c2] != NULL; c2++)
2180                         /* count cancel options in mop2 */;
2181         }
2182
2183         result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2184
2185         if (c2 == 0)
2186                 return (result);
2187
2188         /*
2189          * When we get here, we've got two sets of cancel options;
2190          * we need to merge the two sets.  We know that the result
2191          * array has "c1+c2+1" entries and in the end we might shrink
2192          * it.
2193          * Result now has a copy of the c1 entries from mop1; we'll
2194          * now lookup all the entries of mop2 in mop1 and copy it if
2195          * it is unique.
2196          * This operation is O(n^2) but it's only called once per
2197          * filesystem per duplicate option.  This is a situation
2198          * which doesn't arise with the filesystems in ON and
2199          * n is generally 1.
2200          */
2201
2202         dp = &result[c1];
2203         for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2204                 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2205                         if (strcmp(*sp1, *sp2) == 0)
2206                                 break;
2207                 }
2208                 if (*sp1 == NULL) {
2209                         /*
2210                          * Option *sp2 not found in mop1, so copy it.
2211                          * The calls to vfs_copycancelopt_extend()
2212                          * guarantee that there's enough room.
2213                          */
2214                         *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2215                         (void) strcpy(*dp++, *sp2);
2216                 }
2217         }
2218         if (dp != &result[c1+c2]) {
2219                 size_t bytes = (dp - result + 1) * sizeof (char *);
2220                 char **nres = kmem_alloc(bytes, KM_SLEEP);
2221
2222                 bcopy(result, nres, bytes);
2223                 kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2224                 result = nres;
2225         }
2226         return (result);
2227 }
2228
2229 /*
2230  * Merge two mount option tables (outer and inner) into one.  This is very
2231  * similar to "merging" global variables and automatic variables in C.
2232  *
2233  * This isn't (and doesn't have to be) fast.
2234  *
2235  * This function is *not* for general use by filesystems.
2236  *
2237  * Note: caller is responsible for locking the vfs list, if needed,
2238  *       to protect omo, imo & dmo.
2239  */
2240 void
2241 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2242 {
2243         uint_t i, count;
2244         mntopt_t *mop, *motbl;
2245         uint_t freeidx;
2246
2247         /*
2248          * First determine how much space we need to allocate.
2249          */
2250         count = omo->mo_count;
2251         for (i = 0; i < imo->mo_count; i++) {
2252                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2253                         continue;
2254                 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2255                         count++;
2256         }
2257         ASSERT(count >= omo->mo_count &&
2258             count <= omo->mo_count + imo->mo_count);
2259         motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2260         for (i = 0; i < omo->mo_count; i++)
2261                 vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2262         freeidx = omo->mo_count;
2263         for (i = 0; i < imo->mo_count; i++) {
2264                 if (imo->mo_list[i].mo_flags & MO_EMPTY)
2265                         continue;
2266                 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2267                         char **newcanp;
2268                         uint_t index = mop - omo->mo_list;
2269
2270                         newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2271
2272                         vfs_freeopt(&motbl[index]);
2273                         vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2274
2275                         vfs_freecancelopt(motbl[index].mo_cancel);
2276                         motbl[index].mo_cancel = newcanp;
2277                 } else {
2278                         /*
2279                          * If it's a new option, just copy it over to the first
2280                          * free location.
2281                          */
2282                         vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2283                 }
2284         }
2285         dmo->mo_count = count;
2286         dmo->mo_list = motbl;
2287 }
2288
2289 /*
2290  * Functions to set and clear mount options in a mount options table.
2291  */
2292
2293 /*
2294  * Clear a mount option, if it exists.
2295  *
2296  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2297  * the vfs list.
2298  */
2299 static void
2300 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2301 {
2302         struct mntopt *mop;
2303         uint_t i, count;
2304
2305         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2306
2307         count = mops->mo_count;
2308         for (i = 0; i < count; i++) {
2309                 mop = &mops->mo_list[i];
2310
2311                 if (mop->mo_flags & MO_EMPTY)
2312                         continue;
2313                 if (strcmp(opt, mop->mo_name))
2314                         continue;
2315                 mop->mo_flags &= ~MO_SET;
2316                 if (mop->mo_arg != NULL) {
2317                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2318                 }
2319                 mop->mo_arg = NULL;
2320                 if (update_mnttab)
2321                         vfs_mnttab_modtimeupd();
2322                 break;
2323         }
2324 }
2325
2326 void
2327 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2328 {
2329         int gotlock = 0;
2330
2331         if (VFS_ON_LIST(vfsp)) {
2332                 gotlock = 1;
2333                 vfs_list_lock();
2334         }
2335         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2336         if (gotlock)
2337                 vfs_list_unlock();
2338 }
2339
2340
2341 /*
2342  * Set a mount option on.  If it's not found in the table, it's silently
2343  * ignored.  If the option has MO_IGNORE set, it is still set unless the
2344  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2345  * bits can be used to toggle the MO_NODISPLAY bit for the option.
2346  * If the VFS_CREATEOPT flag bit is set then the first option slot with
2347  * MO_EMPTY set is created as the option passed in.
2348  *
2349  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2350  * the vfs list.
2351  */
2352 static void
2353 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2354     const char *arg, int flags, int update_mnttab)
2355 {
2356         mntopt_t *mop;
2357         uint_t i, count;
2358         char *sp;
2359
2360         ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2361
2362         if (flags & VFS_CREATEOPT) {
2363                 if (vfs_hasopt(mops, opt) != NULL) {
2364                         flags &= ~VFS_CREATEOPT;
2365                 }
2366         }
2367         count = mops->mo_count;
2368         for (i = 0; i < count; i++) {
2369                 mop = &mops->mo_list[i];
2370
2371                 if (mop->mo_flags & MO_EMPTY) {
2372                         if ((flags & VFS_CREATEOPT) == 0)
2373                                 continue;
2374                         sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2375                         (void) strcpy(sp, opt);
2376                         mop->mo_name = sp;
2377                         if (arg != NULL)
2378                                 mop->mo_flags = MO_HASVALUE;
2379                         else
2380                                 mop->mo_flags = 0;
2381                 } else if (strcmp(opt, mop->mo_name)) {
2382                         continue;
2383                 }
2384                 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2385                         break;
2386                 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2387                         sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2388                         (void) strcpy(sp, arg);
2389                 } else {
2390                         sp = NULL;
2391                 }
2392                 if (mop->mo_arg != NULL)
2393                         kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2394                 mop->mo_arg = sp;
2395                 if (flags & VFS_DISPLAY)
2396                         mop->mo_flags &= ~MO_NODISPLAY;
2397                 if (flags & VFS_NODISPLAY)
2398                         mop->mo_flags |= MO_NODISPLAY;
2399                 mop->mo_flags |= MO_SET;
2400                 if (mop->mo_cancel != NULL) {
2401                         char **cp;
2402
2403                         for (cp = mop->mo_cancel; *cp != NULL; cp++)
2404                                 vfs_clearmntopt_nolock(mops, *cp, 0);
2405                 }
2406                 if (update_mnttab)
2407                         vfs_mnttab_modtimeupd();
2408                 break;
2409         }
2410 }
2411
2412 void
2413 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2414 {
2415         int gotlock = 0;
2416
2417         if (VFS_ON_LIST(vfsp)) {
2418                 gotlock = 1;
2419                 vfs_list_lock();
2420         }
2421         vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2422         if (gotlock)
2423                 vfs_list_unlock();
2424 }
2425
2426
2427 /*
2428  * Add a "tag" option to a mounted file system's options list.
2429  *
2430  * Note: caller is responsible for locking the vfs list, if needed,
2431  *       to protect mops.
2432  */
2433 static mntopt_t *
2434 vfs_addtag(mntopts_t *mops, const char *tag)
2435 {
2436         uint_t count;
2437         mntopt_t *mop, *motbl;
2438
2439         count = mops->mo_count + 1;
2440         motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2441         if (mops->mo_count) {
2442                 size_t len = (count - 1) * sizeof (mntopt_t);
2443
2444                 bcopy(mops->mo_list, motbl, len);
2445                 kmem_free(mops->mo_list, len);
2446         }
2447         mops->mo_count = count;
2448         mops->mo_list = motbl;
2449         mop = &motbl[count - 1];
2450         mop->mo_flags = MO_TAG;
2451         mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2452         (void) strcpy(mop->mo_name, tag);
2453         return (mop);
2454 }
2455
2456 /*
2457  * Allow users to set arbitrary "tags" in a vfs's mount options.
2458  * Broader use within the kernel is discouraged.
2459  */
2460 int
2461 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2462     cred_t *cr)
2463 {
2464         vfs_t *vfsp;
2465         mntopts_t *mops;
2466         mntopt_t *mop;
2467         int found = 0;
2468         dev_t dev = makedevice(major, minor);
2469         int err = 0;
2470         char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2471
2472         /*
2473          * Find the desired mounted file system
2474          */
2475         vfs_list_lock();
2476         vfsp = rootvfs;
2477         do {
2478                 if (vfsp->vfs_dev == dev &&
2479                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2480                         found = 1;
2481                         break;
2482                 }
2483                 vfsp = vfsp->vfs_next;
2484         } while (vfsp != rootvfs);
2485
2486         if (!found) {
2487                 err = EINVAL;
2488                 goto out;
2489         }
2490         err = secpolicy_fs_config(cr, vfsp);
2491         if (err != 0)
2492                 goto out;
2493
2494         mops = &vfsp->vfs_mntopts;
2495         /*
2496          * Add tag if it doesn't already exist
2497          */
2498         if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2499                 int len;
2500
2501                 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2502                 len = strlen(buf);
2503                 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2504                         err = ENAMETOOLONG;
2505                         goto out;
2506                 }
2507                 mop = vfs_addtag(mops, tag);
2508         }
2509         if ((mop->mo_flags & MO_TAG) == 0) {
2510                 err = EINVAL;
2511                 goto out;
2512         }
2513         vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2514 out:
2515         vfs_list_unlock();
2516         kmem_free(buf, MAX_MNTOPT_STR);
2517         return (err);
2518 }
2519
2520 /*
2521  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2522  * Broader use within the kernel is discouraged.
2523  */
2524 int
2525 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2526     cred_t *cr)
2527 {
2528         vfs_t *vfsp;
2529         mntopt_t *mop;
2530         int found = 0;
2531         dev_t dev = makedevice(major, minor);
2532         int err = 0;
2533
2534         /*
2535          * Find the desired mounted file system
2536          */
2537         vfs_list_lock();
2538         vfsp = rootvfs;
2539         do {
2540                 if (vfsp->vfs_dev == dev &&
2541                     strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2542                         found = 1;
2543                         break;
2544                 }
2545                 vfsp = vfsp->vfs_next;
2546         } while (vfsp != rootvfs);
2547
2548         if (!found) {
2549                 err = EINVAL;
2550                 goto out;
2551         }
2552         err = secpolicy_fs_config(cr, vfsp);
2553         if (err != 0)
2554                 goto out;
2555
2556         if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2557                 err = EINVAL;
2558                 goto out;
2559         }
2560         if ((mop->mo_flags & MO_TAG) == 0) {
2561                 err = EINVAL;
2562                 goto out;
2563         }
2564         vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2565 out:
2566         vfs_list_unlock();
2567         return (err);
2568 }
2569
2570 /*
2571  * Function to parse an option string and fill in a mount options table.
2572  * Unknown options are silently ignored.  The input option string is modified
2573  * by replacing separators with nulls.  If the create flag is set, options
2574  * not found in the table are just added on the fly.  The table must have
2575  * an option slot marked MO_EMPTY to add an option on the fly.
2576  *
2577  * This function is *not* for general use by filesystems.
2578  *
2579  * Note: caller is responsible for locking the vfs list, if needed,
2580  *       to protect mops..
2581  */
2582 void
2583 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2584 {
2585         char *s = osp, *p, *nextop, *valp, *cp, *ep;
2586         int setflg = VFS_NOFORCEOPT;
2587
2588         if (osp == NULL)
2589                 return;
2590         while (*s != '\0') {
2591                 p = strchr(s, ',');     /* find next option */
2592                 if (p == NULL) {
2593                         cp = NULL;
2594                         p = s + strlen(s);
2595                 } else {
2596                         cp = p;         /* save location of comma */
2597                         *p++ = '\0';    /* mark end and point to next option */
2598                 }
2599                 nextop = p;
2600                 p = strchr(s, '=');     /* look for value */
2601                 if (p == NULL) {
2602                         valp = NULL;    /* no value supplied */
2603                 } else {
2604                         ep = p;         /* save location of equals */
2605                         *p++ = '\0';    /* end option and point to value */
2606                         valp = p;
2607                 }
2608                 /*
2609                  * set option into options table
2610                  */
2611                 if (create)
2612                         setflg |= VFS_CREATEOPT;
2613                 vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2614                 if (cp != NULL)
2615                         *cp = ',';      /* restore the comma */
2616                 if (valp != NULL)
2617                         *ep = '=';      /* restore the equals */
2618                 s = nextop;
2619         }
2620 }
2621
2622 /*
2623  * Function to inquire if an option exists in a mount options table.
2624  * Returns a pointer to the option if it exists, else NULL.
2625  *
2626  * This function is *not* for general use by filesystems.
2627  *
2628  * Note: caller is responsible for locking the vfs list, if needed,
2629  *       to protect mops.
2630  */
2631 struct mntopt *
2632 vfs_hasopt(const mntopts_t *mops, const char *opt)
2633 {
2634         struct mntopt *mop;
2635         uint_t i, count;
2636
2637         count = mops->mo_count;
2638         for (i = 0; i < count; i++) {
2639                 mop = &mops->mo_list[i];
2640
2641                 if (mop->mo_flags & MO_EMPTY)
2642                         continue;
2643                 if (strcmp(opt, mop->mo_name) == 0)
2644                         return (mop);
2645         }
2646         return (NULL);
2647 }
2648
2649 /*
2650  * Function to inquire if an option is set in a mount options table.
2651  * Returns non-zero if set and fills in the arg pointer with a pointer to
2652  * the argument string or NULL if there is no argument string.
2653  */
2654 static int
2655 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2656 {
2657         struct mntopt *mop;
2658         uint_t i, count;
2659
2660         count = mops->mo_count;
2661         for (i = 0; i < count; i++) {
2662                 mop = &mops->mo_list[i];
2663
2664                 if (mop->mo_flags & MO_EMPTY)
2665                         continue;
2666                 if (strcmp(opt, mop->mo_name))
2667                         continue;
2668                 if ((mop->mo_flags & MO_SET) == 0)
2669                         return (0);
2670                 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2671                         *argp = mop->mo_arg;
2672                 return (1);
2673         }
2674         return (0);
2675 }
2676
2677
2678 int
2679 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2680 {
2681         int ret;
2682
2683         vfs_list_read_lock();
2684         ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2685         vfs_list_unlock();
2686         return (ret);
2687 }
2688
2689
2690 /*
2691  * Construct a comma separated string of the options set in the given
2692  * mount table, return the string in the given buffer.  Return non-zero if
2693  * the buffer would overflow.
2694  *
2695  * This function is *not* for general use by filesystems.
2696  *
2697  * Note: caller is responsible for locking the vfs list, if needed,
2698  *       to protect mp.
2699  */
2700 int
2701 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2702 {
2703         char *cp;
2704         uint_t i;
2705
2706         buf[0] = '\0';
2707         cp = buf;
2708         for (i = 0; i < mp->mo_count; i++) {
2709                 struct mntopt *mop;
2710
2711                 mop = &mp->mo_list[i];
2712                 if (mop->mo_flags & MO_SET) {
2713                         int optlen, comma = 0;
2714
2715                         if (buf[0] != '\0')
2716                                 comma = 1;
2717                         optlen = strlen(mop->mo_name);
2718                         if (strlen(buf) + comma + optlen + 1 > len)
2719                                 goto err;
2720                         if (comma)
2721                                 *cp++ = ',';
2722                         (void) strcpy(cp, mop->mo_name);
2723                         cp += optlen;
2724                         /*
2725                          * Append option value if there is one
2726                          */
2727                         if (mop->mo_arg != NULL) {
2728                                 int arglen;
2729
2730                                 arglen = strlen(mop->mo_arg);
2731                                 if (strlen(buf) + arglen + 2 > len)
2732                                         goto err;
2733                                 *cp++ = '=';
2734                                 (void) strcpy(cp, mop->mo_arg);
2735                                 cp += arglen;
2736                         }
2737                 }
2738         }
2739         return (0);
2740 err:
2741         return (EOVERFLOW);
2742 }
2743
2744 static void
2745 vfs_freecancelopt(char **moc)
2746 {
2747         if (moc != NULL) {
2748                 int ccnt = 0;
2749                 char **cp;
2750
2751                 for (cp = moc; *cp != NULL; cp++) {
2752                         kmem_free(*cp, strlen(*cp) + 1);
2753                         ccnt++;
2754                 }
2755                 kmem_free(moc, (ccnt + 1) * sizeof (char *));
2756         }
2757 }
2758
2759 static void
2760 vfs_freeopt(mntopt_t *mop)
2761 {
2762         if (mop->mo_name != NULL)
2763                 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2764
2765         vfs_freecancelopt(mop->mo_cancel);
2766
2767         if (mop->mo_arg != NULL)
2768                 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2769 }
2770
2771 /*
2772  * Free a mount options table
2773  *
2774  * This function is *not* for general use by filesystems.
2775  *
2776  * Note: caller is responsible for locking the vfs list, if needed,
2777  *       to protect mp.
2778  */
2779 void
2780 vfs_freeopttbl(mntopts_t *mp)
2781 {
2782         uint_t i, count;
2783
2784         count = mp->mo_count;
2785         for (i = 0; i < count; i++) {
2786                 vfs_freeopt(&mp->mo_list[i]);
2787         }
2788         if (count) {
2789                 kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2790                 mp->mo_count = 0;
2791                 mp->mo_list = NULL;
2792         }
2793 }
2794
2795
2796 /* ARGSUSED */
2797 static int
2798 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2799         caller_context_t *ct)
2800 {
2801         return (0);
2802 }
2803
2804 /* ARGSUSED */
2805 static int
2806 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2807         caller_context_t *ct)
2808 {
2809         return (0);
2810 }
2811
2812 /*
2813  * The dummy vnode is currently used only by file events notification
2814  * module which is just interested in the timestamps.
2815  */
2816 /* ARGSUSED */
2817 static int
2818 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2819     caller_context_t *ct)
2820 {
2821         bzero(vap, sizeof (vattr_t));
2822         vap->va_type = VREG;
2823         vap->va_nlink = 1;
2824         vap->va_ctime = vfs_mnttab_ctime;
2825         /*
2826          * it is ok to just copy mtime as the time will be monotonically
2827          * increasing.
2828          */
2829         vap->va_mtime = vfs_mnttab_mtime;
2830         vap->va_atime = vap->va_mtime;
2831         return (0);
2832 }
2833
2834 static void
2835 vfs_mnttabvp_setup(void)
2836 {
2837         vnode_t *tvp;
2838         vnodeops_t *vfs_mntdummyvnops;
2839         const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2840                 VOPNAME_READ,           { .vop_read = vfs_mntdummyread },
2841                 VOPNAME_WRITE,          { .vop_write = vfs_mntdummywrite },
2842                 VOPNAME_GETATTR,        { .vop_getattr = vfs_mntdummygetattr },
2843                 VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
2844                 NULL,                   NULL
2845         };
2846
2847         if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2848             &vfs_mntdummyvnops) != 0) {
2849                 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2850                 /* Shouldn't happen, but not bad enough to panic */
2851                 return;
2852         }
2853
2854         /*
2855          * A global dummy vnode is allocated to represent mntfs files.
2856          * The mntfs file (/etc/mnttab) can be monitored for file events
2857          * and receive an event when mnttab changes. Dummy VOP calls
2858          * will be made on this vnode. The file events notification module
2859          * intercepts this vnode and delivers relevant events.
2860          */
2861         tvp = vn_alloc(KM_SLEEP);
2862         tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2863         vn_setops(tvp, vfs_mntdummyvnops);
2864         tvp->v_type = VREG;
2865         /*
2866          * The mnt dummy ops do not reference v_data.
2867          * No other module intercepting this vnode should either.
2868          * Just set it to point to itself.
2869          */
2870         tvp->v_data = (caddr_t)tvp;
2871         tvp->v_vfsp = rootvfs;
2872         vfs_mntdummyvp = tvp;
2873 }
2874
2875 /*
2876  * performs fake read/write ops
2877  */
2878 static void
2879 vfs_mnttab_rwop(int rw)
2880 {
2881         struct uio      uio;
2882         struct iovec    iov;
2883         char    buf[1];
2884
2885         if (vfs_mntdummyvp == NULL)
2886                 return;
2887
2888         bzero(&uio, sizeof (uio));
2889         bzero(&iov, sizeof (iov));
2890         iov.iov_base = buf;
2891         iov.iov_len = 0;
2892         uio.uio_iov = &iov;
2893         uio.uio_iovcnt = 1;
2894         uio.uio_loffset = 0;
2895         uio.uio_segflg = UIO_SYSSPACE;
2896         uio.uio_resid = 0;
2897         if (rw) {
2898                 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2899         } else {
2900                 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2901         }
2902 }
2903
2904 /*
2905  * Generate a write operation.
2906  */
2907 void
2908 vfs_mnttab_writeop(void)
2909 {
2910         vfs_mnttab_rwop(1);
2911 }
2912
2913 /*
2914  * Generate a read operation.
2915  */
2916 void
2917 vfs_mnttab_readop(void)
2918 {
2919         vfs_mnttab_rwop(0);
2920 }
2921
2922 /*
2923  * Free any mnttab information recorded in the vfs struct.
2924  * The vfs must not be on the vfs list.
2925  */
2926 static void
2927 vfs_freemnttab(struct vfs *vfsp)
2928 {
2929         ASSERT(!VFS_ON_LIST(vfsp));
2930
2931         /*
2932          * Free device and mount point information
2933          */
2934         if (vfsp->vfs_mntpt != NULL) {
2935                 refstr_rele(vfsp->vfs_mntpt);
2936                 vfsp->vfs_mntpt = NULL;
2937         }
2938         if (vfsp->vfs_resource != NULL) {
2939                 refstr_rele(vfsp->vfs_resource);
2940                 vfsp->vfs_resource = NULL;
2941         }
2942         /*
2943          * Now free mount options information
2944          */
2945         vfs_freeopttbl(&vfsp->vfs_mntopts);
2946 }
2947
2948 /*
2949  * Return the last mnttab modification time
2950  */
2951 void
2952 vfs_mnttab_modtime(timespec_t *ts)
2953 {
2954         ASSERT(RW_LOCK_HELD(&vfslist));
2955         *ts = vfs_mnttab_mtime;
2956 }
2957
2958 /*
2959  * See if mnttab is changed
2960  */
2961 void
2962 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2963 {
2964         int changed;
2965
2966         *phpp = (struct pollhead *)NULL;
2967
2968         /*
2969          * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2970          * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2971          * to not grab the vfs list lock because tv_sec is monotonically
2972          * increasing.
2973          */
2974
2975         changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2976             (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2977         if (!changed) {
2978                 *phpp = &vfs_pollhd;
2979         }
2980 }
2981
2982 /* Provide a unique and monotonically-increasing timestamp. */
2983 void
2984 vfs_mono_time(timespec_t *ts)
2985 {
2986         static volatile hrtime_t hrt;           /* The saved time. */
2987         hrtime_t        newhrt, oldhrt;         /* For effecting the CAS. */
2988         timespec_t      newts;
2989
2990         /*
2991          * Try gethrestime() first, but be prepared to fabricate a sensible
2992          * answer at the first sign of any trouble.
2993          */
2994         gethrestime(&newts);
2995         newhrt = ts2hrt(&newts);
2996         for (;;) {
2997                 oldhrt = hrt;
2998                 if (newhrt <= hrt)
2999                         newhrt = hrt + 1;
3000                 if (cas64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt)
3001                         break;
3002         }
3003         hrt2ts(newhrt, ts);
3004 }
3005
3006 /*
3007  * Update the mnttab modification time and wake up any waiters for
3008  * mnttab changes
3009  */
3010 void
3011 vfs_mnttab_modtimeupd()
3012 {
3013         hrtime_t oldhrt, newhrt;
3014
3015         ASSERT(RW_WRITE_HELD(&vfslist));
3016         oldhrt = ts2hrt(&vfs_mnttab_mtime);
3017         gethrestime(&vfs_mnttab_mtime);
3018         newhrt = ts2hrt(&vfs_mnttab_mtime);
3019         if (oldhrt == (hrtime_t)0)
3020                 vfs_mnttab_ctime = vfs_mnttab_mtime;
3021         /*
3022          * Attempt to provide unique mtime (like uniqtime but not).
3023          */
3024         if (newhrt == oldhrt) {
3025                 newhrt++;
3026                 hrt2ts(newhrt, &vfs_mnttab_mtime);
3027         }
3028         pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
3029         vfs_mnttab_writeop();
3030 }
3031
3032 int
3033 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
3034 {
3035         vnode_t *coveredvp;
3036         int error;
3037         extern void teardown_vopstats(vfs_t *);
3038
3039         /*
3040          * Get covered vnode. This will be NULL if the vfs is not linked
3041          * into the file system name space (i.e., domount() with MNT_NOSPICE).
3042          */
3043         coveredvp = vfsp->vfs_vnodecovered;
3044         ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3045
3046         /*
3047          * Purge all dnlc entries for this vfs.
3048          */
3049         (void) dnlc_purge_vfsp(vfsp, 0);
3050
3051         /* For forcible umount, skip VFS_SYNC() since it may hang */
3052         if ((flag & MS_FORCE) == 0)
3053                 (void) VFS_SYNC(vfsp, 0, cr);
3054
3055         /*
3056          * Lock the vfs to maintain fs status quo during unmount.  This
3057          * has to be done after the sync because ufs_update tries to acquire
3058          * the vfs_reflock.
3059          */
3060         vfs_lock_wait(vfsp);
3061
3062         if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3063                 vfs_unlock(vfsp);
3064                 if (coveredvp != NULL)
3065                         vn_vfsunlock(coveredvp);
3066         } else if (coveredvp != NULL) {
3067                 teardown_vopstats(vfsp);
3068                 /*
3069                  * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3070                  * when it frees vfsp so we do a VN_HOLD() so we can
3071                  * continue to use coveredvp afterwards.
3072                  */
3073                 VN_HOLD(coveredvp);
3074                 vfs_remove(vfsp);
3075                 vn_vfsunlock(coveredvp);
3076                 VN_RELE(coveredvp);
3077         } else {
3078                 teardown_vopstats(vfsp);
3079                 /*
3080                  * Release the reference to vfs that is not linked
3081                  * into the name space.
3082                  */
3083                 vfs_unlock(vfsp);
3084                 VFS_RELE(vfsp);
3085         }
3086         return (error);
3087 }
3088
3089
3090 /*
3091  * Vfs_unmountall() is called by uadmin() to unmount all
3092  * mounted file systems (except the root file system) during shutdown.
3093  * It follows the existing locking protocol when traversing the vfs list
3094  * to sync and unmount vfses. Even though there should be no
3095  * other thread running while the system is shutting down, it is prudent
3096  * to still follow the locking protocol.
3097  */
3098 void
3099 vfs_unmountall(void)
3100 {
3101         struct vfs *vfsp;
3102         struct vfs *prev_vfsp = NULL;
3103         int error;
3104
3105         /*
3106          * Toss all dnlc entries now so that the per-vfs sync
3107          * and unmount operations don't have to slog through
3108          * a bunch of uninteresting vnodes over and over again.
3109          */
3110         dnlc_purge();
3111
3112         vfs_list_lock();
3113         for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3114                 prev_vfsp = vfsp->vfs_prev;
3115
3116                 if (vfs_lock(vfsp) != 0)
3117                         continue;
3118                 error = vn_vfswlock(vfsp->vfs_vnodecovered);
3119                 vfs_unlock(vfsp);
3120                 if (error)
3121                         continue;
3122
3123                 vfs_list_unlock();
3124
3125                 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3126                 (void) dounmount(vfsp, 0, CRED());
3127
3128                 /*
3129                  * Since we dropped the vfslist lock above we must
3130                  * verify that next_vfsp still exists, else start over.
3131                  */
3132                 vfs_list_lock();
3133                 for (vfsp = rootvfs->vfs_prev;
3134                     vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3135                         if (vfsp == prev_vfsp)
3136                                 break;
3137                 if (vfsp == rootvfs && prev_vfsp != rootvfs)
3138                         prev_vfsp = rootvfs->vfs_prev;
3139         }
3140         vfs_list_unlock();
3141 }
3142
3143 /*
3144  * Called to add an entry to the end of the vfs mount in progress list
3145  */
3146 void
3147 vfs_addmip(dev_t dev, struct vfs *vfsp)
3148 {
3149         struct ipmnt *mipp;
3150
3151         mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3152         mipp->mip_next = NULL;
3153         mipp->mip_dev = dev;
3154         mipp->mip_vfsp = vfsp;
3155         mutex_enter(&vfs_miplist_mutex);
3156         if (vfs_miplist_end != NULL)
3157                 vfs_miplist_end->mip_next = mipp;
3158         else
3159                 vfs_miplist = mipp;
3160         vfs_miplist_end = mipp;
3161         mutex_exit(&vfs_miplist_mutex);
3162 }
3163
3164 /*
3165  * Called to remove an entry from the mount in progress list
3166  * Either because the mount completed or it failed.
3167  */
3168 void
3169 vfs_delmip(struct vfs *vfsp)
3170 {
3171         struct ipmnt *mipp, *mipprev;
3172
3173         mutex_enter(&vfs_miplist_mutex);
3174         mipprev = NULL;
3175         for (mipp = vfs_miplist;
3176             mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3177                 mipprev = mipp;
3178         }
3179         if (mipp == NULL)
3180                 return; /* shouldn't happen */
3181         if (mipp == vfs_miplist_end)
3182                 vfs_miplist_end = mipprev;
3183         if (mipprev == NULL)
3184                 vfs_miplist = mipp->mip_next;
3185         else
3186                 mipprev->mip_next = mipp->mip_next;
3187         mutex_exit(&vfs_miplist_mutex);
3188         kmem_free(mipp, sizeof (struct ipmnt));
3189 }
3190
3191 /*
3192  * vfs_add is called by a specific filesystem's mount routine to add
3193  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3194  * The vfs should already have been locked by the caller.
3195  *
3196  * coveredvp is NULL if this is the root.
3197  */
3198 void
3199 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3200 {
3201         int newflag;
3202
3203         ASSERT(vfs_lock_held(vfsp));
3204         VFS_HOLD(vfsp);
3205         newflag = vfsp->vfs_flag;
3206         if (mflag & MS_RDONLY)
3207                 newflag |= VFS_RDONLY;
3208         else
3209                 newflag &= ~VFS_RDONLY;
3210         if (mflag & MS_NOSUID)
3211                 newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3212         else
3213                 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3214         if (mflag & MS_NOMNTTAB)
3215                 newflag |= VFS_NOMNTTAB;
3216         else
3217                 newflag &= ~VFS_NOMNTTAB;
3218
3219         if (coveredvp != NULL) {
3220                 ASSERT(vn_vfswlock_held(coveredvp));
3221                 coveredvp->v_vfsmountedhere = vfsp;
3222                 VN_HOLD(coveredvp);
3223         }
3224         vfsp->vfs_vnodecovered = coveredvp;
3225         vfsp->vfs_flag = newflag;
3226
3227         vfs_list_add(vfsp);
3228 }
3229
3230 /*
3231  * Remove a vfs from the vfs list, null out the pointer from the
3232  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3233  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3234  * reference to the vfs and to the covered vnode.
3235  *
3236  * Called from dounmount after it's confirmed with the file system
3237  * that the unmount is legal.
3238  */
3239 void
3240 vfs_remove(struct vfs *vfsp)
3241 {
3242         vnode_t *vp;
3243
3244         ASSERT(vfs_lock_held(vfsp));
3245
3246         /*
3247          * Can't unmount root.  Should never happen because fs will
3248          * be busy.
3249          */
3250         if (vfsp == rootvfs)
3251                 panic("vfs_remove: unmounting root");
3252
3253         vfs_list_remove(vfsp);
3254
3255         /*
3256          * Unhook from the file system name space.
3257          */
3258         vp = vfsp->vfs_vnodecovered;
3259         ASSERT(vn_vfswlock_held(vp));
3260         vp->v_vfsmountedhere = NULL;
3261         vfsp->vfs_vnodecovered = NULL;
3262         VN_RELE(vp);
3263
3264         /*
3265          * Release lock and wakeup anybody waiting.
3266          */
3267         vfs_unlock(vfsp);
3268         VFS_RELE(vfsp);
3269 }
3270
3271 /*
3272  * Lock a filesystem to prevent access to it while mounting,
3273  * unmounting and syncing.  Return EBUSY immediately if lock
3274  * can't be acquired.
3275  */
3276 int
3277 vfs_lock(vfs_t *vfsp)
3278 {
3279         vn_vfslocks_entry_t *vpvfsentry;
3280
3281         vpvfsentry = vn_vfslocks_getlock(vfsp);
3282         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3283                 return (0);
3284
3285         vn_vfslocks_rele(vpvfsentry);
3286         return (EBUSY);
3287 }
3288
3289 int
3290 vfs_rlock(vfs_t *vfsp)
3291 {
3292         vn_vfslocks_entry_t *vpvfsentry;
3293
3294         vpvfsentry = vn_vfslocks_getlock(vfsp);
3295
3296         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3297                 return (0);
3298
3299         vn_vfslocks_rele(vpvfsentry);
3300         return (EBUSY);
3301 }
3302
3303 void
3304 vfs_lock_wait(vfs_t *vfsp)
3305 {
3306         vn_vfslocks_entry_t *vpvfsentry;
3307
3308         vpvfsentry = vn_vfslocks_getlock(vfsp);
3309         rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3310 }
3311
3312 void
3313 vfs_rlock_wait(vfs_t *vfsp)
3314 {
3315         vn_vfslocks_entry_t *vpvfsentry;
3316
3317         vpvfsentry = vn_vfslocks_getlock(vfsp);
3318         rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3319 }
3320
3321 /*
3322  * Unlock a locked filesystem.
3323  */
3324 void
3325 vfs_unlock(vfs_t *vfsp)
3326 {
3327         vn_vfslocks_entry_t *vpvfsentry;
3328
3329         /*
3330          * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3331          * And these changes should remain for the patch changes as it is.
3332          */
3333         if (panicstr)
3334                 return;
3335
3336         /*
3337          * ve_refcount needs to be dropped twice here.
3338          * 1. To release refernce after a call to vfs_locks_getlock()
3339          * 2. To release the reference from the locking routines like
3340          *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3341          */
3342
3343         vpvfsentry = vn_vfslocks_getlock(vfsp);
3344         vn_vfslocks_rele(vpvfsentry);
3345
3346         rwst_exit(&vpvfsentry->ve_lock);
3347         vn_vfslocks_rele(vpvfsentry);
3348 }
3349
3350 /*
3351  * Utility routine that allows a filesystem to construct its
3352  * fsid in "the usual way" - by munging some underlying dev_t and
3353  * the filesystem type number into the 64-bit fsid.  Note that
3354  * this implicitly relies on dev_t persistence to make filesystem
3355  * id's persistent.
3356  *
3357  * There's nothing to prevent an individual fs from constructing its
3358  * fsid in a different way, and indeed they should.
3359  *
3360  * Since we want fsids to be 32-bit quantities (so that they can be
3361  * exported identically by either 32-bit or 64-bit APIs, as well as
3362  * the fact that fsid's are "known" to NFS), we compress the device
3363  * number given down to 32-bits, and panic if that isn't possible.
3364  */
3365 void
3366 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3367 {
3368         if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3369                 panic("device number too big for fsid!");
3370         fsi->val[1] = val;
3371 }
3372
3373 int
3374 vfs_lock_held(vfs_t *vfsp)
3375 {
3376         int held;
3377         vn_vfslocks_entry_t *vpvfsentry;
3378
3379         /*
3380          * vfs_lock_held will mimic sema_held behaviour
3381          * if panicstr is set. And these changes should remain
3382          * for the patch changes as it is.
3383          */
3384         if (panicstr)
3385                 return (1);
3386
3387         vpvfsentry = vn_vfslocks_getlock(vfsp);
3388         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3389
3390         vn_vfslocks_rele(vpvfsentry);
3391         return (held);
3392 }
3393
3394 struct _kthread *
3395 vfs_lock_owner(vfs_t *vfsp)
3396 {
3397         struct _kthread *owner;
3398         vn_vfslocks_entry_t *vpvfsentry;
3399
3400         /*
3401          * vfs_wlock_held will mimic sema_held behaviour
3402          * if panicstr is set. And these changes should remain
3403          * for the patch changes as it is.
3404          */
3405         if (panicstr)
3406                 return (NULL);
3407
3408         vpvfsentry = vn_vfslocks_getlock(vfsp);
3409         owner = rwst_owner(&vpvfsentry->ve_lock);
3410
3411         vn_vfslocks_rele(vpvfsentry);
3412         return (owner);
3413 }
3414
3415 /*
3416  * vfs list locking.
3417  *
3418  * Rather than manipulate the vfslist lock directly, we abstract into lock
3419  * and unlock routines to allow the locking implementation to be changed for
3420  * clustering.
3421  *
3422  * Whenever the vfs list is modified through its hash links, the overall list
3423  * lock must be obtained before locking the relevant hash bucket.  But to see
3424  * whether a given vfs is on the list, it suffices to obtain the lock for the
3425  * hash bucket without getting the overall list lock.  (See getvfs() below.)
3426  */
3427
3428 void
3429 vfs_list_lock()
3430 {
3431         rw_enter(&vfslist, RW_WRITER);
3432 }
3433
3434 void
3435 vfs_list_read_lock()
3436 {
3437         rw_enter(&vfslist, RW_READER);
3438 }
3439
3440 void
3441 vfs_list_unlock()
3442 {
3443         rw_exit(&vfslist);
3444 }
3445
3446 /*
3447  * Low level worker routines for adding entries to and removing entries from
3448  * the vfs list.
3449  */
3450
3451 static void
3452 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3453 {
3454         int vhno;
3455         struct vfs **hp;
3456         dev_t dev;
3457
3458         ASSERT(RW_WRITE_HELD(&vfslist));
3459
3460         dev = expldev(vfsp->vfs_fsid.val[0]);
3461         vhno = VFSHASH(getmajor(dev), getminor(dev));
3462
3463         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3464
3465         /*
3466          * Link into the hash table, inserting it at the end, so that LOFS
3467          * with the same fsid as UFS (or other) file systems will not hide the
3468          * UFS.
3469          */
3470         if (insert_at_head) {
3471                 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3472                 rvfs_list[vhno].rvfs_head = vfsp;
3473         } else {
3474                 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3475                     hp = &(*hp)->vfs_hash)
3476                         continue;
3477                 /*
3478                  * hp now contains the address of the pointer to update
3479                  * to effect the insertion.
3480                  */
3481                 vfsp->vfs_hash = NULL;
3482                 *hp = vfsp;
3483         }
3484
3485         rvfs_list[vhno].rvfs_len++;
3486         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3487 }
3488
3489
3490 static void
3491 vfs_hash_remove(struct vfs *vfsp)
3492 {
3493         int vhno;
3494         struct vfs *tvfsp;
3495         dev_t dev;
3496
3497         ASSERT(RW_WRITE_HELD(&vfslist));
3498
3499         dev = expldev(vfsp->vfs_fsid.val[0]);
3500         vhno = VFSHASH(getmajor(dev), getminor(dev));
3501
3502         mutex_enter(&rvfs_list[vhno].rvfs_lock);
3503
3504         /*
3505          * Remove from hash.
3506          */
3507         if (rvfs_list[vhno].rvfs_head == vfsp) {
3508                 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3509                 rvfs_list[vhno].rvfs_len--;
3510                 goto foundit;
3511         }
3512         for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3513             tvfsp = tvfsp->vfs_hash) {
3514                 if (tvfsp->vfs_hash == vfsp) {
3515                         tvfsp->vfs_hash = vfsp->vfs_hash;
3516                         rvfs_list[vhno].rvfs_len--;
3517                         goto foundit;
3518                 }
3519         }
3520         cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3521
3522 foundit:
3523
3524         mutex_exit(&rvfs_list[vhno].rvfs_lock);
3525 }
3526
3527
3528 void
3529 vfs_list_add(struct vfs *vfsp)
3530 {
3531         zone_t *zone;
3532
3533         /*
3534          * Typically, the vfs_t will have been created on behalf of the file
3535          * system in vfs_init, where it will have been provided with a
3536          * vfs_impl_t. This, however, might be lacking if the vfs_t was created
3537          * by an unbundled file system. We therefore check for such an example
3538          * before stamping the vfs_t with its creation time for the benefit of
3539          * mntfs.
3540          */
3541         if (vfsp->vfs_implp == NULL)
3542                 vfsimpl_setup(vfsp);
3543         vfs_mono_time(&vfsp->vfs_hrctime);
3544
3545         /*
3546          * The zone that owns the mount is the one that performed the mount.
3547          * Note that this isn't necessarily the same as the zone mounted into.
3548          * The corresponding zone_rele() will be done when the vfs_t is
3549          * being free'd.
3550          */
3551         vfsp->vfs_zone = curproc->p_zone;
3552         zone_hold(vfsp->vfs_zone);
3553
3554         /*
3555          * Find the zone mounted into, and put this mount on its vfs list.
3556          */
3557         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3558         ASSERT(zone != NULL);
3559         /*
3560          * Special casing for the root vfs.  This structure is allocated
3561          * statically and hooked onto rootvfs at link time.  During the
3562          * vfs_mountroot call at system startup time, the root file system's
3563          * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3564          * as argument.  The code below must detect and handle this special
3565          * case.  The only apparent justification for this special casing is
3566          * to ensure that the root file system appears at the head of the
3567          * list.
3568          *
3569          * XXX: I'm assuming that it's ok to do normal list locking when
3570          *      adding the entry for the root file system (this used to be
3571          *      done with no locks held).
3572          */
3573         vfs_list_lock();
3574         /*
3575          * Link into the vfs list proper.
3576          */
3577         if (vfsp == &root) {
3578                 /*
3579                  * Assert: This vfs is already on the list as its first entry.
3580                  * Thus, there's nothing to do.
3581                  */
3582                 ASSERT(rootvfs == vfsp);
3583                 /*
3584                  * Add it to the head of the global zone's vfslist.
3585                  */
3586                 ASSERT(zone == global_zone);
3587                 ASSERT(zone->zone_vfslist == NULL);
3588                 zone->zone_vfslist = vfsp;
3589         } else {
3590                 /*
3591                  * Link to end of list using vfs_prev (as rootvfs is now a
3592                  * doubly linked circular list) so list is in mount order for
3593                  * mnttab use.
3594                  */
3595                 rootvfs->vfs_prev->vfs_next = vfsp;
3596                 vfsp->vfs_prev = rootvfs->vfs_prev;
3597                 rootvfs->vfs_prev = vfsp;
3598                 vfsp->vfs_next = rootvfs;
3599
3600                 /*
3601                  * Do it again for the zone-private list (which may be NULL).
3602                  */
3603                 if (zone->zone_vfslist == NULL) {
3604                         ASSERT(zone != global_zone);
3605                         zone->zone_vfslist = vfsp;
3606                 } else {
3607                         zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3608                         vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3609                         zone->zone_vfslist->vfs_zone_prev = vfsp;
3610                         vfsp->vfs_zone_next = zone->zone_vfslist;
3611                 }
3612         }
3613
3614         /*
3615          * Link into the hash table, inserting it at the end, so that LOFS
3616          * with the same fsid as UFS (or other) file systems will not hide
3617          * the UFS.
3618          */
3619         vfs_hash_add(vfsp, 0);
3620
3621         /*
3622          * update the mnttab modification time
3623          */
3624         vfs_mnttab_modtimeupd();
3625         vfs_list_unlock();
3626         zone_rele(zone);
3627 }
3628
3629 void
3630 vfs_list_remove(struct vfs *vfsp)
3631 {
3632         zone_t *zone;
3633
3634         zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3635         ASSERT(zone != NULL);
3636         /*
3637          * Callers are responsible for preventing attempts to unmount the
3638          * root.
3639          */
3640         ASSERT(vfsp != rootvfs);
3641
3642         vfs_list_lock();
3643
3644         /*
3645          * Remove from hash.
3646          */
3647         vfs_hash_remove(vfsp);
3648
3649         /*
3650          * Remove from vfs list.
3651          */
3652         vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3653         vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3654         vfsp->vfs_next = vfsp->vfs_prev = NULL;
3655
3656         /*
3657          * Remove from zone-specific vfs list.
3658          */
3659         if (zone->zone_vfslist == vfsp)
3660                 zone->zone_vfslist = vfsp->vfs_zone_next;
3661
3662         if (vfsp->vfs_zone_next == vfsp) {
3663                 ASSERT(vfsp->vfs_zone_prev == vfsp);
3664                 ASSERT(zone->zone_vfslist == vfsp);
3665                 zone->zone_vfslist = NULL;
3666         }
3667
3668         vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3669         vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3670         vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3671
3672         /*
3673          * update the mnttab modification time
3674          */
3675         vfs_mnttab_modtimeupd();
3676         vfs_list_unlock();
3677         zone_rele(zone);
3678 }
3679
3680 struct vfs *
3681 getvfs(fsid_t *fsid)
3682 {
3683         struct vfs *vfsp;
3684         int val0 = fsid->val[0];
3685         int val1 = fsid->val[1];
3686         dev_t dev = expldev(val0);
3687         int vhno = VFSHASH(getmajor(dev), getminor(dev));
3688         kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3689
3690         mutex_enter(hmp);
3691         for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3692                 if (vfsp->vfs_fsid.val[0] == val0 &&
3693                     vfsp->vfs_fsid.val[1] == val1) {
3694                         VFS_HOLD(vfsp);
3695                         mutex_exit(hmp);
3696                         return (vfsp);
3697                 }
3698         }
3699         mutex_exit(hmp);
3700         return (NULL);
3701 }
3702
3703 /*
3704  * Search the vfs mount in progress list for a specified device/vfs entry.
3705  * Returns 0 if the first entry in the list that the device matches has the
3706  * given vfs pointer as well.  If the device matches but a different vfs
3707  * pointer is encountered in the list before the given vfs pointer then
3708  * a 1 is returned.
3709  */
3710
3711 int
3712 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3713 {
3714         int retval = 0;
3715         struct ipmnt *mipp;
3716
3717         mutex_enter(&vfs_miplist_mutex);
3718         for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3719                 if (mipp->mip_dev == dev) {
3720                         if (mipp->mip_vfsp != vfsp)
3721                                 retval = 1;
3722                         break;
3723                 }
3724         }
3725         mutex_exit(&vfs_miplist_mutex);
3726         return (retval);
3727 }
3728
3729 /*
3730  * Search the vfs list for a specified device.  Returns 1, if entry is found
3731  * or 0 if no suitable entry is found.
3732  */
3733
3734 int
3735 vfs_devismounted(dev_t dev)
3736 {
3737         struct vfs *vfsp;
3738         int found;
3739
3740         vfs_list_read_lock();
3741         vfsp = rootvfs;
3742         found = 0;
3743         do {
3744                 if (vfsp->vfs_dev == dev) {
3745                         found = 1;
3746                         break;
3747                 }
3748                 vfsp = vfsp->vfs_next;
3749         } while (vfsp != rootvfs);
3750
3751         vfs_list_unlock();
3752         return (found);
3753 }
3754
3755 /*
3756  * Search the vfs list for a specified device.  Returns a pointer to it
3757  * or NULL if no suitable entry is found. The caller of this routine
3758  * is responsible for releasing the returned vfs pointer.
3759  */
3760 struct vfs *
3761 vfs_dev2vfsp(dev_t dev)
3762 {
3763         struct vfs *vfsp;
3764         int found;
3765
3766         vfs_list_read_lock();
3767         vfsp = rootvfs;
3768         found = 0;
3769         do {
3770                 /*
3771                  * The following could be made more efficient by making
3772                  * the entire loop use vfs_zone_next if the call is from
3773                  * a zone.  The only callers, however, ustat(2) and
3774                  * umount2(2), don't seem to justify the added
3775                  * complexity at present.
3776                  */
3777                 if (vfsp->vfs_dev == dev &&
3778                     ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3779                     curproc->p_zone)) {
3780                         VFS_HOLD(vfsp);
3781                         found = 1;
3782                         break;
3783                 }
3784                 vfsp = vfsp->vfs_next;
3785         } while (vfsp != rootvfs);
3786         vfs_list_unlock();
3787         return (found ? vfsp: NULL);
3788 }
3789
3790 /*
3791  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3792  * or NULL if no suitable entry is found. The caller of this routine
3793  * is responsible for releasing the returned vfs pointer.
3794  *
3795  * Note that if multiple mntpoints match, the last one matching is
3796  * returned in an attempt to return the "top" mount when overlay
3797  * mounts are covering the same mount point.  This is accomplished by starting
3798  * at the end of the list and working our way backwards, stopping at the first
3799  * matching mount.
3800  */
3801 struct vfs *
3802 vfs_mntpoint2vfsp(const char *mp)
3803 {
3804         struct vfs *vfsp;
3805         struct vfs *retvfsp = NULL;
3806         zone_t *zone = curproc->p_zone;
3807         struct vfs *list;
3808
3809         vfs_list_read_lock();
3810         if (getzoneid() == GLOBAL_ZONEID) {
3811                 /*
3812                  * The global zone may see filesystems in any zone.
3813                  */
3814                 vfsp = rootvfs->vfs_prev;
3815                 do {
3816                         if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3817                                 retvfsp = vfsp;
3818                                 break;
3819                         }
3820                         vfsp = vfsp->vfs_prev;
3821                 } while (vfsp != rootvfs->vfs_prev);
3822         } else if ((list = zone->zone_vfslist) != NULL) {
3823                 const char *mntpt;
3824
3825                 vfsp = list->vfs_zone_prev;
3826                 do {
3827                         mntpt = refstr_value(vfsp->vfs_mntpt);
3828                         mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3829                         if (strcmp(mntpt, mp) == 0) {
3830                                 retvfsp = vfsp;
3831                                 break;
3832                         }
3833                         vfsp = vfsp->vfs_zone_prev;
3834                 } while (vfsp != list->vfs_zone_prev);
3835         }
3836         if (retvfsp)
3837                 VFS_HOLD(retvfsp);
3838         vfs_list_unlock();
3839         return (retvfsp);
3840 }
3841
3842 /*
3843  * Search the vfs list for a specified vfsops.
3844  * if vfs entry is found then return 1, else 0.
3845  */
3846 int
3847 vfs_opsinuse(vfsops_t *ops)
3848 {
3849         struct vfs *vfsp;
3850         int found;
3851
3852         vfs_list_read_lock();
3853         vfsp = rootvfs;
3854         found = 0;
3855         do {
3856                 if (vfs_getops(vfsp) == ops) {
3857                         found = 1;
3858                         break;
3859                 }
3860                 vfsp = vfsp->vfs_next;
3861         } while (vfsp != rootvfs);
3862         vfs_list_unlock();
3863         return (found);
3864 }
3865
3866 /*
3867  * Allocate an entry in vfssw for a file system type
3868  */
3869 struct vfssw *
3870 allocate_vfssw(const char *type)
3871 {
3872         struct vfssw *vswp;
3873
3874         if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3875                 /*
3876                  * The vfssw table uses the empty string to identify an
3877                  * available entry; we cannot add any type which has
3878                  * a leading NUL. The string length is limited to
3879                  * the size of the st_fstype array in struct stat.
3880                  */
3881                 return (NULL);
3882         }
3883
3884         ASSERT(VFSSW_WRITE_LOCKED());
3885         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3886                 if (!ALLOCATED_VFSSW(vswp)) {
3887                         vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3888                         (void) strcpy(vswp->vsw_name, type);
3889                         ASSERT(vswp->vsw_count == 0);
3890                         vswp->vsw_count = 1;
3891                         mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3892                         return (vswp);
3893                 }
3894         return (NULL);
3895 }
3896
3897 /*
3898  * Impose additional layer of translation between vfstype names
3899  * and module names in the filesystem.
3900  */
3901 static const char *
3902 vfs_to_modname(const char *vfstype)
3903 {
3904         if (strcmp(vfstype, "proc") == 0) {
3905                 vfstype = "procfs";
3906         } else if (strcmp(vfstype, "fd") == 0) {
3907                 vfstype = "fdfs";
3908         } else if (strncmp(vfstype, "nfs", 3) == 0) {
3909                 vfstype = "nfs";
3910         }
3911
3912         return (vfstype);
3913 }
3914
3915 /*
3916  * Find a vfssw entry given a file system type name.
3917  * Try to autoload the filesystem if it's not found.
3918  * If it's installed, return the vfssw locked to prevent unloading.
3919  */
3920 struct vfssw *
3921 vfs_getvfssw(const char *type)
3922 {
3923         struct vfssw *vswp;
3924         const char *modname;
3925
3926         RLOCK_VFSSW();
3927         vswp = vfs_getvfsswbyname(type);
3928         modname = vfs_to_modname(type);
3929
3930         if (rootdir == NULL) {
3931                 /*
3932                  * If we haven't yet loaded the root file system, then our
3933                  * _init won't be called until later. Allocate vfssw entry,
3934                  * because mod_installfs won't be called.
3935                  */
3936                 if (vswp == NULL) {
3937                         RUNLOCK_VFSSW();
3938                         WLOCK_VFSSW();
3939                         if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3940                                 if ((vswp = allocate_vfssw(type)) == NULL) {
3941                                         WUNLOCK_VFSSW();
3942                                         return (NULL);
3943                                 }
3944                         }
3945                         WUNLOCK_VFSSW();
3946                         RLOCK_VFSSW();
3947                 }
3948                 if (!VFS_INSTALLED(vswp)) {
3949                         RUNLOCK_VFSSW();
3950                         (void) modloadonly("fs", modname);
3951                 } else
3952                         RUNLOCK_VFSSW();
3953                 return (vswp);
3954         }
3955
3956         /*
3957          * Try to load the filesystem.  Before calling modload(), we drop
3958          * our lock on the VFS switch table, and pick it up after the
3959          * module is loaded.  However, there is a potential race:  the
3960          * module could be unloaded after the call to modload() completes
3961          * but before we pick up the lock and drive on.  Therefore,
3962          * we keep reloading the module until we've loaded the module
3963          * _and_ we have the lock on the VFS switch table.
3964          */
3965         while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3966                 RUNLOCK_VFSSW();
3967                 if (modload("fs", modname) == -1)
3968                         return (NULL);
3969                 RLOCK_VFSSW();
3970                 if (vswp == NULL)
3971                         if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3972                                 break;
3973         }
3974         RUNLOCK_VFSSW();
3975
3976         return (vswp);
3977 }
3978
3979 /*
3980  * Find a vfssw entry given a file system type name.
3981  */
3982 struct vfssw *
3983 vfs_getvfsswbyname(const char *type)
3984 {
3985         struct vfssw *vswp;
3986
3987         ASSERT(VFSSW_LOCKED());
3988         if (type == NULL || *type == '\0')
3989                 return (NULL);
3990
3991         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3992                 if (strcmp(type, vswp->vsw_name) == 0) {
3993                         vfs_refvfssw(vswp);
3994                         return (vswp);
3995                 }
3996         }
3997
3998         return (NULL);
3999 }
4000
4001 /*
4002  * Find a vfssw entry given a set of vfsops.
4003  */
4004 struct vfssw *
4005 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
4006 {
4007         struct vfssw *vswp;
4008
4009         RLOCK_VFSSW();
4010         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4011                 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
4012                         vfs_refvfssw(vswp);
4013                         RUNLOCK_VFSSW();
4014                         return (vswp);
4015                 }
4016         }
4017         RUNLOCK_VFSSW();
4018
4019         return (NULL);
4020 }
4021
4022 /*
4023  * Reference a vfssw entry.
4024  */
4025 void
4026 vfs_refvfssw(struct vfssw *vswp)
4027 {
4028
4029         mutex_enter(&vswp->vsw_lock);
4030         vswp->vsw_count++;
4031         mutex_exit(&vswp->vsw_lock);
4032 }
4033
4034 /*
4035  * Unreference a vfssw entry.
4036  */
4037 void
4038 vfs_unrefvfssw(struct vfssw *vswp)
4039 {
4040
4041         mutex_enter(&vswp->vsw_lock);
4042         vswp->vsw_count--;
4043         mutex_exit(&vswp->vsw_lock);
4044 }
4045
4046 int sync_timeout = 30;          /* timeout for syncing a page during panic */
4047 int sync_timeleft;              /* portion of sync_timeout remaining */
4048
4049 static int sync_retries = 20;   /* number of retries when not making progress */
4050 static int sync_triesleft;      /* portion of sync_retries remaining */
4051
4052 static pgcnt_t old_pgcnt, new_pgcnt;
4053 static int new_bufcnt, old_bufcnt;
4054
4055 /*
4056  * Sync all of the mounted filesystems, and then wait for the actual i/o to
4057  * complete.  We wait by counting the number of dirty pages and buffers,
4058  * pushing them out using bio_busy() and page_busy(), and then counting again.
4059  * This routine is used during both the uadmin A_SHUTDOWN code as well as
4060  * the SYNC phase of the panic code (see comments in panic.c).  It should only
4061  * be used after some higher-level mechanism has quiesced the system so that
4062  * new writes are not being initiated while we are waiting for completion.
4063  *
4064  * To ensure finite running time, our algorithm uses two timeout mechanisms:
4065  * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and
4066  * sync_triesleft (a progress counter used by the vfs_syncall() loop below).
4067  * Together these ensure that syncing completes if our i/o paths are stuck.
4068  * The counters are declared above so they can be found easily in the debugger.
4069  *
4070  * The sync_timeleft counter is reset by bio_busy() and page_busy() using the
4071  * vfs_syncprogress() subroutine whenever we make progress through the lists of
4072  * pages and buffers.  It is decremented and expired by the deadman() cyclic.
4073  * When vfs_syncall() decides it is done, we disable the deadman() counter by
4074  * setting sync_timeleft to zero.  This timer guards against vfs_syncall()
4075  * deadlocking or hanging inside of a broken filesystem or driver routine.
4076  *
4077  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
4078  * sync_retries consecutive calls to bio_busy() and page_busy() without
4079  * decreasing either the number of dirty buffers or dirty pages below the
4080  * lowest count we have seen so far, we give up and return from vfs_syncall().
4081  *
4082  * Each loop iteration ends with a call to delay() one second to allow time for
4083  * i/o completion and to permit the user time to read our progress messages.
4084  */
4085 void
4086 vfs_syncall(void)
4087 {
4088         if (rootdir == NULL && !modrootloaded)
4089                 return; /* panic during boot - no filesystems yet */
4090
4091         printf("syncing file systems...");
4092         vfs_syncprogress();
4093         sync();
4094
4095         vfs_syncprogress();
4096         sync_triesleft = sync_retries;
4097
4098         old_bufcnt = new_bufcnt = INT_MAX;
4099         old_pgcnt = new_pgcnt = ULONG_MAX;
4100
4101         while (sync_triesleft > 0) {
4102                 old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4103                 old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4104
4105                 new_bufcnt = bio_busy(B_TRUE);
4106                 new_pgcnt = page_busy(B_TRUE);
4107                 vfs_syncprogress();
4108
4109                 if (new_bufcnt == 0 && new_pgcnt == 0)
4110                         break;
4111
4112                 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4113                         sync_triesleft = sync_retries;
4114                 else
4115                         sync_triesleft--;
4116
4117                 if (new_bufcnt)
4118                         printf(" [%d]", new_bufcnt);
4119                 if (new_pgcnt)
4120                         printf(" %lu", new_pgcnt);
4121
4122                 delay(hz);
4123         }
4124
4125         if (new_bufcnt != 0 || new_pgcnt != 0)
4126                 printf(" done (not all i/o completed)\n");
4127         else
4128                 printf(" done\n");
4129
4130         sync_timeleft = 0;
4131         delay(hz);
4132 }
4133
4134 /*
4135  * If we are in the middle of the sync phase of panic, reset sync_timeleft to
4136  * sync_timeout to indicate that we are making progress and the deadman()
4137  * omnipresent cyclic should not yet time us out.  Note that it is safe to
4138  * store to sync_timeleft here since the deadman() is firing at high-level
4139  * on top of us.  If we are racing with the deadman(), either the deadman()
4140  * will decrement the old value and then we will reset it, or we will
4141  * reset it and then the deadman() will immediately decrement it.  In either
4142  * case, correct behavior results.
4143  */
4144 void
4145 vfs_syncprogress(void)
4146 {
4147         if (panicstr)
4148                 sync_timeleft = sync_timeout;
4149 }
4150
4151 /*
4152  * Map VFS flags to statvfs flags.  These shouldn't really be separate
4153  * flags at all.
4154  */
4155 uint_t
4156 vf_to_stf(uint_t vf)
4157 {
4158         uint_t stf = 0;
4159
4160         if (vf & VFS_RDONLY)
4161                 stf |= ST_RDONLY;
4162         if (vf & VFS_NOSETUID)
4163                 stf |= ST_NOSUID;
4164         if (vf & VFS_NOTRUNC)
4165                 stf |= ST_NOTRUNC;
4166
4167         return (stf);
4168 }
4169
4170 /*
4171  * Entries for (illegal) fstype 0.
4172  */
4173 /* ARGSUSED */
4174 int
4175 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4176 {
4177         cmn_err(CE_PANIC, "stray vfs operation");
4178         return (0);
4179 }
4180
4181 /*
4182  * Entries for (illegal) fstype 0.
4183  */
4184 int
4185 vfsstray(void)
4186 {
4187         cmn_err(CE_PANIC, "stray vfs operation");
4188         return (0);
4189 }
4190
4191 /*
4192  * Support for dealing with forced UFS unmount and its interaction with
4193  * LOFS. Could be used by any filesystem.
4194  * See bug 1203132.
4195  */
4196 int
4197 vfs_EIO(void)
4198 {
4199         return (EIO);
4200 }
4201
4202 /*
4203  * We've gotta define the op for sync separately, since the compiler gets
4204  * confused if we mix and match ANSI and normal style prototypes when
4205  * a "short" argument is present and spits out a warning.
4206  */
4207 /*ARGSUSED*/
4208 int
4209 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4210 {
4211         return (EIO);
4212 }
4213
4214 vfs_t EIO_vfs;
4215 vfsops_t *EIO_vfsops;
4216
4217 /*
4218  * Called from startup() to initialize all loaded vfs's
4219  */
4220 void
4221 vfsinit(void)
4222 {
4223         struct vfssw *vswp;
4224         int error;
4225         extern int vopstats_enabled;
4226         extern void vopstats_startup();
4227
4228         static const fs_operation_def_t EIO_vfsops_template[] = {
4229                 VFSNAME_MOUNT,          { .error = vfs_EIO },
4230                 VFSNAME_UNMOUNT,        { .error = vfs_EIO },
4231                 VFSNAME_ROOT,           { .error = vfs_EIO },
4232                 VFSNAME_STATVFS,        { .error = vfs_EIO },
4233                 VFSNAME_SYNC,           { .vfs_sync = vfs_EIO_sync },
4234                 VFSNAME_VGET,           { .error = vfs_EIO },
4235                 VFSNAME_MOUNTROOT,      { .error = vfs_EIO },
4236                 VFSNAME_FREEVFS,        { .error = vfs_EIO },
4237                 VFSNAME_VNSTATE,        { .error = vfs_EIO },
4238                 NULL, NULL
4239         };
4240
4241         static const fs_operation_def_t stray_vfsops_template[] = {
4242                 VFSNAME_MOUNT,          { .error = vfsstray },
4243                 VFSNAME_UNMOUNT,        { .error = vfsstray },
4244                 VFSNAME_ROOT,           { .error = vfsstray },
4245                 VFSNAME_STATVFS,        { .error = vfsstray },
4246                 VFSNAME_SYNC,           { .vfs_sync = vfsstray_sync },
4247                 VFSNAME_VGET,           { .error = vfsstray },
4248                 VFSNAME_MOUNTROOT,      { .error = vfsstray },
4249                 VFSNAME_FREEVFS,        { .error = vfsstray },
4250                 VFSNAME_VNSTATE,        { .error = vfsstray },
4251                 NULL, NULL
4252         };
4253
4254         /* Create vfs cache */
4255         vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4256             sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4257
4258         /* Initialize the vnode cache (file systems may use it during init). */
4259         vn_create_cache();
4260
4261         /* Setup event monitor framework */
4262         fem_init();
4263
4264         /* Initialize the dummy stray file system type. */
4265         error = vfs_setfsops(0, stray_vfsops_template, NULL);
4266
4267         /* Initialize the dummy EIO file system. */
4268         error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4269         if (error != 0) {
4270                 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4271                 /* Shouldn't happen, but not bad enough to panic */
4272         }
4273
4274         VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4275
4276         /*
4277          * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4278          * on this vfs can immediately notice it's invalid.
4279          */
4280         EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4281
4282         /*
4283          * Call the init routines of non-loadable filesystems only.
4284          * Filesystems which are loaded as separate modules will be
4285          * initialized by the module loading code instead.
4286          */
4287
4288         for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4289                 RLOCK_VFSSW();
4290                 if (vswp->vsw_init != NULL)
4291                         (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4292                 RUNLOCK_VFSSW();
4293         }
4294
4295         vopstats_startup();
4296
4297         if (vopstats_enabled) {
4298                 /* EIO_vfs can collect stats, but we don't retrieve them */
4299                 initialize_vopstats(&EIO_vfs.vfs_vopstats);
4300                 EIO_vfs.vfs_fstypevsp = NULL;
4301                 EIO_vfs.vfs_vskap = NULL;
4302                 EIO_vfs.vfs_flag |= VFS_STATS;
4303         }
4304
4305         xattr_init();
4306
4307         reparse_point_init();
4308 }
4309
4310 vfs_t *
4311 vfs_alloc(int kmflag)
4312 {
4313         vfs_t *vfsp;
4314
4315         vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4316
4317         /*
4318          * Do the simplest initialization here.
4319          * Everything else gets done in vfs_init()
4320          */
4321         bzero(vfsp, sizeof (vfs_t));
4322         return (vfsp);
4323 }
4324
4325 void
4326 vfs_free(vfs_t *vfsp)
4327 {
4328         /*
4329          * One would be tempted to assert that "vfsp->vfs_count == 0".
4330          * The problem is that this gets called out of domount() with
4331          * a partially initialized vfs and a vfs_count of 1.  This is
4332          * also called from vfs_rele() with a vfs_count of 0.  We can't
4333          * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4334          * returned.  This is because VFS_MOUNT() fully initializes the
4335          * vfs structure and its associated data.  VFS_RELE() will call
4336          * VFS_FREEVFS() which may panic the system if the data structures
4337          * aren't fully initialized from a successful VFS_MOUNT()).
4338          */
4339
4340         /* If FEM was in use, make sure everything gets cleaned up */
4341         if (vfsp->vfs_femhead) {
4342                 ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4343                 mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4344                 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4345                 vfsp->vfs_femhead = NULL;
4346         }
4347
4348         if (vfsp->vfs_implp)
4349                 vfsimpl_teardown(vfsp);
4350         sema_destroy(&vfsp->vfs_reflock);
4351         kmem_cache_free(vfs_cache, vfsp);
4352 }
4353
4354 /*
4355  * Increments the vfs reference count by one atomically.
4356  */
4357 void
4358 vfs_hold(vfs_t *vfsp)
4359 {
4360         atomic_add_32(&vfsp->vfs_count, 1);
4361         ASSERT(vfsp->vfs_count != 0);
4362 }
4363
4364 /*
4365  * Decrements the vfs reference count by one atomically. When
4366  * vfs reference count becomes zero, it calls the file system
4367  * specific vfs_freevfs() to free up the resources.
4368  */
4369 void
4370 vfs_rele(vfs_t *vfsp)
4371 {
4372         ASSERT(vfsp->vfs_count != 0);
4373         if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) {
4374                 VFS_FREEVFS(vfsp);
4375                 lofi_remove(vfsp);
4376                 if (vfsp->vfs_zone)
4377                         zone_rele(vfsp->vfs_zone);
4378                 vfs_freemnttab(vfsp);
4379                 vfs_free(vfsp);
4380         }
4381 }
4382
4383 /*
4384  * Generic operations vector support.
4385  *
4386  * This is used to build operations vectors for both the vfs and vnode.
4387  * It's normally called only when a file system is loaded.
4388  *
4389  * There are many possible algorithms for this, including the following:
4390  *
4391  *   (1) scan the list of known operations; for each, see if the file system
4392  *       includes an entry for it, and fill it in as appropriate.
4393  *
4394  *   (2) set up defaults for all known operations.  scan the list of ops
4395  *       supplied by the file system; for each which is both supplied and
4396  *       known, fill it in.
4397  *
4398  *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4399  *       in entries as we go.
4400  *
4401  * we choose (1) for simplicity, and because performance isn't critical here.
4402  * note that (2) could be sped up using a precomputed hash table on known ops.
4403  * (3) could be faster than either, but only if the lists were very large or
4404  * supplied in sorted order.
4405  *
4406  */
4407
4408 int
4409 fs_build_vector(void *vector, int *unused_ops,
4410     const fs_operation_trans_def_t *translation,
4411     const fs_operation_def_t *operations)
4412 {
4413         int i, num_trans, num_ops, used;
4414
4415         /*
4416          * Count the number of translations and the number of supplied
4417          * operations.
4418          */
4419
4420         {
4421                 const fs_operation_trans_def_t *p;
4422
4423                 for (num_trans = 0, p = translation;
4424                     p->name != NULL;
4425                     num_trans++, p++)
4426                         ;
4427         }
4428
4429         {
4430                 const fs_operation_def_t *p;
4431
4432                 for (num_ops = 0, p = operations;
4433                     p->name != NULL;
4434                     num_ops++, p++)
4435                         ;
4436         }
4437
4438         /* Walk through each operation known to our caller.  There will be */
4439         /* one entry in the supplied "translation table" for each. */
4440
4441         used = 0;
4442
4443         for (i = 0; i < num_trans; i++) {
4444                 int j, found;
4445                 char *curname;
4446                 fs_generic_func_p result;
4447                 fs_generic_func_p *location;
4448
4449                 curname = translation[i].name;
4450
4451                 /* Look for a matching operation in the list supplied by the */
4452                 /* file system. */
4453
4454                 found = 0;
4455
4456                 for (j = 0; j < num_ops; j++) {
4457                         if (strcmp(operations[j].name, curname) == 0) {
4458                                 used++;
4459                                 found = 1;
4460                                 break;
4461                         }
4462                 }
4463
4464                 /*
4465                  * If the file system is using a "placeholder" for default
4466                  * or error functions, grab the appropriate function out of
4467                  * the translation table.  If the file system didn't supply
4468                  * this operation at all, use the default function.
4469                  */
4470
4471                 if (found) {
4472                         result = operations[j].func.fs_generic;
4473                         if (result == fs_default) {
4474                                 result = translation[i].defaultFunc;
4475                         } else if (result == fs_error) {
4476                                 result = translation[i].errorFunc;
4477                         } else if (result == NULL) {
4478                                 /* Null values are PROHIBITED */
4479                                 return (EINVAL);
4480                         }
4481                 } else {
4482                         result = translation[i].defaultFunc;
4483                 }
4484
4485                 /* Now store the function into the operations vector. */
4486
4487                 location = (fs_generic_func_p *)
4488                     (((char *)vector) + translation[i].offset);
4489
4490                 *location = result;
4491         }
4492
4493         *unused_ops = num_ops - used;
4494
4495         return (0);
4496 }
4497
4498 /* Placeholder functions, should never be called. */
4499
4500 int
4501 fs_error(void)
4502 {
4503         cmn_err(CE_PANIC, "fs_error called");
4504         return (0);
4505 }
4506
4507 int
4508 fs_default(void)
4509 {
4510         cmn_err(CE_PANIC, "fs_default called");
4511         return (0);
4512 }
4513
4514 #ifdef __sparc
4515
4516 /*
4517  * Part of the implementation of booting off a mirrored root
4518  * involves a change of dev_t for the root device.  To
4519  * accomplish this, first remove the existing hash table
4520  * entry for the root device, convert to the new dev_t,
4521  * then re-insert in the hash table at the head of the list.
4522  */
4523 void
4524 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4525 {
4526         vfs_list_lock();
4527
4528         vfs_hash_remove(vfsp);
4529
4530         vfsp->vfs_dev = ndev;
4531         vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4532
4533         vfs_hash_add(vfsp, 1);
4534
4535         vfs_list_unlock();
4536 }
4537
4538 #else /* x86 NEWBOOT */
4539
4540 #if defined(__x86)
4541 extern int hvmboot_rootconf();
4542 #endif /* __x86 */
4543
4544 extern ib_boot_prop_t *iscsiboot_prop;
4545
4546 int
4547 rootconf()
4548 {
4549         int error;
4550         struct vfssw *vsw;
4551         extern void pm_init();
4552         char *fstyp, *fsmod;
4553         int ret = -1;
4554
4555         getrootfs(&fstyp, &fsmod);
4556
4557 #if defined(__x86)
4558         /*
4559          * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4560          * which lives in /platform/i86hvm, and hence is only available when
4561          * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4562          * is not available then the modstub for this function will return 0.
4563          * If the hvm_bootstrap misc module is available it will be loaded
4564          * and hvmboot_rootconf() will be invoked.
4565          */
4566         if (error = hvmboot_rootconf())
4567                 return (error);
4568 #endif /* __x86 */
4569
4570         if (error = clboot_rootconf())
4571                 return (error);
4572
4573         if (modload("fs", fsmod) == -1)
4574                 panic("Cannot _init %s module", fsmod);
4575
4576         RLOCK_VFSSW();
4577         vsw = vfs_getvfsswbyname(fstyp);
4578         RUNLOCK_VFSSW();
4579         if (vsw == NULL) {
4580                 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4581                 return (ENXIO);
4582         }
4583         VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4584         VFS_HOLD(rootvfs);
4585
4586         /* always mount readonly first */
4587         rootvfs->vfs_flag |= VFS_RDONLY;
4588
4589         pm_init();
4590
4591         if (netboot && iscsiboot_prop) {
4592                 cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4593                     " shouldn't happen in the same time");
4594                 return (EINVAL);
4595         }
4596
4597         if (netboot || iscsiboot_prop) {
4598                 ret = strplumb();
4599                 if (ret != 0) {
4600                         cmn_err(CE_WARN, "Cannot plumb network device %d", ret);
4601                         return (EFAULT);
4602                 }
4603         }
4604
4605         if ((ret == 0) && iscsiboot_prop) {
4606                 ret = modload("drv", "iscsi");
4607                 /* -1 indicates fail */
4608                 if (ret == -1) {
4609                         cmn_err(CE_WARN, "Failed to load iscsi module");
4610                         iscsi_boot_prop_free();
4611                         return (EINVAL);
4612                 } else {
4613                         if (!i_ddi_attach_pseudo_node("iscsi")) {
4614                                 cmn_err(CE_WARN,
4615                                     "Failed to attach iscsi driver");
4616                                 iscsi_boot_prop_free();
4617                                 return (ENODEV);
4618                         }
4619                 }
4620         }
4621
4622         error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4623         vfs_unrefvfssw(vsw);
4624         rootdev = rootvfs->vfs_dev;
4625
4626         if (error)
4627                 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4628                     rootfs.bo_name, fstyp);
4629         else
4630                 cmn_err(CE_CONT, "?root on %s fstype %s\n",
4631                     rootfs.bo_name, fstyp);
4632         return (error);
4633 }
4634
4635 /*
4636  * XXX this is called by nfs only and should probably be removed
4637  * If booted with ASKNAME, prompt on the console for a filesystem
4638  * name and return it.
4639  */
4640 void
4641 getfsname(char *askfor, char *name, size_t namelen)
4642 {
4643         if (boothowto & RB_ASKNAME) {
4644                 printf("%s name: ", askfor);
4645                 console_gets(name, namelen);
4646         }
4647 }
4648
4649 /*
4650  * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4651  * property.
4652  *
4653  * Filesystem types starting with the prefix "nfs" are diskless clients;
4654  * init the root filename name (rootfs.bo_name), too.
4655  *
4656  * If we are booting via NFS we currently have these options:
4657  *      nfs -   dynamically choose NFS V2, V3, or V4 (default)
4658  *      nfs2 -  force NFS V2
4659  *      nfs3 -  force NFS V3
4660  *      nfs4 -  force NFS V4
4661  * Because we need to maintain backward compatibility with the naming
4662  * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4663  * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs".  The dynamic
4664  * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4665  * This is only for root filesystems, all other uses such as cachefs
4666  * will expect that "nfs" == NFS V2.
4667  */
4668 static void
4669 getrootfs(char **fstypp, char **fsmodp)
4670 {
4671         extern char *strplumb_get_netdev_path(void);
4672         char *propstr = NULL;
4673
4674         /*
4675          * Check fstype property; for diskless it should be one of "nfs",
4676          * "nfs2", "nfs3" or "nfs4".
4677          */
4678         if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4679             DDI_PROP_DONTPASS, "fstype", &propstr)
4680             == DDI_SUCCESS) {
4681                 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4682                 ddi_prop_free(propstr);
4683
4684         /*
4685          * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4686          * assume the type of this root filesystem is 'zfs'.
4687          */
4688         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4689             DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4690             == DDI_SUCCESS) {
4691                 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4692                 ddi_prop_free(propstr);
4693         }
4694
4695         if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4696                 *fstypp = *fsmodp = rootfs.bo_fstype;
4697                 return;
4698         }
4699
4700         ++netboot;
4701
4702         if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4703                 (void) strcpy(rootfs.bo_fstype, "nfs");
4704         else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4705                 (void) strcpy(rootfs.bo_fstype, "nfsdyn");
4706
4707         /*
4708          * check if path to network interface is specified in bootpath
4709          * or by a hypervisor domain configuration file.
4710          * XXPV - enable strlumb_get_netdev_path()
4711          */
4712         if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4713             "xpv-nfsroot")) {
4714                 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4715         } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4716             DDI_PROP_DONTPASS, "bootpath", &propstr)
4717             == DDI_SUCCESS) {
4718                 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4719                 ddi_prop_free(propstr);
4720         } else {
4721                 /* attempt to determine netdev_path via boot_mac address */
4722                 netdev_path = strplumb_get_netdev_path();
4723                 if (netdev_path == NULL)
4724                         panic("cannot find boot network interface");
4725                 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME);
4726         }
4727         *fstypp = rootfs.bo_fstype;
4728         *fsmodp = "nfs";
4729 }
4730 #endif
4731
4732 /*
4733  * VFS feature routines
4734  */
4735
4736 #define VFTINDEX(feature)       (((feature) >> 32) & 0xFFFFFFFF)
4737 #define VFTBITS(feature)        ((feature) & 0xFFFFFFFFLL)
4738
4739 /* Register a feature in the vfs */
4740 void
4741 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4742 {
4743         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4744         if (vfsp->vfs_implp == NULL)
4745                 return;
4746
4747         vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4748 }
4749
4750 /*
4751  * Query a vfs for a feature.
4752  * Returns 1 if feature is present, 0 if not
4753  */
4754 int
4755 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4756 {
4757         int     ret = 0;
4758
4759         /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4760         if (vfsp->vfs_implp == NULL)
4761                 return (ret);
4762
4763         if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4764                 ret = 1;
4765
4766         return (ret);
4767 }
4768
4769 /*
4770  * Propagate feature set from one vfs to another
4771  */
4772 void
4773 vfs_propagate_features(vfs_t *from, vfs_t *to)
4774 {
4775         int i;
4776
4777         if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4778                 return;
4779
4780         for (i = 1; i <= to->vfs_featureset[0]; i++) {
4781                 to->vfs_featureset[i] = from->vfs_featureset[i];
4782         }
4783 }
4784
4785 #define LOFICTL_PATH "/devices/pseudo/lofi@0:%d"
4786
4787 /*
4788  * Return the vnode for the lofi node if there's a lofi mount in place.
4789  * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4790  * failure.
4791  */
4792 int
4793 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4794 {
4795         char *path = NULL;
4796         int strsize;
4797         int err;
4798
4799         if (vfsp->vfs_lofi_minor == 0) {
4800                 *vpp = NULL;
4801                 return (-1);
4802         }
4803
4804         strsize = snprintf(NULL, 0, LOFICTL_PATH, vfsp->vfs_lofi_minor);
4805         path = kmem_alloc(strsize + 1, KM_SLEEP);
4806         (void) snprintf(path, strsize + 1, LOFICTL_PATH, vfsp->vfs_lofi_minor);
4807
4808         err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4809
4810         if (err)
4811                 *vpp = NULL;
4812
4813         kmem_free(path, strsize + 1);
4814         return (err);
4815 }