usr/src/uts/common/os/zone.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25
  26 /*
  27  * Zones
  28  *
  29  *   A zone is a named collection of processes, namespace constraints,
  30  *   and other system resources which comprise a secure and manageable
  31  *   application containment facility.
  32  *
  33  *   Zones (represented by the reference counted zone_t) are tracked in
  34  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  35  *   (zoneid_t) are used to track zone association.  Zone IDs are
  36  *   dynamically generated when the zone is created; if a persistent
  37  *   identifier is needed (core files, accounting logs, audit trail,
  38  *   etc.), the zone name should be used.
  39  *
  40  *
  41  *   Global Zone:
  42  *
  43  *   The global zone (zoneid 0) is automatically associated with all
  44  *   system resources that have not been bound to a user-created zone.
  45  *   This means that even systems where zones are not in active use
  46  *   have a global zone, and all processes, mounts, etc. are
  47  *   associated with that zone.  The global zone is generally
  48  *   unconstrained in terms of privileges and access, though the usual
  49  *   credential and privilege based restrictions apply.
  50  *
  51  *
  52  *   Zone States:
  53  *
  54  *   The states in which a zone may be in and the transitions are as
  55  *   follows:
  56  *
  57  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  58  *   initialized zone is added to the list of active zones on the system but
  59  *   isn't accessible.
  60  *
  61  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  62  *   not yet completed. Not possible to enter the zone, but attributes can
  63  *   be retrieved.
  64  *
  65  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  66  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  67  *   executed.  A zone remains in this state until it transitions into
  68  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  69  *
  70  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  71  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  72  *   state.
  73  *
  74  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  75  *   successfully started init.   A zone remains in this state until
  76  *   zone_shutdown() is called.
  77  *
  78  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  79  *   killing all processes running in the zone. The zone remains
  80  *   in this state until there are no more user processes running in the zone.
  81  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  82  *   Since zone_shutdown() is restartable, it may be called successfully
  83  *   multiple times for the same zone_t.  Setting of the zone's state to
  84  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  85  *   the zone's status without worrying about it being a moving target.
  86  *
  87  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  88  *   are no more user processes in the zone.  The zone remains in this
  89  *   state until there are no more kernel threads associated with the
  90  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  91  *   fail.
  92  *
  93  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  94  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  95  *   join the zone or create kernel threads therein.
  96  *
  97  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  98  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
  99  *   return NULL from now on.
 100  *
 101  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 102  *   processes or threads doing work on behalf of the zone.  The zone is
 103  *   removed from the list of active zones.  zone_destroy() returns, and
 104  *   the zone can be recreated.
 105  *
 106  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 107  *   callbacks are executed, and all memory associated with the zone is
 108  *   freed.
 109  *
 110  *   Threads can wait for the zone to enter a requested state by using
 111  *   zone_status_wait() or zone_status_timedwait() with the desired
 112  *   state passed in as an argument.  Zone state transitions are
 113  *   uni-directional; it is not possible to move back to an earlier state.
 114  *
 115  *
 116  *   Zone-Specific Data:
 117  *
 118  *   Subsystems needing to maintain zone-specific data can store that
 119  *   data using the ZSD mechanism.  This provides a zone-specific data
 120  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 121  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 122  *   to register callbacks to be invoked when a zone is created, shut
 123  *   down, or destroyed.  This can be used to initialize zone-specific
 124  *   data for new zones and to clean up when zones go away.
 125  *
 126  *
 127  *   Data Structures:
 128  *
 129  *   The per-zone structure (zone_t) is reference counted, and freed
 130  *   when all references are released.  zone_hold and zone_rele can be
 131  *   used to adjust the reference count.  In addition, reference counts
 132  *   associated with the cred_t structure are tracked separately using
 133  *   zone_cred_hold and zone_cred_rele.
 134  *
 135  *   Pointers to active zone_t's are stored in two hash tables; one
 136  *   for searching by id, the other for searching by name.  Lookups
 137  *   can be performed on either basis, using zone_find_by_id and
 138  *   zone_find_by_name.  Both return zone_t pointers with the zone
 139  *   held, so zone_rele should be called when the pointer is no longer
 140  *   needed.  Zones can also be searched by path; zone_find_by_path
 141  *   returns the zone with which a path name is associated (global
 142  *   zone if the path is not within some other zone's file system
 143  *   hierarchy).  This currently requires iterating through each zone,
 144  *   so it is slower than an id or name search via a hash table.
 145  *
 146  *
 147  *   Locking:
 148  *
 149  *   zonehash_lock: This is a top-level global lock used to protect the
 150  *       zone hash tables and lists.  Zones cannot be created or destroyed
 151  *       while this lock is held.
 152  *   zone_status_lock: This is a global lock protecting zone state.
 153  *       Zones cannot change state while this lock is held.  It also
 154  *       protects the list of kernel threads associated with a zone.
 155  *   zone_lock: This is a per-zone lock used to protect several fields of
 156  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 157  *       this lock means that the zone cannot go away.
 158  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 159  *       related to the zone.max-lwps rctl.
 160  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 161  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 162  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 163  *       currently just max_lofi
 164  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 165  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 166  *       list (a list of zones in the ZONE_IS_DEAD state).
 167  *
 168  *   Ordering requirements:
 169  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 170  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 171  *
 172  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 173  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 174  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 175  *
 176  *   Blocking memory allocations are permitted while holding any of the
 177  *   zone locks.
 178  *
 179  *
 180  *   System Call Interface:
 181  *
 182  *   The zone subsystem can be managed and queried from user level with
 183  *   the following system calls (all subcodes of the primary "zone"
 184  *   system call):
 185  *   - zone_create: creates a zone with selected attributes (name,
 186  *     root path, privileges, resource controls, ZFS datasets)
 187  *   - zone_enter: allows the current process to enter a zone
 188  *   - zone_getattr: reports attributes of a zone
 189  *   - zone_setattr: set attributes of a zone
 190  *   - zone_boot: set 'init' running for the zone
 191  *   - zone_list: lists all zones active in the system
 192  *   - zone_lookup: looks up zone id based on name
 193  *   - zone_shutdown: initiates shutdown process (see states above)
 194  *   - zone_destroy: completes shutdown process (see states above)
 195  *
 196  */
 197
 198 #include <sys/priv_impl.h>
 199 #include <sys/cred.h>
 200 #include <c2/audit.h>
 201 #include <sys/debug.h>
 202 #include <sys/file.h>
 203 #include <sys/kmem.h>
 204 #include <sys/kstat.h>
 205 #include <sys/mutex.h>
 206 #include <sys/note.h>
 207 #include <sys/pathname.h>
 208 #include <sys/proc.h>
 209 #include <sys/project.h>
 210 #include <sys/sysevent.h>
 211 #include <sys/task.h>
 212 #include <sys/systm.h>
 213 #include <sys/types.h>
 214 #include <sys/utsname.h>
 215 #include <sys/vnode.h>
 216 #include <sys/vfs.h>
 217 #include <sys/systeminfo.h>
 218 #include <sys/policy.h>
 219 #include <sys/cred_impl.h>
 220 #include <sys/contract_impl.h>
 221 #include <sys/contract/process_impl.h>
 222 #include <sys/class.h>
 223 #include <sys/pool.h>
 224 #include <sys/pool_pset.h>
 225 #include <sys/pset.h>
 226 #include <sys/strlog.h>
 227 #include <sys/sysmacros.h>
 228 #include <sys/callb.h>
 229 #include <sys/vmparam.h>
 230 #include <sys/corectl.h>
 231 #include <sys/ipc_impl.h>
 232 #include <sys/klpd.h>
 233
 234 #include <sys/door.h>
 235 #include <sys/cpuvar.h>
 236 #include <sys/sdt.h>
 237
 238 #include <sys/uadmin.h>
 239 #include <sys/session.h>
 240 #include <sys/cmn_err.h>
 241 #include <sys/modhash.h>
 242 #include <sys/sunddi.h>
 243 #include <sys/nvpair.h>
 244 #include <sys/rctl.h>
 245 #include <sys/fss.h>
 246 #include <sys/brand.h>
 247 #include <sys/zone.h>
 248 #include <net/if.h>
 249 #include <sys/cpucaps.h>
 250 #include <vm/seg.h>
 251 #include <sys/mac.h>
 252
 253 /*
 254  * This constant specifies the number of seconds that threads waiting for
 255  * subsystems to release a zone's general-purpose references will wait before
 256  * they log the zone's reference counts.  The constant's value shouldn't
 257  * be so small that reference counts are unnecessarily reported for zones
 258  * whose references are slowly released.  On the other hand, it shouldn't be so
 259  * large that users reboot their systems out of frustration over hung zones
 260  * before the system logs the zones' reference counts.
 261  */
 262 #define ZONE_DESTROY_TIMEOUT_SECS       60
 263
 264 /* List of data link IDs which are accessible from the zone */
 265 typedef struct zone_dl {
 266         datalink_id_t   zdl_id;
 267         nvlist_t        *zdl_net;
 268         list_node_t     zdl_linkage;
 269 } zone_dl_t;
 270
 271 /*
 272  * cv used to signal that all references to the zone have been released.  This
 273  * needs to be global since there may be multiple waiters, and the first to
 274  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 275  */
 276 static kcondvar_t zone_destroy_cv;
 277 /*
 278  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 279  * but then we'd need another lock for zone_destroy_cv, and why bother?
 280  */
 281 static kmutex_t zone_status_lock;
 282
 283 /*
 284  * ZSD-related global variables.
 285  */
 286 static kmutex_t zsd_key_lock;   /* protects the following two */
 287 /*
 288  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 289  */
 290 static zone_key_t zsd_keyval = 0;
 291 /*
 292  * Global list of registered keys.  We use this when a new zone is created.
 293  */
 294 static list_t zsd_registered_keys;
 295
 296 int zone_hash_size = 256;
 297 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 298 static kmutex_t zonehash_lock;
 299 static uint_t zonecount;
 300 static id_space_t *zoneid_space;
 301
 302 /*
 303  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 304  * kernel proper runs, and which manages all other zones.
 305  *
 306  * Although not declared as static, the variable "zone0" should not be used
 307  * except for by code that needs to reference the global zone early on in boot,
 308  * before it is fully initialized.  All other consumers should use
 309  * 'global_zone'.
 310  */
 311 zone_t zone0;
 312 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 313
 314 /*
 315  * List of active zones, protected by zonehash_lock.
 316  */
 317 static list_t zone_active;
 318
 319 /*
 320  * List of destroyed zones that still have outstanding cred references.
 321  * Used for debugging.  Uses a separate lock to avoid lock ordering
 322  * problems in zone_free.
 323  */
 324 static list_t zone_deathrow;
 325 static kmutex_t zone_deathrow_lock;
 326
 327 /* number of zones is limited by virtual interface limit in IP */
 328 uint_t maxzones = 8192;
 329
 330 /* Event channel to sent zone state change notifications */
 331 evchan_t *zone_event_chan;
 332
 333 /*
 334  * This table holds the mapping from kernel zone states to
 335  * states visible in the state notification API.
 336  * The idea is that we only expose "obvious" states and
 337  * do not expose states which are just implementation details.
 338  */
 339 const char  *zone_status_table[] = {
 340         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 341         ZONE_EVENT_INITIALIZED,         /* initialized */
 342         ZONE_EVENT_READY,               /* ready */
 343         ZONE_EVENT_READY,               /* booting */
 344         ZONE_EVENT_RUNNING,             /* running */
 345         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 346         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 347         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 349         ZONE_EVENT_UNINITIALIZED,       /* dead */
 350 };
 351
 352 /*
 353  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 354  * (see sys/zone.h).
 355  */
 356 static char *zone_ref_subsys_names[] = {
 357         "NFS",          /* ZONE_REF_NFS */
 358         "NFSv4",        /* ZONE_REF_NFSV4 */
 359         "SMBFS",        /* ZONE_REF_SMBFS */
 360         "MNTFS",        /* ZONE_REF_MNTFS */
 361         "LOFI",         /* ZONE_REF_LOFI */
 362         "VFS",          /* ZONE_REF_VFS */
 363         "IPC"           /* ZONE_REF_IPC */
 364 };
 365
 366 /*
 367  * This isn't static so lint doesn't complain.
 368  */
 369 rctl_hndl_t rc_zone_cpu_shares;
 370 rctl_hndl_t rc_zone_locked_mem;
 371 rctl_hndl_t rc_zone_max_swap;
 372 rctl_hndl_t rc_zone_max_lofi;
 373 rctl_hndl_t rc_zone_cpu_cap;
 374 rctl_hndl_t rc_zone_nlwps;
 375 rctl_hndl_t rc_zone_nprocs;
 376 rctl_hndl_t rc_zone_shmmax;
 377 rctl_hndl_t rc_zone_shmmni;
 378 rctl_hndl_t rc_zone_semmni;
 379 rctl_hndl_t rc_zone_msgmni;
 380 /*
 381  * Synchronization primitives used to synchronize between mounts and zone
 382  * creation/destruction.
 383  */
 384 static int mounts_in_progress;
 385 static kcondvar_t mount_cv;
 386 static kmutex_t mount_lock;
 387
 388 const char * const zone_default_initname = "/sbin/init";
 389 static char * const zone_prefix = "/zone/";
 390 static int zone_shutdown(zoneid_t zoneid);
 391 static int zone_add_datalink(zoneid_t, datalink_id_t);
 392 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 393 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 394 static int zone_set_network(zoneid_t, zone_net_data_t *);
 395 static int zone_get_network(zoneid_t, zone_net_data_t *);
 396
 397 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398
 399 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 400 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 401 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 402 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 403     zone_key_t);
 404 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 405 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 406     kmutex_t *);
 407 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 408     kmutex_t *);
 409
 410 /*
 411  * Bump this number when you alter the zone syscall interfaces; this is
 412  * because we need to have support for previous API versions in libc
 413  * to support patching; libc calls into the kernel to determine this number.
 414  *
 415  * Version 1 of the API is the version originally shipped with Solaris 10
 416  * Version 2 alters the zone_create system call in order to support more
 417  *     arguments by moving the args into a structure; and to do better
 418  *     error reporting when zone_create() fails.
 419  * Version 3 alters the zone_create system call in order to support the
 420  *     import of ZFS datasets to zones.
 421  * Version 4 alters the zone_create system call in order to support
 422  *     Trusted Extensions.
 423  * Version 5 alters the zone_boot system call, and converts its old
 424  *     bootargs parameter to be set by the zone_setattr API instead.
 425  * Version 6 adds the flag argument to zone_create.
 426  */
 427 static const int ZONE_SYSCALL_API_VERSION = 6;
 428
 429 /*
 430  * Certain filesystems (such as NFS and autofs) need to know which zone
 431  * the mount is being placed in.  Because of this, we need to be able to
 432  * ensure that a zone isn't in the process of being created such that
 433  * nfs_mount() thinks it is in the global zone, while by the time it
 434  * gets added the list of mounted zones, it ends up on zoneA's mount
 435  * list.
 436  *
 437  * The following functions: block_mounts()/resume_mounts() and
 438  * mount_in_progress()/mount_completed() are used by zones and the VFS
 439  * layer (respectively) to synchronize zone creation and new mounts.
 440  *
 441  * The semantics are like a reader-reader lock such that there may
 442  * either be multiple mounts (or zone creations, if that weren't
 443  * serialized by zonehash_lock) in progress at the same time, but not
 444  * both.
 445  *
 446  * We use cv's so the user can ctrl-C out of the operation if it's
 447  * taking too long.
 448  *
 449  * The semantics are such that there is unfair bias towards the
 450  * "current" operation.  This means that zone creations may starve if
 451  * there is a rapid succession of new mounts coming in to the system, or
 452  * there is a remote possibility that zones will be created at such a
 453  * rate that new mounts will not be able to proceed.
 454  */
 455 /*
 456  * Prevent new mounts from progressing to the point of calling
 457  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 458  * them to complete.
 459  */
 460 static int
 461 block_mounts(void)
 462 {
 463         int retval = 0;
 464
 465         /*
 466          * Since it may block for a long time, block_mounts() shouldn't be
 467          * called with zonehash_lock held.
 468          */
 469         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 470         mutex_enter(&mount_lock);
 471         while (mounts_in_progress > 0) {
 472                 if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
 473                         goto signaled;
 474         }
 475         /*
 476          * A negative value of mounts_in_progress indicates that mounts
 477          * have been blocked by (-mounts_in_progress) different callers.
 478          */
 479         mounts_in_progress--;
 480         retval = 1;
 481 signaled:
 482         mutex_exit(&mount_lock);
 483         return (retval);
 484 }
 485
 486 /*
 487  * The VFS layer may progress with new mounts as far as we're concerned.
 488  * Allow them to progress if we were the last obstacle.
 489  */
 490 static void
 491 resume_mounts(void)
 492 {
 493         mutex_enter(&mount_lock);
 494         if (++mounts_in_progress == 0)
 495                 cv_broadcast(&mount_cv);
 496         mutex_exit(&mount_lock);
 497 }
 498
 499 /*
 500  * The VFS layer is busy with a mount; zones should wait until all
 501  * mounts are completed to progress.
 502  */
 503 void
 504 mount_in_progress(void)
 505 {
 506         mutex_enter(&mount_lock);
 507         while (mounts_in_progress < 0)
 508                 cv_wait(&mount_cv, &mount_lock);
 509         mounts_in_progress++;
 510         mutex_exit(&mount_lock);
 511 }
 512
 513 /*
 514  * VFS is done with one mount; wake up any waiting block_mounts()
 515  * callers if this is the last mount.
 516  */
 517 void
 518 mount_completed(void)
 519 {
 520         mutex_enter(&mount_lock);
 521         if (--mounts_in_progress == 0)
 522                 cv_broadcast(&mount_cv);
 523         mutex_exit(&mount_lock);
 524 }
 525
 526 /*
 527  * ZSD routines.
 528  *
 529  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 530  * defined by the pthread_key_create() and related interfaces.
 531  *
 532  * Kernel subsystems may register one or more data items and/or
 533  * callbacks to be executed when a zone is created, shutdown, or
 534  * destroyed.
 535  *
 536  * Unlike the thread counterpart, destructor callbacks will be executed
 537  * even if the data pointer is NULL and/or there are no constructor
 538  * callbacks, so it is the responsibility of such callbacks to check for
 539  * NULL data values if necessary.
 540  *
 541  * The locking strategy and overall picture is as follows:
 542  *
 543  * When someone calls zone_key_create(), a template ZSD entry is added to the
 544  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 545  * holding that lock all the existing zones are marked as
 546  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 547  * zone_zsd list (protected by zone_lock). The global list is updated first
 548  * (under zone_key_lock) to make sure that newly created zones use the
 549  * most recent list of keys. Then under zonehash_lock we walk the zones
 550  * and mark them.  Similar locking is used in zone_key_delete().
 551  *
 552  * The actual create, shutdown, and destroy callbacks are done without
 553  * holding any lock. And zsd_flags are used to ensure that the operations
 554  * completed so that when zone_key_create (and zone_create) is done, as well as
 555  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 556  * are completed.
 557  *
 558  * When new zones are created constructor callbacks for all registered ZSD
 559  * entries will be called. That also uses the above two phases of marking
 560  * what needs to be done, and then running the callbacks without holding
 561  * any locks.
 562  *
 563  * The framework does not provide any locking around zone_getspecific() and
 564  * zone_setspecific() apart from that needed for internal consistency, so
 565  * callers interested in atomic "test-and-set" semantics will need to provide
 566  * their own locking.
 567  */
 568
 569 /*
 570  * Helper function to find the zsd_entry associated with the key in the
 571  * given list.
 572  */
 573 static struct zsd_entry *
 574 zsd_find(list_t *l, zone_key_t key)
 575 {
 576         struct zsd_entry *zsd;
 577
 578         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 579                 if (zsd->zsd_key == key) {
 580                         return (zsd);
 581                 }
 582         }
 583         return (NULL);
 584 }
 585
 586 /*
 587  * Helper function to find the zsd_entry associated with the key in the
 588  * given list. Move it to the front of the list.
 589  */
 590 static struct zsd_entry *
 591 zsd_find_mru(list_t *l, zone_key_t key)
 592 {
 593         struct zsd_entry *zsd;
 594
 595         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 596                 if (zsd->zsd_key == key) {
 597                         /*
 598                          * Move to head of list to keep list in MRU order.
 599                          */
 600                         if (zsd != list_head(l)) {
 601                                 list_remove(l, zsd);
 602                                 list_insert_head(l, zsd);
 603                         }
 604                         return (zsd);
 605                 }
 606         }
 607         return (NULL);
 608 }
 609
 610 void
 611 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 612     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 613 {
 614         struct zsd_entry *zsdp;
 615         struct zsd_entry *t;
 616         struct zone *zone;
 617         zone_key_t  key;
 618
 619         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 620         zsdp->zsd_data = NULL;
 621         zsdp->zsd_create = create;
 622         zsdp->zsd_shutdown = shutdown;
 623         zsdp->zsd_destroy = destroy;
 624
 625         /*
 626          * Insert in global list of callbacks. Makes future zone creations
 627          * see it.
 628          */
 629         mutex_enter(&zsd_key_lock);
 630         key = zsdp->zsd_key = ++zsd_keyval;
 631         ASSERT(zsd_keyval != 0);
 632         list_insert_tail(&zsd_registered_keys, zsdp);
 633         mutex_exit(&zsd_key_lock);
 634
 635         /*
 636          * Insert for all existing zones and mark them as needing
 637          * a create callback.
 638          */
 639         mutex_enter(&zonehash_lock);    /* stop the world */
 640         for (zone = list_head(&zone_active); zone != NULL;
 641             zone = list_next(&zone_active, zone)) {
 642                 zone_status_t status;
 643
 644                 mutex_enter(&zone->zone_lock);
 645
 646                 /* Skip zones that are on the way down or not yet up */
 647                 status = zone_status_get(zone);
 648                 if (status >= ZONE_IS_DOWN ||
 649                     status == ZONE_IS_UNINITIALIZED) {
 650                         mutex_exit(&zone->zone_lock);
 651                         continue;
 652                 }
 653
 654                 t = zsd_find_mru(&zone->zone_zsd, key);
 655                 if (t != NULL) {
 656                         /*
 657                          * A zsd_configure already inserted it after
 658                          * we dropped zsd_key_lock above.
 659                          */
 660                         mutex_exit(&zone->zone_lock);
 661                         continue;
 662                 }
 663                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 664                 t->zsd_key = key;
 665                 t->zsd_create = create;
 666                 t->zsd_shutdown = shutdown;
 667                 t->zsd_destroy = destroy;
 668                 if (create != NULL) {
 669                         t->zsd_flags = ZSD_CREATE_NEEDED;
 670                         DTRACE_PROBE2(zsd__create__needed,
 671                             zone_t *, zone, zone_key_t, key);
 672                 }
 673                 list_insert_tail(&zone->zone_zsd, t);
 674                 mutex_exit(&zone->zone_lock);
 675         }
 676         mutex_exit(&zonehash_lock);
 677
 678         if (create != NULL) {
 679                 /* Now call the create callback for this key */
 680                 zsd_apply_all_zones(zsd_apply_create, key);
 681         }
 682         /*
 683          * It is safe for consumers to use the key now, make it
 684          * globally visible. Specifically zone_getspecific() will
 685          * always successfully return the zone specific data associated
 686          * with the key.
 687          */
 688         *keyp = key;
 689
 690 }
 691
 692 /*
 693  * Function called when a module is being unloaded, or otherwise wishes
 694  * to unregister its ZSD key and callbacks.
 695  *
 696  * Remove from the global list and determine the functions that need to
 697  * be called under a global lock. Then call the functions without
 698  * holding any locks. Finally free up the zone_zsd entries. (The apply
 699  * functions need to access the zone_zsd entries to find zsd_data etc.)
 700  */
 701 int
 702 zone_key_delete(zone_key_t key)
 703 {
 704         struct zsd_entry *zsdp = NULL;
 705         zone_t *zone;
 706
 707         mutex_enter(&zsd_key_lock);
 708         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 709         if (zsdp == NULL) {
 710                 mutex_exit(&zsd_key_lock);
 711                 return (-1);
 712         }
 713         list_remove(&zsd_registered_keys, zsdp);
 714         mutex_exit(&zsd_key_lock);
 715
 716         mutex_enter(&zonehash_lock);
 717         for (zone = list_head(&zone_active); zone != NULL;
 718             zone = list_next(&zone_active, zone)) {
 719                 struct zsd_entry *del;
 720
 721                 mutex_enter(&zone->zone_lock);
 722                 del = zsd_find_mru(&zone->zone_zsd, key);
 723                 if (del == NULL) {
 724                         /*
 725                          * Somebody else got here first e.g the zone going
 726                          * away.
 727                          */
 728                         mutex_exit(&zone->zone_lock);
 729                         continue;
 730                 }
 731                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 732                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 733                 if (del->zsd_shutdown != NULL &&
 734                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 735                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 736                         DTRACE_PROBE2(zsd__shutdown__needed,
 737                             zone_t *, zone, zone_key_t, key);
 738                 }
 739                 if (del->zsd_destroy != NULL &&
 740                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 741                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 742                         DTRACE_PROBE2(zsd__destroy__needed,
 743                             zone_t *, zone, zone_key_t, key);
 744                 }
 745                 mutex_exit(&zone->zone_lock);
 746         }
 747         mutex_exit(&zonehash_lock);
 748         kmem_free(zsdp, sizeof (*zsdp));
 749
 750         /* Now call the shutdown and destroy callback for this key */
 751         zsd_apply_all_zones(zsd_apply_shutdown, key);
 752         zsd_apply_all_zones(zsd_apply_destroy, key);
 753
 754         /* Now we can free up the zsdp structures in each zone */
 755         mutex_enter(&zonehash_lock);
 756         for (zone = list_head(&zone_active); zone != NULL;
 757             zone = list_next(&zone_active, zone)) {
 758                 struct zsd_entry *del;
 759
 760                 mutex_enter(&zone->zone_lock);
 761                 del = zsd_find(&zone->zone_zsd, key);
 762                 if (del != NULL) {
 763                         list_remove(&zone->zone_zsd, del);
 764                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 765                         kmem_free(del, sizeof (*del));
 766                 }
 767                 mutex_exit(&zone->zone_lock);
 768         }
 769         mutex_exit(&zonehash_lock);
 770
 771         return (0);
 772 }
 773
 774 /*
 775  * ZSD counterpart of pthread_setspecific().
 776  *
 777  * Since all zsd callbacks, including those with no create function,
 778  * have an entry in zone_zsd, if the key is registered it is part of
 779  * the zone_zsd list.
 780  * Return an error if the key wasn't registerd.
 781  */
 782 int
 783 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 784 {
 785         struct zsd_entry *t;
 786
 787         mutex_enter(&zone->zone_lock);
 788         t = zsd_find_mru(&zone->zone_zsd, key);
 789         if (t != NULL) {
 790                 /*
 791                  * Replace old value with new
 792                  */
 793                 t->zsd_data = (void *)data;
 794                 mutex_exit(&zone->zone_lock);
 795                 return (0);
 796         }
 797         mutex_exit(&zone->zone_lock);
 798         return (-1);
 799 }
 800
 801 /*
 802  * ZSD counterpart of pthread_getspecific().
 803  */
 804 void *
 805 zone_getspecific(zone_key_t key, zone_t *zone)
 806 {
 807         struct zsd_entry *t;
 808         void *data;
 809
 810         mutex_enter(&zone->zone_lock);
 811         t = zsd_find_mru(&zone->zone_zsd, key);
 812         data = (t == NULL ? NULL : t->zsd_data);
 813         mutex_exit(&zone->zone_lock);
 814         return (data);
 815 }
 816
 817 /*
 818  * Function used to initialize a zone's list of ZSD callbacks and data
 819  * when the zone is being created.  The callbacks are initialized from
 820  * the template list (zsd_registered_keys). The constructor callback is
 821  * executed later (once the zone exists and with locks dropped).
 822  */
 823 static void
 824 zone_zsd_configure(zone_t *zone)
 825 {
 826         struct zsd_entry *zsdp;
 827         struct zsd_entry *t;
 828
 829         ASSERT(MUTEX_HELD(&zonehash_lock));
 830         ASSERT(list_head(&zone->zone_zsd) == NULL);
 831         mutex_enter(&zone->zone_lock);
 832         mutex_enter(&zsd_key_lock);
 833         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 834             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 835                 /*
 836                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 837                  * should not have added anything to it.
 838                  */
 839                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 840
 841                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 842                 t->zsd_key = zsdp->zsd_key;
 843                 t->zsd_create = zsdp->zsd_create;
 844                 t->zsd_shutdown = zsdp->zsd_shutdown;
 845                 t->zsd_destroy = zsdp->zsd_destroy;
 846                 if (zsdp->zsd_create != NULL) {
 847                         t->zsd_flags = ZSD_CREATE_NEEDED;
 848                         DTRACE_PROBE2(zsd__create__needed,
 849                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 850                 }
 851                 list_insert_tail(&zone->zone_zsd, t);
 852         }
 853         mutex_exit(&zsd_key_lock);
 854         mutex_exit(&zone->zone_lock);
 855 }
 856
 857 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 858
 859 /*
 860  * Helper function to execute shutdown or destructor callbacks.
 861  */
 862 static void
 863 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 864 {
 865         struct zsd_entry *t;
 866
 867         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 868         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 869         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 870
 871         /*
 872          * Run the callback solely based on what is registered for the zone
 873          * in zone_zsd. The global list can change independently of this
 874          * as keys are registered and unregistered and we don't register new
 875          * callbacks for a zone that is in the process of going away.
 876          */
 877         mutex_enter(&zone->zone_lock);
 878         for (t = list_head(&zone->zone_zsd); t != NULL;
 879             t = list_next(&zone->zone_zsd, t)) {
 880                 zone_key_t key = t->zsd_key;
 881
 882                 /* Skip if no callbacks registered */
 883
 884                 if (ct == ZSD_SHUTDOWN) {
 885                         if (t->zsd_shutdown != NULL &&
 886                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 887                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 888                                 DTRACE_PROBE2(zsd__shutdown__needed,
 889                                     zone_t *, zone, zone_key_t, key);
 890                         }
 891                 } else {
 892                         if (t->zsd_destroy != NULL &&
 893                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 894                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 895                                 DTRACE_PROBE2(zsd__destroy__needed,
 896                                     zone_t *, zone, zone_key_t, key);
 897                         }
 898                 }
 899         }
 900         mutex_exit(&zone->zone_lock);
 901
 902         /* Now call the shutdown and destroy callback for this key */
 903         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 904         zsd_apply_all_keys(zsd_apply_destroy, zone);
 905
 906 }
 907
 908 /*
 909  * Called when the zone is going away; free ZSD-related memory, and
 910  * destroy the zone_zsd list.
 911  */
 912 static void
 913 zone_free_zsd(zone_t *zone)
 914 {
 915         struct zsd_entry *t, *next;
 916
 917         /*
 918          * Free all the zsd_entry's we had on this zone.
 919          */
 920         mutex_enter(&zone->zone_lock);
 921         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 922                 next = list_next(&zone->zone_zsd, t);
 923                 list_remove(&zone->zone_zsd, t);
 924                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 925                 kmem_free(t, sizeof (*t));
 926         }
 927         list_destroy(&zone->zone_zsd);
 928         mutex_exit(&zone->zone_lock);
 929
 930 }
 931
 932 /*
 933  * Apply a function to all zones for particular key value.
 934  *
 935  * The applyfn has to drop zonehash_lock if it does some work, and
 936  * then reacquire it before it returns.
 937  * When the lock is dropped we don't follow list_next even
 938  * if it is possible to do so without any hazards. This is
 939  * because we want the design to allow for the list of zones
 940  * to change in any arbitrary way during the time the
 941  * lock was dropped.
 942  *
 943  * It is safe to restart the loop at list_head since the applyfn
 944  * changes the zsd_flags as it does work, so a subsequent
 945  * pass through will have no effect in applyfn, hence the loop will terminate
 946  * in at worst O(N^2).
 947  */
 948 static void
 949 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 950 {
 951         zone_t *zone;
 952
 953         mutex_enter(&zonehash_lock);
 954         zone = list_head(&zone_active);
 955         while (zone != NULL) {
 956                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 957                         /* Lock dropped - restart at head */
 958                         zone = list_head(&zone_active);
 959                 } else {
 960                         zone = list_next(&zone_active, zone);
 961                 }
 962         }
 963         mutex_exit(&zonehash_lock);
 964 }
 965
 966 /*
 967  * Apply a function to all keys for a particular zone.
 968  *
 969  * The applyfn has to drop zonehash_lock if it does some work, and
 970  * then reacquire it before it returns.
 971  * When the lock is dropped we don't follow list_next even
 972  * if it is possible to do so without any hazards. This is
 973  * because we want the design to allow for the list of zsd callbacks
 974  * to change in any arbitrary way during the time the
 975  * lock was dropped.
 976  *
 977  * It is safe to restart the loop at list_head since the applyfn
 978  * changes the zsd_flags as it does work, so a subsequent
 979  * pass through will have no effect in applyfn, hence the loop will terminate
 980  * in at worst O(N^2).
 981  */
 982 static void
 983 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 984 {
 985         struct zsd_entry *t;
 986
 987         mutex_enter(&zone->zone_lock);
 988         t = list_head(&zone->zone_zsd);
 989         while (t != NULL) {
 990                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 991                         /* Lock dropped - restart at head */
 992                         t = list_head(&zone->zone_zsd);
 993                 } else {
 994                         t = list_next(&zone->zone_zsd, t);
 995                 }
 996         }
 997         mutex_exit(&zone->zone_lock);
 998 }
 999
1000 /*
1001  * Call the create function for the zone and key if CREATE_NEEDED
1002  * is set.
1003  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1004  * we wait for that thread to complete so that we can ensure that
1005  * all the callbacks are done when we've looped over all zones/keys.
1006  *
1007  * When we call the create function, we drop the global held by the
1008  * caller, and return true to tell the caller it needs to re-evalute the
1009  * state.
1010  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1011  * remains held on exit.
1012  */
1013 static boolean_t
1014 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1015     zone_t *zone, zone_key_t key)
1016 {
1017         void *result;
1018         struct zsd_entry *t;
1019         boolean_t dropped;
1020
1021         if (lockp != NULL) {
1022                 ASSERT(MUTEX_HELD(lockp));
1023         }
1024         if (zone_lock_held) {
1025                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1026         } else {
1027                 mutex_enter(&zone->zone_lock);
1028         }
1029
1030         t = zsd_find(&zone->zone_zsd, key);
1031         if (t == NULL) {
1032                 /*
1033                  * Somebody else got here first e.g the zone going
1034                  * away.
1035                  */
1036                 if (!zone_lock_held)
1037                         mutex_exit(&zone->zone_lock);
1038                 return (B_FALSE);
1039         }
1040         dropped = B_FALSE;
1041         if (zsd_wait_for_inprogress(zone, t, lockp))
1042                 dropped = B_TRUE;
1043
1044         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1045                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1046                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1047                 DTRACE_PROBE2(zsd__create__inprogress,
1048                     zone_t *, zone, zone_key_t, key);
1049                 mutex_exit(&zone->zone_lock);
1050                 if (lockp != NULL)
1051                         mutex_exit(lockp);
1052
1053                 dropped = B_TRUE;
1054                 ASSERT(t->zsd_create != NULL);
1055                 DTRACE_PROBE2(zsd__create__start,
1056                     zone_t *, zone, zone_key_t, key);
1057
1058                 result = (*t->zsd_create)(zone->zone_id);
1059
1060                 DTRACE_PROBE2(zsd__create__end,
1061                     zone_t *, zone, voidn *, result);
1062
1063                 ASSERT(result != NULL);
1064                 if (lockp != NULL)
1065                         mutex_enter(lockp);
1066                 mutex_enter(&zone->zone_lock);
1067                 t->zsd_data = result;
1068                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1069                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1070                 cv_broadcast(&t->zsd_cv);
1071                 DTRACE_PROBE2(zsd__create__completed,
1072                     zone_t *, zone, zone_key_t, key);
1073         }
1074         if (!zone_lock_held)
1075                 mutex_exit(&zone->zone_lock);
1076         return (dropped);
1077 }
1078
1079 /*
1080  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1081  * is set.
1082  * If some other thread gets here first and sets *_INPROGRESS, then
1083  * we wait for that thread to complete so that we can ensure that
1084  * all the callbacks are done when we've looped over all zones/keys.
1085  *
1086  * When we call the shutdown function, we drop the global held by the
1087  * caller, and return true to tell the caller it needs to re-evalute the
1088  * state.
1089  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1090  * remains held on exit.
1091  */
1092 static boolean_t
1093 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1094     zone_t *zone, zone_key_t key)
1095 {
1096         struct zsd_entry *t;
1097         void *data;
1098         boolean_t dropped;
1099
1100         if (lockp != NULL) {
1101                 ASSERT(MUTEX_HELD(lockp));
1102         }
1103         if (zone_lock_held) {
1104                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1105         } else {
1106                 mutex_enter(&zone->zone_lock);
1107         }
1108
1109         t = zsd_find(&zone->zone_zsd, key);
1110         if (t == NULL) {
1111                 /*
1112                  * Somebody else got here first e.g the zone going
1113                  * away.
1114                  */
1115                 if (!zone_lock_held)
1116                         mutex_exit(&zone->zone_lock);
1117                 return (B_FALSE);
1118         }
1119         dropped = B_FALSE;
1120         if (zsd_wait_for_creator(zone, t, lockp))
1121                 dropped = B_TRUE;
1122
1123         if (zsd_wait_for_inprogress(zone, t, lockp))
1124                 dropped = B_TRUE;
1125
1126         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1127                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1128                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1129                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1130                     zone_t *, zone, zone_key_t, key);
1131                 mutex_exit(&zone->zone_lock);
1132                 if (lockp != NULL)
1133                         mutex_exit(lockp);
1134                 dropped = B_TRUE;
1135
1136                 ASSERT(t->zsd_shutdown != NULL);
1137                 data = t->zsd_data;
1138
1139                 DTRACE_PROBE2(zsd__shutdown__start,
1140                     zone_t *, zone, zone_key_t, key);
1141
1142                 (t->zsd_shutdown)(zone->zone_id, data);
1143                 DTRACE_PROBE2(zsd__shutdown__end,
1144                     zone_t *, zone, zone_key_t, key);
1145
1146                 if (lockp != NULL)
1147                         mutex_enter(lockp);
1148                 mutex_enter(&zone->zone_lock);
1149                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1150                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1151                 cv_broadcast(&t->zsd_cv);
1152                 DTRACE_PROBE2(zsd__shutdown__completed,
1153                     zone_t *, zone, zone_key_t, key);
1154         }
1155         if (!zone_lock_held)
1156                 mutex_exit(&zone->zone_lock);
1157         return (dropped);
1158 }
1159
1160 /*
1161  * Call the destroy function for the zone and key if DESTROY_NEEDED
1162  * is set.
1163  * If some other thread gets here first and sets *_INPROGRESS, then
1164  * we wait for that thread to complete so that we can ensure that
1165  * all the callbacks are done when we've looped over all zones/keys.
1166  *
1167  * When we call the destroy function, we drop the global held by the
1168  * caller, and return true to tell the caller it needs to re-evalute the
1169  * state.
1170  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1171  * remains held on exit.
1172  */
1173 static boolean_t
1174 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1175     zone_t *zone, zone_key_t key)
1176 {
1177         struct zsd_entry *t;
1178         void *data;
1179         boolean_t dropped;
1180
1181         if (lockp != NULL) {
1182                 ASSERT(MUTEX_HELD(lockp));
1183         }
1184         if (zone_lock_held) {
1185                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1186         } else {
1187                 mutex_enter(&zone->zone_lock);
1188         }
1189
1190         t = zsd_find(&zone->zone_zsd, key);
1191         if (t == NULL) {
1192                 /*
1193                  * Somebody else got here first e.g the zone going
1194                  * away.
1195                  */
1196                 if (!zone_lock_held)
1197                         mutex_exit(&zone->zone_lock);
1198                 return (B_FALSE);
1199         }
1200         dropped = B_FALSE;
1201         if (zsd_wait_for_creator(zone, t, lockp))
1202                 dropped = B_TRUE;
1203
1204         if (zsd_wait_for_inprogress(zone, t, lockp))
1205                 dropped = B_TRUE;
1206
1207         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1208                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1209                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1210                 DTRACE_PROBE2(zsd__destroy__inprogress,
1211                     zone_t *, zone, zone_key_t, key);
1212                 mutex_exit(&zone->zone_lock);
1213                 if (lockp != NULL)
1214                         mutex_exit(lockp);
1215                 dropped = B_TRUE;
1216
1217                 ASSERT(t->zsd_destroy != NULL);
1218                 data = t->zsd_data;
1219                 DTRACE_PROBE2(zsd__destroy__start,
1220                     zone_t *, zone, zone_key_t, key);
1221
1222                 (t->zsd_destroy)(zone->zone_id, data);
1223                 DTRACE_PROBE2(zsd__destroy__end,
1224                     zone_t *, zone, zone_key_t, key);
1225
1226                 if (lockp != NULL)
1227                         mutex_enter(lockp);
1228                 mutex_enter(&zone->zone_lock);
1229                 t->zsd_data = NULL;
1230                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1231                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1232                 cv_broadcast(&t->zsd_cv);
1233                 DTRACE_PROBE2(zsd__destroy__completed,
1234                     zone_t *, zone, zone_key_t, key);
1235         }
1236         if (!zone_lock_held)
1237                 mutex_exit(&zone->zone_lock);
1238         return (dropped);
1239 }
1240
1241 /*
1242  * Wait for any CREATE_NEEDED flag to be cleared.
1243  * Returns true if lockp was temporarily dropped while waiting.
1244  */
1245 static boolean_t
1246 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1247 {
1248         boolean_t dropped = B_FALSE;
1249
1250         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1251                 DTRACE_PROBE2(zsd__wait__for__creator,
1252                     zone_t *, zone, struct zsd_entry *, t);
1253                 if (lockp != NULL) {
1254                         dropped = B_TRUE;
1255                         mutex_exit(lockp);
1256                 }
1257                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1258                 if (lockp != NULL) {
1259                         /* First drop zone_lock to preserve order */
1260                         mutex_exit(&zone->zone_lock);
1261                         mutex_enter(lockp);
1262                         mutex_enter(&zone->zone_lock);
1263                 }
1264         }
1265         return (dropped);
1266 }
1267
1268 /*
1269  * Wait for any INPROGRESS flag to be cleared.
1270  * Returns true if lockp was temporarily dropped while waiting.
1271  */
1272 static boolean_t
1273 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1274 {
1275         boolean_t dropped = B_FALSE;
1276
1277         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1278                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1279                     zone_t *, zone, struct zsd_entry *, t);
1280                 if (lockp != NULL) {
1281                         dropped = B_TRUE;
1282                         mutex_exit(lockp);
1283                 }
1284                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1285                 if (lockp != NULL) {
1286                         /* First drop zone_lock to preserve order */
1287                         mutex_exit(&zone->zone_lock);
1288                         mutex_enter(lockp);
1289                         mutex_enter(&zone->zone_lock);
1290                 }
1291         }
1292         return (dropped);
1293 }
1294
1295 /*
1296  * Frees memory associated with the zone dataset list.
1297  */
1298 static void
1299 zone_free_datasets(zone_t *zone)
1300 {
1301         zone_dataset_t *t, *next;
1302
1303         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1304                 next = list_next(&zone->zone_datasets, t);
1305                 list_remove(&zone->zone_datasets, t);
1306                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1307                 kmem_free(t, sizeof (*t));
1308         }
1309         list_destroy(&zone->zone_datasets);
1310 }
1311
1312 /*
1313  * zone.cpu-shares resource control support.
1314  */
1315 /*ARGSUSED*/
1316 static rctl_qty_t
1317 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1318 {
1319         ASSERT(MUTEX_HELD(&p->p_lock));
1320         return (p->p_zone->zone_shares);
1321 }
1322
1323 /*ARGSUSED*/
1324 static int
1325 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1326     rctl_qty_t nv)
1327 {
1328         ASSERT(MUTEX_HELD(&p->p_lock));
1329         ASSERT(e->rcep_t == RCENTITY_ZONE);
1330         if (e->rcep_p.zone == NULL)
1331                 return (0);
1332
1333         e->rcep_p.zone->zone_shares = nv;
1334         return (0);
1335 }
1336
1337 static rctl_ops_t zone_cpu_shares_ops = {
1338         rcop_no_action,
1339         zone_cpu_shares_usage,
1340         zone_cpu_shares_set,
1341         rcop_no_test
1342 };
1343
1344 /*
1345  * zone.cpu-cap resource control support.
1346  */
1347 /*ARGSUSED*/
1348 static rctl_qty_t
1349 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1350 {
1351         ASSERT(MUTEX_HELD(&p->p_lock));
1352         return (cpucaps_zone_get(p->p_zone));
1353 }
1354
1355 /*ARGSUSED*/
1356 static int
1357 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1358     rctl_qty_t nv)
1359 {
1360         zone_t *zone = e->rcep_p.zone;
1361
1362         ASSERT(MUTEX_HELD(&p->p_lock));
1363         ASSERT(e->rcep_t == RCENTITY_ZONE);
1364
1365         if (zone == NULL)
1366                 return (0);
1367
1368         /*
1369          * set cap to the new value.
1370          */
1371         return (cpucaps_zone_set(zone, nv));
1372 }
1373
1374 static rctl_ops_t zone_cpu_cap_ops = {
1375         rcop_no_action,
1376         zone_cpu_cap_get,
1377         zone_cpu_cap_set,
1378         rcop_no_test
1379 };
1380
1381 /*ARGSUSED*/
1382 static rctl_qty_t
1383 zone_lwps_usage(rctl_t *r, proc_t *p)
1384 {
1385         rctl_qty_t nlwps;
1386         zone_t *zone = p->p_zone;
1387
1388         ASSERT(MUTEX_HELD(&p->p_lock));
1389
1390         mutex_enter(&zone->zone_nlwps_lock);
1391         nlwps = zone->zone_nlwps;
1392         mutex_exit(&zone->zone_nlwps_lock);
1393
1394         return (nlwps);
1395 }
1396
1397 /*ARGSUSED*/
1398 static int
1399 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1400     rctl_qty_t incr, uint_t flags)
1401 {
1402         rctl_qty_t nlwps;
1403
1404         ASSERT(MUTEX_HELD(&p->p_lock));
1405         ASSERT(e->rcep_t == RCENTITY_ZONE);
1406         if (e->rcep_p.zone == NULL)
1407                 return (0);
1408         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1409         nlwps = e->rcep_p.zone->zone_nlwps;
1410
1411         if (nlwps + incr > rcntl->rcv_value)
1412                 return (1);
1413
1414         return (0);
1415 }
1416
1417 /*ARGSUSED*/
1418 static int
1419 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1420 {
1421         ASSERT(MUTEX_HELD(&p->p_lock));
1422         ASSERT(e->rcep_t == RCENTITY_ZONE);
1423         if (e->rcep_p.zone == NULL)
1424                 return (0);
1425         e->rcep_p.zone->zone_nlwps_ctl = nv;
1426         return (0);
1427 }
1428
1429 static rctl_ops_t zone_lwps_ops = {
1430         rcop_no_action,
1431         zone_lwps_usage,
1432         zone_lwps_set,
1433         zone_lwps_test,
1434 };
1435
1436 /*ARGSUSED*/
1437 static rctl_qty_t
1438 zone_procs_usage(rctl_t *r, proc_t *p)
1439 {
1440         rctl_qty_t nprocs;
1441         zone_t *zone = p->p_zone;
1442
1443         ASSERT(MUTEX_HELD(&p->p_lock));
1444
1445         mutex_enter(&zone->zone_nlwps_lock);
1446         nprocs = zone->zone_nprocs;
1447         mutex_exit(&zone->zone_nlwps_lock);
1448
1449         return (nprocs);
1450 }
1451
1452 /*ARGSUSED*/
1453 static int
1454 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1455     rctl_qty_t incr, uint_t flags)
1456 {
1457         rctl_qty_t nprocs;
1458
1459         ASSERT(MUTEX_HELD(&p->p_lock));
1460         ASSERT(e->rcep_t == RCENTITY_ZONE);
1461         if (e->rcep_p.zone == NULL)
1462                 return (0);
1463         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1464         nprocs = e->rcep_p.zone->zone_nprocs;
1465
1466         if (nprocs + incr > rcntl->rcv_value)
1467                 return (1);
1468
1469         return (0);
1470 }
1471
1472 /*ARGSUSED*/
1473 static int
1474 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1475 {
1476         ASSERT(MUTEX_HELD(&p->p_lock));
1477         ASSERT(e->rcep_t == RCENTITY_ZONE);
1478         if (e->rcep_p.zone == NULL)
1479                 return (0);
1480         e->rcep_p.zone->zone_nprocs_ctl = nv;
1481         return (0);
1482 }
1483
1484 static rctl_ops_t zone_procs_ops = {
1485         rcop_no_action,
1486         zone_procs_usage,
1487         zone_procs_set,
1488         zone_procs_test,
1489 };
1490
1491 /*ARGSUSED*/
1492 static int
1493 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1494     rctl_qty_t incr, uint_t flags)
1495 {
1496         rctl_qty_t v;
1497         ASSERT(MUTEX_HELD(&p->p_lock));
1498         ASSERT(e->rcep_t == RCENTITY_ZONE);
1499         v = e->rcep_p.zone->zone_shmmax + incr;
1500         if (v > rval->rcv_value)
1501                 return (1);
1502         return (0);
1503 }
1504
1505 static rctl_ops_t zone_shmmax_ops = {
1506         rcop_no_action,
1507         rcop_no_usage,
1508         rcop_no_set,
1509         zone_shmmax_test
1510 };
1511
1512 /*ARGSUSED*/
1513 static int
1514 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1515     rctl_qty_t incr, uint_t flags)
1516 {
1517         rctl_qty_t v;
1518         ASSERT(MUTEX_HELD(&p->p_lock));
1519         ASSERT(e->rcep_t == RCENTITY_ZONE);
1520         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1521         if (v > rval->rcv_value)
1522                 return (1);
1523         return (0);
1524 }
1525
1526 static rctl_ops_t zone_shmmni_ops = {
1527         rcop_no_action,
1528         rcop_no_usage,
1529         rcop_no_set,
1530         zone_shmmni_test
1531 };
1532
1533 /*ARGSUSED*/
1534 static int
1535 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1536     rctl_qty_t incr, uint_t flags)
1537 {
1538         rctl_qty_t v;
1539         ASSERT(MUTEX_HELD(&p->p_lock));
1540         ASSERT(e->rcep_t == RCENTITY_ZONE);
1541         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1542         if (v > rval->rcv_value)
1543                 return (1);
1544         return (0);
1545 }
1546
1547 static rctl_ops_t zone_semmni_ops = {
1548         rcop_no_action,
1549         rcop_no_usage,
1550         rcop_no_set,
1551         zone_semmni_test
1552 };
1553
1554 /*ARGSUSED*/
1555 static int
1556 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1557     rctl_qty_t incr, uint_t flags)
1558 {
1559         rctl_qty_t v;
1560         ASSERT(MUTEX_HELD(&p->p_lock));
1561         ASSERT(e->rcep_t == RCENTITY_ZONE);
1562         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1563         if (v > rval->rcv_value)
1564                 return (1);
1565         return (0);
1566 }
1567
1568 static rctl_ops_t zone_msgmni_ops = {
1569         rcop_no_action,
1570         rcop_no_usage,
1571         rcop_no_set,
1572         zone_msgmni_test
1573 };
1574
1575 /*ARGSUSED*/
1576 static rctl_qty_t
1577 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1578 {
1579         rctl_qty_t q;
1580         ASSERT(MUTEX_HELD(&p->p_lock));
1581         mutex_enter(&p->p_zone->zone_mem_lock);
1582         q = p->p_zone->zone_locked_mem;
1583         mutex_exit(&p->p_zone->zone_mem_lock);
1584         return (q);
1585 }
1586
1587 /*ARGSUSED*/
1588 static int
1589 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1590     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1591 {
1592         rctl_qty_t q;
1593         zone_t *z;
1594
1595         z = e->rcep_p.zone;
1596         ASSERT(MUTEX_HELD(&p->p_lock));
1597         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1598         q = z->zone_locked_mem;
1599         if (q + incr > rcntl->rcv_value)
1600                 return (1);
1601         return (0);
1602 }
1603
1604 /*ARGSUSED*/
1605 static int
1606 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1607     rctl_qty_t nv)
1608 {
1609         ASSERT(MUTEX_HELD(&p->p_lock));
1610         ASSERT(e->rcep_t == RCENTITY_ZONE);
1611         if (e->rcep_p.zone == NULL)
1612                 return (0);
1613         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1614         return (0);
1615 }
1616
1617 static rctl_ops_t zone_locked_mem_ops = {
1618         rcop_no_action,
1619         zone_locked_mem_usage,
1620         zone_locked_mem_set,
1621         zone_locked_mem_test
1622 };
1623
1624 /*ARGSUSED*/
1625 static rctl_qty_t
1626 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1627 {
1628         rctl_qty_t q;
1629         zone_t *z = p->p_zone;
1630
1631         ASSERT(MUTEX_HELD(&p->p_lock));
1632         mutex_enter(&z->zone_mem_lock);
1633         q = z->zone_max_swap;
1634         mutex_exit(&z->zone_mem_lock);
1635         return (q);
1636 }
1637
1638 /*ARGSUSED*/
1639 static int
1640 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1641     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1642 {
1643         rctl_qty_t q;
1644         zone_t *z;
1645
1646         z = e->rcep_p.zone;
1647         ASSERT(MUTEX_HELD(&p->p_lock));
1648         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1649         q = z->zone_max_swap;
1650         if (q + incr > rcntl->rcv_value)
1651                 return (1);
1652         return (0);
1653 }
1654
1655 /*ARGSUSED*/
1656 static int
1657 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1658     rctl_qty_t nv)
1659 {
1660         ASSERT(MUTEX_HELD(&p->p_lock));
1661         ASSERT(e->rcep_t == RCENTITY_ZONE);
1662         if (e->rcep_p.zone == NULL)
1663                 return (0);
1664         e->rcep_p.zone->zone_max_swap_ctl = nv;
1665         return (0);
1666 }
1667
1668 static rctl_ops_t zone_max_swap_ops = {
1669         rcop_no_action,
1670         zone_max_swap_usage,
1671         zone_max_swap_set,
1672         zone_max_swap_test
1673 };
1674
1675 /*ARGSUSED*/
1676 static rctl_qty_t
1677 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1678 {
1679         rctl_qty_t q;
1680         zone_t *z = p->p_zone;
1681
1682         ASSERT(MUTEX_HELD(&p->p_lock));
1683         mutex_enter(&z->zone_rctl_lock);
1684         q = z->zone_max_lofi;
1685         mutex_exit(&z->zone_rctl_lock);
1686         return (q);
1687 }
1688
1689 /*ARGSUSED*/
1690 static int
1691 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1692     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1693 {
1694         rctl_qty_t q;
1695         zone_t *z;
1696
1697         z = e->rcep_p.zone;
1698         ASSERT(MUTEX_HELD(&p->p_lock));
1699         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1700         q = z->zone_max_lofi;
1701         if (q + incr > rcntl->rcv_value)
1702                 return (1);
1703         return (0);
1704 }
1705
1706 /*ARGSUSED*/
1707 static int
1708 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1709     rctl_qty_t nv)
1710 {
1711         ASSERT(MUTEX_HELD(&p->p_lock));
1712         ASSERT(e->rcep_t == RCENTITY_ZONE);
1713         if (e->rcep_p.zone == NULL)
1714                 return (0);
1715         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1716         return (0);
1717 }
1718
1719 static rctl_ops_t zone_max_lofi_ops = {
1720         rcop_no_action,
1721         zone_max_lofi_usage,
1722         zone_max_lofi_set,
1723         zone_max_lofi_test
1724 };
1725
1726 /*
1727  * Helper function to brand the zone with a unique ID.
1728  */
1729 static void
1730 zone_uniqid(zone_t *zone)
1731 {
1732         static uint64_t uniqid = 0;
1733
1734         ASSERT(MUTEX_HELD(&zonehash_lock));
1735         zone->zone_uniqid = uniqid++;
1736 }
1737
1738 /*
1739  * Returns a held pointer to the "kcred" for the specified zone.
1740  */
1741 struct cred *
1742 zone_get_kcred(zoneid_t zoneid)
1743 {
1744         zone_t *zone;
1745         cred_t *cr;
1746
1747         if ((zone = zone_find_by_id(zoneid)) == NULL)
1748                 return (NULL);
1749         cr = zone->zone_kcred;
1750         crhold(cr);
1751         zone_rele(zone);
1752         return (cr);
1753 }
1754
1755 static int
1756 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1757 {
1758         zone_t *zone = ksp->ks_private;
1759         zone_kstat_t *zk = ksp->ks_data;
1760
1761         if (rw == KSTAT_WRITE)
1762                 return (EACCES);
1763
1764         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1765         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1766         return (0);
1767 }
1768
1769 static int
1770 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1771 {
1772         zone_t *zone = ksp->ks_private;
1773         zone_kstat_t *zk = ksp->ks_data;
1774
1775         if (rw == KSTAT_WRITE)
1776                 return (EACCES);
1777
1778         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1779         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1780         return (0);
1781 }
1782
1783 static int
1784 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1785 {
1786         zone_t *zone = ksp->ks_private;
1787         zone_kstat_t *zk = ksp->ks_data;
1788
1789         if (rw == KSTAT_WRITE)
1790                 return (EACCES);
1791
1792         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1793         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1794         return (0);
1795 }
1796
1797 static kstat_t *
1798 zone_kstat_create_common(zone_t *zone, char *name,
1799     int (*updatefunc) (kstat_t *, int))
1800 {
1801         kstat_t *ksp;
1802         zone_kstat_t *zk;
1803
1804         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1805             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1806             KSTAT_FLAG_VIRTUAL);
1807
1808         if (ksp == NULL)
1809                 return (NULL);
1810
1811         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1812         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1813         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1814         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1815         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1816         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1817         ksp->ks_update = updatefunc;
1818         ksp->ks_private = zone;
1819         kstat_install(ksp);
1820         return (ksp);
1821 }
1822
1823 static void
1824 zone_kstat_create(zone_t *zone)
1825 {
1826         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1827             "lockedmem", zone_lockedmem_kstat_update);
1828         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1829             "swapresv", zone_swapresv_kstat_update);
1830         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
1831             "nprocs", zone_nprocs_kstat_update);
1832 }
1833
1834 static void
1835 zone_kstat_delete_common(kstat_t **pkstat)
1836 {
1837         void *data;
1838
1839         if (*pkstat != NULL) {
1840                 data = (*pkstat)->ks_data;
1841                 kstat_delete(*pkstat);
1842                 kmem_free(data, sizeof (zone_kstat_t));
1843                 *pkstat = NULL;
1844         }
1845 }
1846
1847 static void
1848 zone_kstat_delete(zone_t *zone)
1849 {
1850         zone_kstat_delete_common(&zone->zone_lockedmem_kstat);
1851         zone_kstat_delete_common(&zone->zone_swapresv_kstat);
1852         zone_kstat_delete_common(&zone->zone_nprocs_kstat);
1853 }
1854
1855 /*
1856  * Called very early on in boot to initialize the ZSD list so that
1857  * zone_key_create() can be called before zone_init().  It also initializes
1858  * portions of zone0 which may be used before zone_init() is called.  The
1859  * variable "global_zone" will be set when zone0 is fully initialized by
1860  * zone_init().
1861  */
1862 void
1863 zone_zsd_init(void)
1864 {
1865         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1866         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1867         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1868             offsetof(struct zsd_entry, zsd_linkage));
1869         list_create(&zone_active, sizeof (zone_t),
1870             offsetof(zone_t, zone_linkage));
1871         list_create(&zone_deathrow, sizeof (zone_t),
1872             offsetof(zone_t, zone_linkage));
1873
1874         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1875         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1876         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1877         zone0.zone_shares = 1;
1878         zone0.zone_nlwps = 0;
1879         zone0.zone_nlwps_ctl = INT_MAX;
1880         zone0.zone_nprocs = 0;
1881         zone0.zone_nprocs_ctl = INT_MAX;
1882         zone0.zone_locked_mem = 0;
1883         zone0.zone_locked_mem_ctl = UINT64_MAX;
1884         ASSERT(zone0.zone_max_swap == 0);
1885         zone0.zone_max_swap_ctl = UINT64_MAX;
1886         zone0.zone_max_lofi = 0;
1887         zone0.zone_max_lofi_ctl = UINT64_MAX;
1888         zone0.zone_shmmax = 0;
1889         zone0.zone_ipc.ipcq_shmmni = 0;
1890         zone0.zone_ipc.ipcq_semmni = 0;
1891         zone0.zone_ipc.ipcq_msgmni = 0;
1892         zone0.zone_name = GLOBAL_ZONENAME;
1893         zone0.zone_nodename = utsname.nodename;
1894         zone0.zone_domain = srpc_domain;
1895         zone0.zone_hostid = HW_INVALID_HOSTID;
1896         zone0.zone_fs_allowed = NULL;
1897         zone0.zone_ref = 1;
1898         zone0.zone_id = GLOBAL_ZONEID;
1899         zone0.zone_status = ZONE_IS_RUNNING;
1900         zone0.zone_rootpath = "/";
1901         zone0.zone_rootpathlen = 2;
1902         zone0.zone_psetid = ZONE_PS_INVAL;
1903         zone0.zone_ncpus = 0;
1904         zone0.zone_ncpus_online = 0;
1905         zone0.zone_proc_initpid = 1;
1906         zone0.zone_initname = initname;
1907         zone0.zone_lockedmem_kstat = NULL;
1908         zone0.zone_swapresv_kstat = NULL;
1909         zone0.zone_nprocs_kstat = NULL;
1910         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
1911             offsetof(zone_ref_t, zref_linkage));
1912         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
1913             offsetof(struct zsd_entry, zsd_linkage));
1914         list_insert_head(&zone_active, &zone0);
1915
1916         /*
1917          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
1918          * to anything meaningful.  It is assigned to be 'rootdir' in
1919          * vfs_mountroot().
1920          */
1921         zone0.zone_rootvp = NULL;
1922         zone0.zone_vfslist = NULL;
1923         zone0.zone_bootargs = initargs;
1924         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
1925         /*
1926          * The global zone has all privileges
1927          */
1928         priv_fillset(zone0.zone_privset);
1929         /*
1930          * Add p0 to the global zone
1931          */
1932         zone0.zone_zsched = &p0;
1933         p0.p_zone = &zone0;
1934 }
1935
1936 /*
1937  * Compute a hash value based on the contents of the label and the DOI.  The
1938  * hash algorithm is somewhat arbitrary, but is based on the observation that
1939  * humans will likely pick labels that differ by amounts that work out to be
1940  * multiples of the number of hash chains, and thus stirring in some primes
1941  * should help.
1942  */
1943 static uint_t
1944 hash_bylabel(void *hdata, mod_hash_key_t key)
1945 {
1946         const ts_label_t *lab = (ts_label_t *)key;
1947         const uint32_t *up, *ue;
1948         uint_t hash;
1949         int i;
1950
1951         _NOTE(ARGUNUSED(hdata));
1952
1953         hash = lab->tsl_doi + (lab->tsl_doi << 1);
1954         /* we depend on alignment of label, but not representation */
1955         up = (const uint32_t *)&lab->tsl_label;
1956         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
1957         i = 1;
1958         while (up < ue) {
1959                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
1960                 hash += *up + (*up << ((i % 16) + 1));
1961                 up++;
1962                 i++;
1963         }
1964         return (hash);
1965 }
1966
1967 /*
1968  * All that mod_hash cares about here is zero (equal) versus non-zero (not
1969  * equal).  This may need to be changed if less than / greater than is ever
1970  * needed.
1971  */
1972 static int
1973 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1974 {
1975         ts_label_t *lab1 = (ts_label_t *)key1;
1976         ts_label_t *lab2 = (ts_label_t *)key2;
1977
1978         return (label_equal(lab1, lab2) ? 0 : 1);
1979 }
1980
1981 /*
1982  * Called by main() to initialize the zones framework.
1983  */
1984 void
1985 zone_init(void)
1986 {
1987         rctl_dict_entry_t *rde;
1988         rctl_val_t *dval;
1989         rctl_set_t *set;
1990         rctl_alloc_gp_t *gp;
1991         rctl_entity_p_t e;
1992         int res;
1993
1994         ASSERT(curproc == &p0);
1995
1996         /*
1997          * Create ID space for zone IDs.  ID 0 is reserved for the
1998          * global zone.
1999          */
2000         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2001
2002         /*
2003          * Initialize generic zone resource controls, if any.
2004          */
2005         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2006             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2007             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2008             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2009
2010         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2011             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2012             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2013             RCTL_GLOBAL_INFINITE,
2014             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2015
2016         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2017             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2018             INT_MAX, INT_MAX, &zone_lwps_ops);
2019
2020         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2021             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2022             INT_MAX, INT_MAX, &zone_procs_ops);
2023
2024         /*
2025          * System V IPC resource controls
2026          */
2027         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2028             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2029             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2030
2031         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2032             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2033             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2034
2035         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2036             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2037             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2038
2039         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2040             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2041             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2042
2043         /*
2044          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2045          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2046          */
2047         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2048         bzero(dval, sizeof (rctl_val_t));
2049         dval->rcv_value = 1;
2050         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2051         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2052         dval->rcv_action_recip_pid = -1;
2053
2054         rde = rctl_dict_lookup("zone.cpu-shares");
2055         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2056
2057         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2058             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2059             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2060             &zone_locked_mem_ops);
2061
2062         rc_zone_max_swap = rctl_register("zone.max-swap",
2063             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2064             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2065             &zone_max_swap_ops);
2066
2067         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2068             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2069             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2070             &zone_max_lofi_ops);
2071
2072         /*
2073          * Initialize the ``global zone''.
2074          */
2075         set = rctl_set_create();
2076         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2077         mutex_enter(&p0.p_lock);
2078         e.rcep_p.zone = &zone0;
2079         e.rcep_t = RCENTITY_ZONE;
2080         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2081             gp);
2082
2083         zone0.zone_nlwps = p0.p_lwpcnt;
2084         zone0.zone_nprocs = 1;
2085         zone0.zone_ntasks = 1;
2086         mutex_exit(&p0.p_lock);
2087         zone0.zone_restart_init = B_TRUE;
2088         zone0.zone_brand = &native_brand;
2089         rctl_prealloc_destroy(gp);
2090         /*
2091          * pool_default hasn't been initialized yet, so we let pool_init()
2092          * take care of making sure the global zone is in the default pool.
2093          */
2094
2095         /*
2096          * Initialize global zone kstats
2097          */
2098         zone_kstat_create(&zone0);
2099
2100         /*
2101          * Initialize zone label.
2102          * mlp are initialized when tnzonecfg is loaded.
2103          */
2104         zone0.zone_slabel = l_admin_low;
2105         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2106         label_hold(l_admin_low);
2107
2108         /*
2109          * Initialise the lock for the database structure used by mntfs.
2110          */
2111         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2112
2113         mutex_enter(&zonehash_lock);
2114         zone_uniqid(&zone0);
2115         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2116
2117         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2118             mod_hash_null_valdtor);
2119         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2120             zone_hash_size, mod_hash_null_valdtor);
2121         /*
2122          * maintain zonehashbylabel only for labeled systems
2123          */
2124         if (is_system_labeled())
2125                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2126                     zone_hash_size, mod_hash_null_keydtor,
2127                     mod_hash_null_valdtor, hash_bylabel, NULL,
2128                     hash_labelkey_cmp, KM_SLEEP);
2129         zonecount = 1;
2130
2131         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2132             (mod_hash_val_t)&zone0);
2133         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2134             (mod_hash_val_t)&zone0);
2135         if (is_system_labeled()) {
2136                 zone0.zone_flags |= ZF_HASHED_LABEL;
2137                 (void) mod_hash_insert(zonehashbylabel,
2138                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2139         }
2140         mutex_exit(&zonehash_lock);
2141
2142         /*
2143          * We avoid setting zone_kcred until now, since kcred is initialized
2144          * sometime after zone_zsd_init() and before zone_init().
2145          */
2146         zone0.zone_kcred = kcred;
2147         /*
2148          * The global zone is fully initialized (except for zone_rootvp which
2149          * will be set when the root filesystem is mounted).
2150          */
2151         global_zone = &zone0;
2152
2153         /*
2154          * Setup an event channel to send zone status change notifications on
2155          */
2156         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2157             EVCH_CREAT);
2158
2159         if (res)
2160                 panic("Sysevent_evc_bind failed during zone setup.\n");
2161
2162 }
2163
2164 static void
2165 zone_free(zone_t *zone)
2166 {
2167         ASSERT(zone != global_zone);
2168         ASSERT(zone->zone_ntasks == 0);
2169         ASSERT(zone->zone_nlwps == 0);
2170         ASSERT(zone->zone_nprocs == 0);
2171         ASSERT(zone->zone_cred_ref == 0);
2172         ASSERT(zone->zone_kcred == NULL);
2173         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2174             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2175         ASSERT(list_is_empty(&zone->zone_ref_list));
2176
2177         /*
2178          * Remove any zone caps.
2179          */
2180         cpucaps_zone_remove(zone);
2181
2182         ASSERT(zone->zone_cpucap == NULL);
2183
2184         /* remove from deathrow list */
2185         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2186                 ASSERT(zone->zone_ref == 0);
2187                 mutex_enter(&zone_deathrow_lock);
2188                 list_remove(&zone_deathrow, zone);
2189                 mutex_exit(&zone_deathrow_lock);
2190         }
2191
2192         list_destroy(&zone->zone_ref_list);
2193         zone_free_zsd(zone);
2194         zone_free_datasets(zone);
2195         list_destroy(&zone->zone_dl_list);
2196
2197         if (zone->zone_rootvp != NULL)
2198                 VN_RELE(zone->zone_rootvp);
2199         if (zone->zone_rootpath)
2200                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2201         if (zone->zone_name != NULL)
2202                 kmem_free(zone->zone_name, ZONENAME_MAX);
2203         if (zone->zone_slabel != NULL)
2204                 label_rele(zone->zone_slabel);
2205         if (zone->zone_nodename != NULL)
2206                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2207         if (zone->zone_domain != NULL)
2208                 kmem_free(zone->zone_domain, _SYS_NMLN);
2209         if (zone->zone_privset != NULL)
2210                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2211         if (zone->zone_rctls != NULL)
2212                 rctl_set_free(zone->zone_rctls);
2213         if (zone->zone_bootargs != NULL)
2214                 strfree(zone->zone_bootargs);
2215         if (zone->zone_initname != NULL)
2216                 strfree(zone->zone_initname);
2217         if (zone->zone_fs_allowed != NULL)
2218                 strfree(zone->zone_fs_allowed);
2219         if (zone->zone_pfexecd != NULL)
2220                 klpd_freelist(&zone->zone_pfexecd);
2221         id_free(zoneid_space, zone->zone_id);
2222         mutex_destroy(&zone->zone_lock);
2223         cv_destroy(&zone->zone_cv);
2224         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2225         rw_destroy(&zone->zone_mntfs_db_lock);
2226         kmem_free(zone, sizeof (zone_t));
2227 }
2228
2229 /*
2230  * See block comment at the top of this file for information about zone
2231  * status values.
2232  */
2233 /*
2234  * Convenience function for setting zone status.
2235  */
2236 static void
2237 zone_status_set(zone_t *zone, zone_status_t status)
2238 {
2239
2240         nvlist_t *nvl = NULL;
2241         ASSERT(MUTEX_HELD(&zone_status_lock));
2242         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2243             status >= zone_status_get(zone));
2244
2245         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2246             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2247             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2248             zone_status_table[status]) ||
2249             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2250             zone_status_table[zone->zone_status]) ||
2251             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2252             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2253             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2254             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2255 #ifdef DEBUG
2256                 (void) printf(
2257                     "Failed to allocate and send zone state change event.\n");
2258 #endif
2259         }
2260         nvlist_free(nvl);
2261
2262         zone->zone_status = status;
2263
2264         cv_broadcast(&zone->zone_cv);
2265 }
2266
2267 /*
2268  * Public function to retrieve the zone status.  The zone status may
2269  * change after it is retrieved.
2270  */
2271 zone_status_t
2272 zone_status_get(zone_t *zone)
2273 {
2274         return (zone->zone_status);
2275 }
2276
2277 static int
2278 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2279 {
2280         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2281         int err = 0;
2282
2283         ASSERT(zone != global_zone);
2284         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2285                 goto done;      /* EFAULT or ENAMETOOLONG */
2286
2287         if (zone->zone_bootargs != NULL)
2288                 strfree(zone->zone_bootargs);
2289
2290         zone->zone_bootargs = strdup(buf);
2291
2292 done:
2293         kmem_free(buf, BOOTARGS_MAX);
2294         return (err);
2295 }
2296
2297 static int
2298 zone_set_brand(zone_t *zone, const char *brand)
2299 {
2300         struct brand_attr *attrp;
2301         brand_t *bp;
2302
2303         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2304         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2305                 kmem_free(attrp, sizeof (struct brand_attr));
2306                 return (EFAULT);
2307         }
2308
2309         bp = brand_register_zone(attrp);
2310         kmem_free(attrp, sizeof (struct brand_attr));
2311         if (bp == NULL)
2312                 return (EINVAL);
2313
2314         /*
2315          * This is the only place where a zone can change it's brand.
2316          * We already need to hold zone_status_lock to check the zone
2317          * status, so we'll just use that lock to serialize zone
2318          * branding requests as well.
2319          */
2320         mutex_enter(&zone_status_lock);
2321
2322         /* Re-Branding is not allowed and the zone can't be booted yet */
2323         if ((ZONE_IS_BRANDED(zone)) ||
2324             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2325                 mutex_exit(&zone_status_lock);
2326                 brand_unregister_zone(bp);
2327                 return (EINVAL);
2328         }
2329
2330         /* set up the brand specific data */
2331         zone->zone_brand = bp;
2332         ZBROP(zone)->b_init_brand_data(zone);
2333
2334         mutex_exit(&zone_status_lock);
2335         return (0);
2336 }
2337
2338 static int
2339 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2340 {
2341         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2342         int err = 0;
2343
2344         ASSERT(zone != global_zone);
2345         if ((err = copyinstr(zone_fs_allowed, buf,
2346             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2347                 goto done;
2348
2349         if (zone->zone_fs_allowed != NULL)
2350                 strfree(zone->zone_fs_allowed);
2351
2352         zone->zone_fs_allowed = strdup(buf);
2353
2354 done:
2355         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2356         return (err);
2357 }
2358
2359 static int
2360 zone_set_initname(zone_t *zone, const char *zone_initname)
2361 {
2362         char initname[INITNAME_SZ];
2363         size_t len;
2364         int err = 0;
2365
2366         ASSERT(zone != global_zone);
2367         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2368                 return (err);   /* EFAULT or ENAMETOOLONG */
2369
2370         if (zone->zone_initname != NULL)
2371                 strfree(zone->zone_initname);
2372
2373         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2374         (void) strcpy(zone->zone_initname, initname);
2375         return (0);
2376 }
2377
2378 static int
2379 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2380 {
2381         uint64_t mcap;
2382         int err = 0;
2383
2384         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2385                 zone->zone_phys_mcap = mcap;
2386
2387         return (err);
2388 }
2389
2390 static int
2391 zone_set_sched_class(zone_t *zone, const char *new_class)
2392 {
2393         char sched_class[PC_CLNMSZ];
2394         id_t classid;
2395         int err;
2396
2397         ASSERT(zone != global_zone);
2398         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2399                 return (err);   /* EFAULT or ENAMETOOLONG */
2400
2401         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2402                 return (set_errno(EINVAL));
2403         zone->zone_defaultcid = classid;
2404         ASSERT(zone->zone_defaultcid > 0 &&
2405             zone->zone_defaultcid < loaded_classes);
2406
2407         return (0);
2408 }
2409
2410 /*
2411  * Block indefinitely waiting for (zone_status >= status)
2412  */
2413 void
2414 zone_status_wait(zone_t *zone, zone_status_t status)
2415 {
2416         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2417
2418         mutex_enter(&zone_status_lock);
2419         while (zone->zone_status < status) {
2420                 cv_wait(&zone->zone_cv, &zone_status_lock);
2421         }
2422         mutex_exit(&zone_status_lock);
2423 }
2424
2425 /*
2426  * Private CPR-safe version of zone_status_wait().
2427  */
2428 static void
2429 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2430 {
2431         callb_cpr_t cprinfo;
2432
2433         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2434
2435         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2436             str);
2437         mutex_enter(&zone_status_lock);
2438         while (zone->zone_status < status) {
2439                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2440                 cv_wait(&zone->zone_cv, &zone_status_lock);
2441                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2442         }
2443         /*
2444          * zone_status_lock is implicitly released by the following.
2445          */
2446         CALLB_CPR_EXIT(&cprinfo);
2447 }
2448
2449 /*
2450  * Block until zone enters requested state or signal is received.  Return (0)
2451  * if signaled, non-zero otherwise.
2452  */
2453 int
2454 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2455 {
2456         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2457
2458         mutex_enter(&zone_status_lock);
2459         while (zone->zone_status < status) {
2460                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2461                         mutex_exit(&zone_status_lock);
2462                         return (0);
2463                 }
2464         }
2465         mutex_exit(&zone_status_lock);
2466         return (1);
2467 }
2468
2469 /*
2470  * Block until the zone enters the requested state or the timeout expires,
2471  * whichever happens first.  Return (-1) if operation timed out, time remaining
2472  * otherwise.
2473  */
2474 clock_t
2475 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2476 {
2477         clock_t timeleft = 0;
2478
2479         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2480
2481         mutex_enter(&zone_status_lock);
2482         while (zone->zone_status < status && timeleft != -1) {
2483                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2484         }
2485         mutex_exit(&zone_status_lock);
2486         return (timeleft);
2487 }
2488
2489 /*
2490  * Block until the zone enters the requested state, the current process is
2491  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2492  * operation timed out, 0 if signaled, time remaining otherwise.
2493  */
2494 clock_t
2495 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2496 {
2497         clock_t timeleft = tim - ddi_get_lbolt();
2498
2499         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2500
2501         mutex_enter(&zone_status_lock);
2502         while (zone->zone_status < status) {
2503                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2504                     tim);
2505                 if (timeleft <= 0)
2506                         break;
2507         }
2508         mutex_exit(&zone_status_lock);
2509         return (timeleft);
2510 }
2511
2512 /*
2513  * Zones have two reference counts: one for references from credential
2514  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2515  * This is so we can allow a zone to be rebooted while there are still
2516  * outstanding cred references, since certain drivers cache dblks (which
2517  * implicitly results in cached creds).  We wait for zone_ref to drop to
2518  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2519  * later freed when the zone_cred_ref drops to 0, though nothing other
2520  * than the zone id and privilege set should be accessed once the zone
2521  * is "dead".
2522  *
2523  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2524  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2525  * to 0.  This can be useful to flush out other sources of cached creds
2526  * that may be less innocuous than the driver case.
2527  *
2528  * Zones also provide a tracked reference counting mechanism in which zone
2529  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2530  * debuggers determine the sources of leaked zone references.  See
2531  * zone_hold_ref() and zone_rele_ref() below for more information.
2532  */
2533
2534 int zone_wait_for_cred = 0;
2535
2536 static void
2537 zone_hold_locked(zone_t *z)
2538 {
2539         ASSERT(MUTEX_HELD(&z->zone_lock));
2540         z->zone_ref++;
2541         ASSERT(z->zone_ref != 0);
2542 }
2543
2544 /*
2545  * Increment the specified zone's reference count.  The zone's zone_t structure
2546  * will not be freed as long as the zone's reference count is nonzero.
2547  * Decrement the zone's reference count via zone_rele().
2548  *
2549  * NOTE: This function should only be used to hold zones for short periods of
2550  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2551  */
2552 void
2553 zone_hold(zone_t *z)
2554 {
2555         mutex_enter(&z->zone_lock);
2556         zone_hold_locked(z);
2557         mutex_exit(&z->zone_lock);
2558 }
2559
2560 /*
2561  * If the non-cred ref count drops to 1 and either the cred ref count
2562  * is 0 or we aren't waiting for cred references, the zone is ready to
2563  * be destroyed.
2564  */
2565 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2566             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2567
2568 /*
2569  * Common zone reference release function invoked by zone_rele() and
2570  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2571  * zone's subsystem-specific reference counters are not affected by the
2572  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2573  * removed from the specified zone's reference list.  ref must be non-NULL iff
2574  * subsys is not ZONE_REF_NUM_SUBSYS.
2575  */
2576 static void
2577 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2578 {
2579         boolean_t wakeup;
2580
2581         mutex_enter(&z->zone_lock);
2582         ASSERT(z->zone_ref != 0);
2583         z->zone_ref--;
2584         if (subsys != ZONE_REF_NUM_SUBSYS) {
2585                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2586                 z->zone_subsys_ref[subsys]--;
2587                 list_remove(&z->zone_ref_list, ref);
2588         }
2589         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2590                 /* no more refs, free the structure */
2591                 mutex_exit(&z->zone_lock);
2592                 zone_free(z);
2593                 return;
2594         }
2595         /* signal zone_destroy so the zone can finish halting */
2596         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2597         mutex_exit(&z->zone_lock);
2598
2599         if (wakeup) {
2600                 /*
2601                  * Grabbing zonehash_lock here effectively synchronizes with
2602                  * zone_destroy() to avoid missed signals.
2603                  */
2604                 mutex_enter(&zonehash_lock);
2605                 cv_broadcast(&zone_destroy_cv);
2606                 mutex_exit(&zonehash_lock);
2607         }
2608 }
2609
2610 /*
2611  * Decrement the specified zone's reference count.  The specified zone will
2612  * cease to exist after this function returns if the reference count drops to
2613  * zero.  This function should be paired with zone_hold().
2614  */
2615 void
2616 zone_rele(zone_t *z)
2617 {
2618         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2619 }
2620
2621 /*
2622  * Initialize a zone reference structure.  This function must be invoked for
2623  * a reference structure before the structure is passed to zone_hold_ref().
2624  */
2625 void
2626 zone_init_ref(zone_ref_t *ref)
2627 {
2628         ref->zref_zone = NULL;
2629         list_link_init(&ref->zref_linkage);
2630 }
2631
2632 /*
2633  * Acquire a reference to zone z.  The caller must specify the
2634  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2635  * zone_ref_t structure will represent a reference to the specified zone.  Use
2636  * zone_rele_ref() to release the reference.
2637  *
2638  * The referenced zone_t structure will not be freed as long as the zone_t's
2639  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2640  * references.
2641  *
2642  * NOTE: The zone_ref_t structure must be initialized before it is used.
2643  * See zone_init_ref() above.
2644  */
2645 void
2646 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2647 {
2648         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2649
2650         /*
2651          * Prevent consumers from reusing a reference structure before
2652          * releasing it.
2653          */
2654         VERIFY(ref->zref_zone == NULL);
2655
2656         ref->zref_zone = z;
2657         mutex_enter(&z->zone_lock);
2658         zone_hold_locked(z);
2659         z->zone_subsys_ref[subsys]++;
2660         ASSERT(z->zone_subsys_ref[subsys] != 0);
2661         list_insert_head(&z->zone_ref_list, ref);
2662         mutex_exit(&z->zone_lock);
2663 }
2664
2665 /*
2666  * Release the zone reference represented by the specified zone_ref_t.
2667  * The reference is invalid after it's released; however, the zone_ref_t
2668  * structure can be reused without having to invoke zone_init_ref().
2669  * subsys should be the same value that was passed to zone_hold_ref()
2670  * when the reference was acquired.
2671  */
2672 void
2673 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2674 {
2675         zone_rele_common(ref->zref_zone, ref, subsys);
2676
2677         /*
2678          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2679          * when consumers dereference the reference.  This helps us catch
2680          * consumers who use released references.  Furthermore, this lets
2681          * consumers reuse the zone_ref_t structure without having to
2682          * invoke zone_init_ref().
2683          */
2684         ref->zref_zone = NULL;
2685 }
2686
2687 void
2688 zone_cred_hold(zone_t *z)
2689 {
2690         mutex_enter(&z->zone_lock);
2691         z->zone_cred_ref++;
2692         ASSERT(z->zone_cred_ref != 0);
2693         mutex_exit(&z->zone_lock);
2694 }
2695
2696 void
2697 zone_cred_rele(zone_t *z)
2698 {
2699         boolean_t wakeup;
2700
2701         mutex_enter(&z->zone_lock);
2702         ASSERT(z->zone_cred_ref != 0);
2703         z->zone_cred_ref--;
2704         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2705                 /* no more refs, free the structure */
2706                 mutex_exit(&z->zone_lock);
2707                 zone_free(z);
2708                 return;
2709         }
2710         /*
2711          * If zone_destroy is waiting for the cred references to drain
2712          * out, and they have, signal it.
2713          */
2714         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2715             zone_status_get(z) >= ZONE_IS_DEAD);
2716         mutex_exit(&z->zone_lock);
2717
2718         if (wakeup) {
2719                 /*
2720                  * Grabbing zonehash_lock here effectively synchronizes with
2721                  * zone_destroy() to avoid missed signals.
2722                  */
2723                 mutex_enter(&zonehash_lock);
2724                 cv_broadcast(&zone_destroy_cv);
2725                 mutex_exit(&zonehash_lock);
2726         }
2727 }
2728
2729 void
2730 zone_task_hold(zone_t *z)
2731 {
2732         mutex_enter(&z->zone_lock);
2733         z->zone_ntasks++;
2734         ASSERT(z->zone_ntasks != 0);
2735         mutex_exit(&z->zone_lock);
2736 }
2737
2738 void
2739 zone_task_rele(zone_t *zone)
2740 {
2741         uint_t refcnt;
2742
2743         mutex_enter(&zone->zone_lock);
2744         ASSERT(zone->zone_ntasks != 0);
2745         refcnt = --zone->zone_ntasks;
2746         if (refcnt > 1) {       /* Common case */
2747                 mutex_exit(&zone->zone_lock);
2748                 return;
2749         }
2750         zone_hold_locked(zone); /* so we can use the zone_t later */
2751         mutex_exit(&zone->zone_lock);
2752         if (refcnt == 1) {
2753                 /*
2754                  * See if the zone is shutting down.
2755                  */
2756                 mutex_enter(&zone_status_lock);
2757                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2758                         goto out;
2759                 }
2760
2761                 /*
2762                  * Make sure the ntasks didn't change since we
2763                  * dropped zone_lock.
2764                  */
2765                 mutex_enter(&zone->zone_lock);
2766                 if (refcnt != zone->zone_ntasks) {
2767                         mutex_exit(&zone->zone_lock);
2768                         goto out;
2769                 }
2770                 mutex_exit(&zone->zone_lock);
2771
2772                 /*
2773                  * No more user processes in the zone.  The zone is empty.
2774                  */
2775                 zone_status_set(zone, ZONE_IS_EMPTY);
2776                 goto out;
2777         }
2778
2779         ASSERT(refcnt == 0);
2780         /*
2781          * zsched has exited; the zone is dead.
2782          */
2783         zone->zone_zsched = NULL;               /* paranoia */
2784         mutex_enter(&zone_status_lock);
2785         zone_status_set(zone, ZONE_IS_DEAD);
2786 out:
2787         mutex_exit(&zone_status_lock);
2788         zone_rele(zone);
2789 }
2790
2791 zoneid_t
2792 getzoneid(void)
2793 {
2794         return (curproc->p_zone->zone_id);
2795 }
2796
2797 /*
2798  * Internal versions of zone_find_by_*().  These don't zone_hold() or
2799  * check the validity of a zone's state.
2800  */
2801 static zone_t *
2802 zone_find_all_by_id(zoneid_t zoneid)
2803 {
2804         mod_hash_val_t hv;
2805         zone_t *zone = NULL;
2806
2807         ASSERT(MUTEX_HELD(&zonehash_lock));
2808
2809         if (mod_hash_find(zonehashbyid,
2810             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2811                 zone = (zone_t *)hv;
2812         return (zone);
2813 }
2814
2815 static zone_t *
2816 zone_find_all_by_label(const ts_label_t *label)
2817 {
2818         mod_hash_val_t hv;
2819         zone_t *zone = NULL;
2820
2821         ASSERT(MUTEX_HELD(&zonehash_lock));
2822
2823         /*
2824          * zonehashbylabel is not maintained for unlabeled systems
2825          */
2826         if (!is_system_labeled())
2827                 return (NULL);
2828         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2829                 zone = (zone_t *)hv;
2830         return (zone);
2831 }
2832
2833 static zone_t *
2834 zone_find_all_by_name(char *name)
2835 {
2836         mod_hash_val_t hv;
2837         zone_t *zone = NULL;
2838
2839         ASSERT(MUTEX_HELD(&zonehash_lock));
2840
2841         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2842                 zone = (zone_t *)hv;
2843         return (zone);
2844 }
2845
2846 /*
2847  * Public interface for looking up a zone by zoneid.  Only returns the zone if
2848  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2849  * Caller must call zone_rele() once it is done with the zone.
2850  *
2851  * The zone may begin the zone_destroy() sequence immediately after this
2852  * function returns, but may be safely used until zone_rele() is called.
2853  */
2854 zone_t *
2855 zone_find_by_id(zoneid_t zoneid)
2856 {
2857         zone_t *zone;
2858         zone_status_t status;
2859
2860         mutex_enter(&zonehash_lock);
2861         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2862                 mutex_exit(&zonehash_lock);
2863                 return (NULL);
2864         }
2865         status = zone_status_get(zone);
2866         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2867                 /*
2868                  * For all practical purposes the zone doesn't exist.
2869                  */
2870                 mutex_exit(&zonehash_lock);
2871                 return (NULL);
2872         }
2873         zone_hold(zone);
2874         mutex_exit(&zonehash_lock);
2875         return (zone);
2876 }
2877
2878 /*
2879  * Similar to zone_find_by_id, but using zone label as the key.
2880  */
2881 zone_t *
2882 zone_find_by_label(const ts_label_t *label)
2883 {
2884         zone_t *zone;
2885         zone_status_t status;
2886
2887         mutex_enter(&zonehash_lock);
2888         if ((zone = zone_find_all_by_label(label)) == NULL) {
2889                 mutex_exit(&zonehash_lock);
2890                 return (NULL);
2891         }
2892
2893         status = zone_status_get(zone);
2894         if (status > ZONE_IS_DOWN) {
2895                 /*
2896                  * For all practical purposes the zone doesn't exist.
2897                  */
2898                 mutex_exit(&zonehash_lock);
2899                 return (NULL);
2900         }
2901         zone_hold(zone);
2902         mutex_exit(&zonehash_lock);
2903         return (zone);
2904 }
2905
2906 /*
2907  * Similar to zone_find_by_id, but using zone name as the key.
2908  */
2909 zone_t *
2910 zone_find_by_name(char *name)
2911 {
2912         zone_t *zone;
2913         zone_status_t status;
2914
2915         mutex_enter(&zonehash_lock);
2916         if ((zone = zone_find_all_by_name(name)) == NULL) {
2917                 mutex_exit(&zonehash_lock);
2918                 return (NULL);
2919         }
2920         status = zone_status_get(zone);
2921         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2922                 /*
2923                  * For all practical purposes the zone doesn't exist.
2924                  */
2925                 mutex_exit(&zonehash_lock);
2926                 return (NULL);
2927         }
2928         zone_hold(zone);
2929         mutex_exit(&zonehash_lock);
2930         return (zone);
2931 }
2932
2933 /*
2934  * Similar to zone_find_by_id(), using the path as a key.  For instance,
2935  * if there is a zone "foo" rooted at /foo/root, and the path argument
2936  * is "/foo/root/proc", it will return the held zone_t corresponding to
2937  * zone "foo".
2938  *
2939  * zone_find_by_path() always returns a non-NULL value, since at the
2940  * very least every path will be contained in the global zone.
2941  *
2942  * As with the other zone_find_by_*() functions, the caller is
2943  * responsible for zone_rele()ing the return value of this function.
2944  */
2945 zone_t *
2946 zone_find_by_path(const char *path)
2947 {
2948         zone_t *zone;
2949         zone_t *zret = NULL;
2950         zone_status_t status;
2951
2952         if (path == NULL) {
2953                 /*
2954                  * Call from rootconf().
2955                  */
2956                 zone_hold(global_zone);
2957                 return (global_zone);
2958         }
2959         ASSERT(*path == '/');
2960         mutex_enter(&zonehash_lock);
2961         for (zone = list_head(&zone_active); zone != NULL;
2962             zone = list_next(&zone_active, zone)) {
2963                 if (ZONE_PATH_VISIBLE(path, zone))
2964                         zret = zone;
2965         }
2966         ASSERT(zret != NULL);
2967         status = zone_status_get(zret);
2968         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2969                 /*
2970                  * Zone practically doesn't exist.
2971                  */
2972                 zret = global_zone;
2973         }
2974         zone_hold(zret);
2975         mutex_exit(&zonehash_lock);
2976         return (zret);
2977 }
2978
2979 /*
2980  * Get the number of cpus visible to this zone.  The system-wide global
2981  * 'ncpus' is returned if pools are disabled, the caller is in the
2982  * global zone, or a NULL zone argument is passed in.
2983  */
2984 int
2985 zone_ncpus_get(zone_t *zone)
2986 {
2987         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
2988
2989         return (myncpus != 0 ? myncpus : ncpus);
2990 }
2991
2992 /*
2993  * Get the number of online cpus visible to this zone.  The system-wide
2994  * global 'ncpus_online' is returned if pools are disabled, the caller
2995  * is in the global zone, or a NULL zone argument is passed in.
2996  */
2997 int
2998 zone_ncpus_online_get(zone_t *zone)
2999 {
3000         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3001
3002         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3003 }
3004
3005 /*
3006  * Return the pool to which the zone is currently bound.
3007  */
3008 pool_t *
3009 zone_pool_get(zone_t *zone)
3010 {
3011         ASSERT(pool_lock_held());
3012
3013         return (zone->zone_pool);
3014 }
3015
3016 /*
3017  * Set the zone's pool pointer and update the zone's visibility to match
3018  * the resources in the new pool.
3019  */
3020 void
3021 zone_pool_set(zone_t *zone, pool_t *pool)
3022 {
3023         ASSERT(pool_lock_held());
3024         ASSERT(MUTEX_HELD(&cpu_lock));
3025
3026         zone->zone_pool = pool;
3027         zone_pset_set(zone, pool->pool_pset->pset_id);
3028 }
3029
3030 /*
3031  * Return the cached value of the id of the processor set to which the
3032  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3033  * facility is disabled.
3034  */
3035 psetid_t
3036 zone_pset_get(zone_t *zone)
3037 {
3038         ASSERT(MUTEX_HELD(&cpu_lock));
3039
3040         return (zone->zone_psetid);
3041 }
3042
3043 /*
3044  * Set the cached value of the id of the processor set to which the zone
3045  * is currently bound.  Also update the zone's visibility to match the
3046  * resources in the new processor set.
3047  */
3048 void
3049 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3050 {
3051         psetid_t oldpsetid;
3052
3053         ASSERT(MUTEX_HELD(&cpu_lock));
3054         oldpsetid = zone_pset_get(zone);
3055
3056         if (oldpsetid == newpsetid)
3057                 return;
3058         /*
3059          * Global zone sees all.
3060          */
3061         if (zone != global_zone) {
3062                 zone->zone_psetid = newpsetid;
3063                 if (newpsetid != ZONE_PS_INVAL)
3064                         pool_pset_visibility_add(newpsetid, zone);
3065                 if (oldpsetid != ZONE_PS_INVAL)
3066                         pool_pset_visibility_remove(oldpsetid, zone);
3067         }
3068         /*
3069          * Disabling pools, so we should start using the global values
3070          * for ncpus and ncpus_online.
3071          */
3072         if (newpsetid == ZONE_PS_INVAL) {
3073                 zone->zone_ncpus = 0;
3074                 zone->zone_ncpus_online = 0;
3075         }
3076 }
3077
3078 /*
3079  * Walk the list of active zones and issue the provided callback for
3080  * each of them.
3081  *
3082  * Caller must not be holding any locks that may be acquired under
3083  * zonehash_lock.  See comment at the beginning of the file for a list of
3084  * common locks and their interactions with zones.
3085  */
3086 int
3087 zone_walk(int (*cb)(zone_t *, void *), void *data)
3088 {
3089         zone_t *zone;
3090         int ret = 0;
3091         zone_status_t status;
3092
3093         mutex_enter(&zonehash_lock);
3094         for (zone = list_head(&zone_active); zone != NULL;
3095             zone = list_next(&zone_active, zone)) {
3096                 /*
3097                  * Skip zones that shouldn't be externally visible.
3098                  */
3099                 status = zone_status_get(zone);
3100                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3101                         continue;
3102                 /*
3103                  * Bail immediately if any callback invocation returns a
3104                  * non-zero value.
3105                  */
3106                 ret = (*cb)(zone, data);
3107                 if (ret != 0)
3108                         break;
3109         }
3110         mutex_exit(&zonehash_lock);
3111         return (ret);
3112 }
3113
3114 static int
3115 zone_set_root(zone_t *zone, const char *upath)
3116 {
3117         vnode_t *vp;
3118         int trycount;
3119         int error = 0;
3120         char *path;
3121         struct pathname upn, pn;
3122         size_t pathlen;
3123
3124         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3125                 return (error);
3126
3127         pn_alloc(&pn);
3128
3129         /* prevent infinite loop */
3130         trycount = 10;
3131         for (;;) {
3132                 if (--trycount <= 0) {
3133                         error = ESTALE;
3134                         goto out;
3135                 }
3136
3137                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3138                         /*
3139                          * VOP_ACCESS() may cover 'vp' with a new
3140                          * filesystem, if 'vp' is an autoFS vnode.
3141                          * Get the new 'vp' if so.
3142                          */
3143                         if ((error =
3144                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3145                             (!vn_ismntpt(vp) ||
3146                             (error = traverse(&vp)) == 0)) {
3147                                 pathlen = pn.pn_pathlen + 2;
3148                                 path = kmem_alloc(pathlen, KM_SLEEP);
3149                                 (void) strncpy(path, pn.pn_path,
3150                                     pn.pn_pathlen + 1);
3151                                 path[pathlen - 2] = '/';
3152                                 path[pathlen - 1] = '\0';
3153                                 pn_free(&pn);
3154                                 pn_free(&upn);
3155
3156                                 /* Success! */
3157                                 break;
3158                         }
3159                         VN_RELE(vp);
3160                 }
3161                 if (error != ESTALE)
3162                         goto out;
3163         }
3164
3165         ASSERT(error == 0);
3166         zone->zone_rootvp = vp;         /* we hold a reference to vp */
3167         zone->zone_rootpath = path;
3168         zone->zone_rootpathlen = pathlen;
3169         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3170                 zone->zone_flags |= ZF_IS_SCRATCH;
3171         return (0);
3172
3173 out:
3174         pn_free(&pn);
3175         pn_free(&upn);
3176         return (error);
3177 }
3178
3179 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3180                         ((c) >= 'a' && (c) <= 'z') || \
3181                         ((c) >= 'A' && (c) <= 'Z'))
3182
3183 static int
3184 zone_set_name(zone_t *zone, const char *uname)
3185 {
3186         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3187         size_t len;
3188         int i, err;
3189
3190         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3191                 kmem_free(kname, ZONENAME_MAX);
3192                 return (err);   /* EFAULT or ENAMETOOLONG */
3193         }
3194
3195         /* must be less than ZONENAME_MAX */
3196         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3197                 kmem_free(kname, ZONENAME_MAX);
3198                 return (EINVAL);
3199         }
3200
3201         /*
3202          * Name must start with an alphanumeric and must contain only
3203          * alphanumerics, '-', '_' and '.'.
3204          */
3205         if (!isalnum(kname[0])) {
3206                 kmem_free(kname, ZONENAME_MAX);
3207                 return (EINVAL);
3208         }
3209         for (i = 1; i < len - 1; i++) {
3210                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3211                     kname[i] != '.') {
3212                         kmem_free(kname, ZONENAME_MAX);
3213                         return (EINVAL);
3214                 }
3215         }
3216
3217         zone->zone_name = kname;
3218         return (0);
3219 }
3220
3221 /*
3222  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3223  * is NULL or it points to a zone with no hostid emulation, then the machine's
3224  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3225  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3226  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3227  * hostid and the machine's hostid is invalid.
3228  */
3229 uint32_t
3230 zone_get_hostid(zone_t *zonep)
3231 {
3232         unsigned long machine_hostid;
3233
3234         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3235                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3236                         return (HW_INVALID_HOSTID);
3237                 return ((uint32_t)machine_hostid);
3238         }
3239         return (zonep->zone_hostid);
3240 }
3241
3242 /*
3243  * Similar to thread_create(), but makes sure the thread is in the appropriate
3244  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3245  */
3246 /*ARGSUSED*/
3247 kthread_t *
3248 zthread_create(
3249     caddr_t stk,
3250     size_t stksize,
3251     void (*proc)(),
3252     void *arg,
3253     size_t len,
3254     pri_t pri)
3255 {
3256         kthread_t *t;
3257         zone_t *zone = curproc->p_zone;
3258         proc_t *pp = zone->zone_zsched;
3259
3260         zone_hold(zone);        /* Reference to be dropped when thread exits */
3261
3262         /*
3263          * No-one should be trying to create threads if the zone is shutting
3264          * down and there aren't any kernel threads around.  See comment
3265          * in zthread_exit().
3266          */
3267         ASSERT(!(zone->zone_kthreads == NULL &&
3268             zone_status_get(zone) >= ZONE_IS_EMPTY));
3269         /*
3270          * Create a thread, but don't let it run until we've finished setting
3271          * things up.
3272          */
3273         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3274         ASSERT(t->t_forw == NULL);
3275         mutex_enter(&zone_status_lock);
3276         if (zone->zone_kthreads == NULL) {
3277                 t->t_forw = t->t_back = t;
3278         } else {
3279                 kthread_t *tx = zone->zone_kthreads;
3280
3281                 t->t_forw = tx;
3282                 t->t_back = tx->t_back;
3283                 tx->t_back->t_forw = t;
3284                 tx->t_back = t;
3285         }
3286         zone->zone_kthreads = t;
3287         mutex_exit(&zone_status_lock);
3288
3289         mutex_enter(&pp->p_lock);
3290         t->t_proc_flag |= TP_ZTHREAD;
3291         project_rele(t->t_proj);
3292         t->t_proj = project_hold(pp->p_task->tk_proj);
3293
3294         /*
3295          * Setup complete, let it run.
3296          */
3297         thread_lock(t);
3298         t->t_schedflag |= TS_ALLSTART;
3299         setrun_locked(t);
3300         thread_unlock(t);
3301
3302         mutex_exit(&pp->p_lock);
3303
3304         return (t);
3305 }
3306
3307 /*
3308  * Similar to thread_exit().  Must be called by threads created via
3309  * zthread_exit().
3310  */
3311 void
3312 zthread_exit(void)
3313 {
3314         kthread_t *t = curthread;
3315         proc_t *pp = curproc;
3316         zone_t *zone = pp->p_zone;
3317
3318         mutex_enter(&zone_status_lock);
3319
3320         /*
3321          * Reparent to p0
3322          */
3323         kpreempt_disable();
3324         mutex_enter(&pp->p_lock);
3325         t->t_proc_flag &= ~TP_ZTHREAD;
3326         t->t_procp = &p0;
3327         hat_thread_exit(t);
3328         mutex_exit(&pp->p_lock);
3329         kpreempt_enable();
3330
3331         if (t->t_back == t) {
3332                 ASSERT(t->t_forw == t);
3333                 /*
3334                  * If the zone is empty, once the thread count
3335                  * goes to zero no further kernel threads can be
3336                  * created.  This is because if the creator is a process
3337                  * in the zone, then it must have exited before the zone
3338                  * state could be set to ZONE_IS_EMPTY.
3339                  * Otherwise, if the creator is a kernel thread in the
3340                  * zone, the thread count is non-zero.
3341                  *
3342                  * This really means that non-zone kernel threads should
3343                  * not create zone kernel threads.
3344                  */
3345                 zone->zone_kthreads = NULL;
3346                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3347                         zone_status_set(zone, ZONE_IS_DOWN);
3348                         /*
3349                          * Remove any CPU caps on this zone.
3350                          */
3351                         cpucaps_zone_remove(zone);
3352                 }
3353         } else {
3354                 t->t_forw->t_back = t->t_back;
3355                 t->t_back->t_forw = t->t_forw;
3356                 if (zone->zone_kthreads == t)
3357                         zone->zone_kthreads = t->t_forw;
3358         }
3359         mutex_exit(&zone_status_lock);
3360         zone_rele(zone);
3361         thread_exit();
3362         /* NOTREACHED */
3363 }
3364
3365 static void
3366 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3367 {
3368         vnode_t *oldvp;
3369
3370         /* we're going to hold a reference here to the directory */
3371         VN_HOLD(vp);
3372
3373         /* update abs cwd/root path see c2/audit.c */
3374         if (AU_AUDITING())
3375                 audit_chdirec(vp, vpp);
3376
3377         mutex_enter(&pp->p_lock);
3378         oldvp = *vpp;
3379         *vpp = vp;
3380         mutex_exit(&pp->p_lock);
3381         if (oldvp != NULL)
3382                 VN_RELE(oldvp);
3383 }
3384
3385 /*
3386  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3387  */
3388 static int
3389 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3390 {
3391         nvpair_t *nvp = NULL;
3392         boolean_t priv_set = B_FALSE;
3393         boolean_t limit_set = B_FALSE;
3394         boolean_t action_set = B_FALSE;
3395
3396         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3397                 const char *name;
3398                 uint64_t ui64;
3399
3400                 name = nvpair_name(nvp);
3401                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3402                         return (EINVAL);
3403                 (void) nvpair_value_uint64(nvp, &ui64);
3404                 if (strcmp(name, "privilege") == 0) {
3405                         /*
3406                          * Currently only privileged values are allowed, but
3407                          * this may change in the future.
3408                          */
3409                         if (ui64 != RCPRIV_PRIVILEGED)
3410                                 return (EINVAL);
3411                         rv->rcv_privilege = ui64;
3412                         priv_set = B_TRUE;
3413                 } else if (strcmp(name, "limit") == 0) {
3414                         rv->rcv_value = ui64;
3415                         limit_set = B_TRUE;
3416                 } else if (strcmp(name, "action") == 0) {
3417                         if (ui64 != RCTL_LOCAL_NOACTION &&
3418                             ui64 != RCTL_LOCAL_DENY)
3419                                 return (EINVAL);
3420                         rv->rcv_flagaction = ui64;
3421                         action_set = B_TRUE;
3422                 } else {
3423                         return (EINVAL);
3424                 }
3425         }
3426
3427         if (!(priv_set && limit_set && action_set))
3428                 return (EINVAL);
3429         rv->rcv_action_signal = 0;
3430         rv->rcv_action_recipient = NULL;
3431         rv->rcv_action_recip_pid = -1;
3432         rv->rcv_firing_time = 0;
3433
3434         return (0);
3435 }
3436
3437 /*
3438  * Non-global zone version of start_init.
3439  */
3440 void
3441 zone_start_init(void)
3442 {
3443         proc_t *p = ttoproc(curthread);
3444         zone_t *z = p->p_zone;
3445
3446         ASSERT(!INGLOBALZONE(curproc));
3447
3448         /*
3449          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3450          * storing just the pid of init is sufficient.
3451          */
3452         z->zone_proc_initpid = p->p_pid;
3453
3454         /*
3455          * We maintain zone_boot_err so that we can return the cause of the
3456          * failure back to the caller of the zone_boot syscall.
3457          */
3458         p->p_zone->zone_boot_err = start_init_common();
3459
3460         /*
3461          * We will prevent booting zones from becoming running zones if the
3462          * global zone is shutting down.
3463          */
3464         mutex_enter(&zone_status_lock);
3465         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3466             ZONE_IS_SHUTTING_DOWN) {
3467                 /*
3468                  * Make sure we are still in the booting state-- we could have
3469                  * raced and already be shutting down, or even further along.
3470                  */
3471                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3472                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3473                 }
3474                 mutex_exit(&zone_status_lock);
3475                 /* It's gone bad, dispose of the process */
3476                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3477                         mutex_enter(&p->p_lock);
3478                         ASSERT(p->p_flag & SEXITLWPS);
3479                         lwp_exit();
3480                 }
3481         } else {
3482                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3483                         zone_status_set(z, ZONE_IS_RUNNING);
3484                 mutex_exit(&zone_status_lock);
3485                 /* cause the process to return to userland. */
3486                 lwp_rtt();
3487         }
3488 }
3489
3490 struct zsched_arg {
3491         zone_t *zone;
3492         nvlist_t *nvlist;
3493 };
3494
3495 /*
3496  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3497  * anything to do with scheduling, but rather with the fact that
3498  * per-zone kernel threads are parented to zsched, just like regular
3499  * kernel threads are parented to sched (p0).
3500  *
3501  * zsched is also responsible for launching init for the zone.
3502  */
3503 static void
3504 zsched(void *arg)
3505 {
3506         struct zsched_arg *za = arg;
3507         proc_t *pp = curproc;
3508         proc_t *initp = proc_init;
3509         zone_t *zone = za->zone;
3510         cred_t *cr, *oldcred;
3511         rctl_set_t *set;
3512         rctl_alloc_gp_t *gp;
3513         contract_t *ct = NULL;
3514         task_t *tk, *oldtk;
3515         rctl_entity_p_t e;
3516         kproject_t *pj;
3517
3518         nvlist_t *nvl = za->nvlist;
3519         nvpair_t *nvp = NULL;
3520
3521         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3522         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3523         PTOU(pp)->u_argc = 0;
3524         PTOU(pp)->u_argv = NULL;
3525         PTOU(pp)->u_envp = NULL;
3526         closeall(P_FINFO(pp));
3527
3528         /*
3529          * We are this zone's "zsched" process.  As the zone isn't generally
3530          * visible yet we don't need to grab any locks before initializing its
3531          * zone_proc pointer.
3532          */
3533         zone_hold(zone);  /* this hold is released by zone_destroy() */
3534         zone->zone_zsched = pp;
3535         mutex_enter(&pp->p_lock);
3536         pp->p_zone = zone;
3537         mutex_exit(&pp->p_lock);
3538
3539         /*
3540          * Disassociate process from its 'parent'; parent ourselves to init
3541          * (pid 1) and change other values as needed.
3542          */
3543         sess_create();
3544
3545         mutex_enter(&pidlock);
3546         proc_detach(pp);
3547         pp->p_ppid = 1;
3548         pp->p_flag |= SZONETOP;
3549         pp->p_ancpid = 1;
3550         pp->p_parent = initp;
3551         pp->p_psibling = NULL;
3552         if (initp->p_child)
3553                 initp->p_child->p_psibling = pp;
3554         pp->p_sibling = initp->p_child;
3555         initp->p_child = pp;
3556
3557         /* Decrement what newproc() incremented. */
3558         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3559         /*
3560          * Our credentials are about to become kcred-like, so we don't care
3561          * about the caller's ruid.
3562          */
3563         upcount_inc(crgetruid(kcred), zone->zone_id);
3564         mutex_exit(&pidlock);
3565
3566         /*
3567          * getting out of global zone, so decrement lwp and process counts
3568          */
3569         pj = pp->p_task->tk_proj;
3570         mutex_enter(&global_zone->zone_nlwps_lock);
3571         pj->kpj_nlwps -= pp->p_lwpcnt;
3572         global_zone->zone_nlwps -= pp->p_lwpcnt;
3573         pj->kpj_nprocs--;
3574         global_zone->zone_nprocs--;
3575         mutex_exit(&global_zone->zone_nlwps_lock);
3576
3577         /*
3578          * Decrement locked memory counts on old zone and project.
3579          */
3580         mutex_enter(&global_zone->zone_mem_lock);
3581         global_zone->zone_locked_mem -= pp->p_locked_mem;
3582         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3583         mutex_exit(&global_zone->zone_mem_lock);
3584
3585         /*
3586          * Create and join a new task in project '0' of this zone.
3587          *
3588          * We don't need to call holdlwps() since we know we're the only lwp in
3589          * this process.
3590          *
3591          * task_join() returns with p_lock held.
3592          */
3593         tk = task_create(0, zone);
3594         mutex_enter(&cpu_lock);
3595         oldtk = task_join(tk, 0);
3596
3597         pj = pp->p_task->tk_proj;
3598
3599         mutex_enter(&zone->zone_mem_lock);
3600         zone->zone_locked_mem += pp->p_locked_mem;
3601         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3602         mutex_exit(&zone->zone_mem_lock);
3603
3604         /*
3605          * add lwp and process counts to zsched's zone, and increment
3606          * project's task and process count due to the task created in
3607          * the above task_create.
3608          */
3609         mutex_enter(&zone->zone_nlwps_lock);
3610         pj->kpj_nlwps += pp->p_lwpcnt;
3611         pj->kpj_ntasks += 1;
3612         zone->zone_nlwps += pp->p_lwpcnt;
3613         pj->kpj_nprocs++;
3614         zone->zone_nprocs++;
3615         mutex_exit(&zone->zone_nlwps_lock);
3616
3617         mutex_exit(&curproc->p_lock);
3618         mutex_exit(&cpu_lock);
3619         task_rele(oldtk);
3620
3621         /*
3622          * The process was created by a process in the global zone, hence the
3623          * credentials are wrong.  We might as well have kcred-ish credentials.
3624          */
3625         cr = zone->zone_kcred;
3626         crhold(cr);
3627         mutex_enter(&pp->p_crlock);
3628         oldcred = pp->p_cred;
3629         pp->p_cred = cr;
3630         mutex_exit(&pp->p_crlock);
3631         crfree(oldcred);
3632
3633         /*
3634          * Hold credentials again (for thread)
3635          */
3636         crhold(cr);
3637
3638         /*
3639          * p_lwpcnt can't change since this is a kernel process.
3640          */
3641         crset(pp, cr);
3642
3643         /*
3644          * Chroot
3645          */
3646         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3647         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3648
3649         /*
3650          * Initialize zone's rctl set.
3651          */
3652         set = rctl_set_create();
3653         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3654         mutex_enter(&pp->p_lock);
3655         e.rcep_p.zone = zone;
3656         e.rcep_t = RCENTITY_ZONE;
3657         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3658         mutex_exit(&pp->p_lock);
3659         rctl_prealloc_destroy(gp);
3660
3661         /*
3662          * Apply the rctls passed in to zone_create().  This is basically a list
3663          * assignment: all of the old values are removed and the new ones
3664          * inserted.  That is, if an empty list is passed in, all values are
3665          * removed.
3666          */
3667         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3668                 rctl_dict_entry_t *rde;
3669                 rctl_hndl_t hndl;
3670                 char *name;
3671                 nvlist_t **nvlarray;
3672                 uint_t i, nelem;
3673                 int error;      /* For ASSERT()s */
3674
3675                 name = nvpair_name(nvp);
3676                 hndl = rctl_hndl_lookup(name);
3677                 ASSERT(hndl != -1);
3678                 rde = rctl_dict_lookup_hndl(hndl);
3679                 ASSERT(rde != NULL);
3680
3681                 for (; /* ever */; ) {
3682                         rctl_val_t oval;
3683
3684                         mutex_enter(&pp->p_lock);
3685                         error = rctl_local_get(hndl, NULL, &oval, pp);
3686                         mutex_exit(&pp->p_lock);
3687                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3688                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3689                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
3690                                 break;
3691                         mutex_enter(&pp->p_lock);
3692                         error = rctl_local_delete(hndl, &oval, pp);
3693                         mutex_exit(&pp->p_lock);
3694                         ASSERT(error == 0);
3695                 }
3696                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3697                 ASSERT(error == 0);
3698                 for (i = 0; i < nelem; i++) {
3699                         rctl_val_t *nvalp;
3700
3701                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3702                         error = nvlist2rctlval(nvlarray[i], nvalp);
3703                         ASSERT(error == 0);
3704                         /*
3705                          * rctl_local_insert can fail if the value being
3706                          * inserted is a duplicate; this is OK.
3707                          */
3708                         mutex_enter(&pp->p_lock);
3709                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
3710                                 kmem_cache_free(rctl_val_cache, nvalp);
3711                         mutex_exit(&pp->p_lock);
3712                 }
3713         }
3714         /*
3715          * Tell the world that we're done setting up.
3716          *
3717          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3718          * and atomically set the zone's processor set visibility.  Once
3719          * we drop pool_lock() this zone will automatically get updated
3720          * to reflect any future changes to the pools configuration.
3721          *
3722          * Note that after we drop the locks below (zonehash_lock in
3723          * particular) other operations such as a zone_getattr call can
3724          * now proceed and observe the zone. That is the reason for doing a
3725          * state transition to the INITIALIZED state.
3726          */
3727         pool_lock();
3728         mutex_enter(&cpu_lock);
3729         mutex_enter(&zonehash_lock);
3730         zone_uniqid(zone);
3731         zone_zsd_configure(zone);
3732         if (pool_state == POOL_ENABLED)
3733                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
3734         mutex_enter(&zone_status_lock);
3735         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3736         zone_status_set(zone, ZONE_IS_INITIALIZED);
3737         mutex_exit(&zone_status_lock);
3738         mutex_exit(&zonehash_lock);
3739         mutex_exit(&cpu_lock);
3740         pool_unlock();
3741
3742         /* Now call the create callback for this key */
3743         zsd_apply_all_keys(zsd_apply_create, zone);
3744
3745         /* The callbacks are complete. Mark ZONE_IS_READY */
3746         mutex_enter(&zone_status_lock);
3747         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3748         zone_status_set(zone, ZONE_IS_READY);
3749         mutex_exit(&zone_status_lock);
3750
3751         /*
3752          * Once we see the zone transition to the ZONE_IS_BOOTING state,
3753          * we launch init, and set the state to running.
3754          */
3755         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3756
3757         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3758                 id_t cid;
3759
3760                 /*
3761                  * Ok, this is a little complicated.  We need to grab the
3762                  * zone's pool's scheduling class ID; note that by now, we
3763                  * are already bound to a pool if we need to be (zoneadmd
3764                  * will have done that to us while we're in the READY
3765                  * state).  *But* the scheduling class for the zone's 'init'
3766                  * must be explicitly passed to newproc, which doesn't
3767                  * respect pool bindings.
3768                  *
3769                  * We hold the pool_lock across the call to newproc() to
3770                  * close the obvious race: the pool's scheduling class
3771                  * could change before we manage to create the LWP with
3772                  * classid 'cid'.
3773                  */
3774                 pool_lock();
3775                 if (zone->zone_defaultcid > 0)
3776                         cid = zone->zone_defaultcid;
3777                 else
3778                         cid = pool_get_class(zone->zone_pool);
3779                 if (cid == -1)
3780                         cid = defaultcid;
3781
3782                 /*
3783                  * If this fails, zone_boot will ultimately fail.  The
3784                  * state of the zone will be set to SHUTTING_DOWN-- userland
3785                  * will have to tear down the zone, and fail, or try again.
3786                  */
3787                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3788                     minclsyspri - 1, &ct, 0)) != 0) {
3789                         mutex_enter(&zone_status_lock);
3790                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3791                         mutex_exit(&zone_status_lock);
3792                 } else {
3793                         zone->zone_boot_time = gethrestime_sec();
3794                 }
3795
3796                 pool_unlock();
3797         }
3798
3799         /*
3800          * Wait for zone_destroy() to be called.  This is what we spend
3801          * most of our life doing.
3802          */
3803         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3804
3805         if (ct)
3806                 /*
3807                  * At this point the process contract should be empty.
3808                  * (Though if it isn't, it's not the end of the world.)
3809                  */
3810                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3811
3812         /*
3813          * Allow kcred to be freed when all referring processes
3814          * (including this one) go away.  We can't just do this in
3815          * zone_free because we need to wait for the zone_cred_ref to
3816          * drop to 0 before calling zone_free, and the existence of
3817          * zone_kcred will prevent that.  Thus, we call crfree here to
3818          * balance the crdup in zone_create.  The crhold calls earlier
3819          * in zsched will be dropped when the thread and process exit.
3820          */
3821         crfree(zone->zone_kcred);
3822         zone->zone_kcred = NULL;
3823
3824         exit(CLD_EXITED, 0);
3825 }
3826
3827 /*
3828  * Helper function to determine if there are any submounts of the
3829  * provided path.  Used to make sure the zone doesn't "inherit" any
3830  * mounts from before it is created.
3831  */
3832 static uint_t
3833 zone_mount_count(const char *rootpath)
3834 {
3835         vfs_t *vfsp;
3836         uint_t count = 0;
3837         size_t rootpathlen = strlen(rootpath);
3838
3839         /*
3840          * Holding zonehash_lock prevents race conditions with
3841          * vfs_list_add()/vfs_list_remove() since we serialize with
3842          * zone_find_by_path().
3843          */
3844         ASSERT(MUTEX_HELD(&zonehash_lock));
3845         /*
3846          * The rootpath must end with a '/'
3847          */
3848         ASSERT(rootpath[rootpathlen - 1] == '/');
3849
3850         /*
3851          * This intentionally does not count the rootpath itself if that
3852          * happens to be a mount point.
3853          */
3854         vfs_list_read_lock();
3855         vfsp = rootvfs;
3856         do {
3857                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
3858                     rootpathlen) == 0)
3859                         count++;
3860                 vfsp = vfsp->vfs_next;
3861         } while (vfsp != rootvfs);
3862         vfs_list_unlock();
3863         return (count);
3864 }
3865
3866 /*
3867  * Helper function to make sure that a zone created on 'rootpath'
3868  * wouldn't end up containing other zones' rootpaths.
3869  */
3870 static boolean_t
3871 zone_is_nested(const char *rootpath)
3872 {
3873         zone_t *zone;
3874         size_t rootpathlen = strlen(rootpath);
3875         size_t len;
3876
3877         ASSERT(MUTEX_HELD(&zonehash_lock));
3878
3879         /*
3880          * zone_set_root() appended '/' and '\0' at the end of rootpath
3881          */
3882         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
3883             (rootpath[1] == '/') && (rootpath[2] == '\0'))
3884                 return (B_TRUE);
3885
3886         for (zone = list_head(&zone_active); zone != NULL;
3887             zone = list_next(&zone_active, zone)) {
3888                 if (zone == global_zone)
3889                         continue;
3890                 len = strlen(zone->zone_rootpath);
3891                 if (strncmp(rootpath, zone->zone_rootpath,
3892                     MIN(rootpathlen, len)) == 0)
3893                         return (B_TRUE);
3894         }
3895         return (B_FALSE);
3896 }
3897
3898 static int
3899 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
3900     size_t zone_privssz)
3901 {
3902         priv_set_t *privs;
3903
3904         if (zone_privssz < sizeof (priv_set_t))
3905                 return (ENOMEM);
3906
3907         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
3908
3909         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
3910                 kmem_free(privs, sizeof (priv_set_t));
3911                 return (EFAULT);
3912         }
3913
3914         zone->zone_privset = privs;
3915         return (0);
3916 }
3917
3918 /*
3919  * We make creative use of nvlists to pass in rctls from userland.  The list is
3920  * a list of the following structures:
3921  *
3922  * (name = rctl_name, value = nvpair_list_array)
3923  *
3924  * Where each element of the nvpair_list_array is of the form:
3925  *
3926  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
3927  *      (name = "limit", value = uint64_t),
3928  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
3929  */
3930 static int
3931 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
3932 {
3933         nvpair_t *nvp = NULL;
3934         nvlist_t *nvl = NULL;
3935         char *kbuf;
3936         int error;
3937         rctl_val_t rv;
3938
3939         *nvlp = NULL;
3940
3941         if (buflen == 0)
3942                 return (0);
3943
3944         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3945                 return (ENOMEM);
3946         if (copyin(ubuf, kbuf, buflen)) {
3947                 error = EFAULT;
3948                 goto out;
3949         }
3950         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
3951                 /*
3952                  * nvl may have been allocated/free'd, but the value set to
3953                  * non-NULL, so we reset it here.
3954                  */
3955                 nvl = NULL;
3956                 error = EINVAL;
3957                 goto out;
3958         }
3959         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3960                 rctl_dict_entry_t *rde;
3961                 rctl_hndl_t hndl;
3962                 nvlist_t **nvlarray;
3963                 uint_t i, nelem;
3964                 char *name;
3965
3966                 error = EINVAL;
3967                 name = nvpair_name(nvp);
3968                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
3969                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
3970                         goto out;
3971                 }
3972                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
3973                         goto out;
3974                 }
3975                 rde = rctl_dict_lookup_hndl(hndl);
3976                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3977                 ASSERT(error == 0);
3978                 for (i = 0; i < nelem; i++) {
3979                         if (error = nvlist2rctlval(nvlarray[i], &rv))
3980                                 goto out;
3981                 }
3982                 if (rctl_invalid_value(rde, &rv)) {
3983                         error = EINVAL;
3984                         goto out;
3985                 }
3986         }
3987         error = 0;
3988         *nvlp = nvl;
3989 out:
3990         kmem_free(kbuf, buflen);
3991         if (error && nvl != NULL)
3992                 nvlist_free(nvl);
3993         return (error);
3994 }
3995
3996 int
3997 zone_create_error(int er_error, int er_ext, int *er_out) {
3998         if (er_out != NULL) {
3999                 if (copyout(&er_ext, er_out, sizeof (int))) {
4000                         return (set_errno(EFAULT));
4001                 }
4002         }
4003         return (set_errno(er_error));
4004 }
4005
4006 static int
4007 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4008 {
4009         ts_label_t *tsl;
4010         bslabel_t blab;
4011
4012         /* Get label from user */
4013         if (copyin(lab, &blab, sizeof (blab)) != 0)
4014                 return (EFAULT);
4015         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4016         if (tsl == NULL)
4017                 return (ENOMEM);
4018
4019         zone->zone_slabel = tsl;
4020         return (0);
4021 }
4022
4023 /*
4024  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4025  */
4026 static int
4027 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4028 {
4029         char *kbuf;
4030         char *dataset, *next;
4031         zone_dataset_t *zd;
4032         size_t len;
4033
4034         if (ubuf == NULL || buflen == 0)
4035                 return (0);
4036
4037         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4038                 return (ENOMEM);
4039
4040         if (copyin(ubuf, kbuf, buflen) != 0) {
4041                 kmem_free(kbuf, buflen);
4042                 return (EFAULT);
4043         }
4044
4045         dataset = next = kbuf;
4046         for (;;) {
4047                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4048
4049                 next = strchr(dataset, ',');
4050
4051                 if (next == NULL)
4052                         len = strlen(dataset);
4053                 else
4054                         len = next - dataset;
4055
4056                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4057                 bcopy(dataset, zd->zd_dataset, len);
4058                 zd->zd_dataset[len] = '\0';
4059
4060                 list_insert_head(&zone->zone_datasets, zd);
4061
4062                 if (next == NULL)
4063                         break;
4064
4065                 dataset = next + 1;
4066         }
4067
4068         kmem_free(kbuf, buflen);
4069         return (0);
4070 }
4071
4072 /*
4073  * System call to create/initialize a new zone named 'zone_name', rooted
4074  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4075  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4076  * with labeling set by 'match', 'doi', and 'label'.
4077  *
4078  * If extended error is non-null, we may use it to return more detailed
4079  * error information.
4080  */
4081 static zoneid_t
4082 zone_create(const char *zone_name, const char *zone_root,
4083     const priv_set_t *zone_privs, size_t zone_privssz,
4084     caddr_t rctlbuf, size_t rctlbufsz,
4085     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4086     int match, uint32_t doi, const bslabel_t *label,
4087     int flags)
4088 {
4089         struct zsched_arg zarg;
4090         nvlist_t *rctls = NULL;
4091         proc_t *pp = curproc;
4092         zone_t *zone, *ztmp;
4093         zoneid_t zoneid;
4094         int error;
4095         int error2 = 0;
4096         char *str;
4097         cred_t *zkcr;
4098         boolean_t insert_label_hash;
4099
4100         if (secpolicy_zone_config(CRED()) != 0)
4101                 return (set_errno(EPERM));
4102
4103         /* can't boot zone from within chroot environment */
4104         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4105                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4106                     extended_error));
4107
4108         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4109         zoneid = zone->zone_id = id_alloc(zoneid_space);
4110         zone->zone_status = ZONE_IS_UNINITIALIZED;
4111         zone->zone_pool = pool_default;
4112         zone->zone_pool_mod = gethrtime();
4113         zone->zone_psetid = ZONE_PS_INVAL;
4114         zone->zone_ncpus = 0;
4115         zone->zone_ncpus_online = 0;
4116         zone->zone_restart_init = B_TRUE;
4117         zone->zone_brand = &native_brand;
4118         zone->zone_initname = NULL;
4119         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4120         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4121         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4122         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4123         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4124             offsetof(zone_ref_t, zref_linkage));
4125         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4126             offsetof(struct zsd_entry, zsd_linkage));
4127         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4128             offsetof(zone_dataset_t, zd_linkage));
4129         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4130             offsetof(zone_dl_t, zdl_linkage));
4131         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4132         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4133
4134         if (flags & ZCF_NET_EXCL) {
4135                 zone->zone_flags |= ZF_NET_EXCL;
4136         }
4137
4138         if ((error = zone_set_name(zone, zone_name)) != 0) {
4139                 zone_free(zone);
4140                 return (zone_create_error(error, 0, extended_error));
4141         }
4142
4143         if ((error = zone_set_root(zone, zone_root)) != 0) {
4144                 zone_free(zone);
4145                 return (zone_create_error(error, 0, extended_error));
4146         }
4147         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4148                 zone_free(zone);
4149                 return (zone_create_error(error, 0, extended_error));
4150         }
4151
4152         /* initialize node name to be the same as zone name */
4153         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4154         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4155         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4156
4157         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4158         zone->zone_domain[0] = '\0';
4159         zone->zone_hostid = HW_INVALID_HOSTID;
4160         zone->zone_shares = 1;
4161         zone->zone_shmmax = 0;
4162         zone->zone_ipc.ipcq_shmmni = 0;
4163         zone->zone_ipc.ipcq_semmni = 0;
4164         zone->zone_ipc.ipcq_msgmni = 0;
4165         zone->zone_bootargs = NULL;
4166         zone->zone_fs_allowed = NULL;
4167         zone->zone_initname =
4168             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4169         (void) strcpy(zone->zone_initname, zone_default_initname);
4170         zone->zone_nlwps = 0;
4171         zone->zone_nlwps_ctl = INT_MAX;
4172         zone->zone_nprocs = 0;
4173         zone->zone_nprocs_ctl = INT_MAX;
4174         zone->zone_locked_mem = 0;
4175         zone->zone_locked_mem_ctl = UINT64_MAX;
4176         zone->zone_max_swap = 0;
4177         zone->zone_max_swap_ctl = UINT64_MAX;
4178         zone->zone_max_lofi = 0;
4179         zone->zone_max_lofi_ctl = UINT64_MAX;
4180         zone0.zone_lockedmem_kstat = NULL;
4181         zone0.zone_swapresv_kstat = NULL;
4182
4183         /*
4184          * Zsched initializes the rctls.
4185          */
4186         zone->zone_rctls = NULL;
4187
4188         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4189                 zone_free(zone);
4190                 return (zone_create_error(error, 0, extended_error));
4191         }
4192
4193         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4194                 zone_free(zone);
4195                 return (set_errno(error));
4196         }
4197
4198         /*
4199          * Read in the trusted system parameters:
4200          * match flag and sensitivity label.
4201          */
4202         zone->zone_match = match;
4203         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4204                 /* Fail if requested to set doi to anything but system's doi */
4205                 if (doi != 0 && doi != default_doi) {
4206                         zone_free(zone);
4207                         return (set_errno(EINVAL));
4208                 }
4209                 /* Always apply system's doi to the zone */
4210                 error = zone_set_label(zone, label, default_doi);
4211                 if (error != 0) {
4212                         zone_free(zone);
4213                         return (set_errno(error));
4214                 }
4215                 insert_label_hash = B_TRUE;
4216         } else {
4217                 /* all zones get an admin_low label if system is not labeled */
4218                 zone->zone_slabel = l_admin_low;
4219                 label_hold(l_admin_low);
4220                 insert_label_hash = B_FALSE;
4221         }
4222
4223         /*
4224          * Stop all lwps since that's what normally happens as part of fork().
4225          * This needs to happen before we grab any locks to avoid deadlock
4226          * (another lwp in the process could be waiting for the held lock).
4227          */
4228         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4229                 zone_free(zone);
4230                 if (rctls)
4231                         nvlist_free(rctls);
4232                 return (zone_create_error(error, 0, extended_error));
4233         }
4234
4235         if (block_mounts() == 0) {
4236                 mutex_enter(&pp->p_lock);
4237                 if (curthread != pp->p_agenttp)
4238                         continuelwps(pp);
4239                 mutex_exit(&pp->p_lock);
4240                 zone_free(zone);
4241                 if (rctls)
4242                         nvlist_free(rctls);
4243                 return (zone_create_error(error, 0, extended_error));
4244         }
4245
4246         /*
4247          * Set up credential for kernel access.  After this, any errors
4248          * should go through the dance in errout rather than calling
4249          * zone_free directly.
4250          */
4251         zone->zone_kcred = crdup(kcred);
4252         crsetzone(zone->zone_kcred, zone);
4253         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4254         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4255         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4256         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4257
4258         mutex_enter(&zonehash_lock);
4259         /*
4260          * Make sure zone doesn't already exist.
4261          *
4262          * If the system and zone are labeled,
4263          * make sure no other zone exists that has the same label.
4264          */
4265         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4266             (insert_label_hash &&
4267             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4268                 zone_status_t status;
4269
4270                 status = zone_status_get(ztmp);
4271                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4272                         error = EEXIST;
4273                 else
4274                         error = EBUSY;
4275
4276                 if (insert_label_hash)
4277                         error2 = ZE_LABELINUSE;
4278
4279                 goto errout;
4280         }
4281
4282         /*
4283          * Don't allow zone creations which would cause one zone's rootpath to
4284          * be accessible from that of another (non-global) zone.
4285          */
4286         if (zone_is_nested(zone->zone_rootpath)) {
4287                 error = EBUSY;
4288                 goto errout;
4289         }
4290
4291         ASSERT(zonecount != 0);         /* check for leaks */
4292         if (zonecount + 1 > maxzones) {
4293                 error = ENOMEM;
4294                 goto errout;
4295         }
4296
4297         if (zone_mount_count(zone->zone_rootpath) != 0) {
4298                 error = EBUSY;
4299                 error2 = ZE_AREMOUNTS;
4300                 goto errout;
4301         }
4302
4303         /*
4304          * Zone is still incomplete, but we need to drop all locks while
4305          * zsched() initializes this zone's kernel process.  We
4306          * optimistically add the zone to the hashtable and associated
4307          * lists so a parallel zone_create() doesn't try to create the
4308          * same zone.
4309          */
4310         zonecount++;
4311         (void) mod_hash_insert(zonehashbyid,
4312             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4313             (mod_hash_val_t)(uintptr_t)zone);
4314         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4315         (void) strcpy(str, zone->zone_name);
4316         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4317             (mod_hash_val_t)(uintptr_t)zone);
4318         if (insert_label_hash) {
4319                 (void) mod_hash_insert(zonehashbylabel,
4320                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4321                 zone->zone_flags |= ZF_HASHED_LABEL;
4322         }
4323
4324         /*
4325          * Insert into active list.  At this point there are no 'hold's
4326          * on the zone, but everyone else knows not to use it, so we can
4327          * continue to use it.  zsched() will do a zone_hold() if the
4328          * newproc() is successful.
4329          */
4330         list_insert_tail(&zone_active, zone);
4331         mutex_exit(&zonehash_lock);
4332
4333         zarg.zone = zone;
4334         zarg.nvlist = rctls;
4335         /*
4336          * The process, task, and project rctls are probably wrong;
4337          * we need an interface to get the default values of all rctls,
4338          * and initialize zsched appropriately.  I'm not sure that that
4339          * makes much of a difference, though.
4340          */
4341         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4342         if (error != 0) {
4343                 /*
4344                  * We need to undo all globally visible state.
4345                  */
4346                 mutex_enter(&zonehash_lock);
4347                 list_remove(&zone_active, zone);
4348                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4349                         ASSERT(zone->zone_slabel != NULL);
4350                         (void) mod_hash_destroy(zonehashbylabel,
4351                             (mod_hash_key_t)zone->zone_slabel);
4352                 }
4353                 (void) mod_hash_destroy(zonehashbyname,
4354                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4355                 (void) mod_hash_destroy(zonehashbyid,
4356                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4357                 ASSERT(zonecount > 1);
4358                 zonecount--;
4359                 goto errout;
4360         }
4361
4362         /*
4363          * Zone creation can't fail from now on.
4364          */
4365
4366         /*
4367          * Create zone kstats
4368          */
4369         zone_kstat_create(zone);
4370
4371         /*
4372          * Let the other lwps continue.
4373          */
4374         mutex_enter(&pp->p_lock);
4375         if (curthread != pp->p_agenttp)
4376                 continuelwps(pp);
4377         mutex_exit(&pp->p_lock);
4378
4379         /*
4380          * Wait for zsched to finish initializing the zone.
4381          */
4382         zone_status_wait(zone, ZONE_IS_READY);
4383         /*
4384          * The zone is fully visible, so we can let mounts progress.
4385          */
4386         resume_mounts();
4387         if (rctls)
4388                 nvlist_free(rctls);
4389
4390         return (zoneid);
4391
4392 errout:
4393         mutex_exit(&zonehash_lock);
4394         /*
4395          * Let the other lwps continue.
4396          */
4397         mutex_enter(&pp->p_lock);
4398         if (curthread != pp->p_agenttp)
4399                 continuelwps(pp);
4400         mutex_exit(&pp->p_lock);
4401
4402         resume_mounts();
4403         if (rctls)
4404                 nvlist_free(rctls);
4405         /*
4406          * There is currently one reference to the zone, a cred_ref from
4407          * zone_kcred.  To free the zone, we call crfree, which will call
4408          * zone_cred_rele, which will call zone_free.
4409          */
4410         ASSERT(zone->zone_cred_ref == 1);
4411         ASSERT(zone->zone_kcred->cr_ref == 1);
4412         ASSERT(zone->zone_ref == 0);
4413         zkcr = zone->zone_kcred;
4414         zone->zone_kcred = NULL;
4415         crfree(zkcr);                           /* triggers call to zone_free */
4416         return (zone_create_error(error, error2, extended_error));
4417 }
4418
4419 /*
4420  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4421  * the heavy lifting.  initname is the path to the program to launch
4422  * at the "top" of the zone; if this is NULL, we use the system default,
4423  * which is stored at zone_default_initname.
4424  */
4425 static int
4426 zone_boot(zoneid_t zoneid)
4427 {
4428         int err;
4429         zone_t *zone;
4430
4431         if (secpolicy_zone_config(CRED()) != 0)
4432                 return (set_errno(EPERM));
4433         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4434                 return (set_errno(EINVAL));
4435
4436         mutex_enter(&zonehash_lock);
4437         /*
4438          * Look for zone under hash lock to prevent races with calls to
4439          * zone_shutdown, zone_destroy, etc.
4440          */
4441         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4442                 mutex_exit(&zonehash_lock);
4443                 return (set_errno(EINVAL));
4444         }
4445
4446         mutex_enter(&zone_status_lock);
4447         if (zone_status_get(zone) != ZONE_IS_READY) {
4448                 mutex_exit(&zone_status_lock);
4449                 mutex_exit(&zonehash_lock);
4450                 return (set_errno(EINVAL));
4451         }
4452         zone_status_set(zone, ZONE_IS_BOOTING);
4453         mutex_exit(&zone_status_lock);
4454
4455         zone_hold(zone);        /* so we can use the zone_t later */
4456         mutex_exit(&zonehash_lock);
4457
4458         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4459                 zone_rele(zone);
4460                 return (set_errno(EINTR));
4461         }
4462
4463         /*
4464          * Boot (starting init) might have failed, in which case the zone
4465          * will go to the SHUTTING_DOWN state; an appropriate errno will
4466          * be placed in zone->zone_boot_err, and so we return that.
4467          */
4468         err = zone->zone_boot_err;
4469         zone_rele(zone);
4470         return (err ? set_errno(err) : 0);
4471 }
4472
4473 /*
4474  * Kills all user processes in the zone, waiting for them all to exit
4475  * before returning.
4476  */
4477 static int
4478 zone_empty(zone_t *zone)
4479 {
4480         int waitstatus;
4481
4482         /*
4483          * We need to drop zonehash_lock before killing all
4484          * processes, otherwise we'll deadlock with zone_find_*
4485          * which can be called from the exit path.
4486          */
4487         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4488         while ((waitstatus = zone_status_timedwait_sig(zone,
4489             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4490                 killall(zone->zone_id);
4491         }
4492         /*
4493          * return EINTR if we were signaled
4494          */
4495         if (waitstatus == 0)
4496                 return (EINTR);
4497         return (0);
4498 }
4499
4500 /*
4501  * This function implements the policy for zone visibility.
4502  *
4503  * In standard Solaris, a non-global zone can only see itself.
4504  *
4505  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4506  * it dominates. For this test, the label of the global zone is treated as
4507  * admin_high so it is special-cased instead of being checked for dominance.
4508  *
4509  * Returns true if zone attributes are viewable, false otherwise.
4510  */
4511 static boolean_t
4512 zone_list_access(zone_t *zone)
4513 {
4514
4515         if (curproc->p_zone == global_zone ||
4516             curproc->p_zone == zone) {
4517                 return (B_TRUE);
4518         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4519                 bslabel_t *curproc_label;
4520                 bslabel_t *zone_label;
4521
4522                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4523                 zone_label = label2bslabel(zone->zone_slabel);
4524
4525                 if (zone->zone_id != GLOBAL_ZONEID &&
4526                     bldominates(curproc_label, zone_label)) {
4527                         return (B_TRUE);
4528                 } else {
4529                         return (B_FALSE);
4530                 }
4531         } else {
4532                 return (B_FALSE);
4533         }
4534 }
4535
4536 /*
4537  * Systemcall to start the zone's halt sequence.  By the time this
4538  * function successfully returns, all user processes and kernel threads
4539  * executing in it will have exited, ZSD shutdown callbacks executed,
4540  * and the zone status set to ZONE_IS_DOWN.
4541  *
4542  * It is possible that the call will interrupt itself if the caller is the
4543  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4544  */
4545 static int
4546 zone_shutdown(zoneid_t zoneid)
4547 {
4548         int error;
4549         zone_t *zone;
4550         zone_status_t status;
4551
4552         if (secpolicy_zone_config(CRED()) != 0)
4553                 return (set_errno(EPERM));
4554         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4555                 return (set_errno(EINVAL));
4556
4557         /*
4558          * Block mounts so that VFS_MOUNT() can get an accurate view of
4559          * the zone's status with regards to ZONE_IS_SHUTTING down.
4560          *
4561          * e.g. NFS can fail the mount if it determines that the zone
4562          * has already begun the shutdown sequence.
4563          */
4564         if (block_mounts() == 0)
4565                 return (set_errno(EINTR));
4566         mutex_enter(&zonehash_lock);
4567         /*
4568          * Look for zone under hash lock to prevent races with other
4569          * calls to zone_shutdown and zone_destroy.
4570          */
4571         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4572                 mutex_exit(&zonehash_lock);
4573                 resume_mounts();
4574                 return (set_errno(EINVAL));
4575         }
4576         mutex_enter(&zone_status_lock);
4577         status = zone_status_get(zone);
4578         /*
4579          * Fail if the zone isn't fully initialized yet.
4580          */
4581         if (status < ZONE_IS_READY) {
4582                 mutex_exit(&zone_status_lock);
4583                 mutex_exit(&zonehash_lock);
4584                 resume_mounts();
4585                 return (set_errno(EINVAL));
4586         }
4587         /*
4588          * If conditions required for zone_shutdown() to return have been met,
4589          * return success.
4590          */
4591         if (status >= ZONE_IS_DOWN) {
4592                 mutex_exit(&zone_status_lock);
4593                 mutex_exit(&zonehash_lock);
4594                 resume_mounts();
4595                 return (0);
4596         }
4597         /*
4598          * If zone_shutdown() hasn't been called before, go through the motions.
4599          * If it has, there's nothing to do but wait for the kernel threads to
4600          * drain.
4601          */
4602         if (status < ZONE_IS_EMPTY) {
4603                 uint_t ntasks;
4604
4605                 mutex_enter(&zone->zone_lock);
4606                 if ((ntasks = zone->zone_ntasks) != 1) {
4607                         /*
4608                          * There's still stuff running.
4609                          */
4610                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4611                 }
4612                 mutex_exit(&zone->zone_lock);
4613                 if (ntasks == 1) {
4614                         /*
4615                          * The only way to create another task is through
4616                          * zone_enter(), which will block until we drop
4617                          * zonehash_lock.  The zone is empty.
4618                          */
4619                         if (zone->zone_kthreads == NULL) {
4620                                 /*
4621                                  * Skip ahead to ZONE_IS_DOWN
4622                                  */
4623                                 zone_status_set(zone, ZONE_IS_DOWN);
4624                         } else {
4625                                 zone_status_set(zone, ZONE_IS_EMPTY);
4626                         }
4627                 }
4628         }
4629         zone_hold(zone);        /* so we can use the zone_t later */
4630         mutex_exit(&zone_status_lock);
4631         mutex_exit(&zonehash_lock);
4632         resume_mounts();
4633
4634         if (error = zone_empty(zone)) {
4635                 zone_rele(zone);
4636                 return (set_errno(error));
4637         }
4638         /*
4639          * After the zone status goes to ZONE_IS_DOWN this zone will no
4640          * longer be notified of changes to the pools configuration, so
4641          * in order to not end up with a stale pool pointer, we point
4642          * ourselves at the default pool and remove all resource
4643          * visibility.  This is especially important as the zone_t may
4644          * languish on the deathrow for a very long time waiting for
4645          * cred's to drain out.
4646          *
4647          * This rebinding of the zone can happen multiple times
4648          * (presumably due to interrupted or parallel systemcalls)
4649          * without any adverse effects.
4650          */
4651         if (pool_lock_intr() != 0) {
4652                 zone_rele(zone);
4653                 return (set_errno(EINTR));
4654         }
4655         if (pool_state == POOL_ENABLED) {
4656                 mutex_enter(&cpu_lock);
4657                 zone_pool_set(zone, pool_default);
4658                 /*
4659                  * The zone no longer needs to be able to see any cpus.
4660                  */
4661                 zone_pset_set(zone, ZONE_PS_INVAL);
4662                 mutex_exit(&cpu_lock);
4663         }
4664         pool_unlock();
4665
4666         /*
4667          * ZSD shutdown callbacks can be executed multiple times, hence
4668          * it is safe to not be holding any locks across this call.
4669          */
4670         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4671
4672         mutex_enter(&zone_status_lock);
4673         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4674                 zone_status_set(zone, ZONE_IS_DOWN);
4675         mutex_exit(&zone_status_lock);
4676
4677         /*
4678          * Wait for kernel threads to drain.
4679          */
4680         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4681                 zone_rele(zone);
4682                 return (set_errno(EINTR));
4683         }
4684
4685         /*
4686          * Zone can be become down/destroyable even if the above wait
4687          * returns EINTR, so any code added here may never execute.
4688          * (i.e. don't add code here)
4689          */
4690
4691         zone_rele(zone);
4692         return (0);
4693 }
4694
4695 /*
4696  * Log the specified zone's reference counts.  The caller should not be
4697  * holding the zone's zone_lock.
4698  */
4699 static void
4700 zone_log_refcounts(zone_t *zone)
4701 {
4702         char *buffer;
4703         char *buffer_position;
4704         uint32_t buffer_size;
4705         uint32_t index;
4706         uint_t ref;
4707         uint_t cred_ref;
4708
4709         /*
4710          * Construct a string representing the subsystem-specific reference
4711          * counts.  The counts are printed in ascending order by index into the
4712          * zone_t::zone_subsys_ref array.  The list will be surrounded by
4713          * square brackets [] and will only contain nonzero reference counts.
4714          *
4715          * The buffer will hold two square bracket characters plus ten digits,
4716          * one colon, one space, one comma, and some characters for a
4717          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4718          * bit integers have at most ten decimal digits.)  The last
4719          * reference count's comma is replaced by the closing square
4720          * bracket and a NULL character to terminate the string.
4721          *
4722          * NOTE: We have to grab the zone's zone_lock to create a consistent
4723          * snapshot of the zone's reference counters.
4724          *
4725          * First, figure out how much space the string buffer will need.
4726          * The buffer's size is stored in buffer_size.
4727          */
4728         buffer_size = 2;                        /* for the square brackets */
4729         mutex_enter(&zone->zone_lock);
4730         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4731         ref = zone->zone_ref;
4732         cred_ref = zone->zone_cred_ref;
4733         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4734                 if (zone->zone_subsys_ref[index] != 0)
4735                         buffer_size += strlen(zone_ref_subsys_names[index]) +
4736                             13;
4737         if (buffer_size == 2) {
4738                 /*
4739                  * No subsystems had nonzero reference counts.  Don't bother
4740                  * with allocating a buffer; just log the general-purpose and
4741                  * credential reference counts.
4742                  */
4743                 mutex_exit(&zone->zone_lock);
4744                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4745                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
4746                     "references and %u credential references are still extant",
4747                     zone->zone_name, zone->zone_id, ref, cred_ref);
4748                 return;
4749         }
4750
4751         /*
4752          * buffer_size contains the exact number of characters that the
4753          * buffer will need.  Allocate the buffer and fill it with nonzero
4754          * subsystem-specific reference counts.  Surround the results with
4755          * square brackets afterwards.
4756          */
4757         buffer = kmem_alloc(buffer_size, KM_SLEEP);
4758         buffer_position = &buffer[1];
4759         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
4760                 /*
4761                  * NOTE: The DDI's version of sprintf() returns a pointer to
4762                  * the modified buffer rather than the number of bytes written
4763                  * (as in snprintf(3C)).  This is unfortunate and annoying.
4764                  * Therefore, we'll use snprintf() with INT_MAX to get the
4765                  * number of bytes written.  Using INT_MAX is safe because
4766                  * the buffer is perfectly sized for the data: we'll never
4767                  * overrun the buffer.
4768                  */
4769                 if (zone->zone_subsys_ref[index] != 0)
4770                         buffer_position += snprintf(buffer_position, INT_MAX,
4771                             "%s: %u,", zone_ref_subsys_names[index],
4772                             zone->zone_subsys_ref[index]);
4773         }
4774         mutex_exit(&zone->zone_lock);
4775         buffer[0] = '[';
4776         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
4777         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
4778         buffer_position[-1] = ']';
4779
4780         /*
4781          * Log the reference counts and free the message buffer.
4782          */
4783         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4784             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
4785             "%u credential references are still extant %s", zone->zone_name,
4786             zone->zone_id, ref, cred_ref, buffer);
4787         kmem_free(buffer, buffer_size);
4788 }
4789
4790 /*
4791  * Systemcall entry point to finalize the zone halt process.  The caller
4792  * must have already successfully called zone_shutdown().
4793  *
4794  * Upon successful completion, the zone will have been fully destroyed:
4795  * zsched will have exited, destructor callbacks executed, and the zone
4796  * removed from the list of active zones.
4797  */
4798 static int
4799 zone_destroy(zoneid_t zoneid)
4800 {
4801         uint64_t uniqid;
4802         zone_t *zone;
4803         zone_status_t status;
4804         clock_t wait_time;
4805         boolean_t log_refcounts;
4806
4807         if (secpolicy_zone_config(CRED()) != 0)
4808                 return (set_errno(EPERM));
4809         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4810                 return (set_errno(EINVAL));
4811
4812         mutex_enter(&zonehash_lock);
4813         /*
4814          * Look for zone under hash lock to prevent races with other
4815          * calls to zone_destroy.
4816          */
4817         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4818                 mutex_exit(&zonehash_lock);
4819                 return (set_errno(EINVAL));
4820         }
4821
4822         if (zone_mount_count(zone->zone_rootpath) != 0) {
4823                 mutex_exit(&zonehash_lock);
4824                 return (set_errno(EBUSY));
4825         }
4826         mutex_enter(&zone_status_lock);
4827         status = zone_status_get(zone);
4828         if (status < ZONE_IS_DOWN) {
4829                 mutex_exit(&zone_status_lock);
4830                 mutex_exit(&zonehash_lock);
4831                 return (set_errno(EBUSY));
4832         } else if (status == ZONE_IS_DOWN) {
4833                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
4834         }
4835         mutex_exit(&zone_status_lock);
4836         zone_hold(zone);
4837         mutex_exit(&zonehash_lock);
4838
4839         /*
4840          * wait for zsched to exit
4841          */
4842         zone_status_wait(zone, ZONE_IS_DEAD);
4843         zone_zsd_callbacks(zone, ZSD_DESTROY);
4844         zone->zone_netstack = NULL;
4845         uniqid = zone->zone_uniqid;
4846         zone_rele(zone);
4847         zone = NULL;    /* potentially free'd */
4848
4849         log_refcounts = B_FALSE;
4850         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
4851         mutex_enter(&zonehash_lock);
4852         for (; /* ever */; ) {
4853                 boolean_t unref;
4854                 boolean_t refs_have_been_logged;
4855
4856                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
4857                     zone->zone_uniqid != uniqid) {
4858                         /*
4859                          * The zone has gone away.  Necessary conditions
4860                          * are met, so we return success.
4861                          */
4862                         mutex_exit(&zonehash_lock);
4863                         return (0);
4864                 }
4865                 mutex_enter(&zone->zone_lock);
4866                 unref = ZONE_IS_UNREF(zone);
4867                 refs_have_been_logged = (zone->zone_flags &
4868                     ZF_REFCOUNTS_LOGGED);
4869                 mutex_exit(&zone->zone_lock);
4870                 if (unref) {
4871                         /*
4872                          * There is only one reference to the zone -- that
4873                          * added when the zone was added to the hashtables --
4874                          * and things will remain this way until we drop
4875                          * zonehash_lock... we can go ahead and cleanup the
4876                          * zone.
4877                          */
4878                         break;
4879                 }
4880
4881                 /*
4882                  * Wait for zone_rele_common() or zone_cred_rele() to signal
4883                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
4884                  * some zone's general-purpose reference count reaches one.
4885                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
4886                  * on zone_destroy_cv, then log the zone's reference counts and
4887                  * continue to wait for zone_rele() and zone_cred_rele().
4888                  */
4889                 if (!refs_have_been_logged) {
4890                         if (!log_refcounts) {
4891                                 /*
4892                                  * This thread hasn't timed out waiting on
4893                                  * zone_destroy_cv yet.  Wait wait_time clock
4894                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
4895                                  * seconds) for the zone's references to clear.
4896                                  */
4897                                 ASSERT(wait_time > 0);
4898                                 wait_time = cv_reltimedwait_sig(
4899                                     &zone_destroy_cv, &zonehash_lock, wait_time,
4900                                     TR_SEC);
4901                                 if (wait_time > 0) {
4902                                         /*
4903                                          * A thread in zone_rele() or
4904                                          * zone_cred_rele() signaled
4905                                          * zone_destroy_cv before this thread's
4906                                          * wait timed out.  The zone might have
4907                                          * only one reference left; find out!
4908                                          */
4909                                         continue;
4910                                 } else if (wait_time == 0) {
4911                                         /* The thread's process was signaled. */
4912                                         mutex_exit(&zonehash_lock);
4913                                         return (set_errno(EINTR));
4914                                 }
4915
4916                                 /*
4917                                  * The thread timed out while waiting on
4918                                  * zone_destroy_cv.  Even though the thread
4919                                  * timed out, it has to check whether another
4920                                  * thread woke up from zone_destroy_cv and
4921                                  * destroyed the zone.
4922                                  *
4923                                  * If the zone still exists and has more than
4924                                  * one unreleased general-purpose reference,
4925                                  * then log the zone's reference counts.
4926                                  */
4927                                 log_refcounts = B_TRUE;
4928                                 continue;
4929                         }
4930
4931                         /*
4932                          * The thread already timed out on zone_destroy_cv while
4933                          * waiting for subsystems to release the zone's last
4934                          * general-purpose references.  Log the zone's reference
4935                          * counts and wait indefinitely on zone_destroy_cv.
4936                          */
4937                         zone_log_refcounts(zone);
4938                 }
4939                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
4940                         /* The thread's process was signaled. */
4941                         mutex_exit(&zonehash_lock);
4942                         return (set_errno(EINTR));
4943                 }
4944         }
4945
4946         /*
4947          * Remove CPU cap for this zone now since we're not going to
4948          * fail below this point.
4949          */
4950         cpucaps_zone_remove(zone);
4951
4952         /* Get rid of the zone's kstats */
4953         zone_kstat_delete(zone);
4954
4955         /* remove the pfexecd doors */
4956         if (zone->zone_pfexecd != NULL) {
4957                 klpd_freelist(&zone->zone_pfexecd);
4958                 zone->zone_pfexecd = NULL;
4959         }
4960
4961         /* free brand specific data */
4962         if (ZONE_IS_BRANDED(zone))
4963                 ZBROP(zone)->b_free_brand_data(zone);
4964
4965         /* Say goodbye to brand framework. */
4966         brand_unregister_zone(zone->zone_brand);
4967
4968         /*
4969          * It is now safe to let the zone be recreated; remove it from the
4970          * lists.  The memory will not be freed until the last cred
4971          * reference goes away.
4972          */
4973         ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
4974         zonecount--;
4975         /* remove from active list and hash tables */
4976         list_remove(&zone_active, zone);
4977         (void) mod_hash_destroy(zonehashbyname,
4978             (mod_hash_key_t)zone->zone_name);
4979         (void) mod_hash_destroy(zonehashbyid,
4980             (mod_hash_key_t)(uintptr_t)zone->zone_id);
4981         if (zone->zone_flags & ZF_HASHED_LABEL)
4982                 (void) mod_hash_destroy(zonehashbylabel,
4983                     (mod_hash_key_t)zone->zone_slabel);
4984         mutex_exit(&zonehash_lock);
4985
4986         /*
4987          * Release the root vnode; we're not using it anymore.  Nor should any
4988          * other thread that might access it exist.
4989          */
4990         if (zone->zone_rootvp != NULL) {
4991                 VN_RELE(zone->zone_rootvp);
4992                 zone->zone_rootvp = NULL;
4993         }
4994
4995         /* add to deathrow list */
4996         mutex_enter(&zone_deathrow_lock);
4997         list_insert_tail(&zone_deathrow, zone);
4998         mutex_exit(&zone_deathrow_lock);
4999
5000         /*
5001          * Drop last reference (which was added by zsched()), this will
5002          * free the zone unless there are outstanding cred references.
5003          */
5004         zone_rele(zone);
5005         return (0);
5006 }
5007
5008 /*
5009  * Systemcall entry point for zone_getattr(2).
5010  */
5011 static ssize_t
5012 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5013 {
5014         size_t size;
5015         int error = 0, err;
5016         zone_t *zone;
5017         char *zonepath;
5018         char *outstr;
5019         zone_status_t zone_status;
5020         pid_t initpid;
5021         boolean_t global = (curzone == global_zone);
5022         boolean_t inzone = (curzone->zone_id == zoneid);
5023         ushort_t flags;
5024         zone_net_data_t *zbuf;
5025
5026         mutex_enter(&zonehash_lock);
5027         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5028                 mutex_exit(&zonehash_lock);
5029                 return (set_errno(EINVAL));
5030         }
5031         zone_status = zone_status_get(zone);
5032         if (zone_status < ZONE_IS_INITIALIZED) {
5033                 mutex_exit(&zonehash_lock);
5034                 return (set_errno(EINVAL));
5035         }
5036         zone_hold(zone);
5037         mutex_exit(&zonehash_lock);
5038
5039         /*
5040          * If not in the global zone, don't show information about other zones,
5041          * unless the system is labeled and the local zone's label dominates
5042          * the other zone.
5043          */
5044         if (!zone_list_access(zone)) {
5045                 zone_rele(zone);
5046                 return (set_errno(EINVAL));
5047         }
5048
5049         switch (attr) {
5050         case ZONE_ATTR_ROOT:
5051                 if (global) {
5052                         /*
5053                          * Copy the path to trim the trailing "/" (except for
5054                          * the global zone).
5055                          */
5056                         if (zone != global_zone)
5057                                 size = zone->zone_rootpathlen - 1;
5058                         else
5059                                 size = zone->zone_rootpathlen;
5060                         zonepath = kmem_alloc(size, KM_SLEEP);
5061                         bcopy(zone->zone_rootpath, zonepath, size);
5062                         zonepath[size - 1] = '\0';
5063                 } else {
5064                         if (inzone || !is_system_labeled()) {
5065                                 /*
5066                                  * Caller is not in the global zone.
5067                                  * if the query is on the current zone
5068                                  * or the system is not labeled,
5069                                  * just return faked-up path for current zone.
5070                                  */
5071                                 zonepath = "/";
5072                                 size = 2;
5073                         } else {
5074                                 /*
5075                                  * Return related path for current zone.
5076                                  */
5077                                 int prefix_len = strlen(zone_prefix);
5078                                 int zname_len = strlen(zone->zone_name);
5079
5080                                 size = prefix_len + zname_len + 1;
5081                                 zonepath = kmem_alloc(size, KM_SLEEP);
5082                                 bcopy(zone_prefix, zonepath, prefix_len);
5083                                 bcopy(zone->zone_name, zonepath +
5084                                     prefix_len, zname_len);
5085                                 zonepath[size - 1] = '\0';
5086                         }
5087                 }
5088                 if (bufsize > size)
5089                         bufsize = size;
5090                 if (buf != NULL) {
5091                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5092                         if (err != 0 && err != ENAMETOOLONG)
5093                                 error = EFAULT;
5094                 }
5095                 if (global || (is_system_labeled() && !inzone))
5096                         kmem_free(zonepath, size);
5097                 break;
5098
5099         case ZONE_ATTR_NAME:
5100                 size = strlen(zone->zone_name) + 1;
5101                 if (bufsize > size)
5102                         bufsize = size;
5103                 if (buf != NULL) {
5104                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5105                         if (err != 0 && err != ENAMETOOLONG)
5106                                 error = EFAULT;
5107                 }
5108                 break;
5109
5110         case ZONE_ATTR_STATUS:
5111                 /*
5112                  * Since we're not holding zonehash_lock, the zone status
5113                  * may be anything; leave it up to userland to sort it out.
5114                  */
5115                 size = sizeof (zone_status);
5116                 if (bufsize > size)
5117                         bufsize = size;
5118                 zone_status = zone_status_get(zone);
5119                 if (buf != NULL &&
5120                     copyout(&zone_status, buf, bufsize) != 0)
5121                         error = EFAULT;
5122                 break;
5123         case ZONE_ATTR_FLAGS:
5124                 size = sizeof (zone->zone_flags);
5125                 if (bufsize > size)
5126                         bufsize = size;
5127                 flags = zone->zone_flags;
5128                 if (buf != NULL &&
5129                     copyout(&flags, buf, bufsize) != 0)
5130                         error = EFAULT;
5131                 break;
5132         case ZONE_ATTR_PRIVSET:
5133                 size = sizeof (priv_set_t);
5134                 if (bufsize > size)
5135                         bufsize = size;
5136                 if (buf != NULL &&
5137                     copyout(zone->zone_privset, buf, bufsize) != 0)
5138                         error = EFAULT;
5139                 break;
5140         case ZONE_ATTR_UNIQID:
5141                 size = sizeof (zone->zone_uniqid);
5142                 if (bufsize > size)
5143                         bufsize = size;
5144                 if (buf != NULL &&
5145                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5146                         error = EFAULT;
5147                 break;
5148         case ZONE_ATTR_POOLID:
5149                 {
5150                         pool_t *pool;
5151                         poolid_t poolid;
5152
5153                         if (pool_lock_intr() != 0) {
5154                                 error = EINTR;
5155                                 break;
5156                         }
5157                         pool = zone_pool_get(zone);
5158                         poolid = pool->pool_id;
5159                         pool_unlock();
5160                         size = sizeof (poolid);
5161                         if (bufsize > size)
5162                                 bufsize = size;
5163                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5164                                 error = EFAULT;
5165                 }
5166                 break;
5167         case ZONE_ATTR_SLBL:
5168                 size = sizeof (bslabel_t);
5169                 if (bufsize > size)
5170                         bufsize = size;
5171                 if (zone->zone_slabel == NULL)
5172                         error = EINVAL;
5173                 else if (buf != NULL &&
5174                     copyout(label2bslabel(zone->zone_slabel), buf,
5175                     bufsize) != 0)
5176                         error = EFAULT;
5177                 break;
5178         case ZONE_ATTR_INITPID:
5179                 size = sizeof (initpid);
5180                 if (bufsize > size)
5181                         bufsize = size;
5182                 initpid = zone->zone_proc_initpid;
5183                 if (initpid == -1) {
5184                         error = ESRCH;
5185                         break;
5186                 }
5187                 if (buf != NULL &&
5188                     copyout(&initpid, buf, bufsize) != 0)
5189                         error = EFAULT;
5190                 break;
5191         case ZONE_ATTR_BRAND:
5192                 size = strlen(zone->zone_brand->b_name) + 1;
5193
5194                 if (bufsize > size)
5195                         bufsize = size;
5196                 if (buf != NULL) {
5197                         err = copyoutstr(zone->zone_brand->b_name, buf,
5198                             bufsize, NULL);
5199                         if (err != 0 && err != ENAMETOOLONG)
5200                                 error = EFAULT;
5201                 }
5202                 break;
5203         case ZONE_ATTR_INITNAME:
5204                 size = strlen(zone->zone_initname) + 1;
5205                 if (bufsize > size)
5206                         bufsize = size;
5207                 if (buf != NULL) {
5208                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5209                             NULL);
5210                         if (err != 0 && err != ENAMETOOLONG)
5211                                 error = EFAULT;
5212                 }
5213                 break;
5214         case ZONE_ATTR_BOOTARGS:
5215                 if (zone->zone_bootargs == NULL)
5216                         outstr = "";
5217                 else
5218                         outstr = zone->zone_bootargs;
5219                 size = strlen(outstr) + 1;
5220                 if (bufsize > size)
5221                         bufsize = size;
5222                 if (buf != NULL) {
5223                         err = copyoutstr(outstr, buf, bufsize, NULL);
5224                         if (err != 0 && err != ENAMETOOLONG)
5225                                 error = EFAULT;
5226                 }
5227                 break;
5228         case ZONE_ATTR_PHYS_MCAP:
5229                 size = sizeof (zone->zone_phys_mcap);
5230                 if (bufsize > size)
5231                         bufsize = size;
5232                 if (buf != NULL &&
5233                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5234                         error = EFAULT;
5235                 break;
5236         case ZONE_ATTR_SCHED_CLASS:
5237                 mutex_enter(&class_lock);
5238
5239                 if (zone->zone_defaultcid >= loaded_classes)
5240                         outstr = "";
5241                 else
5242                         outstr = sclass[zone->zone_defaultcid].cl_name;
5243                 size = strlen(outstr) + 1;
5244                 if (bufsize > size)
5245                         bufsize = size;
5246                 if (buf != NULL) {
5247                         err = copyoutstr(outstr, buf, bufsize, NULL);
5248                         if (err != 0 && err != ENAMETOOLONG)
5249                                 error = EFAULT;
5250                 }
5251
5252                 mutex_exit(&class_lock);
5253                 break;
5254         case ZONE_ATTR_HOSTID:
5255                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5256                     bufsize == sizeof (zone->zone_hostid)) {
5257                         size = sizeof (zone->zone_hostid);
5258                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5259                             bufsize) != 0)
5260                                 error = EFAULT;
5261                 } else {
5262                         error = EINVAL;
5263                 }
5264                 break;
5265         case ZONE_ATTR_FS_ALLOWED:
5266                 if (zone->zone_fs_allowed == NULL)
5267                         outstr = "";
5268                 else
5269                         outstr = zone->zone_fs_allowed;
5270                 size = strlen(outstr) + 1;
5271                 if (bufsize > size)
5272                         bufsize = size;
5273                 if (buf != NULL) {
5274                         err = copyoutstr(outstr, buf, bufsize, NULL);
5275                         if (err != 0 && err != ENAMETOOLONG)
5276                                 error = EFAULT;
5277                 }
5278                 break;
5279         case ZONE_ATTR_NETWORK:
5280                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5281                 if (copyin(buf, zbuf, bufsize) != 0) {
5282                         error = EFAULT;
5283                 } else {
5284                         error = zone_get_network(zoneid, zbuf);
5285                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5286                                 error = EFAULT;
5287                 }
5288                 kmem_free(zbuf, bufsize);
5289                 break;
5290         default:
5291                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5292                         size = bufsize;
5293                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5294                 } else {
5295                         error = EINVAL;
5296                 }
5297         }
5298         zone_rele(zone);
5299
5300         if (error)
5301                 return (set_errno(error));
5302         return ((ssize_t)size);
5303 }
5304
5305 /*
5306  * Systemcall entry point for zone_setattr(2).
5307  */
5308 /*ARGSUSED*/
5309 static int
5310 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5311 {
5312         zone_t *zone;
5313         zone_status_t zone_status;
5314         int err = -1;
5315         zone_net_data_t *zbuf;
5316
5317         if (secpolicy_zone_config(CRED()) != 0)
5318                 return (set_errno(EPERM));
5319
5320         /*
5321          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5322          * global zone.
5323          */
5324         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5325                 return (set_errno(EINVAL));
5326         }
5327
5328         mutex_enter(&zonehash_lock);
5329         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5330                 mutex_exit(&zonehash_lock);
5331                 return (set_errno(EINVAL));
5332         }
5333         zone_hold(zone);
5334         mutex_exit(&zonehash_lock);
5335
5336         /*
5337          * At present most attributes can only be set on non-running,
5338          * non-global zones.
5339          */
5340         zone_status = zone_status_get(zone);
5341         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5342                 err = EINVAL;
5343                 goto done;
5344         }
5345
5346         switch (attr) {
5347         case ZONE_ATTR_INITNAME:
5348                 err = zone_set_initname(zone, (const char *)buf);
5349                 break;
5350         case ZONE_ATTR_BOOTARGS:
5351                 err = zone_set_bootargs(zone, (const char *)buf);
5352                 break;
5353         case ZONE_ATTR_BRAND:
5354                 err = zone_set_brand(zone, (const char *)buf);
5355                 break;
5356         case ZONE_ATTR_FS_ALLOWED:
5357                 err = zone_set_fs_allowed(zone, (const char *)buf);
5358                 break;
5359         case ZONE_ATTR_PHYS_MCAP:
5360                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5361                 break;
5362         case ZONE_ATTR_SCHED_CLASS:
5363                 err = zone_set_sched_class(zone, (const char *)buf);
5364                 break;
5365         case ZONE_ATTR_HOSTID:
5366                 if (bufsize == sizeof (zone->zone_hostid)) {
5367                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5368                                 err = 0;
5369                         else
5370                                 err = EFAULT;
5371                 } else {
5372                         err = EINVAL;
5373                 }
5374                 break;
5375         case ZONE_ATTR_NETWORK:
5376                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5377                         err = EINVAL;
5378                         break;
5379                 }
5380                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5381                 if (copyin(buf, zbuf, bufsize) != 0) {
5382                         kmem_free(zbuf, bufsize);
5383                         err = EFAULT;
5384                         break;
5385                 }
5386                 err = zone_set_network(zoneid, zbuf);
5387                 kmem_free(zbuf, bufsize);
5388                 break;
5389         default:
5390                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5391                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5392                 else
5393                         err = EINVAL;
5394         }
5395
5396 done:
5397         zone_rele(zone);
5398         ASSERT(err != -1);
5399         return (err != 0 ? set_errno(err) : 0);
5400 }
5401
5402 /*
5403  * Return zero if the process has at least one vnode mapped in to its
5404  * address space which shouldn't be allowed to change zones.
5405  *
5406  * Also return zero if the process has any shared mappings which reserve
5407  * swap.  This is because the counting for zone.max-swap does not allow swap
5408  * reservation to be shared between zones.  zone swap reservation is counted
5409  * on zone->zone_max_swap.
5410  */
5411 static int
5412 as_can_change_zones(void)
5413 {
5414         proc_t *pp = curproc;
5415         struct seg *seg;
5416         struct as *as = pp->p_as;
5417         vnode_t *vp;
5418         int allow = 1;
5419
5420         ASSERT(pp->p_as != &kas);
5421         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
5422         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5423
5424                 /*
5425                  * Cannot enter zone with shared anon memory which
5426                  * reserves swap.  See comment above.
5427                  */
5428                 if (seg_can_change_zones(seg) == B_FALSE) {
5429                         allow = 0;
5430                         break;
5431                 }
5432                 /*
5433                  * if we can't get a backing vnode for this segment then skip
5434                  * it.
5435                  */
5436                 vp = NULL;
5437                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5438                         continue;
5439                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5440                         allow = 0;
5441                         break;
5442                 }
5443         }
5444         AS_LOCK_EXIT(as, &as->a_lock);
5445         return (allow);
5446 }
5447
5448 /*
5449  * Count swap reserved by curproc's address space
5450  */
5451 static size_t
5452 as_swresv(void)
5453 {
5454         proc_t *pp = curproc;
5455         struct seg *seg;
5456         struct as *as = pp->p_as;
5457         size_t swap = 0;
5458
5459         ASSERT(pp->p_as != &kas);
5460         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
5461         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5462                 swap += seg_swresv(seg);
5463
5464         return (swap);
5465 }
5466
5467 /*
5468  * Systemcall entry point for zone_enter().
5469  *
5470  * The current process is injected into said zone.  In the process
5471  * it will change its project membership, privileges, rootdir/cwd,
5472  * zone-wide rctls, and pool association to match those of the zone.
5473  *
5474  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5475  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5476  * enter a zone that is "ready" or "running".
5477  */
5478 static int
5479 zone_enter(zoneid_t zoneid)
5480 {
5481         zone_t *zone;
5482         vnode_t *vp;
5483         proc_t *pp = curproc;
5484         contract_t *ct;
5485         cont_process_t *ctp;
5486         task_t *tk, *oldtk;
5487         kproject_t *zone_proj0;
5488         cred_t *cr, *newcr;
5489         pool_t *oldpool, *newpool;
5490         sess_t *sp;
5491         uid_t uid;
5492         zone_status_t status;
5493         int err = 0;
5494         rctl_entity_p_t e;
5495         size_t swap;
5496         kthread_id_t t;
5497
5498         if (secpolicy_zone_config(CRED()) != 0)
5499                 return (set_errno(EPERM));
5500         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5501                 return (set_errno(EINVAL));
5502
5503         /*
5504          * Stop all lwps so we don't need to hold a lock to look at
5505          * curproc->p_zone.  This needs to happen before we grab any
5506          * locks to avoid deadlock (another lwp in the process could
5507          * be waiting for the held lock).
5508          */
5509         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5510                 return (set_errno(EINTR));
5511
5512         /*
5513          * Make sure we're not changing zones with files open or mapped in
5514          * to our address space which shouldn't be changing zones.
5515          */
5516         if (!files_can_change_zones()) {
5517                 err = EBADF;
5518                 goto out;
5519         }
5520         if (!as_can_change_zones()) {
5521                 err = EFAULT;
5522                 goto out;
5523         }
5524
5525         mutex_enter(&zonehash_lock);
5526         if (pp->p_zone != global_zone) {
5527                 mutex_exit(&zonehash_lock);
5528                 err = EINVAL;
5529                 goto out;
5530         }
5531
5532         zone = zone_find_all_by_id(zoneid);
5533         if (zone == NULL) {
5534                 mutex_exit(&zonehash_lock);
5535                 err = EINVAL;
5536                 goto out;
5537         }
5538
5539         /*
5540          * To prevent processes in a zone from holding contracts on
5541          * extrazonal resources, and to avoid process contract
5542          * memberships which span zones, contract holders and processes
5543          * which aren't the sole members of their encapsulating process
5544          * contracts are not allowed to zone_enter.
5545          */
5546         ctp = pp->p_ct_process;
5547         ct = &ctp->conp_contract;
5548         mutex_enter(&ct->ct_lock);
5549         mutex_enter(&pp->p_lock);
5550         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5551                 mutex_exit(&pp->p_lock);
5552                 mutex_exit(&ct->ct_lock);
5553                 mutex_exit(&zonehash_lock);
5554                 err = EINVAL;
5555                 goto out;
5556         }
5557
5558         /*
5559          * Moreover, we don't allow processes whose encapsulating
5560          * process contracts have inherited extrazonal contracts.
5561          * While it would be easier to eliminate all process contracts
5562          * with inherited contracts, we need to be able to give a
5563          * restarted init (or other zone-penetrating process) its
5564          * predecessor's contracts.
5565          */
5566         if (ctp->conp_ninherited != 0) {
5567                 contract_t *next;
5568                 for (next = list_head(&ctp->conp_inherited); next;
5569                     next = list_next(&ctp->conp_inherited, next)) {
5570                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5571                                 mutex_exit(&pp->p_lock);
5572                                 mutex_exit(&ct->ct_lock);
5573                                 mutex_exit(&zonehash_lock);
5574                                 err = EINVAL;
5575                                 goto out;
5576                         }
5577                 }
5578         }
5579
5580         mutex_exit(&pp->p_lock);
5581         mutex_exit(&ct->ct_lock);
5582
5583         status = zone_status_get(zone);
5584         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5585                 /*
5586                  * Can't join
5587                  */
5588                 mutex_exit(&zonehash_lock);
5589                 err = EINVAL;
5590                 goto out;
5591         }
5592
5593         /*
5594          * Make sure new priv set is within the permitted set for caller
5595          */
5596         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5597                 mutex_exit(&zonehash_lock);
5598                 err = EPERM;
5599                 goto out;
5600         }
5601         /*
5602          * We want to momentarily drop zonehash_lock while we optimistically
5603          * bind curproc to the pool it should be running in.  This is safe
5604          * since the zone can't disappear (we have a hold on it).
5605          */
5606         zone_hold(zone);
5607         mutex_exit(&zonehash_lock);
5608
5609         /*
5610          * Grab pool_lock to keep the pools configuration from changing
5611          * and to stop ourselves from getting rebound to another pool
5612          * until we join the zone.
5613          */
5614         if (pool_lock_intr() != 0) {
5615                 zone_rele(zone);
5616                 err = EINTR;
5617                 goto out;
5618         }
5619         ASSERT(secpolicy_pool(CRED()) == 0);
5620         /*
5621          * Bind ourselves to the pool currently associated with the zone.
5622          */
5623         oldpool = curproc->p_pool;
5624         newpool = zone_pool_get(zone);
5625         if (pool_state == POOL_ENABLED && newpool != oldpool &&
5626             (err = pool_do_bind(newpool, P_PID, P_MYID,
5627             POOL_BIND_ALL)) != 0) {
5628                 pool_unlock();
5629                 zone_rele(zone);
5630                 goto out;
5631         }
5632
5633         /*
5634          * Grab cpu_lock now; we'll need it later when we call
5635          * task_join().
5636          */
5637         mutex_enter(&cpu_lock);
5638         mutex_enter(&zonehash_lock);
5639         /*
5640          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5641          */
5642         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5643                 /*
5644                  * Can't join anymore.
5645                  */
5646                 mutex_exit(&zonehash_lock);
5647                 mutex_exit(&cpu_lock);
5648                 if (pool_state == POOL_ENABLED &&
5649                     newpool != oldpool)
5650                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
5651                             POOL_BIND_ALL);
5652                 pool_unlock();
5653                 zone_rele(zone);
5654                 err = EINVAL;
5655                 goto out;
5656         }
5657
5658         /*
5659          * a_lock must be held while transfering locked memory and swap
5660          * reservation from the global zone to the non global zone because
5661          * asynchronous faults on the processes' address space can lock
5662          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5663          * segments respectively.
5664          */
5665         AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
5666         swap = as_swresv();
5667         mutex_enter(&pp->p_lock);
5668         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5669         /* verify that we do not exceed and task or lwp limits */
5670         mutex_enter(&zone->zone_nlwps_lock);
5671         /* add new lwps to zone and zone's proj0 */
5672         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5673         zone->zone_nlwps += pp->p_lwpcnt;
5674         /* add 1 task to zone's proj0 */
5675         zone_proj0->kpj_ntasks += 1;
5676
5677         zone_proj0->kpj_nprocs++;
5678         zone->zone_nprocs++;
5679         mutex_exit(&zone->zone_nlwps_lock);
5680
5681         mutex_enter(&zone->zone_mem_lock);
5682         zone->zone_locked_mem += pp->p_locked_mem;
5683         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5684         zone->zone_max_swap += swap;
5685         mutex_exit(&zone->zone_mem_lock);
5686
5687         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5688         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5689         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5690
5691         /* remove lwps and process from proc's old zone and old project */
5692         mutex_enter(&pp->p_zone->zone_nlwps_lock);
5693         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5694         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5695         pp->p_task->tk_proj->kpj_nprocs--;
5696         pp->p_zone->zone_nprocs--;
5697         mutex_exit(&pp->p_zone->zone_nlwps_lock);
5698
5699         mutex_enter(&pp->p_zone->zone_mem_lock);
5700         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5701         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5702         pp->p_zone->zone_max_swap -= swap;
5703         mutex_exit(&pp->p_zone->zone_mem_lock);
5704
5705         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5706         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5707         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5708
5709         pp->p_flag |= SZONETOP;
5710         pp->p_zone = zone;
5711         mutex_exit(&pp->p_lock);
5712         AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
5713
5714         /*
5715          * Joining the zone cannot fail from now on.
5716          *
5717          * This means that a lot of the following code can be commonized and
5718          * shared with zsched().
5719          */
5720
5721         /*
5722          * If the process contract fmri was inherited, we need to
5723          * flag this so that any contract status will not leak
5724          * extra zone information, svc_fmri in this case
5725          */
5726         if (ctp->conp_svc_ctid != ct->ct_id) {
5727                 mutex_enter(&ct->ct_lock);
5728                 ctp->conp_svc_zone_enter = ct->ct_id;
5729                 mutex_exit(&ct->ct_lock);
5730         }
5731
5732         /*
5733          * Reset the encapsulating process contract's zone.
5734          */
5735         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5736         contract_setzuniqid(ct, zone->zone_uniqid);
5737
5738         /*
5739          * Create a new task and associate the process with the project keyed
5740          * by (projid,zoneid).
5741          *
5742          * We might as well be in project 0; the global zone's projid doesn't
5743          * make much sense in a zone anyhow.
5744          *
5745          * This also increments zone_ntasks, and returns with p_lock held.
5746          */
5747         tk = task_create(0, zone);
5748         oldtk = task_join(tk, 0);
5749         mutex_exit(&cpu_lock);
5750
5751         /*
5752          * call RCTLOP_SET functions on this proc
5753          */
5754         e.rcep_p.zone = zone;
5755         e.rcep_t = RCENTITY_ZONE;
5756         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5757             RCD_CALLBACK);
5758         mutex_exit(&pp->p_lock);
5759
5760         /*
5761          * We don't need to hold any of zsched's locks here; not only do we know
5762          * the process and zone aren't going away, we know its session isn't
5763          * changing either.
5764          *
5765          * By joining zsched's session here, we mimic the behavior in the
5766          * global zone of init's sid being the pid of sched.  We extend this
5767          * to all zlogin-like zone_enter()'ing processes as well.
5768          */
5769         mutex_enter(&pidlock);
5770         sp = zone->zone_zsched->p_sessp;
5771         sess_hold(zone->zone_zsched);
5772         mutex_enter(&pp->p_lock);
5773         pgexit(pp);
5774         sess_rele(pp->p_sessp, B_TRUE);
5775         pp->p_sessp = sp;
5776         pgjoin(pp, zone->zone_zsched->p_pidp);
5777
5778         /*
5779          * If any threads are scheduled to be placed on zone wait queue they
5780          * should abandon the idea since the wait queue is changing.
5781          * We need to be holding pidlock & p_lock to do this.
5782          */
5783         if ((t = pp->p_tlist) != NULL) {
5784                 do {
5785                         thread_lock(t);
5786                         /*
5787                          * Kick this thread so that he doesn't sit
5788                          * on a wrong wait queue.
5789                          */
5790                         if (ISWAITING(t))
5791                                 setrun_locked(t);
5792
5793                         if (t->t_schedflag & TS_ANYWAITQ)
5794                                 t->t_schedflag &= ~ TS_ANYWAITQ;
5795
5796                         thread_unlock(t);
5797                 } while ((t = t->t_forw) != pp->p_tlist);
5798         }
5799
5800         /*
5801          * If there is a default scheduling class for the zone and it is not
5802          * the class we are currently in, change all of the threads in the
5803          * process to the new class.  We need to be holding pidlock & p_lock
5804          * when we call parmsset so this is a good place to do it.
5805          */
5806         if (zone->zone_defaultcid > 0 &&
5807             zone->zone_defaultcid != curthread->t_cid) {
5808                 pcparms_t pcparms;
5809
5810                 pcparms.pc_cid = zone->zone_defaultcid;
5811                 pcparms.pc_clparms[0] = 0;
5812
5813                 /*
5814                  * If setting the class fails, we still want to enter the zone.
5815                  */
5816                 if ((t = pp->p_tlist) != NULL) {
5817                         do {
5818                                 (void) parmsset(&pcparms, t);
5819                         } while ((t = t->t_forw) != pp->p_tlist);
5820                 }
5821         }
5822
5823         mutex_exit(&pp->p_lock);
5824         mutex_exit(&pidlock);
5825
5826         mutex_exit(&zonehash_lock);
5827         /*
5828          * We're firmly in the zone; let pools progress.
5829          */
5830         pool_unlock();
5831         task_rele(oldtk);
5832         /*
5833          * We don't need to retain a hold on the zone since we already
5834          * incremented zone_ntasks, so the zone isn't going anywhere.
5835          */
5836         zone_rele(zone);
5837
5838         /*
5839          * Chroot
5840          */
5841         vp = zone->zone_rootvp;
5842         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
5843         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
5844
5845         /*
5846          * Change process credentials
5847          */
5848         newcr = cralloc();
5849         mutex_enter(&pp->p_crlock);
5850         cr = pp->p_cred;
5851         crcopy_to(cr, newcr);
5852         crsetzone(newcr, zone);
5853         pp->p_cred = newcr;
5854
5855         /*
5856          * Restrict all process privilege sets to zone limit
5857          */
5858         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
5859         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
5860         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
5861         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
5862         mutex_exit(&pp->p_crlock);
5863         crset(pp, newcr);
5864
5865         /*
5866          * Adjust upcount to reflect zone entry.
5867          */
5868         uid = crgetruid(newcr);
5869         mutex_enter(&pidlock);
5870         upcount_dec(uid, GLOBAL_ZONEID);
5871         upcount_inc(uid, zoneid);
5872         mutex_exit(&pidlock);
5873
5874         /*
5875          * Set up core file path and content.
5876          */
5877         set_core_defaults();
5878
5879 out:
5880         /*
5881          * Let the other lwps continue.
5882          */
5883         mutex_enter(&pp->p_lock);
5884         if (curthread != pp->p_agenttp)
5885                 continuelwps(pp);
5886         mutex_exit(&pp->p_lock);
5887
5888         return (err != 0 ? set_errno(err) : 0);
5889 }
5890
5891 /*
5892  * Systemcall entry point for zone_list(2).
5893  *
5894  * Processes running in a (non-global) zone only see themselves.
5895  * On labeled systems, they see all zones whose label they dominate.
5896  */
5897 static int
5898 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
5899 {
5900         zoneid_t *zoneids;
5901         zone_t *zone, *myzone;
5902         uint_t user_nzones, real_nzones;
5903         uint_t domi_nzones;
5904         int error;
5905
5906         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
5907                 return (set_errno(EFAULT));
5908
5909         myzone = curproc->p_zone;
5910         if (myzone != global_zone) {
5911                 bslabel_t *mybslab;
5912
5913                 if (!is_system_labeled()) {
5914                         /* just return current zone */
5915                         real_nzones = domi_nzones = 1;
5916                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
5917                         zoneids[0] = myzone->zone_id;
5918                 } else {
5919                         /* return all zones that are dominated */
5920                         mutex_enter(&zonehash_lock);
5921                         real_nzones = zonecount;
5922                         domi_nzones = 0;
5923                         if (real_nzones > 0) {
5924                                 zoneids = kmem_alloc(real_nzones *
5925                                     sizeof (zoneid_t), KM_SLEEP);
5926                                 mybslab = label2bslabel(myzone->zone_slabel);
5927                                 for (zone = list_head(&zone_active);
5928                                     zone != NULL;
5929                                     zone = list_next(&zone_active, zone)) {
5930                                         if (zone->zone_id == GLOBAL_ZONEID)
5931                                                 continue;
5932                                         if (zone != myzone &&
5933                                             (zone->zone_flags & ZF_IS_SCRATCH))
5934                                                 continue;
5935                                         /*
5936                                          * Note that a label always dominates
5937                                          * itself, so myzone is always included
5938                                          * in the list.
5939                                          */
5940                                         if (bldominates(mybslab,
5941                                             label2bslabel(zone->zone_slabel))) {
5942                                                 zoneids[domi_nzones++] =
5943                                                     zone->zone_id;
5944                                         }
5945                                 }
5946                         }
5947                         mutex_exit(&zonehash_lock);
5948                 }
5949         } else {
5950                 mutex_enter(&zonehash_lock);
5951                 real_nzones = zonecount;
5952                 domi_nzones = 0;
5953                 if (real_nzones > 0) {
5954                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
5955                             KM_SLEEP);
5956                         for (zone = list_head(&zone_active); zone != NULL;
5957                             zone = list_next(&zone_active, zone))
5958                                 zoneids[domi_nzones++] = zone->zone_id;
5959                         ASSERT(domi_nzones == real_nzones);
5960                 }
5961                 mutex_exit(&zonehash_lock);
5962         }
5963
5964         /*
5965          * If user has allocated space for fewer entries than we found, then
5966          * return only up to his limit.  Either way, tell him exactly how many
5967          * we found.
5968          */
5969         if (domi_nzones < user_nzones)
5970                 user_nzones = domi_nzones;
5971         error = 0;
5972         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
5973                 error = EFAULT;
5974         } else if (zoneidlist != NULL && user_nzones != 0) {
5975                 if (copyout(zoneids, zoneidlist,
5976                     user_nzones * sizeof (zoneid_t)) != 0)
5977                         error = EFAULT;
5978         }
5979
5980         if (real_nzones > 0)
5981                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
5982
5983         if (error != 0)
5984                 return (set_errno(error));
5985         else
5986                 return (0);
5987 }
5988
5989 /*
5990  * Systemcall entry point for zone_lookup(2).
5991  *
5992  * Non-global zones are only able to see themselves and (on labeled systems)
5993  * the zones they dominate.
5994  */
5995 static zoneid_t
5996 zone_lookup(const char *zone_name)
5997 {
5998         char *kname;
5999         zone_t *zone;
6000         zoneid_t zoneid;
6001         int err;
6002
6003         if (zone_name == NULL) {
6004                 /* return caller's zone id */
6005                 return (getzoneid());
6006         }
6007
6008         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6009         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6010                 kmem_free(kname, ZONENAME_MAX);
6011                 return (set_errno(err));
6012         }
6013
6014         mutex_enter(&zonehash_lock);
6015         zone = zone_find_all_by_name(kname);
6016         kmem_free(kname, ZONENAME_MAX);
6017         /*
6018          * In a non-global zone, can only lookup global and own name.
6019          * In Trusted Extensions zone label dominance rules apply.
6020          */
6021         if (zone == NULL ||
6022             zone_status_get(zone) < ZONE_IS_READY ||
6023             !zone_list_access(zone)) {
6024                 mutex_exit(&zonehash_lock);
6025                 return (set_errno(EINVAL));
6026         } else {
6027                 zoneid = zone->zone_id;
6028                 mutex_exit(&zonehash_lock);
6029                 return (zoneid);
6030         }
6031 }
6032
6033 static int
6034 zone_version(int *version_arg)
6035 {
6036         int version = ZONE_SYSCALL_API_VERSION;
6037
6038         if (copyout(&version, version_arg, sizeof (int)) != 0)
6039                 return (set_errno(EFAULT));
6040         return (0);
6041 }
6042
6043 /* ARGSUSED */
6044 long
6045 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6046 {
6047         zone_def zs;
6048         int err;
6049
6050         switch (cmd) {
6051         case ZONE_CREATE:
6052                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6053                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6054                                 return (set_errno(EFAULT));
6055                         }
6056                 } else {
6057 #ifdef _SYSCALL32_IMPL
6058                         zone_def32 zs32;
6059
6060                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6061                                 return (set_errno(EFAULT));
6062                         }
6063                         zs.zone_name =
6064                             (const char *)(unsigned long)zs32.zone_name;
6065                         zs.zone_root =
6066                             (const char *)(unsigned long)zs32.zone_root;
6067                         zs.zone_privs =
6068                             (const struct priv_set *)
6069                             (unsigned long)zs32.zone_privs;
6070                         zs.zone_privssz = zs32.zone_privssz;
6071                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6072                         zs.rctlbufsz = zs32.rctlbufsz;
6073                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6074                         zs.zfsbufsz = zs32.zfsbufsz;
6075                         zs.extended_error =
6076                             (int *)(unsigned long)zs32.extended_error;
6077                         zs.match = zs32.match;
6078                         zs.doi = zs32.doi;
6079                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6080                         zs.flags = zs32.flags;
6081 #else
6082                         panic("get_udatamodel() returned bogus result\n");
6083 #endif
6084                 }
6085
6086                 return (zone_create(zs.zone_name, zs.zone_root,
6087                     zs.zone_privs, zs.zone_privssz,
6088                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6089                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6090                     zs.extended_error, zs.match, zs.doi,
6091                     zs.label, zs.flags));
6092         case ZONE_BOOT:
6093                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6094         case ZONE_DESTROY:
6095                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6096         case ZONE_GETATTR:
6097                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6098                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6099         case ZONE_SETATTR:
6100                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6101                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6102         case ZONE_ENTER:
6103                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6104         case ZONE_LIST:
6105                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6106         case ZONE_SHUTDOWN:
6107                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6108         case ZONE_LOOKUP:
6109                 return (zone_lookup((const char *)arg1));
6110         case ZONE_VERSION:
6111                 return (zone_version((int *)arg1));
6112         case ZONE_ADD_DATALINK:
6113                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6114                     (datalink_id_t)(uintptr_t)arg2));
6115         case ZONE_DEL_DATALINK:
6116                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6117                     (datalink_id_t)(uintptr_t)arg2));
6118         case ZONE_CHECK_DATALINK: {
6119                 zoneid_t        zoneid;
6120                 boolean_t       need_copyout;
6121
6122                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6123                         return (EFAULT);
6124                 need_copyout = (zoneid == ALL_ZONES);
6125                 err = zone_check_datalink(&zoneid,
6126                     (datalink_id_t)(uintptr_t)arg2);
6127                 if (err == 0 && need_copyout) {
6128                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6129                                 err = EFAULT;
6130                 }
6131                 return (err == 0 ? 0 : set_errno(err));
6132         }
6133         case ZONE_LIST_DATALINK:
6134                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6135                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6136         default:
6137                 return (set_errno(EINVAL));
6138         }
6139 }
6140
6141 struct zarg {
6142         zone_t *zone;
6143         zone_cmd_arg_t arg;
6144 };
6145
6146 static int
6147 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6148 {
6149         char *buf;
6150         size_t buflen;
6151         int error;
6152
6153         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6154         buf = kmem_alloc(buflen, KM_SLEEP);
6155         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6156         error = door_ki_open(buf, doorp);
6157         kmem_free(buf, buflen);
6158         return (error);
6159 }
6160
6161 static void
6162 zone_release_door(door_handle_t *doorp)
6163 {
6164         door_ki_rele(*doorp);
6165         *doorp = NULL;
6166 }
6167
6168 static void
6169 zone_ki_call_zoneadmd(struct zarg *zargp)
6170 {
6171         door_handle_t door = NULL;
6172         door_arg_t darg, save_arg;
6173         char *zone_name;
6174         size_t zone_namelen;
6175         zoneid_t zoneid;
6176         zone_t *zone;
6177         zone_cmd_arg_t arg;
6178         uint64_t uniqid;
6179         size_t size;
6180         int error;
6181         int retry;
6182
6183         zone = zargp->zone;
6184         arg = zargp->arg;
6185         kmem_free(zargp, sizeof (*zargp));
6186
6187         zone_namelen = strlen(zone->zone_name) + 1;
6188         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6189         bcopy(zone->zone_name, zone_name, zone_namelen);
6190         zoneid = zone->zone_id;
6191         uniqid = zone->zone_uniqid;
6192         /*
6193          * zoneadmd may be down, but at least we can empty out the zone.
6194          * We can ignore the return value of zone_empty() since we're called
6195          * from a kernel thread and know we won't be delivered any signals.
6196          */
6197         ASSERT(curproc == &p0);
6198         (void) zone_empty(zone);
6199         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6200         zone_rele(zone);
6201
6202         size = sizeof (arg);
6203         darg.rbuf = (char *)&arg;
6204         darg.data_ptr = (char *)&arg;
6205         darg.rsize = size;
6206         darg.data_size = size;
6207         darg.desc_ptr = NULL;
6208         darg.desc_num = 0;
6209
6210         save_arg = darg;
6211         /*
6212          * Since we're not holding a reference to the zone, any number of
6213          * things can go wrong, including the zone disappearing before we get a
6214          * chance to talk to zoneadmd.
6215          */
6216         for (retry = 0; /* forever */; retry++) {
6217                 if (door == NULL &&
6218                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6219                         goto next;
6220                 }
6221                 ASSERT(door != NULL);
6222
6223                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6224                     SIZE_MAX, 0)) == 0) {
6225                         break;
6226                 }
6227                 switch (error) {
6228                 case EINTR:
6229                         /* FALLTHROUGH */
6230                 case EAGAIN:    /* process may be forking */
6231                         /*
6232                          * Back off for a bit
6233                          */
6234                         break;
6235                 case EBADF:
6236                         zone_release_door(&door);
6237                         if (zone_lookup_door(zone_name, &door) != 0) {
6238                                 /*
6239                                  * zoneadmd may be dead, but it may come back to
6240                                  * life later.
6241                                  */
6242                                 break;
6243                         }
6244                         break;
6245                 default:
6246                         cmn_err(CE_WARN,
6247                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6248                             error);
6249                         goto out;
6250                 }
6251 next:
6252                 /*
6253                  * If this isn't the same zone_t that we originally had in mind,
6254                  * then this is the same as if two kadmin requests come in at
6255                  * the same time: the first one wins.  This means we lose, so we
6256                  * bail.
6257                  */
6258                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6259                         /*
6260                          * Problem is solved.
6261                          */
6262                         break;
6263                 }
6264                 if (zone->zone_uniqid != uniqid) {
6265                         /*
6266                          * zoneid recycled
6267                          */
6268                         zone_rele(zone);
6269                         break;
6270                 }
6271                 /*
6272                  * We could zone_status_timedwait(), but there doesn't seem to
6273                  * be much point in doing that (plus, it would mean that
6274                  * zone_free() isn't called until this thread exits).
6275                  */
6276                 zone_rele(zone);
6277                 delay(hz);
6278                 darg = save_arg;
6279         }
6280 out:
6281         if (door != NULL) {
6282                 zone_release_door(&door);
6283         }
6284         kmem_free(zone_name, zone_namelen);
6285         thread_exit();
6286 }
6287
6288 /*
6289  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6290  * kadmin().  The caller is a process in the zone.
6291  *
6292  * In order to shutdown the zone, we will hand off control to zoneadmd
6293  * (running in the global zone) via a door.  We do a half-hearted job at
6294  * killing all processes in the zone, create a kernel thread to contact
6295  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6296  * a form of generation number used to let zoneadmd (as well as
6297  * zone_destroy()) know exactly which zone they're re talking about.
6298  */
6299 int
6300 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6301 {
6302         struct zarg *zargp;
6303         zone_cmd_t zcmd;
6304         zone_t *zone;
6305
6306         zone = curproc->p_zone;
6307         ASSERT(getzoneid() != GLOBAL_ZONEID);
6308
6309         switch (cmd) {
6310         case A_SHUTDOWN:
6311                 switch (fcn) {
6312                 case AD_HALT:
6313                 case AD_POWEROFF:
6314                         zcmd = Z_HALT;
6315                         break;
6316                 case AD_BOOT:
6317                         zcmd = Z_REBOOT;
6318                         break;
6319                 case AD_IBOOT:
6320                 case AD_SBOOT:
6321                 case AD_SIBOOT:
6322                 case AD_NOSYNC:
6323                         return (ENOTSUP);
6324                 default:
6325                         return (EINVAL);
6326                 }
6327                 break;
6328         case A_REBOOT:
6329                 zcmd = Z_REBOOT;
6330                 break;
6331         case A_FTRACE:
6332         case A_REMOUNT:
6333         case A_FREEZE:
6334         case A_DUMP:
6335         case A_CONFIG:
6336                 return (ENOTSUP);
6337         default:
6338                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6339                 return (EINVAL);
6340         }
6341
6342         if (secpolicy_zone_admin(credp, B_FALSE))
6343                 return (EPERM);
6344         mutex_enter(&zone_status_lock);
6345
6346         /*
6347          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6348          * is in the zone.
6349          */
6350         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6351         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6352                 /*
6353                  * This zone is already on its way down.
6354                  */
6355                 mutex_exit(&zone_status_lock);
6356                 return (0);
6357         }
6358         /*
6359          * Prevent future zone_enter()s
6360          */
6361         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6362         mutex_exit(&zone_status_lock);
6363
6364         /*
6365          * Kill everyone now and call zoneadmd later.
6366          * zone_ki_call_zoneadmd() will do a more thorough job of this
6367          * later.
6368          */
6369         killall(zone->zone_id);
6370         /*
6371          * Now, create the thread to contact zoneadmd and do the rest of the
6372          * work.  This thread can't be created in our zone otherwise
6373          * zone_destroy() would deadlock.
6374          */
6375         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6376         zargp->arg.cmd = zcmd;
6377         zargp->arg.uniqid = zone->zone_uniqid;
6378         zargp->zone = zone;
6379         (void) strcpy(zargp->arg.locale, "C");
6380         /* mdep was already copied in for us by uadmin */
6381         if (mdep != NULL)
6382                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6383                     sizeof (zargp->arg.bootbuf));
6384         zone_hold(zone);
6385
6386         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6387             TS_RUN, minclsyspri);
6388         exit(CLD_EXITED, 0);
6389
6390         return (EINVAL);
6391 }
6392
6393 /*
6394  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6395  * status to ZONE_IS_SHUTTING_DOWN.
6396  *
6397  * This function also shuts down all running zones to ensure that they won't
6398  * fork new processes.
6399  */
6400 void
6401 zone_shutdown_global(void)
6402 {
6403         zone_t *current_zonep;
6404
6405         ASSERT(INGLOBALZONE(curproc));
6406         mutex_enter(&zonehash_lock);
6407         mutex_enter(&zone_status_lock);
6408
6409         /* Modify the global zone's status first. */
6410         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6411         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6412
6413         /*
6414          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6415          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6416          * could cause assertions to fail (e.g., assertions about a zone's
6417          * state during initialization, readying, or booting) or produce races.
6418          * We'll let threads continue to initialize and ready new zones: they'll
6419          * fail to boot the new zones when they see that the global zone is
6420          * shutting down.
6421          */
6422         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6423             current_zonep = list_next(&zone_active, current_zonep)) {
6424                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6425                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6426         }
6427         mutex_exit(&zone_status_lock);
6428         mutex_exit(&zonehash_lock);
6429 }
6430
6431 /*
6432  * Returns true if the named dataset is visible in the current zone.
6433  * The 'write' parameter is set to 1 if the dataset is also writable.
6434  */
6435 int
6436 zone_dataset_visible(const char *dataset, int *write)
6437 {
6438         static int zfstype = -1;
6439         zone_dataset_t *zd;
6440         size_t len;
6441         zone_t *zone = curproc->p_zone;
6442         const char *name = NULL;
6443         vfs_t *vfsp = NULL;
6444
6445         if (dataset[0] == '\0')
6446                 return (0);
6447
6448         /*
6449          * Walk the list once, looking for datasets which match exactly, or
6450          * specify a dataset underneath an exported dataset.  If found, return
6451          * true and note that it is writable.
6452          */
6453         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6454             zd = list_next(&zone->zone_datasets, zd)) {
6455
6456                 len = strlen(zd->zd_dataset);
6457                 if (strlen(dataset) >= len &&
6458                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6459                     (dataset[len] == '\0' || dataset[len] == '/' ||
6460                     dataset[len] == '@')) {
6461                         if (write)
6462                                 *write = 1;
6463                         return (1);
6464                 }
6465         }
6466
6467         /*
6468          * Walk the list a second time, searching for datasets which are parents
6469          * of exported datasets.  These should be visible, but read-only.
6470          *
6471          * Note that we also have to support forms such as 'pool/dataset/', with
6472          * a trailing slash.
6473          */
6474         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6475             zd = list_next(&zone->zone_datasets, zd)) {
6476
6477                 len = strlen(dataset);
6478                 if (dataset[len - 1] == '/')
6479                         len--;  /* Ignore trailing slash */
6480                 if (len < strlen(zd->zd_dataset) &&
6481                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6482                     zd->zd_dataset[len] == '/') {
6483                         if (write)
6484                                 *write = 0;
6485                         return (1);
6486                 }
6487         }
6488
6489         /*
6490          * We reach here if the given dataset is not found in the zone_dataset
6491          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6492          * instead of delegation. For this we search for the dataset in the
6493          * zone_vfslist of this zone. If found, return true and note that it is
6494          * not writable.
6495          */
6496
6497         /*
6498          * Initialize zfstype if it is not initialized yet.
6499          */
6500         if (zfstype == -1) {
6501                 struct vfssw *vswp = vfs_getvfssw("zfs");
6502                 zfstype = vswp - vfssw;
6503                 vfs_unrefvfssw(vswp);
6504         }
6505
6506         vfs_list_read_lock();
6507         vfsp = zone->zone_vfslist;
6508         do {
6509                 ASSERT(vfsp);
6510                 if (vfsp->vfs_fstype == zfstype) {
6511                         name = refstr_value(vfsp->vfs_resource);
6512
6513                         /*
6514                          * Check if we have an exact match.
6515                          */
6516                         if (strcmp(dataset, name) == 0) {
6517                                 vfs_list_unlock();
6518                                 if (write)
6519                                         *write = 0;
6520                                 return (1);
6521                         }
6522                         /*
6523                          * We need to check if we are looking for parents of
6524                          * a dataset. These should be visible, but read-only.
6525                          */
6526                         len = strlen(dataset);
6527                         if (dataset[len - 1] == '/')
6528                                 len--;
6529
6530                         if (len < strlen(name) &&
6531                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6532                                 vfs_list_unlock();
6533                                 if (write)
6534                                         *write = 0;
6535                                 return (1);
6536                         }
6537                 }
6538                 vfsp = vfsp->vfs_zone_next;
6539         } while (vfsp != zone->zone_vfslist);
6540
6541         vfs_list_unlock();
6542         return (0);
6543 }
6544
6545 /*
6546  * zone_find_by_any_path() -
6547  *
6548  * kernel-private routine similar to zone_find_by_path(), but which
6549  * effectively compares against zone paths rather than zonerootpath
6550  * (i.e., the last component of zonerootpaths, which should be "root/",
6551  * are not compared.)  This is done in order to accurately identify all
6552  * paths, whether zone-visible or not, including those which are parallel
6553  * to /root/, such as /dev/, /home/, etc...
6554  *
6555  * If the specified path does not fall under any zone path then global
6556  * zone is returned.
6557  *
6558  * The treat_abs parameter indicates whether the path should be treated as
6559  * an absolute path although it does not begin with "/".  (This supports
6560  * nfs mount syntax such as host:any/path.)
6561  *
6562  * The caller is responsible for zone_rele of the returned zone.
6563  */
6564 zone_t *
6565 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6566 {
6567         zone_t *zone;
6568         int path_offset = 0;
6569
6570         if (path == NULL) {
6571                 zone_hold(global_zone);
6572                 return (global_zone);
6573         }
6574
6575         if (*path != '/') {
6576                 ASSERT(treat_abs);
6577                 path_offset = 1;
6578         }
6579
6580         mutex_enter(&zonehash_lock);
6581         for (zone = list_head(&zone_active); zone != NULL;
6582             zone = list_next(&zone_active, zone)) {
6583                 char    *c;
6584                 size_t  pathlen;
6585                 char *rootpath_start;
6586
6587                 if (zone == global_zone)        /* skip global zone */
6588                         continue;
6589
6590                 /* scan backwards to find start of last component */
6591                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6592                 do {
6593                         c--;
6594                 } while (*c != '/');
6595
6596                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
6597                 rootpath_start = (zone->zone_rootpath + path_offset);
6598                 if (strncmp(path, rootpath_start, pathlen) == 0)
6599                         break;
6600         }
6601         if (zone == NULL)
6602                 zone = global_zone;
6603         zone_hold(zone);
6604         mutex_exit(&zonehash_lock);
6605         return (zone);
6606 }
6607
6608 /*
6609  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6610  * zone_dl_t pointer if found, and NULL otherwise.
6611  */
6612 static zone_dl_t *
6613 zone_find_dl(zone_t *zone, datalink_id_t linkid)
6614 {
6615         zone_dl_t *zdl;
6616
6617         ASSERT(mutex_owned(&zone->zone_lock));
6618         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6619             zdl = list_next(&zone->zone_dl_list, zdl)) {
6620                 if (zdl->zdl_id == linkid)
6621                         break;
6622         }
6623         return (zdl);
6624 }
6625
6626 static boolean_t
6627 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6628 {
6629         boolean_t exists;
6630
6631         mutex_enter(&zone->zone_lock);
6632         exists = (zone_find_dl(zone, linkid) != NULL);
6633         mutex_exit(&zone->zone_lock);
6634         return (exists);
6635 }
6636
6637 /*
6638  * Add an data link name for the zone.
6639  */
6640 static int
6641 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6642 {
6643         zone_dl_t *zdl;
6644         zone_t *zone;
6645         zone_t *thiszone;
6646
6647         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6648                 return (set_errno(ENXIO));
6649
6650         /* Verify that the datalink ID doesn't already belong to a zone. */
6651         mutex_enter(&zonehash_lock);
6652         for (zone = list_head(&zone_active); zone != NULL;
6653             zone = list_next(&zone_active, zone)) {
6654                 if (zone_dl_exists(zone, linkid)) {
6655                         mutex_exit(&zonehash_lock);
6656                         zone_rele(thiszone);
6657                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6658                 }
6659         }
6660
6661         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6662         zdl->zdl_id = linkid;
6663         zdl->zdl_net = NULL;
6664         mutex_enter(&thiszone->zone_lock);
6665         list_insert_head(&thiszone->zone_dl_list, zdl);
6666         mutex_exit(&thiszone->zone_lock);
6667         mutex_exit(&zonehash_lock);
6668         zone_rele(thiszone);
6669         return (0);
6670 }
6671
6672 static int
6673 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6674 {
6675         zone_dl_t *zdl;
6676         zone_t *zone;
6677         int err = 0;
6678
6679         if ((zone = zone_find_by_id(zoneid)) == NULL)
6680                 return (set_errno(EINVAL));
6681
6682         mutex_enter(&zone->zone_lock);
6683         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6684                 err = ENXIO;
6685         } else {
6686                 list_remove(&zone->zone_dl_list, zdl);
6687                 if (zdl->zdl_net != NULL)
6688                         nvlist_free(zdl->zdl_net);
6689                 kmem_free(zdl, sizeof (zone_dl_t));
6690         }
6691         mutex_exit(&zone->zone_lock);
6692         zone_rele(zone);
6693         return (err == 0 ? 0 : set_errno(err));
6694 }
6695
6696 /*
6697  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6698  * the linkid.  Otherwise we just check if the specified zoneidp has been
6699  * assigned the supplied linkid.
6700  */
6701 int
6702 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6703 {
6704         zone_t *zone;
6705         int err = ENXIO;
6706
6707         if (*zoneidp != ALL_ZONES) {
6708                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6709                         if (zone_dl_exists(zone, linkid))
6710                                 err = 0;
6711                         zone_rele(zone);
6712                 }
6713                 return (err);
6714         }
6715
6716         mutex_enter(&zonehash_lock);
6717         for (zone = list_head(&zone_active); zone != NULL;
6718             zone = list_next(&zone_active, zone)) {
6719                 if (zone_dl_exists(zone, linkid)) {
6720                         *zoneidp = zone->zone_id;
6721                         err = 0;
6722                         break;
6723                 }
6724         }
6725         mutex_exit(&zonehash_lock);
6726         return (err);
6727 }
6728
6729 /*
6730  * Get the list of datalink IDs assigned to a zone.
6731  *
6732  * On input, *nump is the number of datalink IDs that can fit in the supplied
6733  * idarray.  Upon return, *nump is either set to the number of datalink IDs
6734  * that were placed in the array if the array was large enough, or to the
6735  * number of datalink IDs that the function needs to place in the array if the
6736  * array is too small.
6737  */
6738 static int
6739 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6740 {
6741         uint_t num, dlcount;
6742         zone_t *zone;
6743         zone_dl_t *zdl;
6744         datalink_id_t *idptr = idarray;
6745
6746         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6747                 return (set_errno(EFAULT));
6748         if ((zone = zone_find_by_id(zoneid)) == NULL)
6749                 return (set_errno(ENXIO));
6750
6751         num = 0;
6752         mutex_enter(&zone->zone_lock);
6753         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6754             zdl = list_next(&zone->zone_dl_list, zdl)) {
6755                 /*
6756                  * If the list is bigger than what the caller supplied, just
6757                  * count, don't do copyout.
6758                  */
6759                 if (++num > dlcount)
6760                         continue;
6761                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6762                         mutex_exit(&zone->zone_lock);
6763                         zone_rele(zone);
6764                         return (set_errno(EFAULT));
6765                 }
6766                 idptr++;
6767         }
6768         mutex_exit(&zone->zone_lock);
6769         zone_rele(zone);
6770
6771         /* Increased or decreased, caller should be notified. */
6772         if (num != dlcount) {
6773                 if (copyout(&num, nump, sizeof (num)) != 0)
6774                         return (set_errno(EFAULT));
6775         }
6776         return (0);
6777 }
6778
6779 /*
6780  * Public interface for looking up a zone by zoneid. It's a customized version
6781  * for netstack_zone_create(). It can only be called from the zsd create
6782  * callbacks, since it doesn't have reference on the zone structure hence if
6783  * it is called elsewhere the zone could disappear after the zonehash_lock
6784  * is dropped.
6785  *
6786  * Furthermore it
6787  * 1. Doesn't check the status of the zone.
6788  * 2. It will be called even before zone_init is called, in that case the
6789  *    address of zone0 is returned directly, and netstack_zone_create()
6790  *    will only assign a value to zone0.zone_netstack, won't break anything.
6791  * 3. Returns without the zone being held.
6792  */
6793 zone_t *
6794 zone_find_by_id_nolock(zoneid_t zoneid)
6795 {
6796         zone_t *zone;
6797
6798         mutex_enter(&zonehash_lock);
6799         if (zonehashbyid == NULL)
6800                 zone = &zone0;
6801         else
6802                 zone = zone_find_all_by_id(zoneid);
6803         mutex_exit(&zonehash_lock);
6804         return (zone);
6805 }
6806
6807 /*
6808  * Walk the datalinks for a given zone
6809  */
6810 int
6811 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
6812     void *data)
6813 {
6814         zone_t          *zone;
6815         zone_dl_t       *zdl;
6816         datalink_id_t   *idarray;
6817         uint_t          idcount = 0;
6818         int             i, ret = 0;
6819
6820         if ((zone = zone_find_by_id(zoneid)) == NULL)
6821                 return (ENOENT);
6822
6823         /*
6824          * We first build an array of linkid's so that we can walk these and
6825          * execute the callback with the zone_lock dropped.
6826          */
6827         mutex_enter(&zone->zone_lock);
6828         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6829             zdl = list_next(&zone->zone_dl_list, zdl)) {
6830                 idcount++;
6831         }
6832
6833         if (idcount == 0) {
6834                 mutex_exit(&zone->zone_lock);
6835                 zone_rele(zone);
6836                 return (0);
6837         }
6838
6839         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
6840         if (idarray == NULL) {
6841                 mutex_exit(&zone->zone_lock);
6842                 zone_rele(zone);
6843                 return (ENOMEM);
6844         }
6845
6846         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6847             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
6848                 idarray[i] = zdl->zdl_id;
6849         }
6850
6851         mutex_exit(&zone->zone_lock);
6852
6853         for (i = 0; i < idcount && ret == 0; i++) {
6854                 if ((ret = (*cb)(idarray[i], data)) != 0)
6855                         break;
6856         }
6857
6858         zone_rele(zone);
6859         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
6860         return (ret);
6861 }
6862
6863 static char *
6864 zone_net_type2name(int type)
6865 {
6866         switch (type) {
6867         case ZONE_NETWORK_ADDRESS:
6868                 return (ZONE_NET_ADDRNAME);
6869         case ZONE_NETWORK_DEFROUTER:
6870                 return (ZONE_NET_RTRNAME);
6871         default:
6872                 return (NULL);
6873         }
6874 }
6875
6876 static int
6877 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
6878 {
6879         zone_t *zone;
6880         zone_dl_t *zdl;
6881         nvlist_t *nvl;
6882         int err = 0;
6883         uint8_t *new = NULL;
6884         char *nvname;
6885         int bufsize;
6886         datalink_id_t linkid = znbuf->zn_linkid;
6887
6888         if (secpolicy_zone_config(CRED()) != 0)
6889                 return (set_errno(EPERM));
6890
6891         if (zoneid == GLOBAL_ZONEID)
6892                 return (set_errno(EINVAL));
6893
6894         nvname = zone_net_type2name(znbuf->zn_type);
6895         bufsize = znbuf->zn_len;
6896         new = znbuf->zn_val;
6897         if (nvname == NULL)
6898                 return (set_errno(EINVAL));
6899
6900         if ((zone = zone_find_by_id(zoneid)) == NULL) {
6901                 return (set_errno(EINVAL));
6902         }
6903
6904         mutex_enter(&zone->zone_lock);
6905         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6906                 err = ENXIO;
6907                 goto done;
6908         }
6909         if ((nvl = zdl->zdl_net) == NULL) {
6910                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
6911                         err = ENOMEM;
6912                         goto done;
6913                 } else {
6914                         zdl->zdl_net = nvl;
6915                 }
6916         }
6917         if (nvlist_exists(nvl, nvname)) {
6918                 err = EINVAL;
6919                 goto done;
6920         }
6921         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
6922         ASSERT(err == 0);
6923 done:
6924         mutex_exit(&zone->zone_lock);
6925         zone_rele(zone);
6926         if (err != 0)
6927                 return (set_errno(err));
6928         else
6929                 return (0);
6930 }
6931
6932 static int
6933 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
6934 {
6935         zone_t *zone;
6936         zone_dl_t *zdl;
6937         nvlist_t *nvl;
6938         uint8_t *ptr;
6939         uint_t psize;
6940         int err = 0;
6941         char *nvname;
6942         int bufsize;
6943         void *buf;
6944         datalink_id_t linkid = znbuf->zn_linkid;
6945
6946         if (zoneid == GLOBAL_ZONEID)
6947                 return (set_errno(EINVAL));
6948
6949         nvname = zone_net_type2name(znbuf->zn_type);
6950         bufsize = znbuf->zn_len;
6951         buf = znbuf->zn_val;
6952
6953         if (nvname == NULL)
6954                 return (set_errno(EINVAL));
6955         if ((zone = zone_find_by_id(zoneid)) == NULL)
6956                 return (set_errno(EINVAL));
6957
6958         mutex_enter(&zone->zone_lock);
6959         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6960                 err = ENXIO;
6961                 goto done;
6962         }
6963         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
6964                 err = ENOENT;
6965                 goto done;
6966         }
6967         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
6968         ASSERT(err == 0);
6969
6970         if (psize > bufsize) {
6971                 err = ENOBUFS;
6972                 goto done;
6973         }
6974         znbuf->zn_len = psize;
6975         bcopy(ptr, buf, psize);
6976 done:
6977         mutex_exit(&zone->zone_lock);
6978         zone_rele(zone);
6979         if (err != 0)
6980                 return (set_errno(err));
6981         else
6982                 return (0);
6983 }