kernel/os/zone.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27
  28 /*
  29  * Zones
  30  *
  31  *   A zone is a named collection of processes, namespace constraints,
  32  *   and other system resources which comprise a secure and manageable
  33  *   application containment facility.
  34  *
  35  *   Zones (represented by the reference counted zone_t) are tracked in
  36  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  37  *   (zoneid_t) are used to track zone association.  Zone IDs are
  38  *   dynamically generated when the zone is created; if a persistent
  39  *   identifier is needed (core files, accounting logs, audit trail,
  40  *   etc.), the zone name should be used.
  41  *
  42  *
  43  *   Global Zone:
  44  *
  45  *   The global zone (zoneid 0) is automatically associated with all
  46  *   system resources that have not been bound to a user-created zone.
  47  *   This means that even systems where zones are not in active use
  48  *   have a global zone, and all processes, mounts, etc. are
  49  *   associated with that zone.  The global zone is generally
  50  *   unconstrained in terms of privileges and access, though the usual
  51  *   credential and privilege based restrictions apply.
  52  *
  53  *
  54  *   Zone States:
  55  *
  56  *   The states in which a zone may be in and the transitions are as
  57  *   follows:
  58  *
  59  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  60  *   initialized zone is added to the list of active zones on the system but
  61  *   isn't accessible.
  62  *
  63  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  64  *   not yet completed. Not possible to enter the zone, but attributes can
  65  *   be retrieved.
  66  *
  67  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  68  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  69  *   executed.  A zone remains in this state until it transitions into
  70  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  71  *
  72  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  73  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  74  *   state.
  75  *
  76  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  77  *   successfully started init.   A zone remains in this state until
  78  *   zone_shutdown() is called.
  79  *
  80  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  81  *   killing all processes running in the zone. The zone remains
  82  *   in this state until there are no more user processes running in the zone.
  83  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  84  *   Since zone_shutdown() is restartable, it may be called successfully
  85  *   multiple times for the same zone_t.  Setting of the zone's state to
  86  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  87  *   the zone's status without worrying about it being a moving target.
  88  *
  89  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  90  *   are no more user processes in the zone.  The zone remains in this
  91  *   state until there are no more kernel threads associated with the
  92  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  93  *   fail.
  94  *
  95  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  96  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  97  *   join the zone or create kernel threads therein.
  98  *
  99  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 100  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 101  *   return NULL from now on.
 102  *
 103  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 104  *   processes or threads doing work on behalf of the zone.  The zone is
 105  *   removed from the list of active zones.  zone_destroy() returns, and
 106  *   the zone can be recreated.
 107  *
 108  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 109  *   callbacks are executed, and all memory associated with the zone is
 110  *   freed.
 111  *
 112  *   Threads can wait for the zone to enter a requested state by using
 113  *   zone_status_wait() or zone_status_timedwait() with the desired
 114  *   state passed in as an argument.  Zone state transitions are
 115  *   uni-directional; it is not possible to move back to an earlier state.
 116  *
 117  *
 118  *   Zone-Specific Data:
 119  *
 120  *   Subsystems needing to maintain zone-specific data can store that
 121  *   data using the ZSD mechanism.  This provides a zone-specific data
 122  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 123  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 124  *   to register callbacks to be invoked when a zone is created, shut
 125  *   down, or destroyed.  This can be used to initialize zone-specific
 126  *   data for new zones and to clean up when zones go away.
 127  *
 128  *
 129  *   Data Structures:
 130  *
 131  *   The per-zone structure (zone_t) is reference counted, and freed
 132  *   when all references are released.  zone_hold and zone_rele can be
 133  *   used to adjust the reference count.  In addition, reference counts
 134  *   associated with the cred_t structure are tracked separately using
 135  *   zone_cred_hold and zone_cred_rele.
 136  *
 137  *   Pointers to active zone_t's are stored in two hash tables; one
 138  *   for searching by id, the other for searching by name.  Lookups
 139  *   can be performed on either basis, using zone_find_by_id and
 140  *   zone_find_by_name.  Both return zone_t pointers with the zone
 141  *   held, so zone_rele should be called when the pointer is no longer
 142  *   needed.  Zones can also be searched by path; zone_find_by_path
 143  *   returns the zone with which a path name is associated (global
 144  *   zone if the path is not within some other zone's file system
 145  *   hierarchy).  This currently requires iterating through each zone,
 146  *   so it is slower than an id or name search via a hash table.
 147  *
 148  *
 149  *   Locking:
 150  *
 151  *   zonehash_lock: This is a top-level global lock used to protect the
 152  *       zone hash tables and lists.  Zones cannot be created or destroyed
 153  *       while this lock is held.
 154  *   zone_status_lock: This is a global lock protecting zone state.
 155  *       Zones cannot change state while this lock is held.  It also
 156  *       protects the list of kernel threads associated with a zone.
 157  *   zone_lock: This is a per-zone lock used to protect several fields of
 158  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 159  *       this lock means that the zone cannot go away.
 160  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 161  *       related to the zone.max-lwps rctl.
 162  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 163  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 164  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 165  *       currently just max_lofi
 166  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 167  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 168  *       list (a list of zones in the ZONE_IS_DEAD state).
 169  *
 170  *   Ordering requirements:
 171  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 172  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 173  *
 174  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 175  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 176  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 177  *
 178  *   Blocking memory allocations are permitted while holding any of the
 179  *   zone locks.
 180  *
 181  *
 182  *   System Call Interface:
 183  *
 184  *   The zone subsystem can be managed and queried from user level with
 185  *   the following system calls (all subcodes of the primary "zone"
 186  *   system call):
 187  *   - zone_create: creates a zone with selected attributes (name,
 188  *     root path, privileges, resource controls, ZFS datasets)
 189  *   - zone_enter: allows the current process to enter a zone
 190  *   - zone_getattr: reports attributes of a zone
 191  *   - zone_setattr: set attributes of a zone
 192  *   - zone_boot: set 'init' running for the zone
 193  *   - zone_list: lists all zones active in the system
 194  *   - zone_lookup: looks up zone id based on name
 195  *   - zone_shutdown: initiates shutdown process (see states above)
 196  *   - zone_destroy: completes shutdown process (see states above)
 197  *
 198  */
 199
 200 #include <sys/priv_impl.h>
 201 #include <sys/cred.h>
 202 #include <c2/audit.h>
 203 #include <sys/debug.h>
 204 #include <sys/file.h>
 205 #include <sys/kmem.h>
 206 #include <sys/kstat.h>
 207 #include <sys/mutex.h>
 208 #include <sys/note.h>
 209 #include <sys/pathname.h>
 210 #include <sys/proc.h>
 211 #include <sys/project.h>
 212 #include <sys/sysevent.h>
 213 #include <sys/task.h>
 214 #include <sys/systm.h>
 215 #include <sys/types.h>
 216 #include <sys/utsname.h>
 217 #include <sys/vnode.h>
 218 #include <sys/vfs.h>
 219 #include <sys/systeminfo.h>
 220 #include <sys/policy.h>
 221 #include <sys/cred_impl.h>
 222 #include <sys/contract_impl.h>
 223 #include <sys/contract/process_impl.h>
 224 #include <sys/class.h>
 225 #include <sys/pool.h>
 226 #include <sys/pool_pset.h>
 227 #include <sys/pset.h>
 228 #include <sys/strlog.h>
 229 #include <sys/sysmacros.h>
 230 #include <sys/callb.h>
 231 #include <sys/vmparam.h>
 232 #include <sys/corectl.h>
 233 #include <sys/ipc_impl.h>
 234 #include <sys/klpd.h>
 235
 236 #include <sys/door.h>
 237 #include <sys/cpuvar.h>
 238 #include <sys/sdt.h>
 239
 240 #include <sys/uadmin.h>
 241 #include <sys/session.h>
 242 #include <sys/cmn_err.h>
 243 #include <sys/modhash.h>
 244 #include <sys/sunddi.h>
 245 #include <sys/nvpair.h>
 246 #include <sys/rctl.h>
 247 #include <sys/fss.h>
 248 #include <sys/brand.h>
 249 #include <sys/zone.h>
 250 #include <net/if.h>
 251 #include <sys/cpucaps.h>
 252 #include <vm/seg.h>
 253 #include <sys/mac.h>
 254
 255 /*
 256  * This constant specifies the number of seconds that threads waiting for
 257  * subsystems to release a zone's general-purpose references will wait before
 258  * they log the zone's reference counts.  The constant's value shouldn't
 259  * be so small that reference counts are unnecessarily reported for zones
 260  * whose references are slowly released.  On the other hand, it shouldn't be so
 261  * large that users reboot their systems out of frustration over hung zones
 262  * before the system logs the zones' reference counts.
 263  */
 264 #define ZONE_DESTROY_TIMEOUT_SECS       60
 265
 266 /* List of data link IDs which are accessible from the zone */
 267 typedef struct zone_dl {
 268         datalink_id_t   zdl_id;
 269         nvlist_t        *zdl_net;
 270         list_node_t     zdl_linkage;
 271 } zone_dl_t;
 272
 273 /*
 274  * cv used to signal that all references to the zone have been released.  This
 275  * needs to be global since there may be multiple waiters, and the first to
 276  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 277  */
 278 static kcondvar_t zone_destroy_cv;
 279 /*
 280  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 281  * but then we'd need another lock for zone_destroy_cv, and why bother?
 282  */
 283 static kmutex_t zone_status_lock;
 284
 285 /*
 286  * ZSD-related global variables.
 287  */
 288 static kmutex_t zsd_key_lock;   /* protects the following two */
 289 /*
 290  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 291  */
 292 static zone_key_t zsd_keyval = 0;
 293 /*
 294  * Global list of registered keys.  We use this when a new zone is created.
 295  */
 296 static list_t zsd_registered_keys;
 297
 298 int zone_hash_size = 256;
 299 static mod_hash_t *zonehashbyname, *zonehashbyid;
 300 static kmutex_t zonehash_lock;
 301 static uint_t zonecount;
 302 static id_space_t *zoneid_space;
 303
 304 /*
 305  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 306  * kernel proper runs, and which manages all other zones.
 307  *
 308  * Although not declared as static, the variable "zone0" should not be used
 309  * except for by code that needs to reference the global zone early on in boot,
 310  * before it is fully initialized.  All other consumers should use
 311  * 'global_zone'.
 312  */
 313 zone_t zone0;
 314 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 315
 316 /*
 317  * List of active zones, protected by zonehash_lock.
 318  */
 319 static list_t zone_active;
 320
 321 /*
 322  * List of destroyed zones that still have outstanding cred references.
 323  * Used for debugging.  Uses a separate lock to avoid lock ordering
 324  * problems in zone_free.
 325  */
 326 static list_t zone_deathrow;
 327 static kmutex_t zone_deathrow_lock;
 328
 329 /* number of zones is limited by virtual interface limit in IP */
 330 uint_t maxzones = 8192;
 331
 332 /* Event channel to sent zone state change notifications */
 333 evchan_t *zone_event_chan;
 334
 335 /*
 336  * This table holds the mapping from kernel zone states to
 337  * states visible in the state notification API.
 338  * The idea is that we only expose "obvious" states and
 339  * do not expose states which are just implementation details.
 340  */
 341 const char  *zone_status_table[] = {
 342         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 343         ZONE_EVENT_INITIALIZED,         /* initialized */
 344         ZONE_EVENT_READY,               /* ready */
 345         ZONE_EVENT_READY,               /* booting */
 346         ZONE_EVENT_RUNNING,             /* running */
 347         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 351         ZONE_EVENT_UNINITIALIZED,       /* dead */
 352 };
 353
 354 /*
 355  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 356  * (see sys/zone.h).
 357  */
 358 static char *zone_ref_subsys_names[] = {
 359         "NFS",          /* ZONE_REF_NFS */
 360         "NFSv4",        /* ZONE_REF_NFSV4 */
 361         "SMBFS",        /* ZONE_REF_SMBFS */
 362         "MNTFS",        /* ZONE_REF_MNTFS */
 363         "LOFI",         /* ZONE_REF_LOFI */
 364         "VFS",          /* ZONE_REF_VFS */
 365         "IPC"           /* ZONE_REF_IPC */
 366 };
 367
 368 /*
 369  * This isn't static so lint doesn't complain.
 370  */
 371 rctl_hndl_t rc_zone_cpu_shares;
 372 rctl_hndl_t rc_zone_locked_mem;
 373 rctl_hndl_t rc_zone_max_swap;
 374 rctl_hndl_t rc_zone_max_lofi;
 375 rctl_hndl_t rc_zone_cpu_cap;
 376 rctl_hndl_t rc_zone_nlwps;
 377 rctl_hndl_t rc_zone_nprocs;
 378 rctl_hndl_t rc_zone_shmmax;
 379 rctl_hndl_t rc_zone_shmmni;
 380 rctl_hndl_t rc_zone_semmni;
 381 rctl_hndl_t rc_zone_msgmni;
 382
 383 const char * const zone_default_initname = "/sbin/init";
 384 static char * const zone_prefix = "/zone/";
 385 static int zone_shutdown(zoneid_t zoneid);
 386 static int zone_add_datalink(zoneid_t, datalink_id_t);
 387 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 388 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 389 static int zone_set_network(zoneid_t, zone_net_data_t *);
 390 static int zone_get_network(zoneid_t, zone_net_data_t *);
 391
 392 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 393
 394 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 395 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 396 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 397 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 398     zone_key_t);
 399 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 400 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 401     kmutex_t *);
 402 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 403     kmutex_t *);
 404
 405 /*
 406  * Bump this number when you alter the zone syscall interfaces; this is
 407  * because we need to have support for previous API versions in libc
 408  * to support patching; libc calls into the kernel to determine this number.
 409  *
 410  * Version 1 of the API is the version originally shipped with Solaris 10
 411  * Version 2 alters the zone_create system call in order to support more
 412  *     arguments by moving the args into a structure; and to do better
 413  *     error reporting when zone_create() fails.
 414  * Version 3 alters the zone_create system call in order to support the
 415  *     import of ZFS datasets to zones.
 416  * Version 4 alters the zone_create system call in order to support
 417  *     Trusted Extensions.
 418  * Version 5 alters the zone_boot system call, and converts its old
 419  *     bootargs parameter to be set by the zone_setattr API instead.
 420  * Version 6 adds the flag argument to zone_create.
 421  */
 422 static const int ZONE_SYSCALL_API_VERSION = 6;
 423
 424 /*
 425  * Certain filesystems (such as NFS and autofs) need to know which zone
 426  * the mount is being placed in.  Because of this, we need to be able to
 427  * ensure that a zone isn't in the process of being created/destroyed such
 428  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 429  * it gets added the list of mounted zones, it ends up on the wrong zone's
 430  * mount list. Since a zone can't reside on an NFS file system, we don't
 431  * have to worry about the zonepath itself.
 432  *
 433  * The following functions: block_mounts()/resume_mounts() and
 434  * mount_in_progress()/mount_completed() are used by zones and the VFS
 435  * layer (respectively) to synchronize zone state transitions and new
 436  * mounts within a zone. This syncronization is on a per-zone basis, so
 437  * activity for one zone will not interfere with activity for another zone.
 438  *
 439  * The semantics are like a reader-reader lock such that there may
 440  * either be multiple mounts (or zone state transitions, if that weren't
 441  * serialized by zonehash_lock) in progress at the same time, but not
 442  * both.
 443  *
 444  * We use cv's so the user can ctrl-C out of the operation if it's
 445  * taking too long.
 446  *
 447  * The semantics are such that there is unfair bias towards the
 448  * "current" operation.  This means that zone halt may starve if
 449  * there is a rapid succession of new mounts coming in to the zone.
 450  */
 451 /*
 452  * Prevent new mounts from progressing to the point of calling
 453  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 454  * them to complete.
 455  */
 456 static int
 457 block_mounts(zone_t *zp)
 458 {
 459         int retval = 0;
 460
 461         /*
 462          * Since it may block for a long time, block_mounts() shouldn't be
 463          * called with zonehash_lock held.
 464          */
 465         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 466         mutex_enter(&zp->zone_mount_lock);
 467         while (zp->zone_mounts_in_progress > 0) {
 468                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 469                         goto signaled;
 470         }
 471         /*
 472          * A negative value of mounts_in_progress indicates that mounts
 473          * have been blocked by (-mounts_in_progress) different callers
 474          * (remotely possible if two threads enter zone_shutdown at the same
 475          * time).
 476          */
 477         zp->zone_mounts_in_progress--;
 478         retval = 1;
 479 signaled:
 480         mutex_exit(&zp->zone_mount_lock);
 481         return (retval);
 482 }
 483
 484 /*
 485  * The VFS layer may progress with new mounts as far as we're concerned.
 486  * Allow them to progress if we were the last obstacle.
 487  */
 488 static void
 489 resume_mounts(zone_t *zp)
 490 {
 491         mutex_enter(&zp->zone_mount_lock);
 492         if (++zp->zone_mounts_in_progress == 0)
 493                 cv_broadcast(&zp->zone_mount_cv);
 494         mutex_exit(&zp->zone_mount_lock);
 495 }
 496
 497 /*
 498  * The VFS layer is busy with a mount; this zone should wait until all
 499  * of its mounts are completed to progress.
 500  */
 501 void
 502 mount_in_progress(zone_t *zp)
 503 {
 504         mutex_enter(&zp->zone_mount_lock);
 505         while (zp->zone_mounts_in_progress < 0)
 506                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 507         zp->zone_mounts_in_progress++;
 508         mutex_exit(&zp->zone_mount_lock);
 509 }
 510
 511 /*
 512  * VFS is done with one mount; wake up any waiting block_mounts()
 513  * callers if this is the last mount.
 514  */
 515 void
 516 mount_completed(zone_t *zp)
 517 {
 518         mutex_enter(&zp->zone_mount_lock);
 519         if (--zp->zone_mounts_in_progress == 0)
 520                 cv_broadcast(&zp->zone_mount_cv);
 521         mutex_exit(&zp->zone_mount_lock);
 522 }
 523
 524 /*
 525  * ZSD routines.
 526  *
 527  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 528  * defined by the pthread_key_create() and related interfaces.
 529  *
 530  * Kernel subsystems may register one or more data items and/or
 531  * callbacks to be executed when a zone is created, shutdown, or
 532  * destroyed.
 533  *
 534  * Unlike the thread counterpart, destructor callbacks will be executed
 535  * even if the data pointer is NULL and/or there are no constructor
 536  * callbacks, so it is the responsibility of such callbacks to check for
 537  * NULL data values if necessary.
 538  *
 539  * The locking strategy and overall picture is as follows:
 540  *
 541  * When someone calls zone_key_create(), a template ZSD entry is added to the
 542  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 543  * holding that lock all the existing zones are marked as
 544  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 545  * zone_zsd list (protected by zone_lock). The global list is updated first
 546  * (under zone_key_lock) to make sure that newly created zones use the
 547  * most recent list of keys. Then under zonehash_lock we walk the zones
 548  * and mark them.  Similar locking is used in zone_key_delete().
 549  *
 550  * The actual create, shutdown, and destroy callbacks are done without
 551  * holding any lock. And zsd_flags are used to ensure that the operations
 552  * completed so that when zone_key_create (and zone_create) is done, as well as
 553  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 554  * are completed.
 555  *
 556  * When new zones are created constructor callbacks for all registered ZSD
 557  * entries will be called. That also uses the above two phases of marking
 558  * what needs to be done, and then running the callbacks without holding
 559  * any locks.
 560  *
 561  * The framework does not provide any locking around zone_getspecific() and
 562  * zone_setspecific() apart from that needed for internal consistency, so
 563  * callers interested in atomic "test-and-set" semantics will need to provide
 564  * their own locking.
 565  */
 566
 567 /*
 568  * Helper function to find the zsd_entry associated with the key in the
 569  * given list.
 570  */
 571 static struct zsd_entry *
 572 zsd_find(list_t *l, zone_key_t key)
 573 {
 574         struct zsd_entry *zsd;
 575
 576         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 577                 if (zsd->zsd_key == key) {
 578                         return (zsd);
 579                 }
 580         }
 581         return (NULL);
 582 }
 583
 584 /*
 585  * Helper function to find the zsd_entry associated with the key in the
 586  * given list. Move it to the front of the list.
 587  */
 588 static struct zsd_entry *
 589 zsd_find_mru(list_t *l, zone_key_t key)
 590 {
 591         struct zsd_entry *zsd;
 592
 593         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 594                 if (zsd->zsd_key == key) {
 595                         /*
 596                          * Move to head of list to keep list in MRU order.
 597                          */
 598                         if (zsd != list_head(l)) {
 599                                 list_remove(l, zsd);
 600                                 list_insert_head(l, zsd);
 601                         }
 602                         return (zsd);
 603                 }
 604         }
 605         return (NULL);
 606 }
 607
 608 void
 609 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 610     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 611 {
 612         struct zsd_entry *zsdp;
 613         struct zsd_entry *t;
 614         struct zone *zone;
 615         zone_key_t  key;
 616
 617         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 618         zsdp->zsd_data = NULL;
 619         zsdp->zsd_create = create;
 620         zsdp->zsd_shutdown = shutdown;
 621         zsdp->zsd_destroy = destroy;
 622
 623         /*
 624          * Insert in global list of callbacks. Makes future zone creations
 625          * see it.
 626          */
 627         mutex_enter(&zsd_key_lock);
 628         key = zsdp->zsd_key = ++zsd_keyval;
 629         ASSERT(zsd_keyval != 0);
 630         list_insert_tail(&zsd_registered_keys, zsdp);
 631         mutex_exit(&zsd_key_lock);
 632
 633         /*
 634          * Insert for all existing zones and mark them as needing
 635          * a create callback.
 636          */
 637         mutex_enter(&zonehash_lock);    /* stop the world */
 638         for (zone = list_head(&zone_active); zone != NULL;
 639             zone = list_next(&zone_active, zone)) {
 640                 zone_status_t status;
 641
 642                 mutex_enter(&zone->zone_lock);
 643
 644                 /* Skip zones that are on the way down or not yet up */
 645                 status = zone_status_get(zone);
 646                 if (status >= ZONE_IS_DOWN ||
 647                     status == ZONE_IS_UNINITIALIZED) {
 648                         mutex_exit(&zone->zone_lock);
 649                         continue;
 650                 }
 651
 652                 t = zsd_find_mru(&zone->zone_zsd, key);
 653                 if (t != NULL) {
 654                         /*
 655                          * A zsd_configure already inserted it after
 656                          * we dropped zsd_key_lock above.
 657                          */
 658                         mutex_exit(&zone->zone_lock);
 659                         continue;
 660                 }
 661                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 662                 t->zsd_key = key;
 663                 t->zsd_create = create;
 664                 t->zsd_shutdown = shutdown;
 665                 t->zsd_destroy = destroy;
 666                 if (create != NULL) {
 667                         t->zsd_flags = ZSD_CREATE_NEEDED;
 668                         DTRACE_PROBE2(zsd__create__needed,
 669                             zone_t *, zone, zone_key_t, key);
 670                 }
 671                 list_insert_tail(&zone->zone_zsd, t);
 672                 mutex_exit(&zone->zone_lock);
 673         }
 674         mutex_exit(&zonehash_lock);
 675
 676         if (create != NULL) {
 677                 /* Now call the create callback for this key */
 678                 zsd_apply_all_zones(zsd_apply_create, key);
 679         }
 680         /*
 681          * It is safe for consumers to use the key now, make it
 682          * globally visible. Specifically zone_getspecific() will
 683          * always successfully return the zone specific data associated
 684          * with the key.
 685          */
 686         *keyp = key;
 687
 688 }
 689
 690 /*
 691  * Function called when a module is being unloaded, or otherwise wishes
 692  * to unregister its ZSD key and callbacks.
 693  *
 694  * Remove from the global list and determine the functions that need to
 695  * be called under a global lock. Then call the functions without
 696  * holding any locks. Finally free up the zone_zsd entries. (The apply
 697  * functions need to access the zone_zsd entries to find zsd_data etc.)
 698  */
 699 int
 700 zone_key_delete(zone_key_t key)
 701 {
 702         struct zsd_entry *zsdp = NULL;
 703         zone_t *zone;
 704
 705         mutex_enter(&zsd_key_lock);
 706         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 707         if (zsdp == NULL) {
 708                 mutex_exit(&zsd_key_lock);
 709                 return (-1);
 710         }
 711         list_remove(&zsd_registered_keys, zsdp);
 712         mutex_exit(&zsd_key_lock);
 713
 714         mutex_enter(&zonehash_lock);
 715         for (zone = list_head(&zone_active); zone != NULL;
 716             zone = list_next(&zone_active, zone)) {
 717                 struct zsd_entry *del;
 718
 719                 mutex_enter(&zone->zone_lock);
 720                 del = zsd_find_mru(&zone->zone_zsd, key);
 721                 if (del == NULL) {
 722                         /*
 723                          * Somebody else got here first e.g the zone going
 724                          * away.
 725                          */
 726                         mutex_exit(&zone->zone_lock);
 727                         continue;
 728                 }
 729                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 730                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 731                 if (del->zsd_shutdown != NULL &&
 732                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 733                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 734                         DTRACE_PROBE2(zsd__shutdown__needed,
 735                             zone_t *, zone, zone_key_t, key);
 736                 }
 737                 if (del->zsd_destroy != NULL &&
 738                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 739                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 740                         DTRACE_PROBE2(zsd__destroy__needed,
 741                             zone_t *, zone, zone_key_t, key);
 742                 }
 743                 mutex_exit(&zone->zone_lock);
 744         }
 745         mutex_exit(&zonehash_lock);
 746         kmem_free(zsdp, sizeof (*zsdp));
 747
 748         /* Now call the shutdown and destroy callback for this key */
 749         zsd_apply_all_zones(zsd_apply_shutdown, key);
 750         zsd_apply_all_zones(zsd_apply_destroy, key);
 751
 752         /* Now we can free up the zsdp structures in each zone */
 753         mutex_enter(&zonehash_lock);
 754         for (zone = list_head(&zone_active); zone != NULL;
 755             zone = list_next(&zone_active, zone)) {
 756                 struct zsd_entry *del;
 757
 758                 mutex_enter(&zone->zone_lock);
 759                 del = zsd_find(&zone->zone_zsd, key);
 760                 if (del != NULL) {
 761                         list_remove(&zone->zone_zsd, del);
 762                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 763                         kmem_free(del, sizeof (*del));
 764                 }
 765                 mutex_exit(&zone->zone_lock);
 766         }
 767         mutex_exit(&zonehash_lock);
 768
 769         return (0);
 770 }
 771
 772 /*
 773  * ZSD counterpart of pthread_setspecific().
 774  *
 775  * Since all zsd callbacks, including those with no create function,
 776  * have an entry in zone_zsd, if the key is registered it is part of
 777  * the zone_zsd list.
 778  * Return an error if the key wasn't registerd.
 779  */
 780 int
 781 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 782 {
 783         struct zsd_entry *t;
 784
 785         mutex_enter(&zone->zone_lock);
 786         t = zsd_find_mru(&zone->zone_zsd, key);
 787         if (t != NULL) {
 788                 /*
 789                  * Replace old value with new
 790                  */
 791                 t->zsd_data = (void *)data;
 792                 mutex_exit(&zone->zone_lock);
 793                 return (0);
 794         }
 795         mutex_exit(&zone->zone_lock);
 796         return (-1);
 797 }
 798
 799 /*
 800  * ZSD counterpart of pthread_getspecific().
 801  */
 802 void *
 803 zone_getspecific(zone_key_t key, zone_t *zone)
 804 {
 805         struct zsd_entry *t;
 806         void *data;
 807
 808         mutex_enter(&zone->zone_lock);
 809         t = zsd_find_mru(&zone->zone_zsd, key);
 810         data = (t == NULL ? NULL : t->zsd_data);
 811         mutex_exit(&zone->zone_lock);
 812         return (data);
 813 }
 814
 815 /*
 816  * Function used to initialize a zone's list of ZSD callbacks and data
 817  * when the zone is being created.  The callbacks are initialized from
 818  * the template list (zsd_registered_keys). The constructor callback is
 819  * executed later (once the zone exists and with locks dropped).
 820  */
 821 static void
 822 zone_zsd_configure(zone_t *zone)
 823 {
 824         struct zsd_entry *zsdp;
 825         struct zsd_entry *t;
 826
 827         ASSERT(MUTEX_HELD(&zonehash_lock));
 828         ASSERT(list_head(&zone->zone_zsd) == NULL);
 829         mutex_enter(&zone->zone_lock);
 830         mutex_enter(&zsd_key_lock);
 831         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 832             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 833                 /*
 834                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 835                  * should not have added anything to it.
 836                  */
 837                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 838
 839                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 840                 t->zsd_key = zsdp->zsd_key;
 841                 t->zsd_create = zsdp->zsd_create;
 842                 t->zsd_shutdown = zsdp->zsd_shutdown;
 843                 t->zsd_destroy = zsdp->zsd_destroy;
 844                 if (zsdp->zsd_create != NULL) {
 845                         t->zsd_flags = ZSD_CREATE_NEEDED;
 846                         DTRACE_PROBE2(zsd__create__needed,
 847                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 848                 }
 849                 list_insert_tail(&zone->zone_zsd, t);
 850         }
 851         mutex_exit(&zsd_key_lock);
 852         mutex_exit(&zone->zone_lock);
 853 }
 854
 855 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 856
 857 /*
 858  * Helper function to execute shutdown or destructor callbacks.
 859  */
 860 static void
 861 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 862 {
 863         struct zsd_entry *t;
 864
 865         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 866         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 867         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 868
 869         /*
 870          * Run the callback solely based on what is registered for the zone
 871          * in zone_zsd. The global list can change independently of this
 872          * as keys are registered and unregistered and we don't register new
 873          * callbacks for a zone that is in the process of going away.
 874          */
 875         mutex_enter(&zone->zone_lock);
 876         for (t = list_head(&zone->zone_zsd); t != NULL;
 877             t = list_next(&zone->zone_zsd, t)) {
 878                 zone_key_t key = t->zsd_key;
 879
 880                 /* Skip if no callbacks registered */
 881
 882                 if (ct == ZSD_SHUTDOWN) {
 883                         if (t->zsd_shutdown != NULL &&
 884                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 885                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 886                                 DTRACE_PROBE2(zsd__shutdown__needed,
 887                                     zone_t *, zone, zone_key_t, key);
 888                         }
 889                 } else {
 890                         if (t->zsd_destroy != NULL &&
 891                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 892                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 893                                 DTRACE_PROBE2(zsd__destroy__needed,
 894                                     zone_t *, zone, zone_key_t, key);
 895                         }
 896                 }
 897         }
 898         mutex_exit(&zone->zone_lock);
 899
 900         /* Now call the shutdown and destroy callback for this key */
 901         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 902         zsd_apply_all_keys(zsd_apply_destroy, zone);
 903
 904 }
 905
 906 /*
 907  * Called when the zone is going away; free ZSD-related memory, and
 908  * destroy the zone_zsd list.
 909  */
 910 static void
 911 zone_free_zsd(zone_t *zone)
 912 {
 913         struct zsd_entry *t, *next;
 914
 915         /*
 916          * Free all the zsd_entry's we had on this zone.
 917          */
 918         mutex_enter(&zone->zone_lock);
 919         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 920                 next = list_next(&zone->zone_zsd, t);
 921                 list_remove(&zone->zone_zsd, t);
 922                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 923                 kmem_free(t, sizeof (*t));
 924         }
 925         list_destroy(&zone->zone_zsd);
 926         mutex_exit(&zone->zone_lock);
 927
 928 }
 929
 930 /*
 931  * Apply a function to all zones for particular key value.
 932  *
 933  * The applyfn has to drop zonehash_lock if it does some work, and
 934  * then reacquire it before it returns.
 935  * When the lock is dropped we don't follow list_next even
 936  * if it is possible to do so without any hazards. This is
 937  * because we want the design to allow for the list of zones
 938  * to change in any arbitrary way during the time the
 939  * lock was dropped.
 940  *
 941  * It is safe to restart the loop at list_head since the applyfn
 942  * changes the zsd_flags as it does work, so a subsequent
 943  * pass through will have no effect in applyfn, hence the loop will terminate
 944  * in at worst O(N^2).
 945  */
 946 static void
 947 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 948 {
 949         zone_t *zone;
 950
 951         mutex_enter(&zonehash_lock);
 952         zone = list_head(&zone_active);
 953         while (zone != NULL) {
 954                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 955                         /* Lock dropped - restart at head */
 956                         zone = list_head(&zone_active);
 957                 } else {
 958                         zone = list_next(&zone_active, zone);
 959                 }
 960         }
 961         mutex_exit(&zonehash_lock);
 962 }
 963
 964 /*
 965  * Apply a function to all keys for a particular zone.
 966  *
 967  * The applyfn has to drop zonehash_lock if it does some work, and
 968  * then reacquire it before it returns.
 969  * When the lock is dropped we don't follow list_next even
 970  * if it is possible to do so without any hazards. This is
 971  * because we want the design to allow for the list of zsd callbacks
 972  * to change in any arbitrary way during the time the
 973  * lock was dropped.
 974  *
 975  * It is safe to restart the loop at list_head since the applyfn
 976  * changes the zsd_flags as it does work, so a subsequent
 977  * pass through will have no effect in applyfn, hence the loop will terminate
 978  * in at worst O(N^2).
 979  */
 980 static void
 981 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 982 {
 983         struct zsd_entry *t;
 984
 985         mutex_enter(&zone->zone_lock);
 986         t = list_head(&zone->zone_zsd);
 987         while (t != NULL) {
 988                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 989                         /* Lock dropped - restart at head */
 990                         t = list_head(&zone->zone_zsd);
 991                 } else {
 992                         t = list_next(&zone->zone_zsd, t);
 993                 }
 994         }
 995         mutex_exit(&zone->zone_lock);
 996 }
 997
 998 /*
 999  * Call the create function for the zone and key if CREATE_NEEDED
1000  * is set.
1001  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1002  * we wait for that thread to complete so that we can ensure that
1003  * all the callbacks are done when we've looped over all zones/keys.
1004  *
1005  * When we call the create function, we drop the global held by the
1006  * caller, and return true to tell the caller it needs to re-evalute the
1007  * state.
1008  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1009  * remains held on exit.
1010  */
1011 static boolean_t
1012 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1013     zone_t *zone, zone_key_t key)
1014 {
1015         void *result;
1016         struct zsd_entry *t;
1017         boolean_t dropped;
1018
1019         if (lockp != NULL) {
1020                 ASSERT(MUTEX_HELD(lockp));
1021         }
1022         if (zone_lock_held) {
1023                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1024         } else {
1025                 mutex_enter(&zone->zone_lock);
1026         }
1027
1028         t = zsd_find(&zone->zone_zsd, key);
1029         if (t == NULL) {
1030                 /*
1031                  * Somebody else got here first e.g the zone going
1032                  * away.
1033                  */
1034                 if (!zone_lock_held)
1035                         mutex_exit(&zone->zone_lock);
1036                 return (B_FALSE);
1037         }
1038         dropped = B_FALSE;
1039         if (zsd_wait_for_inprogress(zone, t, lockp))
1040                 dropped = B_TRUE;
1041
1042         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1043                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1044                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1045                 DTRACE_PROBE2(zsd__create__inprogress,
1046                     zone_t *, zone, zone_key_t, key);
1047                 mutex_exit(&zone->zone_lock);
1048                 if (lockp != NULL)
1049                         mutex_exit(lockp);
1050
1051                 dropped = B_TRUE;
1052                 ASSERT(t->zsd_create != NULL);
1053                 DTRACE_PROBE2(zsd__create__start,
1054                     zone_t *, zone, zone_key_t, key);
1055
1056                 result = (*t->zsd_create)(zone->zone_id);
1057
1058                 DTRACE_PROBE2(zsd__create__end,
1059                     zone_t *, zone, voidn *, result);
1060
1061                 ASSERT(result != NULL);
1062                 if (lockp != NULL)
1063                         mutex_enter(lockp);
1064                 mutex_enter(&zone->zone_lock);
1065                 t->zsd_data = result;
1066                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1067                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1068                 cv_broadcast(&t->zsd_cv);
1069                 DTRACE_PROBE2(zsd__create__completed,
1070                     zone_t *, zone, zone_key_t, key);
1071         }
1072         if (!zone_lock_held)
1073                 mutex_exit(&zone->zone_lock);
1074         return (dropped);
1075 }
1076
1077 /*
1078  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1079  * is set.
1080  * If some other thread gets here first and sets *_INPROGRESS, then
1081  * we wait for that thread to complete so that we can ensure that
1082  * all the callbacks are done when we've looped over all zones/keys.
1083  *
1084  * When we call the shutdown function, we drop the global held by the
1085  * caller, and return true to tell the caller it needs to re-evalute the
1086  * state.
1087  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1088  * remains held on exit.
1089  */
1090 static boolean_t
1091 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1092     zone_t *zone, zone_key_t key)
1093 {
1094         struct zsd_entry *t;
1095         void *data;
1096         boolean_t dropped;
1097
1098         if (lockp != NULL) {
1099                 ASSERT(MUTEX_HELD(lockp));
1100         }
1101         if (zone_lock_held) {
1102                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1103         } else {
1104                 mutex_enter(&zone->zone_lock);
1105         }
1106
1107         t = zsd_find(&zone->zone_zsd, key);
1108         if (t == NULL) {
1109                 /*
1110                  * Somebody else got here first e.g the zone going
1111                  * away.
1112                  */
1113                 if (!zone_lock_held)
1114                         mutex_exit(&zone->zone_lock);
1115                 return (B_FALSE);
1116         }
1117         dropped = B_FALSE;
1118         if (zsd_wait_for_creator(zone, t, lockp))
1119                 dropped = B_TRUE;
1120
1121         if (zsd_wait_for_inprogress(zone, t, lockp))
1122                 dropped = B_TRUE;
1123
1124         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1125                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1126                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1127                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1128                     zone_t *, zone, zone_key_t, key);
1129                 mutex_exit(&zone->zone_lock);
1130                 if (lockp != NULL)
1131                         mutex_exit(lockp);
1132                 dropped = B_TRUE;
1133
1134                 ASSERT(t->zsd_shutdown != NULL);
1135                 data = t->zsd_data;
1136
1137                 DTRACE_PROBE2(zsd__shutdown__start,
1138                     zone_t *, zone, zone_key_t, key);
1139
1140                 (t->zsd_shutdown)(zone->zone_id, data);
1141                 DTRACE_PROBE2(zsd__shutdown__end,
1142                     zone_t *, zone, zone_key_t, key);
1143
1144                 if (lockp != NULL)
1145                         mutex_enter(lockp);
1146                 mutex_enter(&zone->zone_lock);
1147                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1148                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1149                 cv_broadcast(&t->zsd_cv);
1150                 DTRACE_PROBE2(zsd__shutdown__completed,
1151                     zone_t *, zone, zone_key_t, key);
1152         }
1153         if (!zone_lock_held)
1154                 mutex_exit(&zone->zone_lock);
1155         return (dropped);
1156 }
1157
1158 /*
1159  * Call the destroy function for the zone and key if DESTROY_NEEDED
1160  * is set.
1161  * If some other thread gets here first and sets *_INPROGRESS, then
1162  * we wait for that thread to complete so that we can ensure that
1163  * all the callbacks are done when we've looped over all zones/keys.
1164  *
1165  * When we call the destroy function, we drop the global held by the
1166  * caller, and return true to tell the caller it needs to re-evalute the
1167  * state.
1168  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1169  * remains held on exit.
1170  */
1171 static boolean_t
1172 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1173     zone_t *zone, zone_key_t key)
1174 {
1175         struct zsd_entry *t;
1176         void *data;
1177         boolean_t dropped;
1178
1179         if (lockp != NULL) {
1180                 ASSERT(MUTEX_HELD(lockp));
1181         }
1182         if (zone_lock_held) {
1183                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1184         } else {
1185                 mutex_enter(&zone->zone_lock);
1186         }
1187
1188         t = zsd_find(&zone->zone_zsd, key);
1189         if (t == NULL) {
1190                 /*
1191                  * Somebody else got here first e.g the zone going
1192                  * away.
1193                  */
1194                 if (!zone_lock_held)
1195                         mutex_exit(&zone->zone_lock);
1196                 return (B_FALSE);
1197         }
1198         dropped = B_FALSE;
1199         if (zsd_wait_for_creator(zone, t, lockp))
1200                 dropped = B_TRUE;
1201
1202         if (zsd_wait_for_inprogress(zone, t, lockp))
1203                 dropped = B_TRUE;
1204
1205         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1206                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1207                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1208                 DTRACE_PROBE2(zsd__destroy__inprogress,
1209                     zone_t *, zone, zone_key_t, key);
1210                 mutex_exit(&zone->zone_lock);
1211                 if (lockp != NULL)
1212                         mutex_exit(lockp);
1213                 dropped = B_TRUE;
1214
1215                 ASSERT(t->zsd_destroy != NULL);
1216                 data = t->zsd_data;
1217                 DTRACE_PROBE2(zsd__destroy__start,
1218                     zone_t *, zone, zone_key_t, key);
1219
1220                 (t->zsd_destroy)(zone->zone_id, data);
1221                 DTRACE_PROBE2(zsd__destroy__end,
1222                     zone_t *, zone, zone_key_t, key);
1223
1224                 if (lockp != NULL)
1225                         mutex_enter(lockp);
1226                 mutex_enter(&zone->zone_lock);
1227                 t->zsd_data = NULL;
1228                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1229                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1230                 cv_broadcast(&t->zsd_cv);
1231                 DTRACE_PROBE2(zsd__destroy__completed,
1232                     zone_t *, zone, zone_key_t, key);
1233         }
1234         if (!zone_lock_held)
1235                 mutex_exit(&zone->zone_lock);
1236         return (dropped);
1237 }
1238
1239 /*
1240  * Wait for any CREATE_NEEDED flag to be cleared.
1241  * Returns true if lockp was temporarily dropped while waiting.
1242  */
1243 static boolean_t
1244 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1245 {
1246         boolean_t dropped = B_FALSE;
1247
1248         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1249                 DTRACE_PROBE2(zsd__wait__for__creator,
1250                     zone_t *, zone, struct zsd_entry *, t);
1251                 if (lockp != NULL) {
1252                         dropped = B_TRUE;
1253                         mutex_exit(lockp);
1254                 }
1255                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1256                 if (lockp != NULL) {
1257                         /* First drop zone_lock to preserve order */
1258                         mutex_exit(&zone->zone_lock);
1259                         mutex_enter(lockp);
1260                         mutex_enter(&zone->zone_lock);
1261                 }
1262         }
1263         return (dropped);
1264 }
1265
1266 /*
1267  * Wait for any INPROGRESS flag to be cleared.
1268  * Returns true if lockp was temporarily dropped while waiting.
1269  */
1270 static boolean_t
1271 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1272 {
1273         boolean_t dropped = B_FALSE;
1274
1275         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1276                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1277                     zone_t *, zone, struct zsd_entry *, t);
1278                 if (lockp != NULL) {
1279                         dropped = B_TRUE;
1280                         mutex_exit(lockp);
1281                 }
1282                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1283                 if (lockp != NULL) {
1284                         /* First drop zone_lock to preserve order */
1285                         mutex_exit(&zone->zone_lock);
1286                         mutex_enter(lockp);
1287                         mutex_enter(&zone->zone_lock);
1288                 }
1289         }
1290         return (dropped);
1291 }
1292
1293 /*
1294  * Frees memory associated with the zone dataset list.
1295  */
1296 static void
1297 zone_free_datasets(zone_t *zone)
1298 {
1299         zone_dataset_t *t, *next;
1300
1301         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1302                 next = list_next(&zone->zone_datasets, t);
1303                 list_remove(&zone->zone_datasets, t);
1304                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1305                 kmem_free(t, sizeof (*t));
1306         }
1307         list_destroy(&zone->zone_datasets);
1308 }
1309
1310 /*
1311  * zone.cpu-shares resource control support.
1312  */
1313 /*ARGSUSED*/
1314 static rctl_qty_t
1315 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1316 {
1317         ASSERT(MUTEX_HELD(&p->p_lock));
1318         return (p->p_zone->zone_shares);
1319 }
1320
1321 /*ARGSUSED*/
1322 static int
1323 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1324     rctl_qty_t nv)
1325 {
1326         ASSERT(MUTEX_HELD(&p->p_lock));
1327         ASSERT(e->rcep_t == RCENTITY_ZONE);
1328         if (e->rcep_p.zone == NULL)
1329                 return (0);
1330
1331         e->rcep_p.zone->zone_shares = nv;
1332         return (0);
1333 }
1334
1335 static rctl_ops_t zone_cpu_shares_ops = {
1336         rcop_no_action,
1337         zone_cpu_shares_usage,
1338         zone_cpu_shares_set,
1339         rcop_no_test
1340 };
1341
1342 /*
1343  * zone.cpu-cap resource control support.
1344  */
1345 /*ARGSUSED*/
1346 static rctl_qty_t
1347 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1348 {
1349         ASSERT(MUTEX_HELD(&p->p_lock));
1350         return (cpucaps_zone_get(p->p_zone));
1351 }
1352
1353 /*ARGSUSED*/
1354 static int
1355 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1356     rctl_qty_t nv)
1357 {
1358         zone_t *zone = e->rcep_p.zone;
1359
1360         ASSERT(MUTEX_HELD(&p->p_lock));
1361         ASSERT(e->rcep_t == RCENTITY_ZONE);
1362
1363         if (zone == NULL)
1364                 return (0);
1365
1366         /*
1367          * set cap to the new value.
1368          */
1369         return (cpucaps_zone_set(zone, nv));
1370 }
1371
1372 static rctl_ops_t zone_cpu_cap_ops = {
1373         rcop_no_action,
1374         zone_cpu_cap_get,
1375         zone_cpu_cap_set,
1376         rcop_no_test
1377 };
1378
1379 /*ARGSUSED*/
1380 static rctl_qty_t
1381 zone_lwps_usage(rctl_t *r, proc_t *p)
1382 {
1383         rctl_qty_t nlwps;
1384         zone_t *zone = p->p_zone;
1385
1386         ASSERT(MUTEX_HELD(&p->p_lock));
1387
1388         mutex_enter(&zone->zone_nlwps_lock);
1389         nlwps = zone->zone_nlwps;
1390         mutex_exit(&zone->zone_nlwps_lock);
1391
1392         return (nlwps);
1393 }
1394
1395 /*ARGSUSED*/
1396 static int
1397 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1398     rctl_qty_t incr, uint_t flags)
1399 {
1400         rctl_qty_t nlwps;
1401
1402         ASSERT(MUTEX_HELD(&p->p_lock));
1403         ASSERT(e->rcep_t == RCENTITY_ZONE);
1404         if (e->rcep_p.zone == NULL)
1405                 return (0);
1406         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1407         nlwps = e->rcep_p.zone->zone_nlwps;
1408
1409         if (nlwps + incr > rcntl->rcv_value)
1410                 return (1);
1411
1412         return (0);
1413 }
1414
1415 /*ARGSUSED*/
1416 static int
1417 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1418 {
1419         ASSERT(MUTEX_HELD(&p->p_lock));
1420         ASSERT(e->rcep_t == RCENTITY_ZONE);
1421         if (e->rcep_p.zone == NULL)
1422                 return (0);
1423         e->rcep_p.zone->zone_nlwps_ctl = nv;
1424         return (0);
1425 }
1426
1427 static rctl_ops_t zone_lwps_ops = {
1428         rcop_no_action,
1429         zone_lwps_usage,
1430         zone_lwps_set,
1431         zone_lwps_test,
1432 };
1433
1434 /*ARGSUSED*/
1435 static rctl_qty_t
1436 zone_procs_usage(rctl_t *r, proc_t *p)
1437 {
1438         rctl_qty_t nprocs;
1439         zone_t *zone = p->p_zone;
1440
1441         ASSERT(MUTEX_HELD(&p->p_lock));
1442
1443         mutex_enter(&zone->zone_nlwps_lock);
1444         nprocs = zone->zone_nprocs;
1445         mutex_exit(&zone->zone_nlwps_lock);
1446
1447         return (nprocs);
1448 }
1449
1450 /*ARGSUSED*/
1451 static int
1452 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1453     rctl_qty_t incr, uint_t flags)
1454 {
1455         rctl_qty_t nprocs;
1456
1457         ASSERT(MUTEX_HELD(&p->p_lock));
1458         ASSERT(e->rcep_t == RCENTITY_ZONE);
1459         if (e->rcep_p.zone == NULL)
1460                 return (0);
1461         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1462         nprocs = e->rcep_p.zone->zone_nprocs;
1463
1464         if (nprocs + incr > rcntl->rcv_value)
1465                 return (1);
1466
1467         return (0);
1468 }
1469
1470 /*ARGSUSED*/
1471 static int
1472 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1473 {
1474         ASSERT(MUTEX_HELD(&p->p_lock));
1475         ASSERT(e->rcep_t == RCENTITY_ZONE);
1476         if (e->rcep_p.zone == NULL)
1477                 return (0);
1478         e->rcep_p.zone->zone_nprocs_ctl = nv;
1479         return (0);
1480 }
1481
1482 static rctl_ops_t zone_procs_ops = {
1483         rcop_no_action,
1484         zone_procs_usage,
1485         zone_procs_set,
1486         zone_procs_test,
1487 };
1488
1489 /*ARGSUSED*/
1490 static rctl_qty_t
1491 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1492 {
1493         ASSERT(MUTEX_HELD(&p->p_lock));
1494         return (p->p_zone->zone_shmmax);
1495 }
1496
1497 /*ARGSUSED*/
1498 static int
1499 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1500     rctl_qty_t incr, uint_t flags)
1501 {
1502         rctl_qty_t v;
1503         ASSERT(MUTEX_HELD(&p->p_lock));
1504         ASSERT(e->rcep_t == RCENTITY_ZONE);
1505         v = e->rcep_p.zone->zone_shmmax + incr;
1506         if (v > rval->rcv_value)
1507                 return (1);
1508         return (0);
1509 }
1510
1511 static rctl_ops_t zone_shmmax_ops = {
1512         rcop_no_action,
1513         zone_shmmax_usage,
1514         rcop_no_set,
1515         zone_shmmax_test
1516 };
1517
1518 /*ARGSUSED*/
1519 static rctl_qty_t
1520 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1521 {
1522         ASSERT(MUTEX_HELD(&p->p_lock));
1523         return (p->p_zone->zone_ipc.ipcq_shmmni);
1524 }
1525
1526 /*ARGSUSED*/
1527 static int
1528 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1529     rctl_qty_t incr, uint_t flags)
1530 {
1531         rctl_qty_t v;
1532         ASSERT(MUTEX_HELD(&p->p_lock));
1533         ASSERT(e->rcep_t == RCENTITY_ZONE);
1534         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1535         if (v > rval->rcv_value)
1536                 return (1);
1537         return (0);
1538 }
1539
1540 static rctl_ops_t zone_shmmni_ops = {
1541         rcop_no_action,
1542         zone_shmmni_usage,
1543         rcop_no_set,
1544         zone_shmmni_test
1545 };
1546
1547 /*ARGSUSED*/
1548 static rctl_qty_t
1549 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1550 {
1551         ASSERT(MUTEX_HELD(&p->p_lock));
1552         return (p->p_zone->zone_ipc.ipcq_semmni);
1553 }
1554
1555 /*ARGSUSED*/
1556 static int
1557 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1558     rctl_qty_t incr, uint_t flags)
1559 {
1560         rctl_qty_t v;
1561         ASSERT(MUTEX_HELD(&p->p_lock));
1562         ASSERT(e->rcep_t == RCENTITY_ZONE);
1563         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1564         if (v > rval->rcv_value)
1565                 return (1);
1566         return (0);
1567 }
1568
1569 static rctl_ops_t zone_semmni_ops = {
1570         rcop_no_action,
1571         zone_semmni_usage,
1572         rcop_no_set,
1573         zone_semmni_test
1574 };
1575
1576 /*ARGSUSED*/
1577 static rctl_qty_t
1578 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1579 {
1580         ASSERT(MUTEX_HELD(&p->p_lock));
1581         return (p->p_zone->zone_ipc.ipcq_msgmni);
1582 }
1583
1584 /*ARGSUSED*/
1585 static int
1586 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1587     rctl_qty_t incr, uint_t flags)
1588 {
1589         rctl_qty_t v;
1590         ASSERT(MUTEX_HELD(&p->p_lock));
1591         ASSERT(e->rcep_t == RCENTITY_ZONE);
1592         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1593         if (v > rval->rcv_value)
1594                 return (1);
1595         return (0);
1596 }
1597
1598 static rctl_ops_t zone_msgmni_ops = {
1599         rcop_no_action,
1600         zone_msgmni_usage,
1601         rcop_no_set,
1602         zone_msgmni_test
1603 };
1604
1605 /*ARGSUSED*/
1606 static rctl_qty_t
1607 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1608 {
1609         rctl_qty_t q;
1610         ASSERT(MUTEX_HELD(&p->p_lock));
1611         mutex_enter(&p->p_zone->zone_mem_lock);
1612         q = p->p_zone->zone_locked_mem;
1613         mutex_exit(&p->p_zone->zone_mem_lock);
1614         return (q);
1615 }
1616
1617 /*ARGSUSED*/
1618 static int
1619 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1620     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1621 {
1622         rctl_qty_t q;
1623         zone_t *z;
1624
1625         z = e->rcep_p.zone;
1626         ASSERT(MUTEX_HELD(&p->p_lock));
1627         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1628         q = z->zone_locked_mem;
1629         if (q + incr > rcntl->rcv_value)
1630                 return (1);
1631         return (0);
1632 }
1633
1634 /*ARGSUSED*/
1635 static int
1636 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1637     rctl_qty_t nv)
1638 {
1639         ASSERT(MUTEX_HELD(&p->p_lock));
1640         ASSERT(e->rcep_t == RCENTITY_ZONE);
1641         if (e->rcep_p.zone == NULL)
1642                 return (0);
1643         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1644         return (0);
1645 }
1646
1647 static rctl_ops_t zone_locked_mem_ops = {
1648         rcop_no_action,
1649         zone_locked_mem_usage,
1650         zone_locked_mem_set,
1651         zone_locked_mem_test
1652 };
1653
1654 /*ARGSUSED*/
1655 static rctl_qty_t
1656 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1657 {
1658         rctl_qty_t q;
1659         zone_t *z = p->p_zone;
1660
1661         ASSERT(MUTEX_HELD(&p->p_lock));
1662         mutex_enter(&z->zone_mem_lock);
1663         q = z->zone_max_swap;
1664         mutex_exit(&z->zone_mem_lock);
1665         return (q);
1666 }
1667
1668 /*ARGSUSED*/
1669 static int
1670 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1671     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1672 {
1673         rctl_qty_t q;
1674         zone_t *z;
1675
1676         z = e->rcep_p.zone;
1677         ASSERT(MUTEX_HELD(&p->p_lock));
1678         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1679         q = z->zone_max_swap;
1680         if (q + incr > rcntl->rcv_value)
1681                 return (1);
1682         return (0);
1683 }
1684
1685 /*ARGSUSED*/
1686 static int
1687 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1688     rctl_qty_t nv)
1689 {
1690         ASSERT(MUTEX_HELD(&p->p_lock));
1691         ASSERT(e->rcep_t == RCENTITY_ZONE);
1692         if (e->rcep_p.zone == NULL)
1693                 return (0);
1694         e->rcep_p.zone->zone_max_swap_ctl = nv;
1695         return (0);
1696 }
1697
1698 static rctl_ops_t zone_max_swap_ops = {
1699         rcop_no_action,
1700         zone_max_swap_usage,
1701         zone_max_swap_set,
1702         zone_max_swap_test
1703 };
1704
1705 /*ARGSUSED*/
1706 static rctl_qty_t
1707 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1708 {
1709         rctl_qty_t q;
1710         zone_t *z = p->p_zone;
1711
1712         ASSERT(MUTEX_HELD(&p->p_lock));
1713         mutex_enter(&z->zone_rctl_lock);
1714         q = z->zone_max_lofi;
1715         mutex_exit(&z->zone_rctl_lock);
1716         return (q);
1717 }
1718
1719 /*ARGSUSED*/
1720 static int
1721 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1722     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1723 {
1724         rctl_qty_t q;
1725         zone_t *z;
1726
1727         z = e->rcep_p.zone;
1728         ASSERT(MUTEX_HELD(&p->p_lock));
1729         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1730         q = z->zone_max_lofi;
1731         if (q + incr > rcntl->rcv_value)
1732                 return (1);
1733         return (0);
1734 }
1735
1736 /*ARGSUSED*/
1737 static int
1738 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1739     rctl_qty_t nv)
1740 {
1741         ASSERT(MUTEX_HELD(&p->p_lock));
1742         ASSERT(e->rcep_t == RCENTITY_ZONE);
1743         if (e->rcep_p.zone == NULL)
1744                 return (0);
1745         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1746         return (0);
1747 }
1748
1749 static rctl_ops_t zone_max_lofi_ops = {
1750         rcop_no_action,
1751         zone_max_lofi_usage,
1752         zone_max_lofi_set,
1753         zone_max_lofi_test
1754 };
1755
1756 /*
1757  * Helper function to brand the zone with a unique ID.
1758  */
1759 static void
1760 zone_uniqid(zone_t *zone)
1761 {
1762         static uint64_t uniqid = 0;
1763
1764         ASSERT(MUTEX_HELD(&zonehash_lock));
1765         zone->zone_uniqid = uniqid++;
1766 }
1767
1768 /*
1769  * Returns a held pointer to the "kcred" for the specified zone.
1770  */
1771 struct cred *
1772 zone_get_kcred(zoneid_t zoneid)
1773 {
1774         zone_t *zone;
1775         cred_t *cr;
1776
1777         if ((zone = zone_find_by_id(zoneid)) == NULL)
1778                 return (NULL);
1779         cr = zone->zone_kcred;
1780         crhold(cr);
1781         zone_rele(zone);
1782         return (cr);
1783 }
1784
1785 static int
1786 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1787 {
1788         zone_t *zone = ksp->ks_private;
1789         zone_kstat_t *zk = ksp->ks_data;
1790
1791         if (rw == KSTAT_WRITE)
1792                 return (EACCES);
1793
1794         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1795         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1796         return (0);
1797 }
1798
1799 static int
1800 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1801 {
1802         zone_t *zone = ksp->ks_private;
1803         zone_kstat_t *zk = ksp->ks_data;
1804
1805         if (rw == KSTAT_WRITE)
1806                 return (EACCES);
1807
1808         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1809         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1810         return (0);
1811 }
1812
1813 static int
1814 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1815 {
1816         zone_t *zone = ksp->ks_private;
1817         zone_kstat_t *zk = ksp->ks_data;
1818
1819         if (rw == KSTAT_WRITE)
1820                 return (EACCES);
1821
1822         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1823         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1824         return (0);
1825 }
1826
1827 static kstat_t *
1828 zone_kstat_create_common(zone_t *zone, char *name,
1829     int (*updatefunc) (kstat_t *, int))
1830 {
1831         kstat_t *ksp;
1832         zone_kstat_t *zk;
1833
1834         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1835             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1836             KSTAT_FLAG_VIRTUAL);
1837
1838         if (ksp == NULL)
1839                 return (NULL);
1840
1841         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1842         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1843         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1844         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1845         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1846         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1847         ksp->ks_update = updatefunc;
1848         ksp->ks_private = zone;
1849         kstat_install(ksp);
1850         return (ksp);
1851 }
1852
1853
1854 static int
1855 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1856 {
1857         zone_t *zone = ksp->ks_private;
1858         zone_mcap_kstat_t *zmp = ksp->ks_data;
1859
1860         if (rw == KSTAT_WRITE)
1861                 return (EACCES);
1862
1863         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1864         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1865         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1866         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1867         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1868
1869         return (0);
1870 }
1871
1872 static kstat_t *
1873 zone_mcap_kstat_create(zone_t *zone)
1874 {
1875         kstat_t *ksp;
1876         zone_mcap_kstat_t *zmp;
1877
1878         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1879             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1880             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1881             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1882                 return (NULL);
1883
1884         if (zone->zone_id != GLOBAL_ZONEID)
1885                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1886
1887         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1888         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1889         ksp->ks_lock = &zone->zone_mcap_lock;
1890         zone->zone_mcap_stats = zmp;
1891
1892         /* The kstat "name" field is not large enough for a full zonename */
1893         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1894         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1895         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1896         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1897         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1898         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1899         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1900             KSTAT_DATA_UINT64);
1901
1902         ksp->ks_update = zone_mcap_kstat_update;
1903         ksp->ks_private = zone;
1904
1905         kstat_install(ksp);
1906         return (ksp);
1907 }
1908
1909 static int
1910 zone_misc_kstat_update(kstat_t *ksp, int rw)
1911 {
1912         zone_t *zone = ksp->ks_private;
1913         zone_misc_kstat_t *zmp = ksp->ks_data;
1914         hrtime_t tmp;
1915
1916         if (rw == KSTAT_WRITE)
1917                 return (EACCES);
1918
1919         tmp = zone->zone_utime;
1920         scalehrtime(&tmp);
1921         zmp->zm_utime.value.ui64 = tmp;
1922         tmp = zone->zone_stime;
1923         scalehrtime(&tmp);
1924         zmp->zm_stime.value.ui64 = tmp;
1925         tmp = zone->zone_wtime;
1926         scalehrtime(&tmp);
1927         zmp->zm_wtime.value.ui64 = tmp;
1928
1929         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1930         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1931         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1932
1933         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1934         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1935         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1936         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1937
1938         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1939
1940         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1941         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1942
1943         return (0);
1944 }
1945
1946 static kstat_t *
1947 zone_misc_kstat_create(zone_t *zone)
1948 {
1949         kstat_t *ksp;
1950         zone_misc_kstat_t *zmp;
1951
1952         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1953             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1954             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1955             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1956                 return (NULL);
1957
1958         if (zone->zone_id != GLOBAL_ZONEID)
1959                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1960
1961         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1962         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1963         ksp->ks_lock = &zone->zone_misc_lock;
1964         zone->zone_misc_stats = zmp;
1965
1966         /* The kstat "name" field is not large enough for a full zonename */
1967         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1968         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1969         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1970         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1971         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1972         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1973         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1974         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1975             KSTAT_DATA_UINT32);
1976         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1977         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1978             KSTAT_DATA_UINT32);
1979         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1980         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1981         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1982             KSTAT_DATA_UINT32);
1983         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1984         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1985
1986         ksp->ks_update = zone_misc_kstat_update;
1987         ksp->ks_private = zone;
1988
1989         kstat_install(ksp);
1990         return (ksp);
1991 }
1992
1993 static void
1994 zone_kstat_create(zone_t *zone)
1995 {
1996         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1997             "lockedmem", zone_lockedmem_kstat_update);
1998         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1999             "swapresv", zone_swapresv_kstat_update);
2000         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2001             "nprocs", zone_nprocs_kstat_update);
2002
2003         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2004                 zone->zone_mcap_stats = kmem_zalloc(
2005                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2006         }
2007
2008         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2009                 zone->zone_misc_stats = kmem_zalloc(
2010                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2011         }
2012 }
2013
2014 static void
2015 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2016 {
2017         void *data;
2018
2019         if (*pkstat != NULL) {
2020                 data = (*pkstat)->ks_data;
2021                 kstat_delete(*pkstat);
2022                 kmem_free(data, datasz);
2023                 *pkstat = NULL;
2024         }
2025 }
2026
2027 static void
2028 zone_kstat_delete(zone_t *zone)
2029 {
2030         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2031             sizeof (zone_kstat_t));
2032         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2033             sizeof (zone_kstat_t));
2034         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2035             sizeof (zone_kstat_t));
2036         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2037             sizeof (zone_mcap_kstat_t));
2038         zone_kstat_delete_common(&zone->zone_misc_ksp,
2039             sizeof (zone_misc_kstat_t));
2040 }
2041
2042 /*
2043  * Called very early on in boot to initialize the ZSD list so that
2044  * zone_key_create() can be called before zone_init().  It also initializes
2045  * portions of zone0 which may be used before zone_init() is called.  The
2046  * variable "global_zone" will be set when zone0 is fully initialized by
2047  * zone_init().
2048  */
2049 void
2050 zone_zsd_init(void)
2051 {
2052         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2053         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2054         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2055             offsetof(struct zsd_entry, zsd_linkage));
2056         list_create(&zone_active, sizeof (zone_t),
2057             offsetof(zone_t, zone_linkage));
2058         list_create(&zone_deathrow, sizeof (zone_t),
2059             offsetof(zone_t, zone_linkage));
2060
2061         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2062         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2063         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2064         zone0.zone_shares = 1;
2065         zone0.zone_nlwps = 0;
2066         zone0.zone_nlwps_ctl = INT_MAX;
2067         zone0.zone_nprocs = 0;
2068         zone0.zone_nprocs_ctl = INT_MAX;
2069         zone0.zone_locked_mem = 0;
2070         zone0.zone_locked_mem_ctl = UINT64_MAX;
2071         ASSERT(zone0.zone_max_swap == 0);
2072         zone0.zone_max_swap_ctl = UINT64_MAX;
2073         zone0.zone_max_lofi = 0;
2074         zone0.zone_max_lofi_ctl = UINT64_MAX;
2075         zone0.zone_shmmax = 0;
2076         zone0.zone_ipc.ipcq_shmmni = 0;
2077         zone0.zone_ipc.ipcq_semmni = 0;
2078         zone0.zone_ipc.ipcq_msgmni = 0;
2079         zone0.zone_name = GLOBAL_ZONENAME;
2080         zone0.zone_nodename = utsname.nodename;
2081         zone0.zone_domain = srpc_domain;
2082         zone0.zone_hostid = HW_INVALID_HOSTID;
2083         zone0.zone_fs_allowed = NULL;
2084         psecflags_default(&zone0.zone_secflags);
2085         zone0.zone_ref = 1;
2086         zone0.zone_id = GLOBAL_ZONEID;
2087         zone0.zone_status = ZONE_IS_RUNNING;
2088         zone0.zone_rootpath = "/";
2089         zone0.zone_rootpathlen = 2;
2090         zone0.zone_psetid = ZONE_PS_INVAL;
2091         zone0.zone_ncpus = 0;
2092         zone0.zone_ncpus_online = 0;
2093         zone0.zone_proc_initpid = 1;
2094         zone0.zone_initname = initname;
2095         zone0.zone_lockedmem_kstat = NULL;
2096         zone0.zone_swapresv_kstat = NULL;
2097         zone0.zone_nprocs_kstat = NULL;
2098
2099         zone0.zone_stime = 0;
2100         zone0.zone_utime = 0;
2101         zone0.zone_wtime = 0;
2102
2103         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2104             offsetof(zone_ref_t, zref_linkage));
2105         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2106             offsetof(struct zsd_entry, zsd_linkage));
2107         list_insert_head(&zone_active, &zone0);
2108
2109         /*
2110          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2111          * to anything meaningful.  It is assigned to be 'rootdir' in
2112          * vfs_mountroot().
2113          */
2114         zone0.zone_rootvp = NULL;
2115         zone0.zone_vfslist = NULL;
2116         zone0.zone_bootargs = initargs;
2117         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2118         /*
2119          * The global zone has all privileges
2120          */
2121         priv_fillset(zone0.zone_privset);
2122         /*
2123          * Add p0 to the global zone
2124          */
2125         zone0.zone_zsched = &p0;
2126         p0.p_zone = &zone0;
2127 }
2128
2129 /*
2130  * Called by main() to initialize the zones framework.
2131  */
2132 void
2133 zone_init(void)
2134 {
2135         rctl_dict_entry_t *rde;
2136         rctl_val_t *dval;
2137         rctl_set_t *set;
2138         rctl_alloc_gp_t *gp;
2139         rctl_entity_p_t e;
2140         int res;
2141
2142         ASSERT(curproc == &p0);
2143
2144         /*
2145          * Create ID space for zone IDs.  ID 0 is reserved for the
2146          * global zone.
2147          */
2148         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2149
2150         /*
2151          * Initialize generic zone resource controls, if any.
2152          */
2153         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2154             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2155             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2156             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2157
2158         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2159             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2160             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2161             RCTL_GLOBAL_INFINITE,
2162             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2163
2164         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2165             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2166             INT_MAX, INT_MAX, &zone_lwps_ops);
2167
2168         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2169             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2170             INT_MAX, INT_MAX, &zone_procs_ops);
2171
2172         /*
2173          * System V IPC resource controls
2174          */
2175         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2176             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2177             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2178
2179         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2180             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2181             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2182
2183         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2184             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2185             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2186
2187         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2188             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2189             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2190
2191         /*
2192          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2193          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2194          */
2195         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2196         bzero(dval, sizeof (rctl_val_t));
2197         dval->rcv_value = 1;
2198         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2199         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2200         dval->rcv_action_recip_pid = -1;
2201
2202         rde = rctl_dict_lookup("zone.cpu-shares");
2203         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2204
2205         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2206             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2207             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2208             &zone_locked_mem_ops);
2209
2210         rc_zone_max_swap = rctl_register("zone.max-swap",
2211             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2212             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2213             &zone_max_swap_ops);
2214
2215         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2216             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2217             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2218             &zone_max_lofi_ops);
2219
2220         /*
2221          * Initialize the ``global zone''.
2222          */
2223         set = rctl_set_create();
2224         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2225         mutex_enter(&p0.p_lock);
2226         e.rcep_p.zone = &zone0;
2227         e.rcep_t = RCENTITY_ZONE;
2228         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2229             gp);
2230
2231         zone0.zone_nlwps = p0.p_lwpcnt;
2232         zone0.zone_nprocs = 1;
2233         zone0.zone_ntasks = 1;
2234         mutex_exit(&p0.p_lock);
2235         zone0.zone_restart_init = B_TRUE;
2236         zone0.zone_brand = &native_brand;
2237         rctl_prealloc_destroy(gp);
2238         /*
2239          * pool_default hasn't been initialized yet, so we let pool_init()
2240          * take care of making sure the global zone is in the default pool.
2241          */
2242
2243         /*
2244          * Initialize global zone kstats
2245          */
2246         zone_kstat_create(&zone0);
2247
2248         /*
2249          * Initialise the lock for the database structure used by mntfs.
2250          */
2251         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2252
2253         mutex_enter(&zonehash_lock);
2254         zone_uniqid(&zone0);
2255         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2256
2257         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2258             mod_hash_null_valdtor);
2259         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2260             zone_hash_size, mod_hash_null_valdtor);
2261         zonecount = 1;
2262
2263         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2264             (mod_hash_val_t)&zone0);
2265         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2266             (mod_hash_val_t)&zone0);
2267         mutex_exit(&zonehash_lock);
2268
2269         /*
2270          * We avoid setting zone_kcred until now, since kcred is initialized
2271          * sometime after zone_zsd_init() and before zone_init().
2272          */
2273         zone0.zone_kcred = kcred;
2274         /*
2275          * The global zone is fully initialized (except for zone_rootvp which
2276          * will be set when the root filesystem is mounted).
2277          */
2278         global_zone = &zone0;
2279
2280         /*
2281          * Setup an event channel to send zone status change notifications on
2282          */
2283         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2284             EVCH_CREAT);
2285
2286         if (res)
2287                 panic("Sysevent_evc_bind failed during zone setup.\n");
2288
2289 }
2290
2291 static void
2292 zone_free(zone_t *zone)
2293 {
2294         ASSERT(zone != global_zone);
2295         ASSERT(zone->zone_ntasks == 0);
2296         ASSERT(zone->zone_nlwps == 0);
2297         ASSERT(zone->zone_nprocs == 0);
2298         ASSERT(zone->zone_cred_ref == 0);
2299         ASSERT(zone->zone_kcred == NULL);
2300         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2301             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2302         ASSERT(list_is_empty(&zone->zone_ref_list));
2303
2304         /*
2305          * Remove any zone caps.
2306          */
2307         cpucaps_zone_remove(zone);
2308
2309         ASSERT(zone->zone_cpucap == NULL);
2310
2311         /* remove from deathrow list */
2312         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2313                 ASSERT(zone->zone_ref == 0);
2314                 mutex_enter(&zone_deathrow_lock);
2315                 list_remove(&zone_deathrow, zone);
2316                 mutex_exit(&zone_deathrow_lock);
2317         }
2318
2319         list_destroy(&zone->zone_ref_list);
2320         zone_free_zsd(zone);
2321         zone_free_datasets(zone);
2322         list_destroy(&zone->zone_dl_list);
2323
2324         if (zone->zone_rootvp != NULL)
2325                 VN_RELE(zone->zone_rootvp);
2326         if (zone->zone_rootpath)
2327                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2328         if (zone->zone_name != NULL)
2329                 kmem_free(zone->zone_name, ZONENAME_MAX);
2330         if (zone->zone_nodename != NULL)
2331                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2332         if (zone->zone_domain != NULL)
2333                 kmem_free(zone->zone_domain, _SYS_NMLN);
2334         if (zone->zone_privset != NULL)
2335                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2336         if (zone->zone_rctls != NULL)
2337                 rctl_set_free(zone->zone_rctls);
2338         if (zone->zone_bootargs != NULL)
2339                 strfree(zone->zone_bootargs);
2340         if (zone->zone_initname != NULL)
2341                 strfree(zone->zone_initname);
2342         if (zone->zone_fs_allowed != NULL)
2343                 strfree(zone->zone_fs_allowed);
2344         if (zone->zone_pfexecd != NULL)
2345                 klpd_freelist(&zone->zone_pfexecd);
2346         id_free(zoneid_space, zone->zone_id);
2347         mutex_destroy(&zone->zone_lock);
2348         cv_destroy(&zone->zone_cv);
2349         rw_destroy(&zone->zone_mntfs_db_lock);
2350         kmem_free(zone, sizeof (zone_t));
2351 }
2352
2353 /*
2354  * See block comment at the top of this file for information about zone
2355  * status values.
2356  */
2357 /*
2358  * Convenience function for setting zone status.
2359  */
2360 static void
2361 zone_status_set(zone_t *zone, zone_status_t status)
2362 {
2363
2364         nvlist_t *nvl = NULL;
2365         ASSERT(MUTEX_HELD(&zone_status_lock));
2366         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2367             status >= zone_status_get(zone));
2368
2369         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2370             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2371             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2372             zone_status_table[status]) ||
2373             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2374             zone_status_table[zone->zone_status]) ||
2375             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2376             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2377             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2378             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2379 #ifdef DEBUG
2380                 (void) printf(
2381                     "Failed to allocate and send zone state change event.\n");
2382 #endif
2383         }
2384         nvlist_free(nvl);
2385
2386         zone->zone_status = status;
2387
2388         cv_broadcast(&zone->zone_cv);
2389 }
2390
2391 /*
2392  * Public function to retrieve the zone status.  The zone status may
2393  * change after it is retrieved.
2394  */
2395 zone_status_t
2396 zone_status_get(zone_t *zone)
2397 {
2398         return (zone->zone_status);
2399 }
2400
2401 static int
2402 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2403 {
2404         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2405         int err = 0;
2406
2407         ASSERT(zone != global_zone);
2408         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2409                 goto done;      /* EFAULT or ENAMETOOLONG */
2410
2411         if (zone->zone_bootargs != NULL)
2412                 strfree(zone->zone_bootargs);
2413
2414         zone->zone_bootargs = strdup(buf);
2415
2416 done:
2417         kmem_free(buf, BOOTARGS_MAX);
2418         return (err);
2419 }
2420
2421 static int
2422 zone_set_brand(zone_t *zone, const char *brand)
2423 {
2424         struct brand_attr *attrp;
2425         brand_t *bp;
2426
2427         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2428         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2429                 kmem_free(attrp, sizeof (struct brand_attr));
2430                 return (EFAULT);
2431         }
2432
2433         bp = brand_register_zone(attrp);
2434         kmem_free(attrp, sizeof (struct brand_attr));
2435         if (bp == NULL)
2436                 return (EINVAL);
2437
2438         /*
2439          * This is the only place where a zone can change it's brand.
2440          * We already need to hold zone_status_lock to check the zone
2441          * status, so we'll just use that lock to serialize zone
2442          * branding requests as well.
2443          */
2444         mutex_enter(&zone_status_lock);
2445
2446         /* Re-Branding is not allowed and the zone can't be booted yet */
2447         if ((ZONE_IS_BRANDED(zone)) ||
2448             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2449                 mutex_exit(&zone_status_lock);
2450                 brand_unregister_zone(bp);
2451                 return (EINVAL);
2452         }
2453
2454         /* set up the brand specific data */
2455         zone->zone_brand = bp;
2456         ZBROP(zone)->b_init_brand_data(zone);
2457
2458         mutex_exit(&zone_status_lock);
2459         return (0);
2460 }
2461
2462 static int
2463 zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2464 {
2465         int err = 0;
2466         psecflags_t psf;
2467
2468         ASSERT(zone != global_zone);
2469
2470         if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2471                 return (err);
2472
2473         if (zone_status_get(zone) > ZONE_IS_READY)
2474                 return (EINVAL);
2475
2476         if (!psecflags_validate(&psf))
2477                 return (EINVAL);
2478
2479         (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2480
2481         /* Set security flags on the zone's zsched */
2482         (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2483             sizeof (zone->zone_zsched->p_secflags));
2484
2485         return (0);
2486 }
2487
2488 static int
2489 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2490 {
2491         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2492         int err = 0;
2493
2494         ASSERT(zone != global_zone);
2495         if ((err = copyinstr(zone_fs_allowed, buf,
2496             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2497                 goto done;
2498
2499         if (zone->zone_fs_allowed != NULL)
2500                 strfree(zone->zone_fs_allowed);
2501
2502         zone->zone_fs_allowed = strdup(buf);
2503
2504 done:
2505         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2506         return (err);
2507 }
2508
2509 static int
2510 zone_set_initname(zone_t *zone, const char *zone_initname)
2511 {
2512         char initname[INITNAME_SZ];
2513         size_t len;
2514         int err = 0;
2515
2516         ASSERT(zone != global_zone);
2517         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2518                 return (err);   /* EFAULT or ENAMETOOLONG */
2519
2520         if (zone->zone_initname != NULL)
2521                 strfree(zone->zone_initname);
2522
2523         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2524         (void) strcpy(zone->zone_initname, initname);
2525         return (0);
2526 }
2527
2528 static int
2529 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2530 {
2531         uint64_t mcap;
2532         int err = 0;
2533
2534         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2535                 zone->zone_phys_mcap = mcap;
2536
2537         return (err);
2538 }
2539
2540 static int
2541 zone_set_sched_class(zone_t *zone, const char *new_class)
2542 {
2543         char sched_class[PC_CLNMSZ];
2544         id_t classid;
2545         int err;
2546
2547         ASSERT(zone != global_zone);
2548         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2549                 return (err);   /* EFAULT or ENAMETOOLONG */
2550
2551         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2552                 return (set_errno(EINVAL));
2553         zone->zone_defaultcid = classid;
2554         ASSERT(zone->zone_defaultcid > 0 &&
2555             zone->zone_defaultcid < loaded_classes);
2556
2557         return (0);
2558 }
2559
2560 /*
2561  * Block indefinitely waiting for (zone_status >= status)
2562  */
2563 void
2564 zone_status_wait(zone_t *zone, zone_status_t status)
2565 {
2566         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2567
2568         mutex_enter(&zone_status_lock);
2569         while (zone->zone_status < status) {
2570                 cv_wait(&zone->zone_cv, &zone_status_lock);
2571         }
2572         mutex_exit(&zone_status_lock);
2573 }
2574
2575 /*
2576  * Private CPR-safe version of zone_status_wait().
2577  */
2578 static void
2579 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2580 {
2581         callb_cpr_t cprinfo;
2582
2583         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2584
2585         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2586             str);
2587         mutex_enter(&zone_status_lock);
2588         while (zone->zone_status < status) {
2589                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2590                 cv_wait(&zone->zone_cv, &zone_status_lock);
2591                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2592         }
2593         /*
2594          * zone_status_lock is implicitly released by the following.
2595          */
2596         CALLB_CPR_EXIT(&cprinfo);
2597 }
2598
2599 /*
2600  * Block until zone enters requested state or signal is received.  Return (0)
2601  * if signaled, non-zero otherwise.
2602  */
2603 int
2604 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2605 {
2606         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2607
2608         mutex_enter(&zone_status_lock);
2609         while (zone->zone_status < status) {
2610                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2611                         mutex_exit(&zone_status_lock);
2612                         return (0);
2613                 }
2614         }
2615         mutex_exit(&zone_status_lock);
2616         return (1);
2617 }
2618
2619 /*
2620  * Block until the zone enters the requested state or the timeout expires,
2621  * whichever happens first.  Return (-1) if operation timed out, time remaining
2622  * otherwise.
2623  */
2624 clock_t
2625 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2626 {
2627         clock_t timeleft = 0;
2628
2629         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2630
2631         mutex_enter(&zone_status_lock);
2632         while (zone->zone_status < status && timeleft != -1) {
2633                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2634         }
2635         mutex_exit(&zone_status_lock);
2636         return (timeleft);
2637 }
2638
2639 /*
2640  * Block until the zone enters the requested state, the current process is
2641  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2642  * operation timed out, 0 if signaled, time remaining otherwise.
2643  */
2644 clock_t
2645 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2646 {
2647         clock_t timeleft = tim - ddi_get_lbolt();
2648
2649         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2650
2651         mutex_enter(&zone_status_lock);
2652         while (zone->zone_status < status) {
2653                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2654                     tim);
2655                 if (timeleft <= 0)
2656                         break;
2657         }
2658         mutex_exit(&zone_status_lock);
2659         return (timeleft);
2660 }
2661
2662 /*
2663  * Zones have two reference counts: one for references from credential
2664  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2665  * This is so we can allow a zone to be rebooted while there are still
2666  * outstanding cred references, since certain drivers cache dblks (which
2667  * implicitly results in cached creds).  We wait for zone_ref to drop to
2668  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2669  * later freed when the zone_cred_ref drops to 0, though nothing other
2670  * than the zone id and privilege set should be accessed once the zone
2671  * is "dead".
2672  *
2673  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2674  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2675  * to 0.  This can be useful to flush out other sources of cached creds
2676  * that may be less innocuous than the driver case.
2677  *
2678  * Zones also provide a tracked reference counting mechanism in which zone
2679  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2680  * debuggers determine the sources of leaked zone references.  See
2681  * zone_hold_ref() and zone_rele_ref() below for more information.
2682  */
2683
2684 int zone_wait_for_cred = 0;
2685
2686 static void
2687 zone_hold_locked(zone_t *z)
2688 {
2689         ASSERT(MUTEX_HELD(&z->zone_lock));
2690         z->zone_ref++;
2691         ASSERT(z->zone_ref != 0);
2692 }
2693
2694 /*
2695  * Increment the specified zone's reference count.  The zone's zone_t structure
2696  * will not be freed as long as the zone's reference count is nonzero.
2697  * Decrement the zone's reference count via zone_rele().
2698  *
2699  * NOTE: This function should only be used to hold zones for short periods of
2700  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2701  */
2702 void
2703 zone_hold(zone_t *z)
2704 {
2705         mutex_enter(&z->zone_lock);
2706         zone_hold_locked(z);
2707         mutex_exit(&z->zone_lock);
2708 }
2709
2710 /*
2711  * If the non-cred ref count drops to 1 and either the cred ref count
2712  * is 0 or we aren't waiting for cred references, the zone is ready to
2713  * be destroyed.
2714  */
2715 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2716             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2717
2718 /*
2719  * Common zone reference release function invoked by zone_rele() and
2720  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2721  * zone's subsystem-specific reference counters are not affected by the
2722  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2723  * removed from the specified zone's reference list.  ref must be non-NULL iff
2724  * subsys is not ZONE_REF_NUM_SUBSYS.
2725  */
2726 static void
2727 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2728 {
2729         boolean_t wakeup;
2730
2731         mutex_enter(&z->zone_lock);
2732         ASSERT(z->zone_ref != 0);
2733         z->zone_ref--;
2734         if (subsys != ZONE_REF_NUM_SUBSYS) {
2735                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2736                 z->zone_subsys_ref[subsys]--;
2737                 list_remove(&z->zone_ref_list, ref);
2738         }
2739         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2740                 /* no more refs, free the structure */
2741                 mutex_exit(&z->zone_lock);
2742                 zone_free(z);
2743                 return;
2744         }
2745         /* signal zone_destroy so the zone can finish halting */
2746         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2747         mutex_exit(&z->zone_lock);
2748
2749         if (wakeup) {
2750                 /*
2751                  * Grabbing zonehash_lock here effectively synchronizes with
2752                  * zone_destroy() to avoid missed signals.
2753                  */
2754                 mutex_enter(&zonehash_lock);
2755                 cv_broadcast(&zone_destroy_cv);
2756                 mutex_exit(&zonehash_lock);
2757         }
2758 }
2759
2760 /*
2761  * Decrement the specified zone's reference count.  The specified zone will
2762  * cease to exist after this function returns if the reference count drops to
2763  * zero.  This function should be paired with zone_hold().
2764  */
2765 void
2766 zone_rele(zone_t *z)
2767 {
2768         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2769 }
2770
2771 /*
2772  * Initialize a zone reference structure.  This function must be invoked for
2773  * a reference structure before the structure is passed to zone_hold_ref().
2774  */
2775 void
2776 zone_init_ref(zone_ref_t *ref)
2777 {
2778         ref->zref_zone = NULL;
2779         list_link_init(&ref->zref_linkage);
2780 }
2781
2782 /*
2783  * Acquire a reference to zone z.  The caller must specify the
2784  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2785  * zone_ref_t structure will represent a reference to the specified zone.  Use
2786  * zone_rele_ref() to release the reference.
2787  *
2788  * The referenced zone_t structure will not be freed as long as the zone_t's
2789  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2790  * references.
2791  *
2792  * NOTE: The zone_ref_t structure must be initialized before it is used.
2793  * See zone_init_ref() above.
2794  */
2795 void
2796 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2797 {
2798         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2799
2800         /*
2801          * Prevent consumers from reusing a reference structure before
2802          * releasing it.
2803          */
2804         VERIFY(ref->zref_zone == NULL);
2805
2806         ref->zref_zone = z;
2807         mutex_enter(&z->zone_lock);
2808         zone_hold_locked(z);
2809         z->zone_subsys_ref[subsys]++;
2810         ASSERT(z->zone_subsys_ref[subsys] != 0);
2811         list_insert_head(&z->zone_ref_list, ref);
2812         mutex_exit(&z->zone_lock);
2813 }
2814
2815 /*
2816  * Release the zone reference represented by the specified zone_ref_t.
2817  * The reference is invalid after it's released; however, the zone_ref_t
2818  * structure can be reused without having to invoke zone_init_ref().
2819  * subsys should be the same value that was passed to zone_hold_ref()
2820  * when the reference was acquired.
2821  */
2822 void
2823 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2824 {
2825         zone_rele_common(ref->zref_zone, ref, subsys);
2826
2827         /*
2828          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2829          * when consumers dereference the reference.  This helps us catch
2830          * consumers who use released references.  Furthermore, this lets
2831          * consumers reuse the zone_ref_t structure without having to
2832          * invoke zone_init_ref().
2833          */
2834         ref->zref_zone = NULL;
2835 }
2836
2837 void
2838 zone_cred_hold(zone_t *z)
2839 {
2840         mutex_enter(&z->zone_lock);
2841         z->zone_cred_ref++;
2842         ASSERT(z->zone_cred_ref != 0);
2843         mutex_exit(&z->zone_lock);
2844 }
2845
2846 void
2847 zone_cred_rele(zone_t *z)
2848 {
2849         boolean_t wakeup;
2850
2851         mutex_enter(&z->zone_lock);
2852         ASSERT(z->zone_cred_ref != 0);
2853         z->zone_cred_ref--;
2854         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2855                 /* no more refs, free the structure */
2856                 mutex_exit(&z->zone_lock);
2857                 zone_free(z);
2858                 return;
2859         }
2860         /*
2861          * If zone_destroy is waiting for the cred references to drain
2862          * out, and they have, signal it.
2863          */
2864         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2865             zone_status_get(z) >= ZONE_IS_DEAD);
2866         mutex_exit(&z->zone_lock);
2867
2868         if (wakeup) {
2869                 /*
2870                  * Grabbing zonehash_lock here effectively synchronizes with
2871                  * zone_destroy() to avoid missed signals.
2872                  */
2873                 mutex_enter(&zonehash_lock);
2874                 cv_broadcast(&zone_destroy_cv);
2875                 mutex_exit(&zonehash_lock);
2876         }
2877 }
2878
2879 void
2880 zone_task_hold(zone_t *z)
2881 {
2882         mutex_enter(&z->zone_lock);
2883         z->zone_ntasks++;
2884         ASSERT(z->zone_ntasks != 0);
2885         mutex_exit(&z->zone_lock);
2886 }
2887
2888 void
2889 zone_task_rele(zone_t *zone)
2890 {
2891         uint_t refcnt;
2892
2893         mutex_enter(&zone->zone_lock);
2894         ASSERT(zone->zone_ntasks != 0);
2895         refcnt = --zone->zone_ntasks;
2896         if (refcnt > 1) {       /* Common case */
2897                 mutex_exit(&zone->zone_lock);
2898                 return;
2899         }
2900         zone_hold_locked(zone); /* so we can use the zone_t later */
2901         mutex_exit(&zone->zone_lock);
2902         if (refcnt == 1) {
2903                 /*
2904                  * See if the zone is shutting down.
2905                  */
2906                 mutex_enter(&zone_status_lock);
2907                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2908                         goto out;
2909                 }
2910
2911                 /*
2912                  * Make sure the ntasks didn't change since we
2913                  * dropped zone_lock.
2914                  */
2915                 mutex_enter(&zone->zone_lock);
2916                 if (refcnt != zone->zone_ntasks) {
2917                         mutex_exit(&zone->zone_lock);
2918                         goto out;
2919                 }
2920                 mutex_exit(&zone->zone_lock);
2921
2922                 /*
2923                  * No more user processes in the zone.  The zone is empty.
2924                  */
2925                 zone_status_set(zone, ZONE_IS_EMPTY);
2926                 goto out;
2927         }
2928
2929         ASSERT(refcnt == 0);
2930         /*
2931          * zsched has exited; the zone is dead.
2932          */
2933         zone->zone_zsched = NULL;               /* paranoia */
2934         mutex_enter(&zone_status_lock);
2935         zone_status_set(zone, ZONE_IS_DEAD);
2936 out:
2937         mutex_exit(&zone_status_lock);
2938         zone_rele(zone);
2939 }
2940
2941 zoneid_t
2942 getzoneid(void)
2943 {
2944         return (curproc->p_zone->zone_id);
2945 }
2946
2947 /*
2948  * Internal versions of zone_find_by_*().  These don't zone_hold() or
2949  * check the validity of a zone's state.
2950  */
2951 static zone_t *
2952 zone_find_all_by_id(zoneid_t zoneid)
2953 {
2954         mod_hash_val_t hv;
2955         zone_t *zone = NULL;
2956
2957         ASSERT(MUTEX_HELD(&zonehash_lock));
2958
2959         if (mod_hash_find(zonehashbyid,
2960             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2961                 zone = (zone_t *)hv;
2962         return (zone);
2963 }
2964
2965 static zone_t *
2966 zone_find_all_by_name(char *name)
2967 {
2968         mod_hash_val_t hv;
2969         zone_t *zone = NULL;
2970
2971         ASSERT(MUTEX_HELD(&zonehash_lock));
2972
2973         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2974                 zone = (zone_t *)hv;
2975         return (zone);
2976 }
2977
2978 /*
2979  * Public interface for looking up a zone by zoneid.  Only returns the zone if
2980  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2981  * Caller must call zone_rele() once it is done with the zone.
2982  *
2983  * The zone may begin the zone_destroy() sequence immediately after this
2984  * function returns, but may be safely used until zone_rele() is called.
2985  */
2986 zone_t *
2987 zone_find_by_id(zoneid_t zoneid)
2988 {
2989         zone_t *zone;
2990         zone_status_t status;
2991
2992         mutex_enter(&zonehash_lock);
2993         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2994                 mutex_exit(&zonehash_lock);
2995                 return (NULL);
2996         }
2997         status = zone_status_get(zone);
2998         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2999                 /*
3000                  * For all practical purposes the zone doesn't exist.
3001                  */
3002                 mutex_exit(&zonehash_lock);
3003                 return (NULL);
3004         }
3005         zone_hold(zone);
3006         mutex_exit(&zonehash_lock);
3007         return (zone);
3008 }
3009
3010 /*
3011  * Similar to zone_find_by_id, but using zone name as the key.
3012  */
3013 zone_t *
3014 zone_find_by_name(char *name)
3015 {
3016         zone_t *zone;
3017         zone_status_t status;
3018
3019         mutex_enter(&zonehash_lock);
3020         if ((zone = zone_find_all_by_name(name)) == NULL) {
3021                 mutex_exit(&zonehash_lock);
3022                 return (NULL);
3023         }
3024         status = zone_status_get(zone);
3025         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3026                 /*
3027                  * For all practical purposes the zone doesn't exist.
3028                  */
3029                 mutex_exit(&zonehash_lock);
3030                 return (NULL);
3031         }
3032         zone_hold(zone);
3033         mutex_exit(&zonehash_lock);
3034         return (zone);
3035 }
3036
3037 /*
3038  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3039  * if there is a zone "foo" rooted at /foo/root, and the path argument
3040  * is "/foo/root/proc", it will return the held zone_t corresponding to
3041  * zone "foo".
3042  *
3043  * zone_find_by_path() always returns a non-NULL value, since at the
3044  * very least every path will be contained in the global zone.
3045  *
3046  * As with the other zone_find_by_*() functions, the caller is
3047  * responsible for zone_rele()ing the return value of this function.
3048  */
3049 zone_t *
3050 zone_find_by_path(const char *path)
3051 {
3052         zone_t *zone;
3053         zone_t *zret = NULL;
3054         zone_status_t status;
3055
3056         if (path == NULL) {
3057                 /*
3058                  * Call from rootconf().
3059                  */
3060                 zone_hold(global_zone);
3061                 return (global_zone);
3062         }
3063         ASSERT(*path == '/');
3064         mutex_enter(&zonehash_lock);
3065         for (zone = list_head(&zone_active); zone != NULL;
3066             zone = list_next(&zone_active, zone)) {
3067                 if (ZONE_PATH_VISIBLE(path, zone))
3068                         zret = zone;
3069         }
3070         ASSERT(zret != NULL);
3071         status = zone_status_get(zret);
3072         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3073                 /*
3074                  * Zone practically doesn't exist.
3075                  */
3076                 zret = global_zone;
3077         }
3078         zone_hold(zret);
3079         mutex_exit(&zonehash_lock);
3080         return (zret);
3081 }
3082
3083 /*
3084  * Public interface for updating per-zone load averages.  Called once per
3085  * second.
3086  *
3087  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3088  */
3089 void
3090 zone_loadavg_update()
3091 {
3092         zone_t *zp;
3093         zone_status_t status;
3094         struct loadavg_s *lavg;
3095         hrtime_t zone_total;
3096         int i;
3097         hrtime_t hr_avg;
3098         int nrun;
3099         static int64_t f[3] = { 135, 27, 9 };
3100         int64_t q, r;
3101
3102         mutex_enter(&zonehash_lock);
3103         for (zp = list_head(&zone_active); zp != NULL;
3104             zp = list_next(&zone_active, zp)) {
3105                 mutex_enter(&zp->zone_lock);
3106
3107                 /* Skip zones that are on the way down or not yet up */
3108                 status = zone_status_get(zp);
3109                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3110                         /* For all practical purposes the zone doesn't exist. */
3111                         mutex_exit(&zp->zone_lock);
3112                         continue;
3113                 }
3114
3115                 /*
3116                  * Update the 10 second moving average data in zone_loadavg.
3117                  */
3118                 lavg = &zp->zone_loadavg;
3119
3120                 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3121                 scalehrtime(&zone_total);
3122
3123                 /* The zone_total should always be increasing. */
3124                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3125                     zone_total - lavg->lg_total : 0;
3126                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3127                 /* lg_total holds the prev. 1 sec. total */
3128                 lavg->lg_total = zone_total;
3129
3130                 /*
3131                  * To simplify the calculation, we don't calculate the load avg.
3132                  * until the zone has been up for at least 10 seconds and our
3133                  * moving average is thus full.
3134                  */
3135                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3136                         lavg->lg_len++;
3137                         mutex_exit(&zp->zone_lock);
3138                         continue;
3139                 }
3140
3141                 /* Now calculate the 1min, 5min, 15 min load avg. */
3142                 hr_avg = 0;
3143                 for (i = 0; i < S_LOADAVG_SZ; i++)
3144                         hr_avg += lavg->lg_loads[i];
3145                 hr_avg = hr_avg / S_LOADAVG_SZ;
3146                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3147
3148                 /* Compute load avg. See comment in calcloadavg() */
3149                 for (i = 0; i < 3; i++) {
3150                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3151                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3152                         zp->zone_hp_avenrun[i] +=
3153                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3154
3155                         /* avenrun[] can only hold 31 bits of load avg. */
3156                         if (zp->zone_hp_avenrun[i] <
3157                             ((uint64_t)1<<(31+16-FSHIFT)))
3158                                 zp->zone_avenrun[i] = (int32_t)
3159                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3160                         else
3161                                 zp->zone_avenrun[i] = 0x7fffffff;
3162                 }
3163
3164                 mutex_exit(&zp->zone_lock);
3165         }
3166         mutex_exit(&zonehash_lock);
3167 }
3168
3169 /*
3170  * Get the number of cpus visible to this zone.  The system-wide global
3171  * 'ncpus' is returned if pools are disabled, the caller is in the
3172  * global zone, or a NULL zone argument is passed in.
3173  */
3174 int
3175 zone_ncpus_get(zone_t *zone)
3176 {
3177         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3178
3179         return (myncpus != 0 ? myncpus : ncpus);
3180 }
3181
3182 /*
3183  * Get the number of online cpus visible to this zone.  The system-wide
3184  * global 'ncpus_online' is returned if pools are disabled, the caller
3185  * is in the global zone, or a NULL zone argument is passed in.
3186  */
3187 int
3188 zone_ncpus_online_get(zone_t *zone)
3189 {
3190         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3191
3192         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3193 }
3194
3195 /*
3196  * Return the pool to which the zone is currently bound.
3197  */
3198 pool_t *
3199 zone_pool_get(zone_t *zone)
3200 {
3201         ASSERT(pool_lock_held());
3202
3203         return (zone->zone_pool);
3204 }
3205
3206 /*
3207  * Set the zone's pool pointer and update the zone's visibility to match
3208  * the resources in the new pool.
3209  */
3210 void
3211 zone_pool_set(zone_t *zone, pool_t *pool)
3212 {
3213         ASSERT(pool_lock_held());
3214         ASSERT(MUTEX_HELD(&cpu_lock));
3215
3216         zone->zone_pool = pool;
3217         zone_pset_set(zone, pool->pool_pset->pset_id);
3218 }
3219
3220 /*
3221  * Return the cached value of the id of the processor set to which the
3222  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3223  * facility is disabled.
3224  */
3225 psetid_t
3226 zone_pset_get(zone_t *zone)
3227 {
3228         ASSERT(MUTEX_HELD(&cpu_lock));
3229
3230         return (zone->zone_psetid);
3231 }
3232
3233 /*
3234  * Set the cached value of the id of the processor set to which the zone
3235  * is currently bound.  Also update the zone's visibility to match the
3236  * resources in the new processor set.
3237  */
3238 void
3239 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3240 {
3241         psetid_t oldpsetid;
3242
3243         ASSERT(MUTEX_HELD(&cpu_lock));
3244         oldpsetid = zone_pset_get(zone);
3245
3246         if (oldpsetid == newpsetid)
3247                 return;
3248         /*
3249          * Global zone sees all.
3250          */
3251         if (zone != global_zone) {
3252                 zone->zone_psetid = newpsetid;
3253                 if (newpsetid != ZONE_PS_INVAL)
3254                         pool_pset_visibility_add(newpsetid, zone);
3255                 if (oldpsetid != ZONE_PS_INVAL)
3256                         pool_pset_visibility_remove(oldpsetid, zone);
3257         }
3258         /*
3259          * Disabling pools, so we should start using the global values
3260          * for ncpus and ncpus_online.
3261          */
3262         if (newpsetid == ZONE_PS_INVAL) {
3263                 zone->zone_ncpus = 0;
3264                 zone->zone_ncpus_online = 0;
3265         }
3266 }
3267
3268 /*
3269  * Walk the list of active zones and issue the provided callback for
3270  * each of them.
3271  *
3272  * Caller must not be holding any locks that may be acquired under
3273  * zonehash_lock.  See comment at the beginning of the file for a list of
3274  * common locks and their interactions with zones.
3275  */
3276 int
3277 zone_walk(int (*cb)(zone_t *, void *), void *data)
3278 {
3279         zone_t *zone;
3280         int ret = 0;
3281         zone_status_t status;
3282
3283         mutex_enter(&zonehash_lock);
3284         for (zone = list_head(&zone_active); zone != NULL;
3285             zone = list_next(&zone_active, zone)) {
3286                 /*
3287                  * Skip zones that shouldn't be externally visible.
3288                  */
3289                 status = zone_status_get(zone);
3290                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3291                         continue;
3292                 /*
3293                  * Bail immediately if any callback invocation returns a
3294                  * non-zero value.
3295                  */
3296                 ret = (*cb)(zone, data);
3297                 if (ret != 0)
3298                         break;
3299         }
3300         mutex_exit(&zonehash_lock);
3301         return (ret);
3302 }
3303
3304 static int
3305 zone_set_root(zone_t *zone, const char *upath)
3306 {
3307         vnode_t *vp;
3308         int trycount;
3309         int error = 0;
3310         char *path;
3311         struct pathname upn, pn;
3312         size_t pathlen;
3313
3314         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3315                 return (error);
3316
3317         pn_alloc(&pn);
3318
3319         /* prevent infinite loop */
3320         trycount = 10;
3321         for (;;) {
3322                 if (--trycount <= 0) {
3323                         error = ESTALE;
3324                         goto out;
3325                 }
3326
3327                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3328                         /*
3329                          * fop_access() may cover 'vp' with a new
3330                          * filesystem, if 'vp' is an autoFS vnode.
3331                          * Get the new 'vp' if so.
3332                          */
3333                         if ((error =
3334                             fop_access(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3335                             (!vn_ismntpt(vp) ||
3336                             (error = traverse(&vp)) == 0)) {
3337                                 pathlen = pn.pn_pathlen + 2;
3338                                 path = kmem_alloc(pathlen, KM_SLEEP);
3339                                 (void) strncpy(path, pn.pn_path,
3340                                     pn.pn_pathlen + 1);
3341                                 path[pathlen - 2] = '/';
3342                                 path[pathlen - 1] = '\0';
3343                                 pn_free(&pn);
3344                                 pn_free(&upn);
3345
3346                                 /* Success! */
3347                                 break;
3348                         }
3349                         VN_RELE(vp);
3350                 }
3351                 if (error != ESTALE)
3352                         goto out;
3353         }
3354
3355         ASSERT(error == 0);
3356         zone->zone_rootvp = vp;         /* we hold a reference to vp */
3357         zone->zone_rootpath = path;
3358         zone->zone_rootpathlen = pathlen;
3359         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3360                 zone->zone_flags |= ZF_IS_SCRATCH;
3361         return (0);
3362
3363 out:
3364         pn_free(&pn);
3365         pn_free(&upn);
3366         return (error);
3367 }
3368
3369 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3370                         ((c) >= 'a' && (c) <= 'z') || \
3371                         ((c) >= 'A' && (c) <= 'Z'))
3372
3373 static int
3374 zone_set_name(zone_t *zone, const char *uname)
3375 {
3376         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3377         size_t len;
3378         int i, err;
3379
3380         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3381                 kmem_free(kname, ZONENAME_MAX);
3382                 return (err);   /* EFAULT or ENAMETOOLONG */
3383         }
3384
3385         /* must be less than ZONENAME_MAX */
3386         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3387                 kmem_free(kname, ZONENAME_MAX);
3388                 return (EINVAL);
3389         }
3390
3391         /*
3392          * Name must start with an alphanumeric and must contain only
3393          * alphanumerics, '-', '_' and '.'.
3394          */
3395         if (!isalnum(kname[0])) {
3396                 kmem_free(kname, ZONENAME_MAX);
3397                 return (EINVAL);
3398         }
3399         for (i = 1; i < len - 1; i++) {
3400                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3401                     kname[i] != '.') {
3402                         kmem_free(kname, ZONENAME_MAX);
3403                         return (EINVAL);
3404                 }
3405         }
3406
3407         zone->zone_name = kname;
3408         return (0);
3409 }
3410
3411 /*
3412  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3413  * is NULL or it points to a zone with no hostid emulation, then the machine's
3414  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3415  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3416  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3417  * hostid and the machine's hostid is invalid.
3418  */
3419 uint32_t
3420 zone_get_hostid(zone_t *zonep)
3421 {
3422         unsigned long machine_hostid;
3423
3424         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3425                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3426                         return (HW_INVALID_HOSTID);
3427                 return ((uint32_t)machine_hostid);
3428         }
3429         return (zonep->zone_hostid);
3430 }
3431
3432 /*
3433  * Similar to thread_create(), but makes sure the thread is in the appropriate
3434  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3435  */
3436 /*ARGSUSED*/
3437 kthread_t *
3438 zthread_create(
3439     caddr_t stk,
3440     size_t stksize,
3441     void (*proc)(),
3442     void *arg,
3443     size_t len,
3444     pri_t pri)
3445 {
3446         kthread_t *t;
3447         zone_t *zone = curproc->p_zone;
3448         proc_t *pp = zone->zone_zsched;
3449
3450         zone_hold(zone);        /* Reference to be dropped when thread exits */
3451
3452         /*
3453          * No-one should be trying to create threads if the zone is shutting
3454          * down and there aren't any kernel threads around.  See comment
3455          * in zthread_exit().
3456          */
3457         ASSERT(!(zone->zone_kthreads == NULL &&
3458             zone_status_get(zone) >= ZONE_IS_EMPTY));
3459         /*
3460          * Create a thread, but don't let it run until we've finished setting
3461          * things up.
3462          */
3463         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3464         ASSERT(t->t_forw == NULL);
3465         mutex_enter(&zone_status_lock);
3466         if (zone->zone_kthreads == NULL) {
3467                 t->t_forw = t->t_back = t;
3468         } else {
3469                 kthread_t *tx = zone->zone_kthreads;
3470
3471                 t->t_forw = tx;
3472                 t->t_back = tx->t_back;
3473                 tx->t_back->t_forw = t;
3474                 tx->t_back = t;
3475         }
3476         zone->zone_kthreads = t;
3477         mutex_exit(&zone_status_lock);
3478
3479         mutex_enter(&pp->p_lock);
3480         t->t_proc_flag |= TP_ZTHREAD;
3481         project_rele(t->t_proj);
3482         t->t_proj = project_hold(pp->p_task->tk_proj);
3483
3484         /*
3485          * Setup complete, let it run.
3486          */
3487         thread_lock(t);
3488         t->t_schedflag |= TS_ALLSTART;
3489         setrun_locked(t);
3490         thread_unlock(t);
3491
3492         mutex_exit(&pp->p_lock);
3493
3494         return (t);
3495 }
3496
3497 /*
3498  * Similar to thread_exit().  Must be called by threads created via
3499  * zthread_exit().
3500  */
3501 void
3502 zthread_exit(void)
3503 {
3504         kthread_t *t = curthread;
3505         proc_t *pp = curproc;
3506         zone_t *zone = pp->p_zone;
3507
3508         mutex_enter(&zone_status_lock);
3509
3510         /*
3511          * Reparent to p0
3512          */
3513         kpreempt_disable();
3514         mutex_enter(&pp->p_lock);
3515         t->t_proc_flag &= ~TP_ZTHREAD;
3516         t->t_procp = &p0;
3517         hat_thread_exit(t);
3518         mutex_exit(&pp->p_lock);
3519         kpreempt_enable();
3520
3521         if (t->t_back == t) {
3522                 ASSERT(t->t_forw == t);
3523                 /*
3524                  * If the zone is empty, once the thread count
3525                  * goes to zero no further kernel threads can be
3526                  * created.  This is because if the creator is a process
3527                  * in the zone, then it must have exited before the zone
3528                  * state could be set to ZONE_IS_EMPTY.
3529                  * Otherwise, if the creator is a kernel thread in the
3530                  * zone, the thread count is non-zero.
3531                  *
3532                  * This really means that non-zone kernel threads should
3533                  * not create zone kernel threads.
3534                  */
3535                 zone->zone_kthreads = NULL;
3536                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3537                         zone_status_set(zone, ZONE_IS_DOWN);
3538                         /*
3539                          * Remove any CPU caps on this zone.
3540                          */
3541                         cpucaps_zone_remove(zone);
3542                 }
3543         } else {
3544                 t->t_forw->t_back = t->t_back;
3545                 t->t_back->t_forw = t->t_forw;
3546                 if (zone->zone_kthreads == t)
3547                         zone->zone_kthreads = t->t_forw;
3548         }
3549         mutex_exit(&zone_status_lock);
3550         zone_rele(zone);
3551         thread_exit();
3552         /* NOTREACHED */
3553 }
3554
3555 static void
3556 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3557 {
3558         vnode_t *oldvp;
3559
3560         /* we're going to hold a reference here to the directory */
3561         VN_HOLD(vp);
3562
3563         /* update abs cwd/root path see c2/audit.c */
3564         if (AU_AUDITING())
3565                 audit_chdirec(vp, vpp);
3566
3567         mutex_enter(&pp->p_lock);
3568         oldvp = *vpp;
3569         *vpp = vp;
3570         mutex_exit(&pp->p_lock);
3571         if (oldvp != NULL)
3572                 VN_RELE(oldvp);
3573 }
3574
3575 /*
3576  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3577  */
3578 static int
3579 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3580 {
3581         nvpair_t *nvp = NULL;
3582         boolean_t priv_set = B_FALSE;
3583         boolean_t limit_set = B_FALSE;
3584         boolean_t action_set = B_FALSE;
3585
3586         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3587                 const char *name;
3588                 uint64_t ui64;
3589
3590                 name = nvpair_name(nvp);
3591                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3592                         return (EINVAL);
3593                 (void) nvpair_value_uint64(nvp, &ui64);
3594                 if (strcmp(name, "privilege") == 0) {
3595                         /*
3596                          * Currently only privileged values are allowed, but
3597                          * this may change in the future.
3598                          */
3599                         if (ui64 != RCPRIV_PRIVILEGED)
3600                                 return (EINVAL);
3601                         rv->rcv_privilege = ui64;
3602                         priv_set = B_TRUE;
3603                 } else if (strcmp(name, "limit") == 0) {
3604                         rv->rcv_value = ui64;
3605                         limit_set = B_TRUE;
3606                 } else if (strcmp(name, "action") == 0) {
3607                         if (ui64 != RCTL_LOCAL_NOACTION &&
3608                             ui64 != RCTL_LOCAL_DENY)
3609                                 return (EINVAL);
3610                         rv->rcv_flagaction = ui64;
3611                         action_set = B_TRUE;
3612                 } else {
3613                         return (EINVAL);
3614                 }
3615         }
3616
3617         if (!(priv_set && limit_set && action_set))
3618                 return (EINVAL);
3619         rv->rcv_action_signal = 0;
3620         rv->rcv_action_recipient = NULL;
3621         rv->rcv_action_recip_pid = -1;
3622         rv->rcv_firing_time = 0;
3623
3624         return (0);
3625 }
3626
3627 /*
3628  * Non-global zone version of start_init.
3629  */
3630 void
3631 zone_start_init(void)
3632 {
3633         proc_t *p = ttoproc(curthread);
3634         zone_t *z = p->p_zone;
3635
3636         ASSERT(!INGLOBALZONE(curproc));
3637
3638         /*
3639          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3640          * storing just the pid of init is sufficient.
3641          */
3642         z->zone_proc_initpid = p->p_pid;
3643
3644         /*
3645          * We maintain zone_boot_err so that we can return the cause of the
3646          * failure back to the caller of the zone_boot syscall.
3647          */
3648         p->p_zone->zone_boot_err = start_init_common();
3649
3650         /*
3651          * We will prevent booting zones from becoming running zones if the
3652          * global zone is shutting down.
3653          */
3654         mutex_enter(&zone_status_lock);
3655         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3656             ZONE_IS_SHUTTING_DOWN) {
3657                 /*
3658                  * Make sure we are still in the booting state-- we could have
3659                  * raced and already be shutting down, or even further along.
3660                  */
3661                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3662                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3663                 }
3664                 mutex_exit(&zone_status_lock);
3665                 /* It's gone bad, dispose of the process */
3666                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3667                         mutex_enter(&p->p_lock);
3668                         ASSERT(p->p_flag & SEXITLWPS);
3669                         lwp_exit();
3670                 }
3671         } else {
3672                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3673                         zone_status_set(z, ZONE_IS_RUNNING);
3674                 mutex_exit(&zone_status_lock);
3675                 /* cause the process to return to userland. */
3676                 lwp_rtt();
3677         }
3678 }
3679
3680 struct zsched_arg {
3681         zone_t *zone;
3682         nvlist_t *nvlist;
3683 };
3684
3685 /*
3686  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3687  * anything to do with scheduling, but rather with the fact that
3688  * per-zone kernel threads are parented to zsched, just like regular
3689  * kernel threads are parented to sched (p0).
3690  *
3691  * zsched is also responsible for launching init for the zone.
3692  */
3693 static void
3694 zsched(void *arg)
3695 {
3696         struct zsched_arg *za = arg;
3697         proc_t *pp = curproc;
3698         proc_t *initp = proc_init;
3699         zone_t *zone = za->zone;
3700         cred_t *cr, *oldcred;
3701         rctl_set_t *set;
3702         rctl_alloc_gp_t *gp;
3703         contract_t *ct = NULL;
3704         task_t *tk, *oldtk;
3705         rctl_entity_p_t e;
3706         kproject_t *pj;
3707
3708         nvlist_t *nvl = za->nvlist;
3709         nvpair_t *nvp = NULL;
3710
3711         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3712         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3713         PTOU(pp)->u_argc = 0;
3714         PTOU(pp)->u_argv = (uintptr_t)NULL;
3715         PTOU(pp)->u_envp = (uintptr_t)NULL;
3716         PTOU(pp)->u_commpagep = (uintptr_t)NULL;
3717         closeall(P_FINFO(pp));
3718
3719         /*
3720          * We are this zone's "zsched" process.  As the zone isn't generally
3721          * visible yet we don't need to grab any locks before initializing its
3722          * zone_proc pointer.
3723          */
3724         zone_hold(zone);  /* this hold is released by zone_destroy() */
3725         zone->zone_zsched = pp;
3726         mutex_enter(&pp->p_lock);
3727         pp->p_zone = zone;
3728         mutex_exit(&pp->p_lock);
3729
3730         /*
3731          * Disassociate process from its 'parent'; parent ourselves to init
3732          * (pid 1) and change other values as needed.
3733          */
3734         sess_create();
3735
3736         mutex_enter(&pidlock);
3737         proc_detach(pp);
3738         pp->p_ppid = 1;
3739         pp->p_flag |= SZONETOP;
3740         pp->p_ancpid = 1;
3741         pp->p_parent = initp;
3742         pp->p_psibling = NULL;
3743         if (initp->p_child)
3744                 initp->p_child->p_psibling = pp;
3745         pp->p_sibling = initp->p_child;
3746         initp->p_child = pp;
3747
3748         /* Decrement what newproc() incremented. */
3749         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3750         /*
3751          * Our credentials are about to become kcred-like, so we don't care
3752          * about the caller's ruid.
3753          */
3754         upcount_inc(crgetruid(kcred), zone->zone_id);
3755         mutex_exit(&pidlock);
3756
3757         /*
3758          * getting out of global zone, so decrement lwp and process counts
3759          */
3760         pj = pp->p_task->tk_proj;
3761         mutex_enter(&global_zone->zone_nlwps_lock);
3762         pj->kpj_nlwps -= pp->p_lwpcnt;
3763         global_zone->zone_nlwps -= pp->p_lwpcnt;
3764         pj->kpj_nprocs--;
3765         global_zone->zone_nprocs--;
3766         mutex_exit(&global_zone->zone_nlwps_lock);
3767
3768         /*
3769          * Decrement locked memory counts on old zone and project.
3770          */
3771         mutex_enter(&global_zone->zone_mem_lock);
3772         global_zone->zone_locked_mem -= pp->p_locked_mem;
3773         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3774         mutex_exit(&global_zone->zone_mem_lock);
3775
3776         /*
3777          * Create and join a new task in project '0' of this zone.
3778          *
3779          * We don't need to call holdlwps() since we know we're the only lwp in
3780          * this process.
3781          *
3782          * task_join() returns with p_lock held.
3783          */
3784         tk = task_create(0, zone);
3785         mutex_enter(&cpu_lock);
3786         oldtk = task_join(tk, 0);
3787
3788         pj = pp->p_task->tk_proj;
3789
3790         mutex_enter(&zone->zone_mem_lock);
3791         zone->zone_locked_mem += pp->p_locked_mem;
3792         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3793         mutex_exit(&zone->zone_mem_lock);
3794
3795         /*
3796          * add lwp and process counts to zsched's zone, and increment
3797          * project's task and process count due to the task created in
3798          * the above task_create.
3799          */
3800         mutex_enter(&zone->zone_nlwps_lock);
3801         pj->kpj_nlwps += pp->p_lwpcnt;
3802         pj->kpj_ntasks += 1;
3803         zone->zone_nlwps += pp->p_lwpcnt;
3804         pj->kpj_nprocs++;
3805         zone->zone_nprocs++;
3806         mutex_exit(&zone->zone_nlwps_lock);
3807
3808         mutex_exit(&curproc->p_lock);
3809         mutex_exit(&cpu_lock);
3810         task_rele(oldtk);
3811
3812         /*
3813          * The process was created by a process in the global zone, hence the
3814          * credentials are wrong.  We might as well have kcred-ish credentials.
3815          */
3816         cr = zone->zone_kcred;
3817         crhold(cr);
3818         mutex_enter(&pp->p_crlock);
3819         oldcred = pp->p_cred;
3820         pp->p_cred = cr;
3821         mutex_exit(&pp->p_crlock);
3822         crfree(oldcred);
3823
3824         /*
3825          * Hold credentials again (for thread)
3826          */
3827         crhold(cr);
3828
3829         /*
3830          * p_lwpcnt can't change since this is a kernel process.
3831          */
3832         crset(pp, cr);
3833
3834         /*
3835          * Chroot
3836          */
3837         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3838         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3839
3840         /*
3841          * Initialize zone's rctl set.
3842          */
3843         set = rctl_set_create();
3844         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3845         mutex_enter(&pp->p_lock);
3846         e.rcep_p.zone = zone;
3847         e.rcep_t = RCENTITY_ZONE;
3848         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3849         mutex_exit(&pp->p_lock);
3850         rctl_prealloc_destroy(gp);
3851
3852         /*
3853          * Apply the rctls passed in to zone_create().  This is basically a list
3854          * assignment: all of the old values are removed and the new ones
3855          * inserted.  That is, if an empty list is passed in, all values are
3856          * removed.
3857          */
3858         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3859                 rctl_dict_entry_t *rde;
3860                 rctl_hndl_t hndl;
3861                 char *name;
3862                 nvlist_t **nvlarray;
3863                 uint_t i, nelem;
3864                 int error;      /* For ASSERT()s */
3865
3866                 name = nvpair_name(nvp);
3867                 hndl = rctl_hndl_lookup(name);
3868                 ASSERT(hndl != -1);
3869                 rde = rctl_dict_lookup_hndl(hndl);
3870                 ASSERT(rde != NULL);
3871
3872                 for (; /* ever */; ) {
3873                         rctl_val_t oval;
3874
3875                         mutex_enter(&pp->p_lock);
3876                         error = rctl_local_get(hndl, NULL, &oval, pp);
3877                         mutex_exit(&pp->p_lock);
3878                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3879                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3880                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
3881                                 break;
3882                         mutex_enter(&pp->p_lock);
3883                         error = rctl_local_delete(hndl, &oval, pp);
3884                         mutex_exit(&pp->p_lock);
3885                         ASSERT(error == 0);
3886                 }
3887                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3888                 ASSERT(error == 0);
3889                 for (i = 0; i < nelem; i++) {
3890                         rctl_val_t *nvalp;
3891
3892                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3893                         error = nvlist2rctlval(nvlarray[i], nvalp);
3894                         ASSERT(error == 0);
3895                         /*
3896                          * rctl_local_insert can fail if the value being
3897                          * inserted is a duplicate; this is OK.
3898                          */
3899                         mutex_enter(&pp->p_lock);
3900                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
3901                                 kmem_cache_free(rctl_val_cache, nvalp);
3902                         mutex_exit(&pp->p_lock);
3903                 }
3904         }
3905
3906         /*
3907          * Tell the world that we're done setting up.
3908          *
3909          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3910          * and atomically set the zone's processor set visibility.  Once
3911          * we drop pool_lock() this zone will automatically get updated
3912          * to reflect any future changes to the pools configuration.
3913          *
3914          * Note that after we drop the locks below (zonehash_lock in
3915          * particular) other operations such as a zone_getattr call can
3916          * now proceed and observe the zone. That is the reason for doing a
3917          * state transition to the INITIALIZED state.
3918          */
3919         pool_lock();
3920         mutex_enter(&cpu_lock);
3921         mutex_enter(&zonehash_lock);
3922         zone_uniqid(zone);
3923         zone_zsd_configure(zone);
3924         if (pool_state == POOL_ENABLED)
3925                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
3926         mutex_enter(&zone_status_lock);
3927         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3928         zone_status_set(zone, ZONE_IS_INITIALIZED);
3929         mutex_exit(&zone_status_lock);
3930         mutex_exit(&zonehash_lock);
3931         mutex_exit(&cpu_lock);
3932         pool_unlock();
3933
3934         /* Now call the create callback for this key */
3935         zsd_apply_all_keys(zsd_apply_create, zone);
3936
3937         /* The callbacks are complete. Mark ZONE_IS_READY */
3938         mutex_enter(&zone_status_lock);
3939         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3940         zone_status_set(zone, ZONE_IS_READY);
3941         mutex_exit(&zone_status_lock);
3942
3943         /*
3944          * Once we see the zone transition to the ZONE_IS_BOOTING state,
3945          * we launch init, and set the state to running.
3946          */
3947         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3948
3949         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3950                 id_t cid;
3951
3952                 /*
3953                  * Ok, this is a little complicated.  We need to grab the
3954                  * zone's pool's scheduling class ID; note that by now, we
3955                  * are already bound to a pool if we need to be (zoneadmd
3956                  * will have done that to us while we're in the READY
3957                  * state).  *But* the scheduling class for the zone's 'init'
3958                  * must be explicitly passed to newproc, which doesn't
3959                  * respect pool bindings.
3960                  *
3961                  * We hold the pool_lock across the call to newproc() to
3962                  * close the obvious race: the pool's scheduling class
3963                  * could change before we manage to create the LWP with
3964                  * classid 'cid'.
3965                  */
3966                 pool_lock();
3967                 if (zone->zone_defaultcid > 0)
3968                         cid = zone->zone_defaultcid;
3969                 else
3970                         cid = pool_get_class(zone->zone_pool);
3971                 if (cid == -1)
3972                         cid = defaultcid;
3973
3974                 /*
3975                  * If this fails, zone_boot will ultimately fail.  The
3976                  * state of the zone will be set to SHUTTING_DOWN-- userland
3977                  * will have to tear down the zone, and fail, or try again.
3978                  */
3979                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3980                     minclsyspri - 1, &ct, 0)) != 0) {
3981                         mutex_enter(&zone_status_lock);
3982                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3983                         mutex_exit(&zone_status_lock);
3984                 } else {
3985                         zone->zone_boot_time = gethrestime_sec();
3986                 }
3987
3988                 pool_unlock();
3989         }
3990
3991         /*
3992          * Wait for zone_destroy() to be called.  This is what we spend
3993          * most of our life doing.
3994          */
3995         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3996
3997         if (ct)
3998                 /*
3999                  * At this point the process contract should be empty.
4000                  * (Though if it isn't, it's not the end of the world.)
4001                  */
4002                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4003
4004         /*
4005          * Allow kcred to be freed when all referring processes
4006          * (including this one) go away.  We can't just do this in
4007          * zone_free because we need to wait for the zone_cred_ref to
4008          * drop to 0 before calling zone_free, and the existence of
4009          * zone_kcred will prevent that.  Thus, we call crfree here to
4010          * balance the crdup in zone_create.  The crhold calls earlier
4011          * in zsched will be dropped when the thread and process exit.
4012          */
4013         crfree(zone->zone_kcred);
4014         zone->zone_kcred = NULL;
4015
4016         exit(CLD_EXITED, 0);
4017 }
4018
4019 /*
4020  * Helper function to determine if there are any submounts of the
4021  * provided path.  Used to make sure the zone doesn't "inherit" any
4022  * mounts from before it is created.
4023  */
4024 static uint_t
4025 zone_mount_count(const char *rootpath)
4026 {
4027         vfs_t *vfsp;
4028         uint_t count = 0;
4029         size_t rootpathlen = strlen(rootpath);
4030
4031         /*
4032          * Holding zonehash_lock prevents race conditions with
4033          * vfs_list_add()/vfs_list_remove() since we serialize with
4034          * zone_find_by_path().
4035          */
4036         ASSERT(MUTEX_HELD(&zonehash_lock));
4037         /*
4038          * The rootpath must end with a '/'
4039          */
4040         ASSERT(rootpath[rootpathlen - 1] == '/');
4041
4042         /*
4043          * This intentionally does not count the rootpath itself if that
4044          * happens to be a mount point.
4045          */
4046         vfs_list_read_lock();
4047         vfsp = rootvfs;
4048         do {
4049                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4050                     rootpathlen) == 0)
4051                         count++;
4052                 vfsp = vfsp->vfs_next;
4053         } while (vfsp != rootvfs);
4054         vfs_list_unlock();
4055         return (count);
4056 }
4057
4058 /*
4059  * Helper function to make sure that a zone created on 'rootpath'
4060  * wouldn't end up containing other zones' rootpaths.
4061  */
4062 static boolean_t
4063 zone_is_nested(const char *rootpath)
4064 {
4065         zone_t *zone;
4066         size_t rootpathlen = strlen(rootpath);
4067         size_t len;
4068
4069         ASSERT(MUTEX_HELD(&zonehash_lock));
4070
4071         /*
4072          * zone_set_root() appended '/' and '\0' at the end of rootpath
4073          */
4074         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4075             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4076                 return (B_TRUE);
4077
4078         for (zone = list_head(&zone_active); zone != NULL;
4079             zone = list_next(&zone_active, zone)) {
4080                 if (zone == global_zone)
4081                         continue;
4082                 len = strlen(zone->zone_rootpath);
4083                 if (strncmp(rootpath, zone->zone_rootpath,
4084                     MIN(rootpathlen, len)) == 0)
4085                         return (B_TRUE);
4086         }
4087         return (B_FALSE);
4088 }
4089
4090 static int
4091 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4092     size_t zone_privssz)
4093 {
4094         priv_set_t *privs;
4095
4096         if (zone_privssz < sizeof (priv_set_t))
4097                 return (ENOMEM);
4098
4099         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4100
4101         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4102                 kmem_free(privs, sizeof (priv_set_t));
4103                 return (EFAULT);
4104         }
4105
4106         zone->zone_privset = privs;
4107         return (0);
4108 }
4109
4110 /*
4111  * We make creative use of nvlists to pass in rctls from userland.  The list is
4112  * a list of the following structures:
4113  *
4114  * (name = rctl_name, value = nvpair_list_array)
4115  *
4116  * Where each element of the nvpair_list_array is of the form:
4117  *
4118  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4119  *      (name = "limit", value = uint64_t),
4120  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4121  */
4122 static int
4123 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4124 {
4125         nvpair_t *nvp = NULL;
4126         nvlist_t *nvl = NULL;
4127         char *kbuf;
4128         int error;
4129         rctl_val_t rv;
4130
4131         *nvlp = NULL;
4132
4133         if (buflen == 0)
4134                 return (0);
4135
4136         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4137                 return (ENOMEM);
4138         if (copyin(ubuf, kbuf, buflen)) {
4139                 error = EFAULT;
4140                 goto out;
4141         }
4142         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4143                 /*
4144                  * nvl may have been allocated/free'd, but the value set to
4145                  * non-NULL, so we reset it here.
4146                  */
4147                 nvl = NULL;
4148                 error = EINVAL;
4149                 goto out;
4150         }
4151         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4152                 rctl_dict_entry_t *rde;
4153                 rctl_hndl_t hndl;
4154                 nvlist_t **nvlarray;
4155                 uint_t i, nelem;
4156                 char *name;
4157
4158                 error = EINVAL;
4159                 name = nvpair_name(nvp);
4160                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4161                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4162                         goto out;
4163                 }
4164                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4165                         goto out;
4166                 }
4167                 rde = rctl_dict_lookup_hndl(hndl);
4168                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4169                 ASSERT(error == 0);
4170                 for (i = 0; i < nelem; i++) {
4171                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4172                                 goto out;
4173                 }
4174                 if (rctl_invalid_value(rde, &rv)) {
4175                         error = EINVAL;
4176                         goto out;
4177                 }
4178         }
4179         error = 0;
4180         *nvlp = nvl;
4181 out:
4182         kmem_free(kbuf, buflen);
4183         if (error && nvl != NULL)
4184                 nvlist_free(nvl);
4185         return (error);
4186 }
4187
4188 int
4189 zone_create_error(int er_error, int er_ext, int *er_out)
4190 {
4191         if (er_out != NULL) {
4192                 if (copyout(&er_ext, er_out, sizeof (int))) {
4193                         return (set_errno(EFAULT));
4194                 }
4195         }
4196         return (set_errno(er_error));
4197 }
4198
4199 /*
4200  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4201  */
4202 static int
4203 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4204 {
4205         char *kbuf;
4206         char *dataset, *next;
4207         zone_dataset_t *zd;
4208         size_t len;
4209
4210         if (ubuf == NULL || buflen == 0)
4211                 return (0);
4212
4213         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4214                 return (ENOMEM);
4215
4216         if (copyin(ubuf, kbuf, buflen) != 0) {
4217                 kmem_free(kbuf, buflen);
4218                 return (EFAULT);
4219         }
4220
4221         dataset = next = kbuf;
4222         for (;;) {
4223                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4224
4225                 next = strchr(dataset, ',');
4226
4227                 if (next == NULL)
4228                         len = strlen(dataset);
4229                 else
4230                         len = next - dataset;
4231
4232                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4233                 bcopy(dataset, zd->zd_dataset, len);
4234                 zd->zd_dataset[len] = '\0';
4235
4236                 list_insert_head(&zone->zone_datasets, zd);
4237
4238                 if (next == NULL)
4239                         break;
4240
4241                 dataset = next + 1;
4242         }
4243
4244         kmem_free(kbuf, buflen);
4245         return (0);
4246 }
4247
4248 /*
4249  * System call to create/initialize a new zone named 'zone_name', rooted
4250  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4251  * and initialized with the zone-wide rctls described in 'rctlbuf'.
4252  *
4253  * If extended error is non-null, we may use it to return more detailed
4254  * error information.
4255  */
4256 static zoneid_t
4257 zone_create(const char *zone_name, const char *zone_root,
4258     const priv_set_t *zone_privs, size_t zone_privssz,
4259     caddr_t rctlbuf, size_t rctlbufsz,
4260     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4261     int flags)
4262 {
4263         struct zsched_arg zarg;
4264         nvlist_t *rctls = NULL;
4265         proc_t *pp = curproc;
4266         zone_t *zone, *ztmp;
4267         zoneid_t zoneid, start = GLOBAL_ZONEID;
4268         int error;
4269         int error2 = 0;
4270         char *str;
4271         cred_t *zkcr;
4272
4273         if (secpolicy_zone_config(CRED()) != 0)
4274                 return (set_errno(EPERM));
4275
4276         /* can't boot zone from within chroot environment */
4277         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4278                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4279                     extended_error));
4280         /*
4281          * As the first step of zone creation, we want to allocate a zoneid.
4282          * This allocation is complicated by the fact that netstacks use the
4283          * zoneid to determine their stackid, but netstacks themselves are
4284          * freed asynchronously with respect to zone destruction.  This means
4285          * that a netstack reference leak (or in principle, an extraordinarily
4286          * long netstack reference hold) could result in a zoneid being
4287          * allocated that in fact corresponds to a stackid from an active
4288          * (referenced) netstack -- unleashing all sorts of havoc when that
4289          * netstack is actually (re)used.  (In the abstract, we might wish a
4290          * zoneid to not be deallocated until its last referencing netstack
4291          * has been released, but netstacks lack a backpointer into their
4292          * referencing zone -- and changing them to have such a pointer would
4293          * be substantial, to put it euphemistically.)  To avoid this, we
4294          * detect this condition on allocation: if we have allocated a zoneid
4295          * that corresponds to a netstack that's still in use, we warn about
4296          * it (as it is much more likely to be a reference leak than an actual
4297          * netstack reference), free it, and allocate another.  That these
4298          * identifers are allocated out of an ID space assures that we won't
4299          * see the identifier we just allocated.
4300          */
4301         for (;;) {
4302                 zoneid = id_alloc(zoneid_space);
4303
4304                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4305                         break;
4306
4307                 id_free(zoneid_space, zoneid);
4308
4309                 if (start == GLOBAL_ZONEID) {
4310                         start = zoneid;
4311                 } else if (zoneid == start) {
4312                         /*
4313                          * We have managed to iterate over the entire available
4314                          * zoneid space -- there are no identifiers available,
4315                          * presumably due to some number of leaked netstack
4316                          * references.  While it's in principle possible for us
4317                          * to continue to try, it seems wiser to give up at
4318                          * this point to warn and fail explicitly with a
4319                          * distinctive error.
4320                          */
4321                         cmn_err(CE_WARN, "zone_create() failed: all available "
4322                             "zone IDs have netstacks still in use");
4323                         return (set_errno(ENFILE));
4324                 }
4325
4326                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4327                     "netstack still in use", zoneid);
4328         }
4329
4330         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4331         zone->zone_id = zoneid;
4332         zone->zone_status = ZONE_IS_UNINITIALIZED;
4333         zone->zone_pool = pool_default;
4334         zone->zone_pool_mod = gethrtime();
4335         zone->zone_psetid = ZONE_PS_INVAL;
4336         zone->zone_ncpus = 0;
4337         zone->zone_ncpus_online = 0;
4338         zone->zone_restart_init = B_TRUE;
4339         zone->zone_brand = &native_brand;
4340         zone->zone_initname = NULL;
4341         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4342         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4343         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4344         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4345         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4346             offsetof(zone_ref_t, zref_linkage));
4347         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4348             offsetof(struct zsd_entry, zsd_linkage));
4349         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4350             offsetof(zone_dataset_t, zd_linkage));
4351         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4352             offsetof(zone_dl_t, zdl_linkage));
4353         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4354
4355         if (flags & ZCF_NET_EXCL) {
4356                 zone->zone_flags |= ZF_NET_EXCL;
4357         }
4358
4359         if ((error = zone_set_name(zone, zone_name)) != 0) {
4360                 zone_free(zone);
4361                 return (zone_create_error(error, 0, extended_error));
4362         }
4363
4364         if ((error = zone_set_root(zone, zone_root)) != 0) {
4365                 zone_free(zone);
4366                 return (zone_create_error(error, 0, extended_error));
4367         }
4368         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4369                 zone_free(zone);
4370                 return (zone_create_error(error, 0, extended_error));
4371         }
4372
4373         /* initialize node name to be the same as zone name */
4374         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4375         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4376         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4377
4378         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4379         zone->zone_domain[0] = '\0';
4380         zone->zone_hostid = HW_INVALID_HOSTID;
4381         zone->zone_shares = 1;
4382         zone->zone_shmmax = 0;
4383         zone->zone_ipc.ipcq_shmmni = 0;
4384         zone->zone_ipc.ipcq_semmni = 0;
4385         zone->zone_ipc.ipcq_msgmni = 0;
4386         zone->zone_bootargs = NULL;
4387         zone->zone_fs_allowed = NULL;
4388
4389         secflags_zero(&zone0.zone_secflags.psf_lower);
4390         secflags_zero(&zone0.zone_secflags.psf_effective);
4391         secflags_zero(&zone0.zone_secflags.psf_inherit);
4392         secflags_fullset(&zone0.zone_secflags.psf_upper);
4393
4394         zone->zone_initname =
4395             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4396         (void) strcpy(zone->zone_initname, zone_default_initname);
4397         zone->zone_nlwps = 0;
4398         zone->zone_nlwps_ctl = INT_MAX;
4399         zone->zone_nprocs = 0;
4400         zone->zone_nprocs_ctl = INT_MAX;
4401         zone->zone_locked_mem = 0;
4402         zone->zone_locked_mem_ctl = UINT64_MAX;
4403         zone->zone_max_swap = 0;
4404         zone->zone_max_swap_ctl = UINT64_MAX;
4405         zone->zone_max_lofi = 0;
4406         zone->zone_max_lofi_ctl = UINT64_MAX;
4407         zone0.zone_lockedmem_kstat = NULL;
4408         zone0.zone_swapresv_kstat = NULL;
4409
4410         /*
4411          * Zsched initializes the rctls.
4412          */
4413         zone->zone_rctls = NULL;
4414
4415         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4416                 zone_free(zone);
4417                 return (zone_create_error(error, 0, extended_error));
4418         }
4419
4420         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4421                 zone_free(zone);
4422                 return (set_errno(error));
4423         }
4424
4425         /*
4426          * Stop all lwps since that's what normally happens as part of fork().
4427          * This needs to happen before we grab any locks to avoid deadlock
4428          * (another lwp in the process could be waiting for the held lock).
4429          */
4430         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4431                 zone_free(zone);
4432                 nvlist_free(rctls);
4433                 return (zone_create_error(error, 0, extended_error));
4434         }
4435
4436         if (block_mounts(zone) == 0) {
4437                 mutex_enter(&pp->p_lock);
4438                 if (curthread != pp->p_agenttp)
4439                         continuelwps(pp);
4440                 mutex_exit(&pp->p_lock);
4441                 zone_free(zone);
4442                 nvlist_free(rctls);
4443                 return (zone_create_error(error, 0, extended_error));
4444         }
4445
4446         /*
4447          * Set up credential for kernel access.  After this, any errors
4448          * should go through the dance in errout rather than calling
4449          * zone_free directly.
4450          */
4451         zone->zone_kcred = crdup(kcred);
4452         crsetzone(zone->zone_kcred, zone);
4453         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4454         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4455         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4456         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4457
4458         mutex_enter(&zonehash_lock);
4459         /*
4460          * Make sure zone doesn't already exist.
4461          */
4462         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL) {
4463                 zone_status_t status;
4464
4465                 status = zone_status_get(ztmp);
4466                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4467                         error = EEXIST;
4468                 else
4469                         error = EBUSY;
4470
4471                 goto errout;
4472         }
4473
4474         /*
4475          * Don't allow zone creations which would cause one zone's rootpath to
4476          * be accessible from that of another (non-global) zone.
4477          */
4478         if (zone_is_nested(zone->zone_rootpath)) {
4479                 error = EBUSY;
4480                 goto errout;
4481         }
4482
4483         ASSERT(zonecount != 0);         /* check for leaks */
4484         if (zonecount + 1 > maxzones) {
4485                 error = ENOMEM;
4486                 goto errout;
4487         }
4488
4489         if (zone_mount_count(zone->zone_rootpath) != 0) {
4490                 error = EBUSY;
4491                 error2 = ZE_AREMOUNTS;
4492                 goto errout;
4493         }
4494
4495         /*
4496          * Zone is still incomplete, but we need to drop all locks while
4497          * zsched() initializes this zone's kernel process.  We
4498          * optimistically add the zone to the hashtable and associated
4499          * lists so a parallel zone_create() doesn't try to create the
4500          * same zone.
4501          */
4502         zonecount++;
4503         (void) mod_hash_insert(zonehashbyid,
4504             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4505             (mod_hash_val_t)(uintptr_t)zone);
4506         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4507         (void) strcpy(str, zone->zone_name);
4508         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4509             (mod_hash_val_t)(uintptr_t)zone);
4510
4511         /*
4512          * Insert into active list.  At this point there are no 'hold's
4513          * on the zone, but everyone else knows not to use it, so we can
4514          * continue to use it.  zsched() will do a zone_hold() if the
4515          * newproc() is successful.
4516          */
4517         list_insert_tail(&zone_active, zone);
4518         mutex_exit(&zonehash_lock);
4519
4520         zarg.zone = zone;
4521         zarg.nvlist = rctls;
4522         /*
4523          * The process, task, and project rctls are probably wrong;
4524          * we need an interface to get the default values of all rctls,
4525          * and initialize zsched appropriately.  I'm not sure that that
4526          * makes much of a difference, though.
4527          */
4528         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4529         if (error != 0) {
4530                 /*
4531                  * We need to undo all globally visible state.
4532                  */
4533                 mutex_enter(&zonehash_lock);
4534                 list_remove(&zone_active, zone);
4535                 (void) mod_hash_destroy(zonehashbyname,
4536                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4537                 (void) mod_hash_destroy(zonehashbyid,
4538                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4539                 ASSERT(zonecount > 1);
4540                 zonecount--;
4541                 goto errout;
4542         }
4543
4544         /*
4545          * Zone creation can't fail from now on.
4546          */
4547
4548         /*
4549          * Create zone kstats
4550          */
4551         zone_kstat_create(zone);
4552
4553         /*
4554          * Let the other lwps continue.
4555          */
4556         mutex_enter(&pp->p_lock);
4557         if (curthread != pp->p_agenttp)
4558                 continuelwps(pp);
4559         mutex_exit(&pp->p_lock);
4560
4561         /*
4562          * Wait for zsched to finish initializing the zone.
4563          */
4564         zone_status_wait(zone, ZONE_IS_READY);
4565         /*
4566          * The zone is fully visible, so we can let mounts progress.
4567          */
4568         resume_mounts(zone);
4569         nvlist_free(rctls);
4570
4571         return (zoneid);
4572
4573 errout:
4574         mutex_exit(&zonehash_lock);
4575         /*
4576          * Let the other lwps continue.
4577          */
4578         mutex_enter(&pp->p_lock);
4579         if (curthread != pp->p_agenttp)
4580                 continuelwps(pp);
4581         mutex_exit(&pp->p_lock);
4582
4583         resume_mounts(zone);
4584         nvlist_free(rctls);
4585         /*
4586          * There is currently one reference to the zone, a cred_ref from
4587          * zone_kcred.  To free the zone, we call crfree, which will call
4588          * zone_cred_rele, which will call zone_free.
4589          */
4590         ASSERT(zone->zone_cred_ref == 1);
4591         ASSERT(zone->zone_kcred->cr_ref == 1);
4592         ASSERT(zone->zone_ref == 0);
4593         zkcr = zone->zone_kcred;
4594         zone->zone_kcred = NULL;
4595         crfree(zkcr);                           /* triggers call to zone_free */
4596         return (zone_create_error(error, error2, extended_error));
4597 }
4598
4599 /*
4600  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4601  * the heavy lifting.  initname is the path to the program to launch
4602  * at the "top" of the zone; if this is NULL, we use the system default,
4603  * which is stored at zone_default_initname.
4604  */
4605 static int
4606 zone_boot(zoneid_t zoneid)
4607 {
4608         int err;
4609         zone_t *zone;
4610
4611         if (secpolicy_zone_config(CRED()) != 0)
4612                 return (set_errno(EPERM));
4613         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4614                 return (set_errno(EINVAL));
4615
4616         mutex_enter(&zonehash_lock);
4617         /*
4618          * Look for zone under hash lock to prevent races with calls to
4619          * zone_shutdown, zone_destroy, etc.
4620          */
4621         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4622                 mutex_exit(&zonehash_lock);
4623                 return (set_errno(EINVAL));
4624         }
4625
4626         mutex_enter(&zone_status_lock);
4627         if (zone_status_get(zone) != ZONE_IS_READY) {
4628                 mutex_exit(&zone_status_lock);
4629                 mutex_exit(&zonehash_lock);
4630                 return (set_errno(EINVAL));
4631         }
4632         zone_status_set(zone, ZONE_IS_BOOTING);
4633         mutex_exit(&zone_status_lock);
4634
4635         zone_hold(zone);        /* so we can use the zone_t later */
4636         mutex_exit(&zonehash_lock);
4637
4638         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4639                 zone_rele(zone);
4640                 return (set_errno(EINTR));
4641         }
4642
4643         /*
4644          * Boot (starting init) might have failed, in which case the zone
4645          * will go to the SHUTTING_DOWN state; an appropriate errno will
4646          * be placed in zone->zone_boot_err, and so we return that.
4647          */
4648         err = zone->zone_boot_err;
4649         zone_rele(zone);
4650         return (err ? set_errno(err) : 0);
4651 }
4652
4653 /*
4654  * Kills all user processes in the zone, waiting for them all to exit
4655  * before returning.
4656  */
4657 static int
4658 zone_empty(zone_t *zone)
4659 {
4660         int waitstatus;
4661
4662         /*
4663          * We need to drop zonehash_lock before killing all
4664          * processes, otherwise we'll deadlock with zone_find_*
4665          * which can be called from the exit path.
4666          */
4667         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4668         while ((waitstatus = zone_status_timedwait_sig(zone,
4669             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4670                 killall(zone->zone_id);
4671         }
4672         /*
4673          * return EINTR if we were signaled
4674          */
4675         if (waitstatus == 0)
4676                 return (EINTR);
4677         return (0);
4678 }
4679
4680 /*
4681  * This function implements the policy for zone visibility. A non-global zone
4682  * can only see itself.
4683  *
4684  * Returns true if zone attributes are viewable, false otherwise.
4685  */
4686 static boolean_t
4687 zone_list_access(zone_t *zone)
4688 {
4689
4690         if (curproc->p_zone == global_zone ||
4691             curproc->p_zone == zone) {
4692                 return (B_TRUE);
4693         } else {
4694                 return (B_FALSE);
4695         }
4696 }
4697
4698 /*
4699  * Systemcall to start the zone's halt sequence.  By the time this
4700  * function successfully returns, all user processes and kernel threads
4701  * executing in it will have exited, ZSD shutdown callbacks executed,
4702  * and the zone status set to ZONE_IS_DOWN.
4703  *
4704  * It is possible that the call will interrupt itself if the caller is the
4705  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4706  */
4707 static int
4708 zone_shutdown(zoneid_t zoneid)
4709 {
4710         int error;
4711         zone_t *zone;
4712         zone_status_t status;
4713
4714         if (secpolicy_zone_config(CRED()) != 0)
4715                 return (set_errno(EPERM));
4716         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4717                 return (set_errno(EINVAL));
4718
4719         mutex_enter(&zonehash_lock);
4720         /*
4721          * Look for zone under hash lock to prevent races with other
4722          * calls to zone_shutdown and zone_destroy.
4723          */
4724         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4725                 mutex_exit(&zonehash_lock);
4726                 return (set_errno(EINVAL));
4727         }
4728
4729         /*
4730          * We have to drop zonehash_lock before calling block_mounts.
4731          * Hold the zone so we can continue to use the zone_t.
4732          */
4733         zone_hold(zone);
4734         mutex_exit(&zonehash_lock);
4735
4736         /*
4737          * Block mounts so that VFS_MOUNT() can get an accurate view of
4738          * the zone's status with regards to ZONE_IS_SHUTTING down.
4739          *
4740          * e.g. NFS can fail the mount if it determines that the zone
4741          * has already begun the shutdown sequence.
4742          *
4743          */
4744         if (block_mounts(zone) == 0) {
4745                 zone_rele(zone);
4746                 return (set_errno(EINTR));
4747         }
4748
4749         mutex_enter(&zonehash_lock);
4750         mutex_enter(&zone_status_lock);
4751         status = zone_status_get(zone);
4752         /*
4753          * Fail if the zone isn't fully initialized yet.
4754          */
4755         if (status < ZONE_IS_READY) {
4756                 mutex_exit(&zone_status_lock);
4757                 mutex_exit(&zonehash_lock);
4758                 resume_mounts(zone);
4759                 zone_rele(zone);
4760                 return (set_errno(EINVAL));
4761         }
4762         /*
4763          * If conditions required for zone_shutdown() to return have been met,
4764          * return success.
4765          */
4766         if (status >= ZONE_IS_DOWN) {
4767                 mutex_exit(&zone_status_lock);
4768                 mutex_exit(&zonehash_lock);
4769                 resume_mounts(zone);
4770                 zone_rele(zone);
4771                 return (0);
4772         }
4773         /*
4774          * If zone_shutdown() hasn't been called before, go through the motions.
4775          * If it has, there's nothing to do but wait for the kernel threads to
4776          * drain.
4777          */
4778         if (status < ZONE_IS_EMPTY) {
4779                 uint_t ntasks;
4780
4781                 mutex_enter(&zone->zone_lock);
4782                 if ((ntasks = zone->zone_ntasks) != 1) {
4783                         /*
4784                          * There's still stuff running.
4785                          */
4786                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4787                 }
4788                 mutex_exit(&zone->zone_lock);
4789                 if (ntasks == 1) {
4790                         /*
4791                          * The only way to create another task is through
4792                          * zone_enter(), which will block until we drop
4793                          * zonehash_lock.  The zone is empty.
4794                          */
4795                         if (zone->zone_kthreads == NULL) {
4796                                 /*
4797                                  * Skip ahead to ZONE_IS_DOWN
4798                                  */
4799                                 zone_status_set(zone, ZONE_IS_DOWN);
4800                         } else {
4801                                 zone_status_set(zone, ZONE_IS_EMPTY);
4802                         }
4803                 }
4804         }
4805         mutex_exit(&zone_status_lock);
4806         mutex_exit(&zonehash_lock);
4807         resume_mounts(zone);
4808
4809         if (error = zone_empty(zone)) {
4810                 zone_rele(zone);
4811                 return (set_errno(error));
4812         }
4813         /*
4814          * After the zone status goes to ZONE_IS_DOWN this zone will no
4815          * longer be notified of changes to the pools configuration, so
4816          * in order to not end up with a stale pool pointer, we point
4817          * ourselves at the default pool and remove all resource
4818          * visibility.  This is especially important as the zone_t may
4819          * languish on the deathrow for a very long time waiting for
4820          * cred's to drain out.
4821          *
4822          * This rebinding of the zone can happen multiple times
4823          * (presumably due to interrupted or parallel systemcalls)
4824          * without any adverse effects.
4825          */
4826         if (pool_lock_intr() != 0) {
4827                 zone_rele(zone);
4828                 return (set_errno(EINTR));
4829         }
4830         if (pool_state == POOL_ENABLED) {
4831                 mutex_enter(&cpu_lock);
4832                 zone_pool_set(zone, pool_default);
4833                 /*
4834                  * The zone no longer needs to be able to see any cpus.
4835                  */
4836                 zone_pset_set(zone, ZONE_PS_INVAL);
4837                 mutex_exit(&cpu_lock);
4838         }
4839         pool_unlock();
4840
4841         /*
4842          * ZSD shutdown callbacks can be executed multiple times, hence
4843          * it is safe to not be holding any locks across this call.
4844          */
4845         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4846
4847         mutex_enter(&zone_status_lock);
4848         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4849                 zone_status_set(zone, ZONE_IS_DOWN);
4850         mutex_exit(&zone_status_lock);
4851
4852         /*
4853          * Wait for kernel threads to drain.
4854          */
4855         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4856                 zone_rele(zone);
4857                 return (set_errno(EINTR));
4858         }
4859
4860         /*
4861          * Zone can be become down/destroyable even if the above wait
4862          * returns EINTR, so any code added here may never execute.
4863          * (i.e. don't add code here)
4864          */
4865
4866         zone_rele(zone);
4867         return (0);
4868 }
4869
4870 /*
4871  * Log the specified zone's reference counts.  The caller should not be
4872  * holding the zone's zone_lock.
4873  */
4874 static void
4875 zone_log_refcounts(zone_t *zone)
4876 {
4877         char *buffer;
4878         char *buffer_position;
4879         uint32_t buffer_size;
4880         uint32_t index;
4881         uint_t ref;
4882         uint_t cred_ref;
4883
4884         /*
4885          * Construct a string representing the subsystem-specific reference
4886          * counts.  The counts are printed in ascending order by index into the
4887          * zone_t::zone_subsys_ref array.  The list will be surrounded by
4888          * square brackets [] and will only contain nonzero reference counts.
4889          *
4890          * The buffer will hold two square bracket characters plus ten digits,
4891          * one colon, one space, one comma, and some characters for a
4892          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4893          * bit integers have at most ten decimal digits.)  The last
4894          * reference count's comma is replaced by the closing square
4895          * bracket and a NULL character to terminate the string.
4896          *
4897          * NOTE: We have to grab the zone's zone_lock to create a consistent
4898          * snapshot of the zone's reference counters.
4899          *
4900          * First, figure out how much space the string buffer will need.
4901          * The buffer's size is stored in buffer_size.
4902          */
4903         buffer_size = 2;                        /* for the square brackets */
4904         mutex_enter(&zone->zone_lock);
4905         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4906         ref = zone->zone_ref;
4907         cred_ref = zone->zone_cred_ref;
4908         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4909                 if (zone->zone_subsys_ref[index] != 0)
4910                         buffer_size += strlen(zone_ref_subsys_names[index]) +
4911                             13;
4912         if (buffer_size == 2) {
4913                 /*
4914                  * No subsystems had nonzero reference counts.  Don't bother
4915                  * with allocating a buffer; just log the general-purpose and
4916                  * credential reference counts.
4917                  */
4918                 mutex_exit(&zone->zone_lock);
4919                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4920                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
4921                     "references and %u credential references are still extant",
4922                     zone->zone_name, zone->zone_id, ref, cred_ref);
4923                 return;
4924         }
4925
4926         /*
4927          * buffer_size contains the exact number of characters that the
4928          * buffer will need.  Allocate the buffer and fill it with nonzero
4929          * subsystem-specific reference counts.  Surround the results with
4930          * square brackets afterwards.
4931          */
4932         buffer = kmem_alloc(buffer_size, KM_SLEEP);
4933         buffer_position = &buffer[1];
4934         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
4935                 /*
4936                  * NOTE: The DDI's version of sprintf() returns a pointer to
4937                  * the modified buffer rather than the number of bytes written
4938                  * (as in snprintf(3C)).  This is unfortunate and annoying.
4939                  * Therefore, we'll use snprintf() with INT_MAX to get the
4940                  * number of bytes written.  Using INT_MAX is safe because
4941                  * the buffer is perfectly sized for the data: we'll never
4942                  * overrun the buffer.
4943                  */
4944                 if (zone->zone_subsys_ref[index] != 0)
4945                         buffer_position += snprintf(buffer_position, INT_MAX,
4946                             "%s: %u,", zone_ref_subsys_names[index],
4947                             zone->zone_subsys_ref[index]);
4948         }
4949         mutex_exit(&zone->zone_lock);
4950         buffer[0] = '[';
4951         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
4952         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
4953         buffer_position[-1] = ']';
4954
4955         /*
4956          * Log the reference counts and free the message buffer.
4957          */
4958         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4959             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
4960             "%u credential references are still extant %s", zone->zone_name,
4961             zone->zone_id, ref, cred_ref, buffer);
4962         kmem_free(buffer, buffer_size);
4963 }
4964
4965 /*
4966  * Systemcall entry point to finalize the zone halt process.  The caller
4967  * must have already successfully called zone_shutdown().
4968  *
4969  * Upon successful completion, the zone will have been fully destroyed:
4970  * zsched will have exited, destructor callbacks executed, and the zone
4971  * removed from the list of active zones.
4972  */
4973 static int
4974 zone_destroy(zoneid_t zoneid)
4975 {
4976         uint64_t uniqid;
4977         zone_t *zone;
4978         zone_status_t status;
4979         clock_t wait_time;
4980         boolean_t log_refcounts;
4981
4982         if (secpolicy_zone_config(CRED()) != 0)
4983                 return (set_errno(EPERM));
4984         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4985                 return (set_errno(EINVAL));
4986
4987         mutex_enter(&zonehash_lock);
4988         /*
4989          * Look for zone under hash lock to prevent races with other
4990          * calls to zone_destroy.
4991          */
4992         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4993                 mutex_exit(&zonehash_lock);
4994                 return (set_errno(EINVAL));
4995         }
4996
4997         if (zone_mount_count(zone->zone_rootpath) != 0) {
4998                 mutex_exit(&zonehash_lock);
4999                 return (set_errno(EBUSY));
5000         }
5001         mutex_enter(&zone_status_lock);
5002         status = zone_status_get(zone);
5003         if (status < ZONE_IS_DOWN) {
5004                 mutex_exit(&zone_status_lock);
5005                 mutex_exit(&zonehash_lock);
5006                 return (set_errno(EBUSY));
5007         } else if (status == ZONE_IS_DOWN) {
5008                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5009         }
5010         mutex_exit(&zone_status_lock);
5011         zone_hold(zone);
5012         mutex_exit(&zonehash_lock);
5013
5014         /*
5015          * wait for zsched to exit
5016          */
5017         zone_status_wait(zone, ZONE_IS_DEAD);
5018         zone_zsd_callbacks(zone, ZSD_DESTROY);
5019         zone->zone_netstack = NULL;
5020         uniqid = zone->zone_uniqid;
5021         zone_rele(zone);
5022         zone = NULL;    /* potentially free'd */
5023
5024         log_refcounts = B_FALSE;
5025         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5026         mutex_enter(&zonehash_lock);
5027         for (; /* ever */; ) {
5028                 boolean_t unref;
5029                 boolean_t refs_have_been_logged;
5030
5031                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5032                     zone->zone_uniqid != uniqid) {
5033                         /*
5034                          * The zone has gone away.  Necessary conditions
5035                          * are met, so we return success.
5036                          */
5037                         mutex_exit(&zonehash_lock);
5038                         return (0);
5039                 }
5040                 mutex_enter(&zone->zone_lock);
5041                 unref = ZONE_IS_UNREF(zone);
5042                 refs_have_been_logged = (zone->zone_flags &
5043                     ZF_REFCOUNTS_LOGGED);
5044                 mutex_exit(&zone->zone_lock);
5045                 if (unref) {
5046                         /*
5047                          * There is only one reference to the zone -- that
5048                          * added when the zone was added to the hashtables --
5049                          * and things will remain this way until we drop
5050                          * zonehash_lock... we can go ahead and cleanup the
5051                          * zone.
5052                          */
5053                         break;
5054                 }
5055
5056                 /*
5057                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5058                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5059                  * some zone's general-purpose reference count reaches one.
5060                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5061                  * on zone_destroy_cv, then log the zone's reference counts and
5062                  * continue to wait for zone_rele() and zone_cred_rele().
5063                  */
5064                 if (!refs_have_been_logged) {
5065                         if (!log_refcounts) {
5066                                 /*
5067                                  * This thread hasn't timed out waiting on
5068                                  * zone_destroy_cv yet.  Wait wait_time clock
5069                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5070                                  * seconds) for the zone's references to clear.
5071                                  */
5072                                 ASSERT(wait_time > 0);
5073                                 wait_time = cv_reltimedwait_sig(
5074                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5075                                     TR_SEC);
5076                                 if (wait_time > 0) {
5077                                         /*
5078                                          * A thread in zone_rele() or
5079                                          * zone_cred_rele() signaled
5080                                          * zone_destroy_cv before this thread's
5081                                          * wait timed out.  The zone might have
5082                                          * only one reference left; find out!
5083                                          */
5084                                         continue;
5085                                 } else if (wait_time == 0) {
5086                                         /* The thread's process was signaled. */
5087                                         mutex_exit(&zonehash_lock);
5088                                         return (set_errno(EINTR));
5089                                 }
5090
5091                                 /*
5092                                  * The thread timed out while waiting on
5093                                  * zone_destroy_cv.  Even though the thread
5094                                  * timed out, it has to check whether another
5095                                  * thread woke up from zone_destroy_cv and
5096                                  * destroyed the zone.
5097                                  *
5098                                  * If the zone still exists and has more than
5099                                  * one unreleased general-purpose reference,
5100                                  * then log the zone's reference counts.
5101                                  */
5102                                 log_refcounts = B_TRUE;
5103                                 continue;
5104                         }
5105
5106                         /*
5107                          * The thread already timed out on zone_destroy_cv while
5108                          * waiting for subsystems to release the zone's last
5109                          * general-purpose references.  Log the zone's reference
5110                          * counts and wait indefinitely on zone_destroy_cv.
5111                          */
5112                         zone_log_refcounts(zone);
5113                 }
5114                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5115                         /* The thread's process was signaled. */
5116                         mutex_exit(&zonehash_lock);
5117                         return (set_errno(EINTR));
5118                 }
5119         }
5120
5121         /*
5122          * Remove CPU cap for this zone now since we're not going to
5123          * fail below this point.
5124          */
5125         cpucaps_zone_remove(zone);
5126
5127         /* Get rid of the zone's kstats */
5128         zone_kstat_delete(zone);
5129
5130         /* remove the pfexecd doors */
5131         if (zone->zone_pfexecd != NULL) {
5132                 klpd_freelist(&zone->zone_pfexecd);
5133                 zone->zone_pfexecd = NULL;
5134         }
5135
5136         /* free brand specific data */
5137         if (ZONE_IS_BRANDED(zone))
5138                 ZBROP(zone)->b_free_brand_data(zone);
5139
5140         /* Say goodbye to brand framework. */
5141         brand_unregister_zone(zone->zone_brand);
5142
5143         /*
5144          * It is now safe to let the zone be recreated; remove it from the
5145          * lists.  The memory will not be freed until the last cred
5146          * reference goes away.
5147          */
5148         ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5149         zonecount--;
5150         /* remove from active list and hash tables */
5151         list_remove(&zone_active, zone);
5152         (void) mod_hash_destroy(zonehashbyname,
5153             (mod_hash_key_t)zone->zone_name);
5154         (void) mod_hash_destroy(zonehashbyid,
5155             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5156         mutex_exit(&zonehash_lock);
5157
5158         /*
5159          * Release the root vnode; we're not using it anymore.  Nor should any
5160          * other thread that might access it exist.
5161          */
5162         if (zone->zone_rootvp != NULL) {
5163                 VN_RELE(zone->zone_rootvp);
5164                 zone->zone_rootvp = NULL;
5165         }
5166
5167         /* add to deathrow list */
5168         mutex_enter(&zone_deathrow_lock);
5169         list_insert_tail(&zone_deathrow, zone);
5170         mutex_exit(&zone_deathrow_lock);
5171
5172         /*
5173          * Drop last reference (which was added by zsched()), this will
5174          * free the zone unless there are outstanding cred references.
5175          */
5176         zone_rele(zone);
5177         return (0);
5178 }
5179
5180 /*
5181  * Systemcall entry point for zone_getattr(2).
5182  */
5183 static ssize_t
5184 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5185 {
5186         size_t size;
5187         int error = 0, err;
5188         zone_t *zone;
5189         char *zonepath;
5190         char *outstr;
5191         zone_status_t zone_status;
5192         pid_t initpid;
5193         boolean_t global = (curzone == global_zone);
5194         boolean_t inzone = (curzone->zone_id == zoneid);
5195         ushort_t flags;
5196         zone_net_data_t *zbuf;
5197
5198         mutex_enter(&zonehash_lock);
5199         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5200                 mutex_exit(&zonehash_lock);
5201                 return (set_errno(EINVAL));
5202         }
5203         zone_status = zone_status_get(zone);
5204         if (zone_status < ZONE_IS_INITIALIZED) {
5205                 mutex_exit(&zonehash_lock);
5206                 return (set_errno(EINVAL));
5207         }
5208         zone_hold(zone);
5209         mutex_exit(&zonehash_lock);
5210
5211         /*
5212          * If not in the global zone, don't show information about other zones.
5213          */
5214         if (!zone_list_access(zone)) {
5215                 zone_rele(zone);
5216                 return (set_errno(EINVAL));
5217         }
5218
5219         switch (attr) {
5220         case ZONE_ATTR_ROOT:
5221                 if (global) {
5222                         /*
5223                          * Copy the path to trim the trailing "/" (except for
5224                          * the global zone).
5225                          */
5226                         if (zone != global_zone)
5227                                 size = zone->zone_rootpathlen - 1;
5228                         else
5229                                 size = zone->zone_rootpathlen;
5230                         zonepath = kmem_alloc(size, KM_SLEEP);
5231                         bcopy(zone->zone_rootpath, zonepath, size);
5232                         zonepath[size - 1] = '\0';
5233                 } else {
5234                         if (inzone) {
5235                                 /*
5236                                  * Caller is not in the global zone.  if the
5237                                  * query is on the current zone just return
5238                                  * faked-up path for current zone.
5239                                  */
5240                                 zonepath = "/";
5241                                 size = 2;
5242                         } else {
5243                                 /*
5244                                  * Return related path for current zone.
5245                                  */
5246                                 int prefix_len = strlen(zone_prefix);
5247                                 int zname_len = strlen(zone->zone_name);
5248
5249                                 size = prefix_len + zname_len + 1;
5250                                 zonepath = kmem_alloc(size, KM_SLEEP);
5251                                 bcopy(zone_prefix, zonepath, prefix_len);
5252                                 bcopy(zone->zone_name, zonepath +
5253                                     prefix_len, zname_len);
5254                                 zonepath[size - 1] = '\0';
5255                         }
5256                 }
5257                 if (bufsize > size)
5258                         bufsize = size;
5259                 if (buf != NULL) {
5260                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5261                         if (err != 0 && err != ENAMETOOLONG)
5262                                 error = EFAULT;
5263                 }
5264                 if (global)
5265                         kmem_free(zonepath, size);
5266                 break;
5267
5268         case ZONE_ATTR_NAME:
5269                 size = strlen(zone->zone_name) + 1;
5270                 if (bufsize > size)
5271                         bufsize = size;
5272                 if (buf != NULL) {
5273                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5274                         if (err != 0 && err != ENAMETOOLONG)
5275                                 error = EFAULT;
5276                 }
5277                 break;
5278
5279         case ZONE_ATTR_STATUS:
5280                 /*
5281                  * Since we're not holding zonehash_lock, the zone status
5282                  * may be anything; leave it up to userland to sort it out.
5283                  */
5284                 size = sizeof (zone_status);
5285                 if (bufsize > size)
5286                         bufsize = size;
5287                 zone_status = zone_status_get(zone);
5288                 if (buf != NULL &&
5289                     copyout(&zone_status, buf, bufsize) != 0)
5290                         error = EFAULT;
5291                 break;
5292         case ZONE_ATTR_FLAGS:
5293                 size = sizeof (zone->zone_flags);
5294                 if (bufsize > size)
5295                         bufsize = size;
5296                 flags = zone->zone_flags;
5297                 if (buf != NULL &&
5298                     copyout(&flags, buf, bufsize) != 0)
5299                         error = EFAULT;
5300                 break;
5301         case ZONE_ATTR_PRIVSET:
5302                 size = sizeof (priv_set_t);
5303                 if (bufsize > size)
5304                         bufsize = size;
5305                 if (buf != NULL &&
5306                     copyout(zone->zone_privset, buf, bufsize) != 0)
5307                         error = EFAULT;
5308                 break;
5309         case ZONE_ATTR_UNIQID:
5310                 size = sizeof (zone->zone_uniqid);
5311                 if (bufsize > size)
5312                         bufsize = size;
5313                 if (buf != NULL &&
5314                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5315                         error = EFAULT;
5316                 break;
5317         case ZONE_ATTR_POOLID:
5318                 {
5319                         pool_t *pool;
5320                         poolid_t poolid;
5321
5322                         if (pool_lock_intr() != 0) {
5323                                 error = EINTR;
5324                                 break;
5325                         }
5326                         pool = zone_pool_get(zone);
5327                         poolid = pool->pool_id;
5328                         pool_unlock();
5329                         size = sizeof (poolid);
5330                         if (bufsize > size)
5331                                 bufsize = size;
5332                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5333                                 error = EFAULT;
5334                 }
5335                 break;
5336         case ZONE_ATTR_INITPID:
5337                 size = sizeof (initpid);
5338                 if (bufsize > size)
5339                         bufsize = size;
5340                 initpid = zone->zone_proc_initpid;
5341                 if (initpid == -1) {
5342                         error = ESRCH;
5343                         break;
5344                 }
5345                 if (buf != NULL &&
5346                     copyout(&initpid, buf, bufsize) != 0)
5347                         error = EFAULT;
5348                 break;
5349         case ZONE_ATTR_BRAND:
5350                 size = strlen(zone->zone_brand->b_name) + 1;
5351
5352                 if (bufsize > size)
5353                         bufsize = size;
5354                 if (buf != NULL) {
5355                         err = copyoutstr(zone->zone_brand->b_name, buf,
5356                             bufsize, NULL);
5357                         if (err != 0 && err != ENAMETOOLONG)
5358                                 error = EFAULT;
5359                 }
5360                 break;
5361         case ZONE_ATTR_INITNAME:
5362                 size = strlen(zone->zone_initname) + 1;
5363                 if (bufsize > size)
5364                         bufsize = size;
5365                 if (buf != NULL) {
5366                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5367                             NULL);
5368                         if (err != 0 && err != ENAMETOOLONG)
5369                                 error = EFAULT;
5370                 }
5371                 break;
5372         case ZONE_ATTR_BOOTARGS:
5373                 if (zone->zone_bootargs == NULL)
5374                         outstr = "";
5375                 else
5376                         outstr = zone->zone_bootargs;
5377                 size = strlen(outstr) + 1;
5378                 if (bufsize > size)
5379                         bufsize = size;
5380                 if (buf != NULL) {
5381                         err = copyoutstr(outstr, buf, bufsize, NULL);
5382                         if (err != 0 && err != ENAMETOOLONG)
5383                                 error = EFAULT;
5384                 }
5385                 break;
5386         case ZONE_ATTR_PHYS_MCAP:
5387                 size = sizeof (zone->zone_phys_mcap);
5388                 if (bufsize > size)
5389                         bufsize = size;
5390                 if (buf != NULL &&
5391                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5392                         error = EFAULT;
5393                 break;
5394         case ZONE_ATTR_SCHED_CLASS:
5395                 mutex_enter(&class_lock);
5396
5397                 if (zone->zone_defaultcid >= loaded_classes)
5398                         outstr = "";
5399                 else
5400                         outstr = sclass[zone->zone_defaultcid].cl_name;
5401                 size = strlen(outstr) + 1;
5402                 if (bufsize > size)
5403                         bufsize = size;
5404                 if (buf != NULL) {
5405                         err = copyoutstr(outstr, buf, bufsize, NULL);
5406                         if (err != 0 && err != ENAMETOOLONG)
5407                                 error = EFAULT;
5408                 }
5409
5410                 mutex_exit(&class_lock);
5411                 break;
5412         case ZONE_ATTR_HOSTID:
5413                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5414                     bufsize == sizeof (zone->zone_hostid)) {
5415                         size = sizeof (zone->zone_hostid);
5416                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5417                             bufsize) != 0)
5418                                 error = EFAULT;
5419                 } else {
5420                         error = EINVAL;
5421                 }
5422                 break;
5423         case ZONE_ATTR_FS_ALLOWED:
5424                 if (zone->zone_fs_allowed == NULL)
5425                         outstr = "";
5426                 else
5427                         outstr = zone->zone_fs_allowed;
5428                 size = strlen(outstr) + 1;
5429                 if (bufsize > size)
5430                         bufsize = size;
5431                 if (buf != NULL) {
5432                         err = copyoutstr(outstr, buf, bufsize, NULL);
5433                         if (err != 0 && err != ENAMETOOLONG)
5434                                 error = EFAULT;
5435                 }
5436                 break;
5437         case ZONE_ATTR_SECFLAGS:
5438                 size = sizeof (zone->zone_secflags);
5439                 if (bufsize > size)
5440                         bufsize = size;
5441                 if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5442                         error = EFAULT;
5443                 break;
5444         case ZONE_ATTR_NETWORK:
5445                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5446                 if (copyin(buf, zbuf, bufsize) != 0) {
5447                         error = EFAULT;
5448                 } else {
5449                         error = zone_get_network(zoneid, zbuf);
5450                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5451                                 error = EFAULT;
5452                 }
5453                 kmem_free(zbuf, bufsize);
5454                 break;
5455         default:
5456                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5457                         size = bufsize;
5458                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5459                 } else {
5460                         error = EINVAL;
5461                 }
5462         }
5463         zone_rele(zone);
5464
5465         if (error)
5466                 return (set_errno(error));
5467         return ((ssize_t)size);
5468 }
5469
5470 /*
5471  * Systemcall entry point for zone_setattr(2).
5472  */
5473 /*ARGSUSED*/
5474 static int
5475 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5476 {
5477         zone_t *zone;
5478         zone_status_t zone_status;
5479         int err = -1;
5480         zone_net_data_t *zbuf;
5481
5482         if (secpolicy_zone_config(CRED()) != 0)
5483                 return (set_errno(EPERM));
5484
5485         /*
5486          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5487          * global zone.
5488          */
5489         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5490                 return (set_errno(EINVAL));
5491         }
5492
5493         mutex_enter(&zonehash_lock);
5494         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5495                 mutex_exit(&zonehash_lock);
5496                 return (set_errno(EINVAL));
5497         }
5498         zone_hold(zone);
5499         mutex_exit(&zonehash_lock);
5500
5501         /*
5502          * At present most attributes can only be set on non-running,
5503          * non-global zones.
5504          */
5505         zone_status = zone_status_get(zone);
5506         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5507                 err = EINVAL;
5508                 goto done;
5509         }
5510
5511         switch (attr) {
5512         case ZONE_ATTR_INITNAME:
5513                 err = zone_set_initname(zone, (const char *)buf);
5514                 break;
5515         case ZONE_ATTR_INITNORESTART:
5516                 zone->zone_restart_init = B_FALSE;
5517                 err = 0;
5518                 break;
5519         case ZONE_ATTR_BOOTARGS:
5520                 err = zone_set_bootargs(zone, (const char *)buf);
5521                 break;
5522         case ZONE_ATTR_BRAND:
5523                 err = zone_set_brand(zone, (const char *)buf);
5524                 break;
5525         case ZONE_ATTR_FS_ALLOWED:
5526                 err = zone_set_fs_allowed(zone, (const char *)buf);
5527                 break;
5528         case ZONE_ATTR_SECFLAGS:
5529                 err = zone_set_secflags(zone, (psecflags_t *)buf);
5530                 break;
5531         case ZONE_ATTR_PHYS_MCAP:
5532                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5533                 break;
5534         case ZONE_ATTR_SCHED_CLASS:
5535                 err = zone_set_sched_class(zone, (const char *)buf);
5536                 break;
5537         case ZONE_ATTR_HOSTID:
5538                 if (bufsize == sizeof (zone->zone_hostid)) {
5539                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5540                                 err = 0;
5541                         else
5542                                 err = EFAULT;
5543                 } else {
5544                         err = EINVAL;
5545                 }
5546                 break;
5547         case ZONE_ATTR_NETWORK:
5548                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5549                         err = EINVAL;
5550                         break;
5551                 }
5552                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5553                 if (copyin(buf, zbuf, bufsize) != 0) {
5554                         kmem_free(zbuf, bufsize);
5555                         err = EFAULT;
5556                         break;
5557                 }
5558                 err = zone_set_network(zoneid, zbuf);
5559                 kmem_free(zbuf, bufsize);
5560                 break;
5561         default:
5562                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5563                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5564                 else
5565                         err = EINVAL;
5566         }
5567
5568 done:
5569         zone_rele(zone);
5570         ASSERT(err != -1);
5571         return (err != 0 ? set_errno(err) : 0);
5572 }
5573
5574 /*
5575  * Return zero if the process has at least one vnode mapped in to its
5576  * address space which shouldn't be allowed to change zones.
5577  *
5578  * Also return zero if the process has any shared mappings which reserve
5579  * swap.  This is because the counting for zone.max-swap does not allow swap
5580  * reservation to be shared between zones.  zone swap reservation is counted
5581  * on zone->zone_max_swap.
5582  */
5583 static int
5584 as_can_change_zones(void)
5585 {
5586         proc_t *pp = curproc;
5587         struct seg *seg;
5588         struct as *as = pp->p_as;
5589         vnode_t *vp;
5590         int allow = 1;
5591
5592         ASSERT(pp->p_as != &kas);
5593         AS_LOCK_ENTER(as, RW_READER);
5594         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5595
5596                 /*
5597                  * Cannot enter zone with shared anon memory which
5598                  * reserves swap.  See comment above.
5599                  */
5600                 if (seg_can_change_zones(seg) == B_FALSE) {
5601                         allow = 0;
5602                         break;
5603                 }
5604                 /*
5605                  * if we can't get a backing vnode for this segment then skip
5606                  * it.
5607                  */
5608                 vp = NULL;
5609                 if (segop_getvp(seg, seg->s_base, &vp) != 0 || vp == NULL)
5610                         continue;
5611                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5612                         allow = 0;
5613                         break;
5614                 }
5615         }
5616         AS_LOCK_EXIT(as);
5617         return (allow);
5618 }
5619
5620 /*
5621  * Count swap reserved by curproc's address space
5622  */
5623 static size_t
5624 as_swresv(void)
5625 {
5626         proc_t *pp = curproc;
5627         struct seg *seg;
5628         struct as *as = pp->p_as;
5629         size_t swap = 0;
5630
5631         ASSERT(pp->p_as != &kas);
5632         ASSERT(AS_WRITE_HELD(as));
5633         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5634                 swap += seg_swresv(seg);
5635
5636         return (swap);
5637 }
5638
5639 /*
5640  * Systemcall entry point for zone_enter().
5641  *
5642  * The current process is injected into said zone.  In the process
5643  * it will change its project membership, privileges, rootdir/cwd,
5644  * zone-wide rctls, and pool association to match those of the zone.
5645  *
5646  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5647  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5648  * enter a zone that is "ready" or "running".
5649  */
5650 static int
5651 zone_enter(zoneid_t zoneid)
5652 {
5653         zone_t *zone;
5654         vnode_t *vp;
5655         proc_t *pp = curproc;
5656         contract_t *ct;
5657         cont_process_t *ctp;
5658         task_t *tk, *oldtk;
5659         kproject_t *zone_proj0;
5660         cred_t *cr, *newcr;
5661         pool_t *oldpool, *newpool;
5662         sess_t *sp;
5663         uid_t uid;
5664         zone_status_t status;
5665         int err = 0;
5666         rctl_entity_p_t e;
5667         size_t swap;
5668         kthread_id_t t;
5669
5670         if (secpolicy_zone_config(CRED()) != 0)
5671                 return (set_errno(EPERM));
5672         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5673                 return (set_errno(EINVAL));
5674
5675         /*
5676          * Stop all lwps so we don't need to hold a lock to look at
5677          * curproc->p_zone.  This needs to happen before we grab any
5678          * locks to avoid deadlock (another lwp in the process could
5679          * be waiting for the held lock).
5680          */
5681         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5682                 return (set_errno(EINTR));
5683
5684         /*
5685          * Make sure we're not changing zones with files open or mapped in
5686          * to our address space which shouldn't be changing zones.
5687          */
5688         if (!files_can_change_zones()) {
5689                 err = EBADF;
5690                 goto out;
5691         }
5692         if (!as_can_change_zones()) {
5693                 err = EFAULT;
5694                 goto out;
5695         }
5696
5697         mutex_enter(&zonehash_lock);
5698         if (pp->p_zone != global_zone) {
5699                 mutex_exit(&zonehash_lock);
5700                 err = EINVAL;
5701                 goto out;
5702         }
5703
5704         zone = zone_find_all_by_id(zoneid);
5705         if (zone == NULL) {
5706                 mutex_exit(&zonehash_lock);
5707                 err = EINVAL;
5708                 goto out;
5709         }
5710
5711         /*
5712          * To prevent processes in a zone from holding contracts on
5713          * extrazonal resources, and to avoid process contract
5714          * memberships which span zones, contract holders and processes
5715          * which aren't the sole members of their encapsulating process
5716          * contracts are not allowed to zone_enter.
5717          */
5718         ctp = pp->p_ct_process;
5719         ct = &ctp->conp_contract;
5720         mutex_enter(&ct->ct_lock);
5721         mutex_enter(&pp->p_lock);
5722         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5723                 mutex_exit(&pp->p_lock);
5724                 mutex_exit(&ct->ct_lock);
5725                 mutex_exit(&zonehash_lock);
5726                 err = EINVAL;
5727                 goto out;
5728         }
5729
5730         /*
5731          * Moreover, we don't allow processes whose encapsulating
5732          * process contracts have inherited extrazonal contracts.
5733          * While it would be easier to eliminate all process contracts
5734          * with inherited contracts, we need to be able to give a
5735          * restarted init (or other zone-penetrating process) its
5736          * predecessor's contracts.
5737          */
5738         if (ctp->conp_ninherited != 0) {
5739                 contract_t *next;
5740                 for (next = list_head(&ctp->conp_inherited); next;
5741                     next = list_next(&ctp->conp_inherited, next)) {
5742                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5743                                 mutex_exit(&pp->p_lock);
5744                                 mutex_exit(&ct->ct_lock);
5745                                 mutex_exit(&zonehash_lock);
5746                                 err = EINVAL;
5747                                 goto out;
5748                         }
5749                 }
5750         }
5751
5752         mutex_exit(&pp->p_lock);
5753         mutex_exit(&ct->ct_lock);
5754
5755         status = zone_status_get(zone);
5756         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5757                 /*
5758                  * Can't join
5759                  */
5760                 mutex_exit(&zonehash_lock);
5761                 err = EINVAL;
5762                 goto out;
5763         }
5764
5765         /*
5766          * Make sure new priv set is within the permitted set for caller
5767          */
5768         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5769                 mutex_exit(&zonehash_lock);
5770                 err = EPERM;
5771                 goto out;
5772         }
5773         /*
5774          * We want to momentarily drop zonehash_lock while we optimistically
5775          * bind curproc to the pool it should be running in.  This is safe
5776          * since the zone can't disappear (we have a hold on it).
5777          */
5778         zone_hold(zone);
5779         mutex_exit(&zonehash_lock);
5780
5781         /*
5782          * Grab pool_lock to keep the pools configuration from changing
5783          * and to stop ourselves from getting rebound to another pool
5784          * until we join the zone.
5785          */
5786         if (pool_lock_intr() != 0) {
5787                 zone_rele(zone);
5788                 err = EINTR;
5789                 goto out;
5790         }
5791         ASSERT(secpolicy_pool(CRED()) == 0);
5792         /*
5793          * Bind ourselves to the pool currently associated with the zone.
5794          */
5795         oldpool = curproc->p_pool;
5796         newpool = zone_pool_get(zone);
5797         if (pool_state == POOL_ENABLED && newpool != oldpool &&
5798             (err = pool_do_bind(newpool, P_PID, P_MYID,
5799             POOL_BIND_ALL)) != 0) {
5800                 pool_unlock();
5801                 zone_rele(zone);
5802                 goto out;
5803         }
5804
5805         /*
5806          * Grab cpu_lock now; we'll need it later when we call
5807          * task_join().
5808          */
5809         mutex_enter(&cpu_lock);
5810         mutex_enter(&zonehash_lock);
5811         /*
5812          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5813          */
5814         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5815                 /*
5816                  * Can't join anymore.
5817                  */
5818                 mutex_exit(&zonehash_lock);
5819                 mutex_exit(&cpu_lock);
5820                 if (pool_state == POOL_ENABLED &&
5821                     newpool != oldpool)
5822                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
5823                             POOL_BIND_ALL);
5824                 pool_unlock();
5825                 zone_rele(zone);
5826                 err = EINVAL;
5827                 goto out;
5828         }
5829
5830         /*
5831          * a_lock must be held while transfering locked memory and swap
5832          * reservation from the global zone to the non global zone because
5833          * asynchronous faults on the processes' address space can lock
5834          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5835          * segments respectively.
5836          */
5837         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
5838         swap = as_swresv();
5839         mutex_enter(&pp->p_lock);
5840         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5841         /* verify that we do not exceed and task or lwp limits */
5842         mutex_enter(&zone->zone_nlwps_lock);
5843         /* add new lwps to zone and zone's proj0 */
5844         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5845         zone->zone_nlwps += pp->p_lwpcnt;
5846         /* add 1 task to zone's proj0 */
5847         zone_proj0->kpj_ntasks += 1;
5848
5849         zone_proj0->kpj_nprocs++;
5850         zone->zone_nprocs++;
5851         mutex_exit(&zone->zone_nlwps_lock);
5852
5853         mutex_enter(&zone->zone_mem_lock);
5854         zone->zone_locked_mem += pp->p_locked_mem;
5855         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5856         zone->zone_max_swap += swap;
5857         mutex_exit(&zone->zone_mem_lock);
5858
5859         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5860         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5861         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5862
5863         /* remove lwps and process from proc's old zone and old project */
5864         mutex_enter(&pp->p_zone->zone_nlwps_lock);
5865         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5866         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5867         pp->p_task->tk_proj->kpj_nprocs--;
5868         pp->p_zone->zone_nprocs--;
5869         mutex_exit(&pp->p_zone->zone_nlwps_lock);
5870
5871         mutex_enter(&pp->p_zone->zone_mem_lock);
5872         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5873         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5874         pp->p_zone->zone_max_swap -= swap;
5875         mutex_exit(&pp->p_zone->zone_mem_lock);
5876
5877         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5878         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5879         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5880
5881         pp->p_flag |= SZONETOP;
5882         pp->p_zone = zone;
5883         mutex_exit(&pp->p_lock);
5884         AS_LOCK_EXIT(pp->p_as);
5885
5886         /*
5887          * Joining the zone cannot fail from now on.
5888          *
5889          * This means that a lot of the following code can be commonized and
5890          * shared with zsched().
5891          */
5892
5893         /*
5894          * If the process contract fmri was inherited, we need to
5895          * flag this so that any contract status will not leak
5896          * extra zone information, svc_fmri in this case
5897          */
5898         if (ctp->conp_svc_ctid != ct->ct_id) {
5899                 mutex_enter(&ct->ct_lock);
5900                 ctp->conp_svc_zone_enter = ct->ct_id;
5901                 mutex_exit(&ct->ct_lock);
5902         }
5903
5904         /*
5905          * Reset the encapsulating process contract's zone.
5906          */
5907         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5908         contract_setzuniqid(ct, zone->zone_uniqid);
5909
5910         /*
5911          * Create a new task and associate the process with the project keyed
5912          * by (projid,zoneid).
5913          *
5914          * We might as well be in project 0; the global zone's projid doesn't
5915          * make much sense in a zone anyhow.
5916          *
5917          * This also increments zone_ntasks, and returns with p_lock held.
5918          */
5919         tk = task_create(0, zone);
5920         oldtk = task_join(tk, 0);
5921         mutex_exit(&cpu_lock);
5922
5923         /*
5924          * call RCTLOP_SET functions on this proc
5925          */
5926         e.rcep_p.zone = zone;
5927         e.rcep_t = RCENTITY_ZONE;
5928         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5929             RCD_CALLBACK);
5930         mutex_exit(&pp->p_lock);
5931
5932         /*
5933          * We don't need to hold any of zsched's locks here; not only do we know
5934          * the process and zone aren't going away, we know its session isn't
5935          * changing either.
5936          *
5937          * By joining zsched's session here, we mimic the behavior in the
5938          * global zone of init's sid being the pid of sched.  We extend this
5939          * to all zlogin-like zone_enter()'ing processes as well.
5940          */
5941         mutex_enter(&pidlock);
5942         sp = zone->zone_zsched->p_sessp;
5943         sess_hold(zone->zone_zsched);
5944         mutex_enter(&pp->p_lock);
5945         pgexit(pp);
5946         sess_rele(pp->p_sessp, B_TRUE);
5947         pp->p_sessp = sp;
5948         pgjoin(pp, zone->zone_zsched->p_pidp);
5949
5950         /*
5951          * If any threads are scheduled to be placed on zone wait queue they
5952          * should abandon the idea since the wait queue is changing.
5953          * We need to be holding pidlock & p_lock to do this.
5954          */
5955         if ((t = pp->p_tlist) != NULL) {
5956                 do {
5957                         thread_lock(t);
5958                         /*
5959                          * Kick this thread so that it doesn't sit
5960                          * on a wrong wait queue.
5961                          */
5962                         if (ISWAITING(t))
5963                                 setrun_locked(t);
5964
5965                         if (t->t_schedflag & TS_ANYWAITQ)
5966                                 t->t_schedflag &= ~ TS_ANYWAITQ;
5967
5968                         thread_unlock(t);
5969                 } while ((t = t->t_forw) != pp->p_tlist);
5970         }
5971
5972         /*
5973          * If there is a default scheduling class for the zone and it is not
5974          * the class we are currently in, change all of the threads in the
5975          * process to the new class.  We need to be holding pidlock & p_lock
5976          * when we call parmsset so this is a good place to do it.
5977          */
5978         if (zone->zone_defaultcid > 0 &&
5979             zone->zone_defaultcid != curthread->t_cid) {
5980                 pcparms_t pcparms;
5981
5982                 pcparms.pc_cid = zone->zone_defaultcid;
5983                 pcparms.pc_clparms[0] = 0;
5984
5985                 /*
5986                  * If setting the class fails, we still want to enter the zone.
5987                  */
5988                 if ((t = pp->p_tlist) != NULL) {
5989                         do {
5990                                 (void) parmsset(&pcparms, t);
5991                         } while ((t = t->t_forw) != pp->p_tlist);
5992                 }
5993         }
5994
5995         mutex_exit(&pp->p_lock);
5996         mutex_exit(&pidlock);
5997
5998         mutex_exit(&zonehash_lock);
5999         /*
6000          * We're firmly in the zone; let pools progress.
6001          */
6002         pool_unlock();
6003         task_rele(oldtk);
6004         /*
6005          * We don't need to retain a hold on the zone since we already
6006          * incremented zone_ntasks, so the zone isn't going anywhere.
6007          */
6008         zone_rele(zone);
6009
6010         /*
6011          * Chroot
6012          */
6013         vp = zone->zone_rootvp;
6014         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6015         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6016
6017         /*
6018          * Change process security flags.  Note that the _effective_ flags
6019          * cannot change
6020          */
6021         secflags_copy(&pp->p_secflags.psf_lower,
6022             &zone->zone_secflags.psf_lower);
6023         secflags_copy(&pp->p_secflags.psf_upper,
6024             &zone->zone_secflags.psf_upper);
6025         secflags_copy(&pp->p_secflags.psf_inherit,
6026             &zone->zone_secflags.psf_inherit);
6027
6028         /*
6029          * Change process credentials
6030          */
6031         newcr = cralloc();
6032         mutex_enter(&pp->p_crlock);
6033         cr = pp->p_cred;
6034         crcopy_to(cr, newcr);
6035         crsetzone(newcr, zone);
6036         pp->p_cred = newcr;
6037
6038         /*
6039          * Restrict all process privilege sets to zone limit
6040          */
6041         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6042         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6043         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6044         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6045         mutex_exit(&pp->p_crlock);
6046         crset(pp, newcr);
6047
6048         /*
6049          * Adjust upcount to reflect zone entry.
6050          */
6051         uid = crgetruid(newcr);
6052         mutex_enter(&pidlock);
6053         upcount_dec(uid, GLOBAL_ZONEID);
6054         upcount_inc(uid, zoneid);
6055         mutex_exit(&pidlock);
6056
6057         /*
6058          * Set up core file path and content.
6059          */
6060         set_core_defaults();
6061
6062 out:
6063         /*
6064          * Let the other lwps continue.
6065          */
6066         mutex_enter(&pp->p_lock);
6067         if (curthread != pp->p_agenttp)
6068                 continuelwps(pp);
6069         mutex_exit(&pp->p_lock);
6070
6071         return (err != 0 ? set_errno(err) : 0);
6072 }
6073
6074 /*
6075  * Systemcall entry point for zone_list(2).
6076  *
6077  * Processes running in a (non-global) zone only see themselves.
6078  */
6079 static int
6080 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6081 {
6082         zoneid_t *zoneids;
6083         zone_t *zone, *myzone;
6084         uint_t user_nzones, real_nzones;
6085         uint_t domi_nzones;
6086         int error;
6087
6088         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6089                 return (set_errno(EFAULT));
6090
6091         myzone = curproc->p_zone;
6092         if (myzone != global_zone) {
6093                 /* just return current zone */
6094                 real_nzones = domi_nzones = 1;
6095                 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6096                 zoneids[0] = myzone->zone_id;
6097         } else {
6098                 mutex_enter(&zonehash_lock);
6099                 real_nzones = zonecount;
6100                 domi_nzones = 0;
6101                 if (real_nzones > 0) {
6102                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6103                             KM_SLEEP);
6104                         for (zone = list_head(&zone_active); zone != NULL;
6105                             zone = list_next(&zone_active, zone))
6106                                 zoneids[domi_nzones++] = zone->zone_id;
6107                         ASSERT(domi_nzones == real_nzones);
6108                 }
6109                 mutex_exit(&zonehash_lock);
6110         }
6111
6112         /*
6113          * If user has allocated space for fewer entries than we found, then
6114          * return only up to their limit.  Either way, tell them exactly how
6115          * many we found.
6116          */
6117         if (domi_nzones < user_nzones)
6118                 user_nzones = domi_nzones;
6119         error = 0;
6120         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6121                 error = EFAULT;
6122         } else if (zoneidlist != NULL && user_nzones != 0) {
6123                 if (copyout(zoneids, zoneidlist,
6124                     user_nzones * sizeof (zoneid_t)) != 0)
6125                         error = EFAULT;
6126         }
6127
6128         if (real_nzones > 0)
6129                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6130
6131         if (error != 0)
6132                 return (set_errno(error));
6133         else
6134                 return (0);
6135 }
6136
6137 /*
6138  * Systemcall entry point for zone_lookup(2).
6139  *
6140  * Non-global zones are only able to see themselves.
6141  */
6142 static zoneid_t
6143 zone_lookup(const char *zone_name)
6144 {
6145         char *kname;
6146         zone_t *zone;
6147         zoneid_t zoneid;
6148         int err;
6149
6150         if (zone_name == NULL) {
6151                 /* return caller's zone id */
6152                 return (getzoneid());
6153         }
6154
6155         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6156         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6157                 kmem_free(kname, ZONENAME_MAX);
6158                 return (set_errno(err));
6159         }
6160
6161         mutex_enter(&zonehash_lock);
6162         zone = zone_find_all_by_name(kname);
6163         kmem_free(kname, ZONENAME_MAX);
6164         /* In a non-global zone, can only lookup global and own name. */
6165         if (zone == NULL ||
6166             zone_status_get(zone) < ZONE_IS_READY ||
6167             !zone_list_access(zone)) {
6168                 mutex_exit(&zonehash_lock);
6169                 return (set_errno(EINVAL));
6170         } else {
6171                 zoneid = zone->zone_id;
6172                 mutex_exit(&zonehash_lock);
6173                 return (zoneid);
6174         }
6175 }
6176
6177 static int
6178 zone_version(int *version_arg)
6179 {
6180         int version = ZONE_SYSCALL_API_VERSION;
6181
6182         if (copyout(&version, version_arg, sizeof (int)) != 0)
6183                 return (set_errno(EFAULT));
6184         return (0);
6185 }
6186
6187 /* ARGSUSED */
6188 long
6189 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6190 {
6191         zone_def zs;
6192         int err;
6193
6194         switch (cmd) {
6195         case ZONE_CREATE:
6196                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6197                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6198                                 return (set_errno(EFAULT));
6199                         }
6200                 } else {
6201 #ifdef _SYSCALL32_IMPL
6202                         zone_def32 zs32;
6203
6204                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6205                                 return (set_errno(EFAULT));
6206                         }
6207                         zs.zone_name =
6208                             (const char *)(unsigned long)zs32.zone_name;
6209                         zs.zone_root =
6210                             (const char *)(unsigned long)zs32.zone_root;
6211                         zs.zone_privs =
6212                             (const struct priv_set *)
6213                             (unsigned long)zs32.zone_privs;
6214                         zs.zone_privssz = zs32.zone_privssz;
6215                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6216                         zs.rctlbufsz = zs32.rctlbufsz;
6217                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6218                         zs.zfsbufsz = zs32.zfsbufsz;
6219                         zs.extended_error =
6220                             (int *)(unsigned long)zs32.extended_error;
6221                         zs.flags = zs32.flags;
6222 #else
6223                         panic("get_udatamodel() returned bogus result\n");
6224 #endif
6225                 }
6226
6227                 return (zone_create(zs.zone_name, zs.zone_root,
6228                     zs.zone_privs, zs.zone_privssz,
6229                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6230                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6231                     zs.extended_error, zs.flags));
6232         case ZONE_BOOT:
6233                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6234         case ZONE_DESTROY:
6235                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6236         case ZONE_GETATTR:
6237                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6238                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6239         case ZONE_SETATTR:
6240                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6241                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6242         case ZONE_ENTER:
6243                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6244         case ZONE_LIST:
6245                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6246         case ZONE_SHUTDOWN:
6247                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6248         case ZONE_LOOKUP:
6249                 return (zone_lookup((const char *)arg1));
6250         case ZONE_VERSION:
6251                 return (zone_version((int *)arg1));
6252         case ZONE_ADD_DATALINK:
6253                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6254                     (datalink_id_t)(uintptr_t)arg2));
6255         case ZONE_DEL_DATALINK:
6256                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6257                     (datalink_id_t)(uintptr_t)arg2));
6258         case ZONE_CHECK_DATALINK: {
6259                 zoneid_t        zoneid;
6260                 boolean_t       need_copyout;
6261
6262                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6263                         return (EFAULT);
6264                 need_copyout = (zoneid == ALL_ZONES);
6265                 err = zone_check_datalink(&zoneid,
6266                     (datalink_id_t)(uintptr_t)arg2);
6267                 if (err == 0 && need_copyout) {
6268                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6269                                 err = EFAULT;
6270                 }
6271                 return (err == 0 ? 0 : set_errno(err));
6272         }
6273         case ZONE_LIST_DATALINK:
6274                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6275                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6276         default:
6277                 return (set_errno(EINVAL));
6278         }
6279 }
6280
6281 struct zarg {
6282         zone_t *zone;
6283         zone_cmd_arg_t arg;
6284 };
6285
6286 static int
6287 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6288 {
6289         char *buf;
6290         size_t buflen;
6291         int error;
6292
6293         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6294         buf = kmem_alloc(buflen, KM_SLEEP);
6295         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6296         error = door_ki_open(buf, doorp);
6297         kmem_free(buf, buflen);
6298         return (error);
6299 }
6300
6301 static void
6302 zone_release_door(door_handle_t *doorp)
6303 {
6304         door_ki_rele(*doorp);
6305         *doorp = NULL;
6306 }
6307
6308 static void
6309 zone_ki_call_zoneadmd(struct zarg *zargp)
6310 {
6311         door_handle_t door = NULL;
6312         door_arg_t darg, save_arg;
6313         char *zone_name;
6314         size_t zone_namelen;
6315         zoneid_t zoneid;
6316         zone_t *zone;
6317         zone_cmd_arg_t arg;
6318         uint64_t uniqid;
6319         size_t size;
6320         int error;
6321         int retry;
6322
6323         zone = zargp->zone;
6324         arg = zargp->arg;
6325         kmem_free(zargp, sizeof (*zargp));
6326
6327         zone_namelen = strlen(zone->zone_name) + 1;
6328         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6329         bcopy(zone->zone_name, zone_name, zone_namelen);
6330         zoneid = zone->zone_id;
6331         uniqid = zone->zone_uniqid;
6332         /*
6333          * zoneadmd may be down, but at least we can empty out the zone.
6334          * We can ignore the return value of zone_empty() since we're called
6335          * from a kernel thread and know we won't be delivered any signals.
6336          */
6337         ASSERT(curproc == &p0);
6338         (void) zone_empty(zone);
6339         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6340         zone_rele(zone);
6341
6342         size = sizeof (arg);
6343         darg.rbuf = (char *)&arg;
6344         darg.data_ptr = (char *)&arg;
6345         darg.rsize = size;
6346         darg.data_size = size;
6347         darg.desc_ptr = NULL;
6348         darg.desc_num = 0;
6349
6350         save_arg = darg;
6351         /*
6352          * Since we're not holding a reference to the zone, any number of
6353          * things can go wrong, including the zone disappearing before we get a
6354          * chance to talk to zoneadmd.
6355          */
6356         for (retry = 0; /* forever */; retry++) {
6357                 if (door == NULL &&
6358                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6359                         goto next;
6360                 }
6361                 ASSERT(door != NULL);
6362
6363                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6364                     SIZE_MAX, 0)) == 0) {
6365                         break;
6366                 }
6367                 switch (error) {
6368                 case EINTR:
6369                         /* FALLTHROUGH */
6370                 case EAGAIN:    /* process may be forking */
6371                         /*
6372                          * Back off for a bit
6373                          */
6374                         break;
6375                 case EBADF:
6376                         zone_release_door(&door);
6377                         if (zone_lookup_door(zone_name, &door) != 0) {
6378                                 /*
6379                                  * zoneadmd may be dead, but it may come back to
6380                                  * life later.
6381                                  */
6382                                 break;
6383                         }
6384                         break;
6385                 default:
6386                         cmn_err(CE_WARN,
6387                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6388                             error);
6389                         goto out;
6390                 }
6391 next:
6392                 /*
6393                  * If this isn't the same zone_t that we originally had in mind,
6394                  * then this is the same as if two kadmin requests come in at
6395                  * the same time: the first one wins.  This means we lose, so we
6396                  * bail.
6397                  */
6398                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6399                         /*
6400                          * Problem is solved.
6401                          */
6402                         break;
6403                 }
6404                 if (zone->zone_uniqid != uniqid) {
6405                         /*
6406                          * zoneid recycled
6407                          */
6408                         zone_rele(zone);
6409                         break;
6410                 }
6411                 /*
6412                  * We could zone_status_timedwait(), but there doesn't seem to
6413                  * be much point in doing that (plus, it would mean that
6414                  * zone_free() isn't called until this thread exits).
6415                  */
6416                 zone_rele(zone);
6417                 ddi_sleep(1);
6418                 darg = save_arg;
6419         }
6420 out:
6421         if (door != NULL) {
6422                 zone_release_door(&door);
6423         }
6424         kmem_free(zone_name, zone_namelen);
6425         thread_exit();
6426 }
6427
6428 /*
6429  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6430  * kadmin().  The caller is a process in the zone.
6431  *
6432  * In order to shutdown the zone, we will hand off control to zoneadmd
6433  * (running in the global zone) via a door.  We do a half-hearted job at
6434  * killing all processes in the zone, create a kernel thread to contact
6435  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6436  * a form of generation number used to let zoneadmd (as well as
6437  * zone_destroy()) know exactly which zone they're re talking about.
6438  */
6439 int
6440 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6441 {
6442         struct zarg *zargp;
6443         zone_cmd_t zcmd;
6444         zone_t *zone;
6445
6446         zone = curproc->p_zone;
6447         ASSERT(getzoneid() != GLOBAL_ZONEID);
6448
6449         switch (cmd) {
6450         case A_SHUTDOWN:
6451                 switch (fcn) {
6452                 case AD_HALT:
6453                 case AD_POWEROFF:
6454                         zcmd = Z_HALT;
6455                         break;
6456                 case AD_BOOT:
6457                         zcmd = Z_REBOOT;
6458                         break;
6459                 case AD_IBOOT:
6460                 case AD_SBOOT:
6461                 case AD_SIBOOT:
6462                 case AD_NOSYNC:
6463                         return (ENOTSUP);
6464                 default:
6465                         return (EINVAL);
6466                 }
6467                 break;
6468         case A_REBOOT:
6469                 zcmd = Z_REBOOT;
6470                 break;
6471         case A_FTRACE:
6472         case A_REMOUNT:
6473         case A_FREEZE:
6474         case A_DUMP:
6475         case A_CONFIG:
6476                 return (ENOTSUP);
6477         default:
6478                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6479                 return (EINVAL);
6480         }
6481
6482         if (secpolicy_zone_admin(credp, B_FALSE))
6483                 return (EPERM);
6484         mutex_enter(&zone_status_lock);
6485
6486         /*
6487          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6488          * is in the zone.
6489          */
6490         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6491         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6492                 /*
6493                  * This zone is already on its way down.
6494                  */
6495                 mutex_exit(&zone_status_lock);
6496                 return (0);
6497         }
6498         /*
6499          * Prevent future zone_enter()s
6500          */
6501         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6502         mutex_exit(&zone_status_lock);
6503
6504         /*
6505          * Kill everyone now and call zoneadmd later.
6506          * zone_ki_call_zoneadmd() will do a more thorough job of this
6507          * later.
6508          */
6509         killall(zone->zone_id);
6510         /*
6511          * Now, create the thread to contact zoneadmd and do the rest of the
6512          * work.  This thread can't be created in our zone otherwise
6513          * zone_destroy() would deadlock.
6514          */
6515         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6516         zargp->arg.cmd = zcmd;
6517         zargp->arg.uniqid = zone->zone_uniqid;
6518         zargp->zone = zone;
6519         (void) strcpy(zargp->arg.locale, "C");
6520         /* mdep was already copied in for us by uadmin */
6521         if (mdep != NULL)
6522                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6523                     sizeof (zargp->arg.bootbuf));
6524         zone_hold(zone);
6525
6526         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6527             TS_RUN, minclsyspri);
6528         exit(CLD_EXITED, 0);
6529
6530         return (EINVAL);
6531 }
6532
6533 /*
6534  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6535  * status to ZONE_IS_SHUTTING_DOWN.
6536  *
6537  * This function also shuts down all running zones to ensure that they won't
6538  * fork new processes.
6539  */
6540 void
6541 zone_shutdown_global(void)
6542 {
6543         zone_t *current_zonep;
6544
6545         ASSERT(INGLOBALZONE(curproc));
6546         mutex_enter(&zonehash_lock);
6547         mutex_enter(&zone_status_lock);
6548
6549         /* Modify the global zone's status first. */
6550         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6551         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6552
6553         /*
6554          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6555          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6556          * could cause assertions to fail (e.g., assertions about a zone's
6557          * state during initialization, readying, or booting) or produce races.
6558          * We'll let threads continue to initialize and ready new zones: they'll
6559          * fail to boot the new zones when they see that the global zone is
6560          * shutting down.
6561          */
6562         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6563             current_zonep = list_next(&zone_active, current_zonep)) {
6564                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6565                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6566         }
6567         mutex_exit(&zone_status_lock);
6568         mutex_exit(&zonehash_lock);
6569 }
6570
6571 /*
6572  * Returns true if the named dataset is visible in the current zone.
6573  * The 'write' parameter is set to 1 if the dataset is also writable.
6574  */
6575 int
6576 zone_dataset_visible(const char *dataset, int *write)
6577 {
6578         static int zfstype = -1;
6579         zone_dataset_t *zd;
6580         size_t len;
6581         zone_t *zone = curproc->p_zone;
6582         const char *name = NULL;
6583         vfs_t *vfsp = NULL;
6584
6585         if (dataset[0] == '\0')
6586                 return (0);
6587
6588         /*
6589          * Walk the list once, looking for datasets which match exactly, or
6590          * specify a dataset underneath an exported dataset.  If found, return
6591          * true and note that it is writable.
6592          */
6593         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6594             zd = list_next(&zone->zone_datasets, zd)) {
6595
6596                 len = strlen(zd->zd_dataset);
6597                 if (strlen(dataset) >= len &&
6598                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6599                     (dataset[len] == '\0' || dataset[len] == '/' ||
6600                     dataset[len] == '@')) {
6601                         if (write)
6602                                 *write = 1;
6603                         return (1);
6604                 }
6605         }
6606
6607         /*
6608          * Walk the list a second time, searching for datasets which are parents
6609          * of exported datasets.  These should be visible, but read-only.
6610          *
6611          * Note that we also have to support forms such as 'pool/dataset/', with
6612          * a trailing slash.
6613          */
6614         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6615             zd = list_next(&zone->zone_datasets, zd)) {
6616
6617                 len = strlen(dataset);
6618                 if (dataset[len - 1] == '/')
6619                         len--;  /* Ignore trailing slash */
6620                 if (len < strlen(zd->zd_dataset) &&
6621                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6622                     zd->zd_dataset[len] == '/') {
6623                         if (write)
6624                                 *write = 0;
6625                         return (1);
6626                 }
6627         }
6628
6629         /*
6630          * We reach here if the given dataset is not found in the zone_dataset
6631          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6632          * instead of delegation. For this we search for the dataset in the
6633          * zone_vfslist of this zone. If found, return true and note that it is
6634          * not writable.
6635          */
6636
6637         /*
6638          * Initialize zfstype if it is not initialized yet.
6639          */
6640         if (zfstype == -1) {
6641                 struct vfssw *vswp = vfs_getvfssw("zfs");
6642                 zfstype = vswp - vfssw;
6643                 vfs_unrefvfssw(vswp);
6644         }
6645
6646         vfs_list_read_lock();
6647         vfsp = zone->zone_vfslist;
6648         do {
6649                 ASSERT(vfsp);
6650                 if (vfsp->vfs_fstype == zfstype) {
6651                         name = refstr_value(vfsp->vfs_resource);
6652
6653                         /*
6654                          * Check if we have an exact match.
6655                          */
6656                         if (strcmp(dataset, name) == 0) {
6657                                 vfs_list_unlock();
6658                                 if (write)
6659                                         *write = 0;
6660                                 return (1);
6661                         }
6662                         /*
6663                          * We need to check if we are looking for parents of
6664                          * a dataset. These should be visible, but read-only.
6665                          */
6666                         len = strlen(dataset);
6667                         if (dataset[len - 1] == '/')
6668                                 len--;
6669
6670                         if (len < strlen(name) &&
6671                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6672                                 vfs_list_unlock();
6673                                 if (write)
6674                                         *write = 0;
6675                                 return (1);
6676                         }
6677                 }
6678                 vfsp = vfsp->vfs_zone_next;
6679         } while (vfsp != zone->zone_vfslist);
6680
6681         vfs_list_unlock();
6682         return (0);
6683 }
6684
6685 /*
6686  * zone_find_by_any_path() -
6687  *
6688  * kernel-private routine similar to zone_find_by_path(), but which
6689  * effectively compares against zone paths rather than zonerootpath
6690  * (i.e., the last component of zonerootpaths, which should be "root/",
6691  * are not compared.)  This is done in order to accurately identify all
6692  * paths, whether zone-visible or not, including those which are parallel
6693  * to /root/, such as /dev/, /home/, etc...
6694  *
6695  * If the specified path does not fall under any zone path then global
6696  * zone is returned.
6697  *
6698  * The treat_abs parameter indicates whether the path should be treated as
6699  * an absolute path although it does not begin with "/".  (This supports
6700  * nfs mount syntax such as host:any/path.)
6701  *
6702  * The caller is responsible for zone_rele of the returned zone.
6703  */
6704 zone_t *
6705 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6706 {
6707         zone_t *zone;
6708         int path_offset = 0;
6709
6710         if (path == NULL) {
6711                 zone_hold(global_zone);
6712                 return (global_zone);
6713         }
6714
6715         if (*path != '/') {
6716                 ASSERT(treat_abs);
6717                 path_offset = 1;
6718         }
6719
6720         mutex_enter(&zonehash_lock);
6721         for (zone = list_head(&zone_active); zone != NULL;
6722             zone = list_next(&zone_active, zone)) {
6723                 char    *c;
6724                 size_t  pathlen;
6725                 char *rootpath_start;
6726
6727                 if (zone == global_zone)        /* skip global zone */
6728                         continue;
6729
6730                 /* scan backwards to find start of last component */
6731                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6732                 do {
6733                         c--;
6734                 } while (*c != '/');
6735
6736                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
6737                 rootpath_start = (zone->zone_rootpath + path_offset);
6738                 if (strncmp(path, rootpath_start, pathlen) == 0)
6739                         break;
6740         }
6741         if (zone == NULL)
6742                 zone = global_zone;
6743         zone_hold(zone);
6744         mutex_exit(&zonehash_lock);
6745         return (zone);
6746 }
6747
6748 /*
6749  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6750  * zone_dl_t pointer if found, and NULL otherwise.
6751  */
6752 static zone_dl_t *
6753 zone_find_dl(zone_t *zone, datalink_id_t linkid)
6754 {
6755         zone_dl_t *zdl;
6756
6757         ASSERT(mutex_owned(&zone->zone_lock));
6758         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6759             zdl = list_next(&zone->zone_dl_list, zdl)) {
6760                 if (zdl->zdl_id == linkid)
6761                         break;
6762         }
6763         return (zdl);
6764 }
6765
6766 static boolean_t
6767 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6768 {
6769         boolean_t exists;
6770
6771         mutex_enter(&zone->zone_lock);
6772         exists = (zone_find_dl(zone, linkid) != NULL);
6773         mutex_exit(&zone->zone_lock);
6774         return (exists);
6775 }
6776
6777 /*
6778  * Add an data link name for the zone.
6779  */
6780 static int
6781 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6782 {
6783         zone_dl_t *zdl;
6784         zone_t *zone;
6785         zone_t *thiszone;
6786
6787         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6788                 return (set_errno(ENXIO));
6789
6790         /* Verify that the datalink ID doesn't already belong to a zone. */
6791         mutex_enter(&zonehash_lock);
6792         for (zone = list_head(&zone_active); zone != NULL;
6793             zone = list_next(&zone_active, zone)) {
6794                 if (zone_dl_exists(zone, linkid)) {
6795                         mutex_exit(&zonehash_lock);
6796                         zone_rele(thiszone);
6797                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6798                 }
6799         }
6800
6801         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6802         zdl->zdl_id = linkid;
6803         zdl->zdl_net = NULL;
6804         mutex_enter(&thiszone->zone_lock);
6805         list_insert_head(&thiszone->zone_dl_list, zdl);
6806         mutex_exit(&thiszone->zone_lock);
6807         mutex_exit(&zonehash_lock);
6808         zone_rele(thiszone);
6809         return (0);
6810 }
6811
6812 static int
6813 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6814 {
6815         zone_dl_t *zdl;
6816         zone_t *zone;
6817         int err = 0;
6818
6819         if ((zone = zone_find_by_id(zoneid)) == NULL)
6820                 return (set_errno(EINVAL));
6821
6822         mutex_enter(&zone->zone_lock);
6823         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6824                 err = ENXIO;
6825         } else {
6826                 list_remove(&zone->zone_dl_list, zdl);
6827                 nvlist_free(zdl->zdl_net);
6828                 kmem_free(zdl, sizeof (zone_dl_t));
6829         }
6830         mutex_exit(&zone->zone_lock);
6831         zone_rele(zone);
6832         return (err == 0 ? 0 : set_errno(err));
6833 }
6834
6835 /*
6836  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6837  * the linkid.  Otherwise we just check if the specified zoneidp has been
6838  * assigned the supplied linkid.
6839  */
6840 int
6841 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6842 {
6843         zone_t *zone;
6844         int err = ENXIO;
6845
6846         if (*zoneidp != ALL_ZONES) {
6847                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6848                         if (zone_dl_exists(zone, linkid))
6849                                 err = 0;
6850                         zone_rele(zone);
6851                 }
6852                 return (err);
6853         }
6854
6855         mutex_enter(&zonehash_lock);
6856         for (zone = list_head(&zone_active); zone != NULL;
6857             zone = list_next(&zone_active, zone)) {
6858                 if (zone_dl_exists(zone, linkid)) {
6859                         *zoneidp = zone->zone_id;
6860                         err = 0;
6861                         break;
6862                 }
6863         }
6864         mutex_exit(&zonehash_lock);
6865         return (err);
6866 }
6867
6868 /*
6869  * Get the list of datalink IDs assigned to a zone.
6870  *
6871  * On input, *nump is the number of datalink IDs that can fit in the supplied
6872  * idarray.  Upon return, *nump is either set to the number of datalink IDs
6873  * that were placed in the array if the array was large enough, or to the
6874  * number of datalink IDs that the function needs to place in the array if the
6875  * array is too small.
6876  */
6877 static int
6878 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6879 {
6880         uint_t num, dlcount;
6881         zone_t *zone;
6882         zone_dl_t *zdl;
6883         datalink_id_t *idptr = idarray;
6884
6885         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6886                 return (set_errno(EFAULT));
6887         if ((zone = zone_find_by_id(zoneid)) == NULL)
6888                 return (set_errno(ENXIO));
6889
6890         num = 0;
6891         mutex_enter(&zone->zone_lock);
6892         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6893             zdl = list_next(&zone->zone_dl_list, zdl)) {
6894                 /*
6895                  * If the list is bigger than what the caller supplied, just
6896                  * count, don't do copyout.
6897                  */
6898                 if (++num > dlcount)
6899                         continue;
6900                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6901                         mutex_exit(&zone->zone_lock);
6902                         zone_rele(zone);
6903                         return (set_errno(EFAULT));
6904                 }
6905                 idptr++;
6906         }
6907         mutex_exit(&zone->zone_lock);
6908         zone_rele(zone);
6909
6910         /* Increased or decreased, caller should be notified. */
6911         if (num != dlcount) {
6912                 if (copyout(&num, nump, sizeof (num)) != 0)
6913                         return (set_errno(EFAULT));
6914         }
6915         return (0);
6916 }
6917
6918 /*
6919  * Public interface for looking up a zone by zoneid. It's a customized version
6920  * for netstack_zone_create(). It can only be called from the zsd create
6921  * callbacks, since it doesn't have reference on the zone structure hence if
6922  * it is called elsewhere the zone could disappear after the zonehash_lock
6923  * is dropped.
6924  *
6925  * Furthermore it
6926  * 1. Doesn't check the status of the zone.
6927  * 2. It will be called even before zone_init is called, in that case the
6928  *    address of zone0 is returned directly, and netstack_zone_create()
6929  *    will only assign a value to zone0.zone_netstack, won't break anything.
6930  * 3. Returns without the zone being held.
6931  */
6932 zone_t *
6933 zone_find_by_id_nolock(zoneid_t zoneid)
6934 {
6935         zone_t *zone;
6936
6937         mutex_enter(&zonehash_lock);
6938         if (zonehashbyid == NULL)
6939                 zone = &zone0;
6940         else
6941                 zone = zone_find_all_by_id(zoneid);
6942         mutex_exit(&zonehash_lock);
6943         return (zone);
6944 }
6945
6946 /*
6947  * Walk the datalinks for a given zone
6948  */
6949 int
6950 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
6951     void *data)
6952 {
6953         zone_t          *zone;
6954         zone_dl_t       *zdl;
6955         datalink_id_t   *idarray;
6956         uint_t          idcount = 0;
6957         int             i, ret = 0;
6958
6959         if ((zone = zone_find_by_id(zoneid)) == NULL)
6960                 return (ENOENT);
6961
6962         /*
6963          * We first build an array of linkid's so that we can walk these and
6964          * execute the callback with the zone_lock dropped.
6965          */
6966         mutex_enter(&zone->zone_lock);
6967         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6968             zdl = list_next(&zone->zone_dl_list, zdl)) {
6969                 idcount++;
6970         }
6971
6972         if (idcount == 0) {
6973                 mutex_exit(&zone->zone_lock);
6974                 zone_rele(zone);
6975                 return (0);
6976         }
6977
6978         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
6979         if (idarray == NULL) {
6980                 mutex_exit(&zone->zone_lock);
6981                 zone_rele(zone);
6982                 return (ENOMEM);
6983         }
6984
6985         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6986             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
6987                 idarray[i] = zdl->zdl_id;
6988         }
6989
6990         mutex_exit(&zone->zone_lock);
6991
6992         for (i = 0; i < idcount && ret == 0; i++) {
6993                 if ((ret = (*cb)(idarray[i], data)) != 0)
6994                         break;
6995         }
6996
6997         zone_rele(zone);
6998         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
6999         return (ret);
7000 }
7001
7002 static char *
7003 zone_net_type2name(int type)
7004 {
7005         switch (type) {
7006         case ZONE_NETWORK_ADDRESS:
7007                 return (ZONE_NET_ADDRNAME);
7008         case ZONE_NETWORK_DEFROUTER:
7009                 return (ZONE_NET_RTRNAME);
7010         default:
7011                 return (NULL);
7012         }
7013 }
7014
7015 static int
7016 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7017 {
7018         zone_t *zone;
7019         zone_dl_t *zdl;
7020         nvlist_t *nvl;
7021         int err = 0;
7022         uint8_t *new = NULL;
7023         char *nvname;
7024         int bufsize;
7025         datalink_id_t linkid = znbuf->zn_linkid;
7026
7027         if (secpolicy_zone_config(CRED()) != 0)
7028                 return (set_errno(EPERM));
7029
7030         if (zoneid == GLOBAL_ZONEID)
7031                 return (set_errno(EINVAL));
7032
7033         nvname = zone_net_type2name(znbuf->zn_type);
7034         bufsize = znbuf->zn_len;
7035         new = znbuf->zn_val;
7036         if (nvname == NULL)
7037                 return (set_errno(EINVAL));
7038
7039         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7040                 return (set_errno(EINVAL));
7041         }
7042
7043         mutex_enter(&zone->zone_lock);
7044         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7045                 err = ENXIO;
7046                 goto done;
7047         }
7048         if ((nvl = zdl->zdl_net) == NULL) {
7049                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7050                         err = ENOMEM;
7051                         goto done;
7052                 } else {
7053                         zdl->zdl_net = nvl;
7054                 }
7055         }
7056         if (nvlist_exists(nvl, nvname)) {
7057                 err = EINVAL;
7058                 goto done;
7059         }
7060         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7061         ASSERT(err == 0);
7062 done:
7063         mutex_exit(&zone->zone_lock);
7064         zone_rele(zone);
7065         if (err != 0)
7066                 return (set_errno(err));
7067         else
7068                 return (0);
7069 }
7070
7071 static int
7072 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7073 {
7074         zone_t *zone;
7075         zone_dl_t *zdl;
7076         nvlist_t *nvl;
7077         uint8_t *ptr;
7078         uint_t psize;
7079         int err = 0;
7080         char *nvname;
7081         int bufsize;
7082         void *buf;
7083         datalink_id_t linkid = znbuf->zn_linkid;
7084
7085         if (zoneid == GLOBAL_ZONEID)
7086                 return (set_errno(EINVAL));
7087
7088         nvname = zone_net_type2name(znbuf->zn_type);
7089         bufsize = znbuf->zn_len;
7090         buf = znbuf->zn_val;
7091
7092         if (nvname == NULL)
7093                 return (set_errno(EINVAL));
7094         if ((zone = zone_find_by_id(zoneid)) == NULL)
7095                 return (set_errno(EINVAL));
7096
7097         mutex_enter(&zone->zone_lock);
7098         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7099                 err = ENXIO;
7100                 goto done;
7101         }
7102         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7103                 err = ENOENT;
7104                 goto done;
7105         }
7106         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7107         ASSERT(err == 0);
7108
7109         if (psize > bufsize) {
7110                 err = ENOBUFS;
7111                 goto done;
7112         }
7113         znbuf->zn_len = psize;
7114         bcopy(ptr, buf, psize);
7115 done:
7116         mutex_exit(&zone->zone_lock);
7117         zone_rele(zone);
7118         if (err != 0)
7119                 return (set_errno(err));
7120         else
7121                 return (0);
7122 }