kernel/os/zone.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27  */
  28
  29 /*
  30  * Zones
  31  *
  32  *   A zone is a named collection of processes, namespace constraints,
  33  *   and other system resources which comprise a secure and manageable
  34  *   application containment facility.
  35  *
  36  *   Zones (represented by the reference counted zone_t) are tracked in
  37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38  *   (zoneid_t) are used to track zone association.  Zone IDs are
  39  *   dynamically generated when the zone is created; if a persistent
  40  *   identifier is needed (core files, accounting logs, audit trail,
  41  *   etc.), the zone name should be used.
  42  *
  43  *
  44  *   Global Zone:
  45  *
  46  *   The global zone (zoneid 0) is automatically associated with all
  47  *   system resources that have not been bound to a user-created zone.
  48  *   This means that even systems where zones are not in active use
  49  *   have a global zone, and all processes, mounts, etc. are
  50  *   associated with that zone.  The global zone is generally
  51  *   unconstrained in terms of privileges and access, though the usual
  52  *   credential and privilege based restrictions apply.
  53  *
  54  *
  55  *   Zone States:
  56  *
  57  *   The states in which a zone may be in and the transitions are as
  58  *   follows:
  59  *
  60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61  *   initialized zone is added to the list of active zones on the system but
  62  *   isn't accessible.
  63  *
  64  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65  *   not yet completed. Not possible to enter the zone, but attributes can
  66  *   be retrieved.
  67  *
  68  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70  *   executed.  A zone remains in this state until it transitions into
  71  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72  *
  73  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75  *   state.
  76  *
  77  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78  *   successfully started init.   A zone remains in this state until
  79  *   zone_shutdown() is called.
  80  *
  81  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82  *   killing all processes running in the zone. The zone remains
  83  *   in this state until there are no more user processes running in the zone.
  84  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85  *   Since zone_shutdown() is restartable, it may be called successfully
  86  *   multiple times for the same zone_t.  Setting of the zone's state to
  87  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88  *   the zone's status without worrying about it being a moving target.
  89  *
  90  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91  *   are no more user processes in the zone.  The zone remains in this
  92  *   state until there are no more kernel threads associated with the
  93  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94  *   fail.
  95  *
  96  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98  *   join the zone or create kernel threads therein.
  99  *
 100  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  *   return NULL from now on.
 103  *
 104  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  *   processes or threads doing work on behalf of the zone.  The zone is
 106  *   removed from the list of active zones.  zone_destroy() returns, and
 107  *   the zone can be recreated.
 108  *
 109  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 110  *   callbacks are executed, and all memory associated with the zone is
 111  *   freed.
 112  *
 113  *   Threads can wait for the zone to enter a requested state by using
 114  *   zone_status_wait() or zone_status_timedwait() with the desired
 115  *   state passed in as an argument.  Zone state transitions are
 116  *   uni-directional; it is not possible to move back to an earlier state.
 117  *
 118  *
 119  *   Zone-Specific Data:
 120  *
 121  *   Subsystems needing to maintain zone-specific data can store that
 122  *   data using the ZSD mechanism.  This provides a zone-specific data
 123  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 124  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 125  *   to register callbacks to be invoked when a zone is created, shut
 126  *   down, or destroyed.  This can be used to initialize zone-specific
 127  *   data for new zones and to clean up when zones go away.
 128  *
 129  *
 130  *   Data Structures:
 131  *
 132  *   The per-zone structure (zone_t) is reference counted, and freed
 133  *   when all references are released.  zone_hold and zone_rele can be
 134  *   used to adjust the reference count.  In addition, reference counts
 135  *   associated with the cred_t structure are tracked separately using
 136  *   zone_cred_hold and zone_cred_rele.
 137  *
 138  *   Pointers to active zone_t's are stored in two hash tables; one
 139  *   for searching by id, the other for searching by name.  Lookups
 140  *   can be performed on either basis, using zone_find_by_id and
 141  *   zone_find_by_name.  Both return zone_t pointers with the zone
 142  *   held, so zone_rele should be called when the pointer is no longer
 143  *   needed.  Zones can also be searched by path; zone_find_by_path
 144  *   returns the zone with which a path name is associated (global
 145  *   zone if the path is not within some other zone's file system
 146  *   hierarchy).  This currently requires iterating through each zone,
 147  *   so it is slower than an id or name search via a hash table.
 148  *
 149  *
 150  *   Locking:
 151  *
 152  *   zonehash_lock: This is a top-level global lock used to protect the
 153  *       zone hash tables and lists.  Zones cannot be created or destroyed
 154  *       while this lock is held.
 155  *   zone_status_lock: This is a global lock protecting zone state.
 156  *       Zones cannot change state while this lock is held.  It also
 157  *       protects the list of kernel threads associated with a zone.
 158  *   zone_lock: This is a per-zone lock used to protect several fields of
 159  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 160  *       this lock means that the zone cannot go away.
 161  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 162  *       related to the zone.max-lwps rctl.
 163  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 164  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 165  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 166  *       currently just max_lofi
 167  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 168  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 169  *       list (a list of zones in the ZONE_IS_DEAD state).
 170  *
 171  *   Ordering requirements:
 172  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 173  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 174  *
 175  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 176  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 177  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 178  *
 179  *   Blocking memory allocations are permitted while holding any of the
 180  *   zone locks.
 181  *
 182  *
 183  *   System Call Interface:
 184  *
 185  *   The zone subsystem can be managed and queried from user level with
 186  *   the following system calls (all subcodes of the primary "zone"
 187  *   system call):
 188  *   - zone_create: creates a zone with selected attributes (name,
 189  *     root path, privileges, resource controls, ZFS datasets)
 190  *   - zone_enter: allows the current process to enter a zone
 191  *   - zone_getattr: reports attributes of a zone
 192  *   - zone_setattr: set attributes of a zone
 193  *   - zone_boot: set 'init' running for the zone
 194  *   - zone_list: lists all zones active in the system
 195  *   - zone_lookup: looks up zone id based on name
 196  *   - zone_shutdown: initiates shutdown process (see states above)
 197  *   - zone_destroy: completes shutdown process (see states above)
 198  *
 199  */
 200
 201 #include <sys/priv_impl.h>
 202 #include <sys/cred.h>
 203 #include <c2/audit.h>
 204 #include <sys/debug.h>
 205 #include <sys/file.h>
 206 #include <sys/kmem.h>
 207 #include <sys/kstat.h>
 208 #include <sys/mutex.h>
 209 #include <sys/note.h>
 210 #include <sys/pathname.h>
 211 #include <sys/proc.h>
 212 #include <sys/project.h>
 213 #include <sys/sysevent.h>
 214 #include <sys/task.h>
 215 #include <sys/systm.h>
 216 #include <sys/types.h>
 217 #include <sys/utsname.h>
 218 #include <sys/vnode.h>
 219 #include <sys/vfs.h>
 220 #include <sys/systeminfo.h>
 221 #include <sys/policy.h>
 222 #include <sys/cred_impl.h>
 223 #include <sys/contract_impl.h>
 224 #include <sys/contract/process_impl.h>
 225 #include <sys/class.h>
 226 #include <sys/pool.h>
 227 #include <sys/pool_pset.h>
 228 #include <sys/pset.h>
 229 #include <sys/strlog.h>
 230 #include <sys/sysmacros.h>
 231 #include <sys/callb.h>
 232 #include <sys/vmparam.h>
 233 #include <sys/corectl.h>
 234 #include <sys/ipc_impl.h>
 235 #include <sys/klpd.h>
 236
 237 #include <sys/door.h>
 238 #include <sys/cpuvar.h>
 239 #include <sys/sdt.h>
 240
 241 #include <sys/uadmin.h>
 242 #include <sys/session.h>
 243 #include <sys/cmn_err.h>
 244 #include <sys/modhash.h>
 245 #include <sys/sunddi.h>
 246 #include <sys/nvpair.h>
 247 #include <sys/rctl.h>
 248 #include <sys/fss.h>
 249 #include <sys/brand.h>
 250 #include <sys/zone.h>
 251 #include <net/if.h>
 252 #include <sys/cpucaps.h>
 253 #include <vm/seg.h>
 254 #include <sys/mac.h>
 255
 256 /*
 257  * This constant specifies the number of seconds that threads waiting for
 258  * subsystems to release a zone's general-purpose references will wait before
 259  * they log the zone's reference counts.  The constant's value shouldn't
 260  * be so small that reference counts are unnecessarily reported for zones
 261  * whose references are slowly released.  On the other hand, it shouldn't be so
 262  * large that users reboot their systems out of frustration over hung zones
 263  * before the system logs the zones' reference counts.
 264  */
 265 #define ZONE_DESTROY_TIMEOUT_SECS       60
 266
 267 /* List of data link IDs which are accessible from the zone */
 268 typedef struct zone_dl {
 269         datalink_id_t   zdl_id;
 270         nvlist_t        *zdl_net;
 271         list_node_t     zdl_linkage;
 272 } zone_dl_t;
 273
 274 /*
 275  * cv used to signal that all references to the zone have been released.  This
 276  * needs to be global since there may be multiple waiters, and the first to
 277  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  */
 279 static kcondvar_t zone_destroy_cv;
 280 /*
 281  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  */
 284 static kmutex_t zone_status_lock;
 285
 286 /*
 287  * ZSD-related global variables.
 288  */
 289 static kmutex_t zsd_key_lock;   /* protects the following two */
 290 /*
 291  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  */
 293 static zone_key_t zsd_keyval = 0;
 294 /*
 295  * Global list of registered keys.  We use this when a new zone is created.
 296  */
 297 static list_t zsd_registered_keys;
 298
 299 int zone_hash_size = 256;
 300 static mod_hash_t *zonehashbyname, *zonehashbyid;
 301 static kmutex_t zonehash_lock;
 302 static uint_t zonecount;
 303 static id_space_t *zoneid_space;
 304
 305 /*
 306  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  * kernel proper runs, and which manages all other zones.
 308  *
 309  * Although not declared as static, the variable "zone0" should not be used
 310  * except for by code that needs to reference the global zone early on in boot,
 311  * before it is fully initialized.  All other consumers should use
 312  * 'global_zone'.
 313  */
 314 zone_t zone0;
 315 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316
 317 /*
 318  * List of active zones, protected by zonehash_lock.
 319  */
 320 static list_t zone_active;
 321
 322 /*
 323  * List of destroyed zones that still have outstanding cred references.
 324  * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  * problems in zone_free.
 326  */
 327 static list_t zone_deathrow;
 328 static kmutex_t zone_deathrow_lock;
 329
 330 /* number of zones is limited by virtual interface limit in IP */
 331 uint_t maxzones = 8192;
 332
 333 /* Event channel to sent zone state change notifications */
 334 evchan_t *zone_event_chan;
 335
 336 /*
 337  * This table holds the mapping from kernel zone states to
 338  * states visible in the state notification API.
 339  * The idea is that we only expose "obvious" states and
 340  * do not expose states which are just implementation details.
 341  */
 342 const char  *zone_status_table[] = {
 343         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344         ZONE_EVENT_INITIALIZED,         /* initialized */
 345         ZONE_EVENT_READY,               /* ready */
 346         ZONE_EVENT_READY,               /* booting */
 347         ZONE_EVENT_RUNNING,             /* running */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352         ZONE_EVENT_UNINITIALIZED,       /* dead */
 353 };
 354
 355 /*
 356  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  * (see sys/zone.h).
 358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_max_lofi;
 376 rctl_hndl_t rc_zone_cpu_cap;
 377 rctl_hndl_t rc_zone_nlwps;
 378 rctl_hndl_t rc_zone_nprocs;
 379 rctl_hndl_t rc_zone_shmmax;
 380 rctl_hndl_t rc_zone_shmmni;
 381 rctl_hndl_t rc_zone_semmni;
 382 rctl_hndl_t rc_zone_msgmni;
 383
 384 const char * const zone_default_initname = "/sbin/init";
 385 static char * const zone_prefix = "/zone/";
 386 static int zone_shutdown(zoneid_t zoneid);
 387 static int zone_add_datalink(zoneid_t, datalink_id_t);
 388 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 389 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 390 static int zone_set_network(zoneid_t, zone_net_data_t *);
 391 static int zone_get_network(zoneid_t, zone_net_data_t *);
 392
 393 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 394
 395 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 396 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 397 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 399     zone_key_t);
 400 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 401 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 402     kmutex_t *);
 403 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 404     kmutex_t *);
 405
 406 /*
 407  * Bump this number when you alter the zone syscall interfaces; this is
 408  * because we need to have support for previous API versions in libc
 409  * to support patching; libc calls into the kernel to determine this number.
 410  *
 411  * Version 1 of the API is the version originally shipped with Solaris 10
 412  * Version 2 alters the zone_create system call in order to support more
 413  *     arguments by moving the args into a structure; and to do better
 414  *     error reporting when zone_create() fails.
 415  * Version 3 alters the zone_create system call in order to support the
 416  *     import of ZFS datasets to zones.
 417  * Version 4 alters the zone_create system call in order to support
 418  *     Trusted Extensions.
 419  * Version 5 alters the zone_boot system call, and converts its old
 420  *     bootargs parameter to be set by the zone_setattr API instead.
 421  * Version 6 adds the flag argument to zone_create.
 422  */
 423 static const int ZONE_SYSCALL_API_VERSION = 6;
 424
 425 /*
 426  * Certain filesystems (such as NFS and autofs) need to know which zone
 427  * the mount is being placed in.  Because of this, we need to be able to
 428  * ensure that a zone isn't in the process of being created/destroyed such
 429  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 430  * it gets added the list of mounted zones, it ends up on the wrong zone's
 431  * mount list. Since a zone can't reside on an NFS file system, we don't
 432  * have to worry about the zonepath itself.
 433  *
 434  * The following functions: block_mounts()/resume_mounts() and
 435  * mount_in_progress()/mount_completed() are used by zones and the VFS
 436  * layer (respectively) to synchronize zone state transitions and new
 437  * mounts within a zone. This syncronization is on a per-zone basis, so
 438  * activity for one zone will not interfere with activity for another zone.
 439  *
 440  * The semantics are like a reader-reader lock such that there may
 441  * either be multiple mounts (or zone state transitions, if that weren't
 442  * serialized by zonehash_lock) in progress at the same time, but not
 443  * both.
 444  *
 445  * We use cv's so the user can ctrl-C out of the operation if it's
 446  * taking too long.
 447  *
 448  * The semantics are such that there is unfair bias towards the
 449  * "current" operation.  This means that zone halt may starve if
 450  * there is a rapid succession of new mounts coming in to the zone.
 451  */
 452 /*
 453  * Prevent new mounts from progressing to the point of calling
 454  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 455  * them to complete.
 456  */
 457 static int
 458 block_mounts(zone_t *zp)
 459 {
 460         int retval = 0;
 461
 462         /*
 463          * Since it may block for a long time, block_mounts() shouldn't be
 464          * called with zonehash_lock held.
 465          */
 466         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 467         mutex_enter(&zp->zone_mount_lock);
 468         while (zp->zone_mounts_in_progress > 0) {
 469                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 470                         goto signaled;
 471         }
 472         /*
 473          * A negative value of mounts_in_progress indicates that mounts
 474          * have been blocked by (-mounts_in_progress) different callers
 475          * (remotely possible if two threads enter zone_shutdown at the same
 476          * time).
 477          */
 478         zp->zone_mounts_in_progress--;
 479         retval = 1;
 480 signaled:
 481         mutex_exit(&zp->zone_mount_lock);
 482         return (retval);
 483 }
 484
 485 /*
 486  * The VFS layer may progress with new mounts as far as we're concerned.
 487  * Allow them to progress if we were the last obstacle.
 488  */
 489 static void
 490 resume_mounts(zone_t *zp)
 491 {
 492         mutex_enter(&zp->zone_mount_lock);
 493         if (++zp->zone_mounts_in_progress == 0)
 494                 cv_broadcast(&zp->zone_mount_cv);
 495         mutex_exit(&zp->zone_mount_lock);
 496 }
 497
 498 /*
 499  * The VFS layer is busy with a mount; this zone should wait until all
 500  * of its mounts are completed to progress.
 501  */
 502 void
 503 mount_in_progress(zone_t *zp)
 504 {
 505         mutex_enter(&zp->zone_mount_lock);
 506         while (zp->zone_mounts_in_progress < 0)
 507                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 508         zp->zone_mounts_in_progress++;
 509         mutex_exit(&zp->zone_mount_lock);
 510 }
 511
 512 /*
 513  * VFS is done with one mount; wake up any waiting block_mounts()
 514  * callers if this is the last mount.
 515  */
 516 void
 517 mount_completed(zone_t *zp)
 518 {
 519         mutex_enter(&zp->zone_mount_lock);
 520         if (--zp->zone_mounts_in_progress == 0)
 521                 cv_broadcast(&zp->zone_mount_cv);
 522         mutex_exit(&zp->zone_mount_lock);
 523 }
 524
 525 /*
 526  * ZSD routines.
 527  *
 528  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 529  * defined by the pthread_key_create() and related interfaces.
 530  *
 531  * Kernel subsystems may register one or more data items and/or
 532  * callbacks to be executed when a zone is created, shutdown, or
 533  * destroyed.
 534  *
 535  * Unlike the thread counterpart, destructor callbacks will be executed
 536  * even if the data pointer is NULL and/or there are no constructor
 537  * callbacks, so it is the responsibility of such callbacks to check for
 538  * NULL data values if necessary.
 539  *
 540  * The locking strategy and overall picture is as follows:
 541  *
 542  * When someone calls zone_key_create(), a template ZSD entry is added to the
 543  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 544  * holding that lock all the existing zones are marked as
 545  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 546  * zone_zsd list (protected by zone_lock). The global list is updated first
 547  * (under zone_key_lock) to make sure that newly created zones use the
 548  * most recent list of keys. Then under zonehash_lock we walk the zones
 549  * and mark them.  Similar locking is used in zone_key_delete().
 550  *
 551  * The actual create, shutdown, and destroy callbacks are done without
 552  * holding any lock. And zsd_flags are used to ensure that the operations
 553  * completed so that when zone_key_create (and zone_create) is done, as well as
 554  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 555  * are completed.
 556  *
 557  * When new zones are created constructor callbacks for all registered ZSD
 558  * entries will be called. That also uses the above two phases of marking
 559  * what needs to be done, and then running the callbacks without holding
 560  * any locks.
 561  *
 562  * The framework does not provide any locking around zone_getspecific() and
 563  * zone_setspecific() apart from that needed for internal consistency, so
 564  * callers interested in atomic "test-and-set" semantics will need to provide
 565  * their own locking.
 566  */
 567
 568 /*
 569  * Helper function to find the zsd_entry associated with the key in the
 570  * given list.
 571  */
 572 static struct zsd_entry *
 573 zsd_find(list_t *l, zone_key_t key)
 574 {
 575         struct zsd_entry *zsd;
 576
 577         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 578                 if (zsd->zsd_key == key) {
 579                         return (zsd);
 580                 }
 581         }
 582         return (NULL);
 583 }
 584
 585 /*
 586  * Helper function to find the zsd_entry associated with the key in the
 587  * given list. Move it to the front of the list.
 588  */
 589 static struct zsd_entry *
 590 zsd_find_mru(list_t *l, zone_key_t key)
 591 {
 592         struct zsd_entry *zsd;
 593
 594         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 595                 if (zsd->zsd_key == key) {
 596                         /*
 597                          * Move to head of list to keep list in MRU order.
 598                          */
 599                         if (zsd != list_head(l)) {
 600                                 list_remove(l, zsd);
 601                                 list_insert_head(l, zsd);
 602                         }
 603                         return (zsd);
 604                 }
 605         }
 606         return (NULL);
 607 }
 608
 609 void
 610 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 611     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 612 {
 613         struct zsd_entry *zsdp;
 614         struct zsd_entry *t;
 615         struct zone *zone;
 616         zone_key_t  key;
 617
 618         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 619         zsdp->zsd_data = NULL;
 620         zsdp->zsd_create = create;
 621         zsdp->zsd_shutdown = shutdown;
 622         zsdp->zsd_destroy = destroy;
 623
 624         /*
 625          * Insert in global list of callbacks. Makes future zone creations
 626          * see it.
 627          */
 628         mutex_enter(&zsd_key_lock);
 629         key = zsdp->zsd_key = ++zsd_keyval;
 630         ASSERT(zsd_keyval != 0);
 631         list_insert_tail(&zsd_registered_keys, zsdp);
 632         mutex_exit(&zsd_key_lock);
 633
 634         /*
 635          * Insert for all existing zones and mark them as needing
 636          * a create callback.
 637          */
 638         mutex_enter(&zonehash_lock);    /* stop the world */
 639         for (zone = list_head(&zone_active); zone != NULL;
 640             zone = list_next(&zone_active, zone)) {
 641                 zone_status_t status;
 642
 643                 mutex_enter(&zone->zone_lock);
 644
 645                 /* Skip zones that are on the way down or not yet up */
 646                 status = zone_status_get(zone);
 647                 if (status >= ZONE_IS_DOWN ||
 648                     status == ZONE_IS_UNINITIALIZED) {
 649                         mutex_exit(&zone->zone_lock);
 650                         continue;
 651                 }
 652
 653                 t = zsd_find_mru(&zone->zone_zsd, key);
 654                 if (t != NULL) {
 655                         /*
 656                          * A zsd_configure already inserted it after
 657                          * we dropped zsd_key_lock above.
 658                          */
 659                         mutex_exit(&zone->zone_lock);
 660                         continue;
 661                 }
 662                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 663                 t->zsd_key = key;
 664                 t->zsd_create = create;
 665                 t->zsd_shutdown = shutdown;
 666                 t->zsd_destroy = destroy;
 667                 if (create != NULL) {
 668                         t->zsd_flags = ZSD_CREATE_NEEDED;
 669                         DTRACE_PROBE2(zsd__create__needed,
 670                             zone_t *, zone, zone_key_t, key);
 671                 }
 672                 list_insert_tail(&zone->zone_zsd, t);
 673                 mutex_exit(&zone->zone_lock);
 674         }
 675         mutex_exit(&zonehash_lock);
 676
 677         if (create != NULL) {
 678                 /* Now call the create callback for this key */
 679                 zsd_apply_all_zones(zsd_apply_create, key);
 680         }
 681         /*
 682          * It is safe for consumers to use the key now, make it
 683          * globally visible. Specifically zone_getspecific() will
 684          * always successfully return the zone specific data associated
 685          * with the key.
 686          */
 687         *keyp = key;
 688
 689 }
 690
 691 /*
 692  * Function called when a module is being unloaded, or otherwise wishes
 693  * to unregister its ZSD key and callbacks.
 694  *
 695  * Remove from the global list and determine the functions that need to
 696  * be called under a global lock. Then call the functions without
 697  * holding any locks. Finally free up the zone_zsd entries. (The apply
 698  * functions need to access the zone_zsd entries to find zsd_data etc.)
 699  */
 700 int
 701 zone_key_delete(zone_key_t key)
 702 {
 703         struct zsd_entry *zsdp = NULL;
 704         zone_t *zone;
 705
 706         mutex_enter(&zsd_key_lock);
 707         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 708         if (zsdp == NULL) {
 709                 mutex_exit(&zsd_key_lock);
 710                 return (-1);
 711         }
 712         list_remove(&zsd_registered_keys, zsdp);
 713         mutex_exit(&zsd_key_lock);
 714
 715         mutex_enter(&zonehash_lock);
 716         for (zone = list_head(&zone_active); zone != NULL;
 717             zone = list_next(&zone_active, zone)) {
 718                 struct zsd_entry *del;
 719
 720                 mutex_enter(&zone->zone_lock);
 721                 del = zsd_find_mru(&zone->zone_zsd, key);
 722                 if (del == NULL) {
 723                         /*
 724                          * Somebody else got here first e.g the zone going
 725                          * away.
 726                          */
 727                         mutex_exit(&zone->zone_lock);
 728                         continue;
 729                 }
 730                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 731                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 732                 if (del->zsd_shutdown != NULL &&
 733                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 734                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 735                         DTRACE_PROBE2(zsd__shutdown__needed,
 736                             zone_t *, zone, zone_key_t, key);
 737                 }
 738                 if (del->zsd_destroy != NULL &&
 739                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 740                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 741                         DTRACE_PROBE2(zsd__destroy__needed,
 742                             zone_t *, zone, zone_key_t, key);
 743                 }
 744                 mutex_exit(&zone->zone_lock);
 745         }
 746         mutex_exit(&zonehash_lock);
 747         kmem_free(zsdp, sizeof (*zsdp));
 748
 749         /* Now call the shutdown and destroy callback for this key */
 750         zsd_apply_all_zones(zsd_apply_shutdown, key);
 751         zsd_apply_all_zones(zsd_apply_destroy, key);
 752
 753         /* Now we can free up the zsdp structures in each zone */
 754         mutex_enter(&zonehash_lock);
 755         for (zone = list_head(&zone_active); zone != NULL;
 756             zone = list_next(&zone_active, zone)) {
 757                 struct zsd_entry *del;
 758
 759                 mutex_enter(&zone->zone_lock);
 760                 del = zsd_find(&zone->zone_zsd, key);
 761                 if (del != NULL) {
 762                         list_remove(&zone->zone_zsd, del);
 763                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 764                         kmem_free(del, sizeof (*del));
 765                 }
 766                 mutex_exit(&zone->zone_lock);
 767         }
 768         mutex_exit(&zonehash_lock);
 769
 770         return (0);
 771 }
 772
 773 /*
 774  * ZSD counterpart of pthread_setspecific().
 775  *
 776  * Since all zsd callbacks, including those with no create function,
 777  * have an entry in zone_zsd, if the key is registered it is part of
 778  * the zone_zsd list.
 779  * Return an error if the key wasn't registerd.
 780  */
 781 int
 782 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 783 {
 784         struct zsd_entry *t;
 785
 786         mutex_enter(&zone->zone_lock);
 787         t = zsd_find_mru(&zone->zone_zsd, key);
 788         if (t != NULL) {
 789                 /*
 790                  * Replace old value with new
 791                  */
 792                 t->zsd_data = (void *)data;
 793                 mutex_exit(&zone->zone_lock);
 794                 return (0);
 795         }
 796         mutex_exit(&zone->zone_lock);
 797         return (-1);
 798 }
 799
 800 /*
 801  * ZSD counterpart of pthread_getspecific().
 802  */
 803 void *
 804 zone_getspecific(zone_key_t key, zone_t *zone)
 805 {
 806         struct zsd_entry *t;
 807         void *data;
 808
 809         mutex_enter(&zone->zone_lock);
 810         t = zsd_find_mru(&zone->zone_zsd, key);
 811         data = (t == NULL ? NULL : t->zsd_data);
 812         mutex_exit(&zone->zone_lock);
 813         return (data);
 814 }
 815
 816 /*
 817  * Function used to initialize a zone's list of ZSD callbacks and data
 818  * when the zone is being created.  The callbacks are initialized from
 819  * the template list (zsd_registered_keys). The constructor callback is
 820  * executed later (once the zone exists and with locks dropped).
 821  */
 822 static void
 823 zone_zsd_configure(zone_t *zone)
 824 {
 825         struct zsd_entry *zsdp;
 826         struct zsd_entry *t;
 827
 828         ASSERT(MUTEX_HELD(&zonehash_lock));
 829         ASSERT(list_head(&zone->zone_zsd) == NULL);
 830         mutex_enter(&zone->zone_lock);
 831         mutex_enter(&zsd_key_lock);
 832         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 833             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 834                 /*
 835                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 836                  * should not have added anything to it.
 837                  */
 838                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 839
 840                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 841                 t->zsd_key = zsdp->zsd_key;
 842                 t->zsd_create = zsdp->zsd_create;
 843                 t->zsd_shutdown = zsdp->zsd_shutdown;
 844                 t->zsd_destroy = zsdp->zsd_destroy;
 845                 if (zsdp->zsd_create != NULL) {
 846                         t->zsd_flags = ZSD_CREATE_NEEDED;
 847                         DTRACE_PROBE2(zsd__create__needed,
 848                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 849                 }
 850                 list_insert_tail(&zone->zone_zsd, t);
 851         }
 852         mutex_exit(&zsd_key_lock);
 853         mutex_exit(&zone->zone_lock);
 854 }
 855
 856 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 857
 858 /*
 859  * Helper function to execute shutdown or destructor callbacks.
 860  */
 861 static void
 862 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 863 {
 864         struct zsd_entry *t;
 865
 866         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 867         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 868         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 869
 870         /*
 871          * Run the callback solely based on what is registered for the zone
 872          * in zone_zsd. The global list can change independently of this
 873          * as keys are registered and unregistered and we don't register new
 874          * callbacks for a zone that is in the process of going away.
 875          */
 876         mutex_enter(&zone->zone_lock);
 877         for (t = list_head(&zone->zone_zsd); t != NULL;
 878             t = list_next(&zone->zone_zsd, t)) {
 879                 zone_key_t key = t->zsd_key;
 880
 881                 /* Skip if no callbacks registered */
 882
 883                 if (ct == ZSD_SHUTDOWN) {
 884                         if (t->zsd_shutdown != NULL &&
 885                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 886                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 887                                 DTRACE_PROBE2(zsd__shutdown__needed,
 888                                     zone_t *, zone, zone_key_t, key);
 889                         }
 890                 } else {
 891                         if (t->zsd_destroy != NULL &&
 892                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 893                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 894                                 DTRACE_PROBE2(zsd__destroy__needed,
 895                                     zone_t *, zone, zone_key_t, key);
 896                         }
 897                 }
 898         }
 899         mutex_exit(&zone->zone_lock);
 900
 901         /* Now call the shutdown and destroy callback for this key */
 902         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 903         zsd_apply_all_keys(zsd_apply_destroy, zone);
 904
 905 }
 906
 907 /*
 908  * Called when the zone is going away; free ZSD-related memory, and
 909  * destroy the zone_zsd list.
 910  */
 911 static void
 912 zone_free_zsd(zone_t *zone)
 913 {
 914         struct zsd_entry *t, *next;
 915
 916         /*
 917          * Free all the zsd_entry's we had on this zone.
 918          */
 919         mutex_enter(&zone->zone_lock);
 920         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 921                 next = list_next(&zone->zone_zsd, t);
 922                 list_remove(&zone->zone_zsd, t);
 923                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 924                 kmem_free(t, sizeof (*t));
 925         }
 926         list_destroy(&zone->zone_zsd);
 927         mutex_exit(&zone->zone_lock);
 928
 929 }
 930
 931 /*
 932  * Apply a function to all zones for particular key value.
 933  *
 934  * The applyfn has to drop zonehash_lock if it does some work, and
 935  * then reacquire it before it returns.
 936  * When the lock is dropped we don't follow list_next even
 937  * if it is possible to do so without any hazards. This is
 938  * because we want the design to allow for the list of zones
 939  * to change in any arbitrary way during the time the
 940  * lock was dropped.
 941  *
 942  * It is safe to restart the loop at list_head since the applyfn
 943  * changes the zsd_flags as it does work, so a subsequent
 944  * pass through will have no effect in applyfn, hence the loop will terminate
 945  * in at worst O(N^2).
 946  */
 947 static void
 948 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 949 {
 950         zone_t *zone;
 951
 952         mutex_enter(&zonehash_lock);
 953         zone = list_head(&zone_active);
 954         while (zone != NULL) {
 955                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 956                         /* Lock dropped - restart at head */
 957                         zone = list_head(&zone_active);
 958                 } else {
 959                         zone = list_next(&zone_active, zone);
 960                 }
 961         }
 962         mutex_exit(&zonehash_lock);
 963 }
 964
 965 /*
 966  * Apply a function to all keys for a particular zone.
 967  *
 968  * The applyfn has to drop zonehash_lock if it does some work, and
 969  * then reacquire it before it returns.
 970  * When the lock is dropped we don't follow list_next even
 971  * if it is possible to do so without any hazards. This is
 972  * because we want the design to allow for the list of zsd callbacks
 973  * to change in any arbitrary way during the time the
 974  * lock was dropped.
 975  *
 976  * It is safe to restart the loop at list_head since the applyfn
 977  * changes the zsd_flags as it does work, so a subsequent
 978  * pass through will have no effect in applyfn, hence the loop will terminate
 979  * in at worst O(N^2).
 980  */
 981 static void
 982 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 983 {
 984         struct zsd_entry *t;
 985
 986         mutex_enter(&zone->zone_lock);
 987         t = list_head(&zone->zone_zsd);
 988         while (t != NULL) {
 989                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 990                         /* Lock dropped - restart at head */
 991                         t = list_head(&zone->zone_zsd);
 992                 } else {
 993                         t = list_next(&zone->zone_zsd, t);
 994                 }
 995         }
 996         mutex_exit(&zone->zone_lock);
 997 }
 998
 999 /*
1000  * Call the create function for the zone and key if CREATE_NEEDED
1001  * is set.
1002  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003  * we wait for that thread to complete so that we can ensure that
1004  * all the callbacks are done when we've looped over all zones/keys.
1005  *
1006  * When we call the create function, we drop the global held by the
1007  * caller, and return true to tell the caller it needs to re-evalute the
1008  * state.
1009  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010  * remains held on exit.
1011  */
1012 static boolean_t
1013 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014     zone_t *zone, zone_key_t key)
1015 {
1016         void *result;
1017         struct zsd_entry *t;
1018         boolean_t dropped;
1019
1020         if (lockp != NULL) {
1021                 ASSERT(MUTEX_HELD(lockp));
1022         }
1023         if (zone_lock_held) {
1024                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1025         } else {
1026                 mutex_enter(&zone->zone_lock);
1027         }
1028
1029         t = zsd_find(&zone->zone_zsd, key);
1030         if (t == NULL) {
1031                 /*
1032                  * Somebody else got here first e.g the zone going
1033                  * away.
1034                  */
1035                 if (!zone_lock_held)
1036                         mutex_exit(&zone->zone_lock);
1037                 return (B_FALSE);
1038         }
1039         dropped = B_FALSE;
1040         if (zsd_wait_for_inprogress(zone, t, lockp))
1041                 dropped = B_TRUE;
1042
1043         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046                 DTRACE_PROBE2(zsd__create__inprogress,
1047                     zone_t *, zone, zone_key_t, key);
1048                 mutex_exit(&zone->zone_lock);
1049                 if (lockp != NULL)
1050                         mutex_exit(lockp);
1051
1052                 dropped = B_TRUE;
1053                 ASSERT(t->zsd_create != NULL);
1054                 DTRACE_PROBE2(zsd__create__start,
1055                     zone_t *, zone, zone_key_t, key);
1056
1057                 result = (*t->zsd_create)(zone->zone_id);
1058
1059                 DTRACE_PROBE2(zsd__create__end,
1060                     zone_t *, zone, voidn *, result);
1061
1062                 ASSERT(result != NULL);
1063                 if (lockp != NULL)
1064                         mutex_enter(lockp);
1065                 mutex_enter(&zone->zone_lock);
1066                 t->zsd_data = result;
1067                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1069                 cv_broadcast(&t->zsd_cv);
1070                 DTRACE_PROBE2(zsd__create__completed,
1071                     zone_t *, zone, zone_key_t, key);
1072         }
1073         if (!zone_lock_held)
1074                 mutex_exit(&zone->zone_lock);
1075         return (dropped);
1076 }
1077
1078 /*
1079  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1080  * is set.
1081  * If some other thread gets here first and sets *_INPROGRESS, then
1082  * we wait for that thread to complete so that we can ensure that
1083  * all the callbacks are done when we've looped over all zones/keys.
1084  *
1085  * When we call the shutdown function, we drop the global held by the
1086  * caller, and return true to tell the caller it needs to re-evalute the
1087  * state.
1088  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1089  * remains held on exit.
1090  */
1091 static boolean_t
1092 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1093     zone_t *zone, zone_key_t key)
1094 {
1095         struct zsd_entry *t;
1096         void *data;
1097         boolean_t dropped;
1098
1099         if (lockp != NULL) {
1100                 ASSERT(MUTEX_HELD(lockp));
1101         }
1102         if (zone_lock_held) {
1103                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1104         } else {
1105                 mutex_enter(&zone->zone_lock);
1106         }
1107
1108         t = zsd_find(&zone->zone_zsd, key);
1109         if (t == NULL) {
1110                 /*
1111                  * Somebody else got here first e.g the zone going
1112                  * away.
1113                  */
1114                 if (!zone_lock_held)
1115                         mutex_exit(&zone->zone_lock);
1116                 return (B_FALSE);
1117         }
1118         dropped = B_FALSE;
1119         if (zsd_wait_for_creator(zone, t, lockp))
1120                 dropped = B_TRUE;
1121
1122         if (zsd_wait_for_inprogress(zone, t, lockp))
1123                 dropped = B_TRUE;
1124
1125         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1126                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1127                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1128                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1129                     zone_t *, zone, zone_key_t, key);
1130                 mutex_exit(&zone->zone_lock);
1131                 if (lockp != NULL)
1132                         mutex_exit(lockp);
1133                 dropped = B_TRUE;
1134
1135                 ASSERT(t->zsd_shutdown != NULL);
1136                 data = t->zsd_data;
1137
1138                 DTRACE_PROBE2(zsd__shutdown__start,
1139                     zone_t *, zone, zone_key_t, key);
1140
1141                 (t->zsd_shutdown)(zone->zone_id, data);
1142                 DTRACE_PROBE2(zsd__shutdown__end,
1143                     zone_t *, zone, zone_key_t, key);
1144
1145                 if (lockp != NULL)
1146                         mutex_enter(lockp);
1147                 mutex_enter(&zone->zone_lock);
1148                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1149                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1150                 cv_broadcast(&t->zsd_cv);
1151                 DTRACE_PROBE2(zsd__shutdown__completed,
1152                     zone_t *, zone, zone_key_t, key);
1153         }
1154         if (!zone_lock_held)
1155                 mutex_exit(&zone->zone_lock);
1156         return (dropped);
1157 }
1158
1159 /*
1160  * Call the destroy function for the zone and key if DESTROY_NEEDED
1161  * is set.
1162  * If some other thread gets here first and sets *_INPROGRESS, then
1163  * we wait for that thread to complete so that we can ensure that
1164  * all the callbacks are done when we've looped over all zones/keys.
1165  *
1166  * When we call the destroy function, we drop the global held by the
1167  * caller, and return true to tell the caller it needs to re-evalute the
1168  * state.
1169  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1170  * remains held on exit.
1171  */
1172 static boolean_t
1173 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1174     zone_t *zone, zone_key_t key)
1175 {
1176         struct zsd_entry *t;
1177         void *data;
1178         boolean_t dropped;
1179
1180         if (lockp != NULL) {
1181                 ASSERT(MUTEX_HELD(lockp));
1182         }
1183         if (zone_lock_held) {
1184                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1185         } else {
1186                 mutex_enter(&zone->zone_lock);
1187         }
1188
1189         t = zsd_find(&zone->zone_zsd, key);
1190         if (t == NULL) {
1191                 /*
1192                  * Somebody else got here first e.g the zone going
1193                  * away.
1194                  */
1195                 if (!zone_lock_held)
1196                         mutex_exit(&zone->zone_lock);
1197                 return (B_FALSE);
1198         }
1199         dropped = B_FALSE;
1200         if (zsd_wait_for_creator(zone, t, lockp))
1201                 dropped = B_TRUE;
1202
1203         if (zsd_wait_for_inprogress(zone, t, lockp))
1204                 dropped = B_TRUE;
1205
1206         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1207                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1208                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1209                 DTRACE_PROBE2(zsd__destroy__inprogress,
1210                     zone_t *, zone, zone_key_t, key);
1211                 mutex_exit(&zone->zone_lock);
1212                 if (lockp != NULL)
1213                         mutex_exit(lockp);
1214                 dropped = B_TRUE;
1215
1216                 ASSERT(t->zsd_destroy != NULL);
1217                 data = t->zsd_data;
1218                 DTRACE_PROBE2(zsd__destroy__start,
1219                     zone_t *, zone, zone_key_t, key);
1220
1221                 (t->zsd_destroy)(zone->zone_id, data);
1222                 DTRACE_PROBE2(zsd__destroy__end,
1223                     zone_t *, zone, zone_key_t, key);
1224
1225                 if (lockp != NULL)
1226                         mutex_enter(lockp);
1227                 mutex_enter(&zone->zone_lock);
1228                 t->zsd_data = NULL;
1229                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1230                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1231                 cv_broadcast(&t->zsd_cv);
1232                 DTRACE_PROBE2(zsd__destroy__completed,
1233                     zone_t *, zone, zone_key_t, key);
1234         }
1235         if (!zone_lock_held)
1236                 mutex_exit(&zone->zone_lock);
1237         return (dropped);
1238 }
1239
1240 /*
1241  * Wait for any CREATE_NEEDED flag to be cleared.
1242  * Returns true if lockp was temporarily dropped while waiting.
1243  */
1244 static boolean_t
1245 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1246 {
1247         boolean_t dropped = B_FALSE;
1248
1249         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1250                 DTRACE_PROBE2(zsd__wait__for__creator,
1251                     zone_t *, zone, struct zsd_entry *, t);
1252                 if (lockp != NULL) {
1253                         dropped = B_TRUE;
1254                         mutex_exit(lockp);
1255                 }
1256                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1257                 if (lockp != NULL) {
1258                         /* First drop zone_lock to preserve order */
1259                         mutex_exit(&zone->zone_lock);
1260                         mutex_enter(lockp);
1261                         mutex_enter(&zone->zone_lock);
1262                 }
1263         }
1264         return (dropped);
1265 }
1266
1267 /*
1268  * Wait for any INPROGRESS flag to be cleared.
1269  * Returns true if lockp was temporarily dropped while waiting.
1270  */
1271 static boolean_t
1272 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1273 {
1274         boolean_t dropped = B_FALSE;
1275
1276         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1277                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1278                     zone_t *, zone, struct zsd_entry *, t);
1279                 if (lockp != NULL) {
1280                         dropped = B_TRUE;
1281                         mutex_exit(lockp);
1282                 }
1283                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1284                 if (lockp != NULL) {
1285                         /* First drop zone_lock to preserve order */
1286                         mutex_exit(&zone->zone_lock);
1287                         mutex_enter(lockp);
1288                         mutex_enter(&zone->zone_lock);
1289                 }
1290         }
1291         return (dropped);
1292 }
1293
1294 /*
1295  * Frees memory associated with the zone dataset list.
1296  */
1297 static void
1298 zone_free_datasets(zone_t *zone)
1299 {
1300         zone_dataset_t *t, *next;
1301
1302         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1303                 next = list_next(&zone->zone_datasets, t);
1304                 list_remove(&zone->zone_datasets, t);
1305                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1306                 kmem_free(t, sizeof (*t));
1307         }
1308         list_destroy(&zone->zone_datasets);
1309 }
1310
1311 /*
1312  * zone.cpu-shares resource control support.
1313  */
1314 /*ARGSUSED*/
1315 static rctl_qty_t
1316 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1317 {
1318         ASSERT(MUTEX_HELD(&p->p_lock));
1319         return (p->p_zone->zone_shares);
1320 }
1321
1322 /*ARGSUSED*/
1323 static int
1324 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1325     rctl_qty_t nv)
1326 {
1327         ASSERT(MUTEX_HELD(&p->p_lock));
1328         ASSERT(e->rcep_t == RCENTITY_ZONE);
1329         if (e->rcep_p.zone == NULL)
1330                 return (0);
1331
1332         e->rcep_p.zone->zone_shares = nv;
1333         return (0);
1334 }
1335
1336 static rctl_ops_t zone_cpu_shares_ops = {
1337         rcop_no_action,
1338         zone_cpu_shares_usage,
1339         zone_cpu_shares_set,
1340         rcop_no_test
1341 };
1342
1343 /*
1344  * zone.cpu-cap resource control support.
1345  */
1346 /*ARGSUSED*/
1347 static rctl_qty_t
1348 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1349 {
1350         ASSERT(MUTEX_HELD(&p->p_lock));
1351         return (cpucaps_zone_get(p->p_zone));
1352 }
1353
1354 /*ARGSUSED*/
1355 static int
1356 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1357     rctl_qty_t nv)
1358 {
1359         zone_t *zone = e->rcep_p.zone;
1360
1361         ASSERT(MUTEX_HELD(&p->p_lock));
1362         ASSERT(e->rcep_t == RCENTITY_ZONE);
1363
1364         if (zone == NULL)
1365                 return (0);
1366
1367         /*
1368          * set cap to the new value.
1369          */
1370         return (cpucaps_zone_set(zone, nv));
1371 }
1372
1373 static rctl_ops_t zone_cpu_cap_ops = {
1374         rcop_no_action,
1375         zone_cpu_cap_get,
1376         zone_cpu_cap_set,
1377         rcop_no_test
1378 };
1379
1380 /*ARGSUSED*/
1381 static rctl_qty_t
1382 zone_lwps_usage(rctl_t *r, proc_t *p)
1383 {
1384         rctl_qty_t nlwps;
1385         zone_t *zone = p->p_zone;
1386
1387         ASSERT(MUTEX_HELD(&p->p_lock));
1388
1389         mutex_enter(&zone->zone_nlwps_lock);
1390         nlwps = zone->zone_nlwps;
1391         mutex_exit(&zone->zone_nlwps_lock);
1392
1393         return (nlwps);
1394 }
1395
1396 /*ARGSUSED*/
1397 static int
1398 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1399     rctl_qty_t incr, uint_t flags)
1400 {
1401         rctl_qty_t nlwps;
1402
1403         ASSERT(MUTEX_HELD(&p->p_lock));
1404         ASSERT(e->rcep_t == RCENTITY_ZONE);
1405         if (e->rcep_p.zone == NULL)
1406                 return (0);
1407         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1408         nlwps = e->rcep_p.zone->zone_nlwps;
1409
1410         if (nlwps + incr > rcntl->rcv_value)
1411                 return (1);
1412
1413         return (0);
1414 }
1415
1416 /*ARGSUSED*/
1417 static int
1418 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1419 {
1420         ASSERT(MUTEX_HELD(&p->p_lock));
1421         ASSERT(e->rcep_t == RCENTITY_ZONE);
1422         if (e->rcep_p.zone == NULL)
1423                 return (0);
1424         e->rcep_p.zone->zone_nlwps_ctl = nv;
1425         return (0);
1426 }
1427
1428 static rctl_ops_t zone_lwps_ops = {
1429         rcop_no_action,
1430         zone_lwps_usage,
1431         zone_lwps_set,
1432         zone_lwps_test,
1433 };
1434
1435 /*ARGSUSED*/
1436 static rctl_qty_t
1437 zone_procs_usage(rctl_t *r, proc_t *p)
1438 {
1439         rctl_qty_t nprocs;
1440         zone_t *zone = p->p_zone;
1441
1442         ASSERT(MUTEX_HELD(&p->p_lock));
1443
1444         mutex_enter(&zone->zone_nlwps_lock);
1445         nprocs = zone->zone_nprocs;
1446         mutex_exit(&zone->zone_nlwps_lock);
1447
1448         return (nprocs);
1449 }
1450
1451 /*ARGSUSED*/
1452 static int
1453 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1454     rctl_qty_t incr, uint_t flags)
1455 {
1456         rctl_qty_t nprocs;
1457
1458         ASSERT(MUTEX_HELD(&p->p_lock));
1459         ASSERT(e->rcep_t == RCENTITY_ZONE);
1460         if (e->rcep_p.zone == NULL)
1461                 return (0);
1462         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1463         nprocs = e->rcep_p.zone->zone_nprocs;
1464
1465         if (nprocs + incr > rcntl->rcv_value)
1466                 return (1);
1467
1468         return (0);
1469 }
1470
1471 /*ARGSUSED*/
1472 static int
1473 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1474 {
1475         ASSERT(MUTEX_HELD(&p->p_lock));
1476         ASSERT(e->rcep_t == RCENTITY_ZONE);
1477         if (e->rcep_p.zone == NULL)
1478                 return (0);
1479         e->rcep_p.zone->zone_nprocs_ctl = nv;
1480         return (0);
1481 }
1482
1483 static rctl_ops_t zone_procs_ops = {
1484         rcop_no_action,
1485         zone_procs_usage,
1486         zone_procs_set,
1487         zone_procs_test,
1488 };
1489
1490 /*ARGSUSED*/
1491 static rctl_qty_t
1492 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1493 {
1494         ASSERT(MUTEX_HELD(&p->p_lock));
1495         return (p->p_zone->zone_shmmax);
1496 }
1497
1498 /*ARGSUSED*/
1499 static int
1500 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1501     rctl_qty_t incr, uint_t flags)
1502 {
1503         rctl_qty_t v;
1504         ASSERT(MUTEX_HELD(&p->p_lock));
1505         ASSERT(e->rcep_t == RCENTITY_ZONE);
1506         v = e->rcep_p.zone->zone_shmmax + incr;
1507         if (v > rval->rcv_value)
1508                 return (1);
1509         return (0);
1510 }
1511
1512 static rctl_ops_t zone_shmmax_ops = {
1513         rcop_no_action,
1514         zone_shmmax_usage,
1515         rcop_no_set,
1516         zone_shmmax_test
1517 };
1518
1519 /*ARGSUSED*/
1520 static rctl_qty_t
1521 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1522 {
1523         ASSERT(MUTEX_HELD(&p->p_lock));
1524         return (p->p_zone->zone_ipc.ipcq_shmmni);
1525 }
1526
1527 /*ARGSUSED*/
1528 static int
1529 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530     rctl_qty_t incr, uint_t flags)
1531 {
1532         rctl_qty_t v;
1533         ASSERT(MUTEX_HELD(&p->p_lock));
1534         ASSERT(e->rcep_t == RCENTITY_ZONE);
1535         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1536         if (v > rval->rcv_value)
1537                 return (1);
1538         return (0);
1539 }
1540
1541 static rctl_ops_t zone_shmmni_ops = {
1542         rcop_no_action,
1543         zone_shmmni_usage,
1544         rcop_no_set,
1545         zone_shmmni_test
1546 };
1547
1548 /*ARGSUSED*/
1549 static rctl_qty_t
1550 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1551 {
1552         ASSERT(MUTEX_HELD(&p->p_lock));
1553         return (p->p_zone->zone_ipc.ipcq_semmni);
1554 }
1555
1556 /*ARGSUSED*/
1557 static int
1558 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1559     rctl_qty_t incr, uint_t flags)
1560 {
1561         rctl_qty_t v;
1562         ASSERT(MUTEX_HELD(&p->p_lock));
1563         ASSERT(e->rcep_t == RCENTITY_ZONE);
1564         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1565         if (v > rval->rcv_value)
1566                 return (1);
1567         return (0);
1568 }
1569
1570 static rctl_ops_t zone_semmni_ops = {
1571         rcop_no_action,
1572         zone_semmni_usage,
1573         rcop_no_set,
1574         zone_semmni_test
1575 };
1576
1577 /*ARGSUSED*/
1578 static rctl_qty_t
1579 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1580 {
1581         ASSERT(MUTEX_HELD(&p->p_lock));
1582         return (p->p_zone->zone_ipc.ipcq_msgmni);
1583 }
1584
1585 /*ARGSUSED*/
1586 static int
1587 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1588     rctl_qty_t incr, uint_t flags)
1589 {
1590         rctl_qty_t v;
1591         ASSERT(MUTEX_HELD(&p->p_lock));
1592         ASSERT(e->rcep_t == RCENTITY_ZONE);
1593         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1594         if (v > rval->rcv_value)
1595                 return (1);
1596         return (0);
1597 }
1598
1599 static rctl_ops_t zone_msgmni_ops = {
1600         rcop_no_action,
1601         zone_msgmni_usage,
1602         rcop_no_set,
1603         zone_msgmni_test
1604 };
1605
1606 /*ARGSUSED*/
1607 static rctl_qty_t
1608 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1609 {
1610         rctl_qty_t q;
1611         ASSERT(MUTEX_HELD(&p->p_lock));
1612         mutex_enter(&p->p_zone->zone_mem_lock);
1613         q = p->p_zone->zone_locked_mem;
1614         mutex_exit(&p->p_zone->zone_mem_lock);
1615         return (q);
1616 }
1617
1618 /*ARGSUSED*/
1619 static int
1620 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1621     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1622 {
1623         rctl_qty_t q;
1624         zone_t *z;
1625
1626         z = e->rcep_p.zone;
1627         ASSERT(MUTEX_HELD(&p->p_lock));
1628         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1629         q = z->zone_locked_mem;
1630         if (q + incr > rcntl->rcv_value)
1631                 return (1);
1632         return (0);
1633 }
1634
1635 /*ARGSUSED*/
1636 static int
1637 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1638     rctl_qty_t nv)
1639 {
1640         ASSERT(MUTEX_HELD(&p->p_lock));
1641         ASSERT(e->rcep_t == RCENTITY_ZONE);
1642         if (e->rcep_p.zone == NULL)
1643                 return (0);
1644         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1645         return (0);
1646 }
1647
1648 static rctl_ops_t zone_locked_mem_ops = {
1649         rcop_no_action,
1650         zone_locked_mem_usage,
1651         zone_locked_mem_set,
1652         zone_locked_mem_test
1653 };
1654
1655 /*ARGSUSED*/
1656 static rctl_qty_t
1657 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1658 {
1659         rctl_qty_t q;
1660         zone_t *z = p->p_zone;
1661
1662         ASSERT(MUTEX_HELD(&p->p_lock));
1663         mutex_enter(&z->zone_mem_lock);
1664         q = z->zone_max_swap;
1665         mutex_exit(&z->zone_mem_lock);
1666         return (q);
1667 }
1668
1669 /*ARGSUSED*/
1670 static int
1671 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1672     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1673 {
1674         rctl_qty_t q;
1675         zone_t *z;
1676
1677         z = e->rcep_p.zone;
1678         ASSERT(MUTEX_HELD(&p->p_lock));
1679         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1680         q = z->zone_max_swap;
1681         if (q + incr > rcntl->rcv_value)
1682                 return (1);
1683         return (0);
1684 }
1685
1686 /*ARGSUSED*/
1687 static int
1688 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1689     rctl_qty_t nv)
1690 {
1691         ASSERT(MUTEX_HELD(&p->p_lock));
1692         ASSERT(e->rcep_t == RCENTITY_ZONE);
1693         if (e->rcep_p.zone == NULL)
1694                 return (0);
1695         e->rcep_p.zone->zone_max_swap_ctl = nv;
1696         return (0);
1697 }
1698
1699 static rctl_ops_t zone_max_swap_ops = {
1700         rcop_no_action,
1701         zone_max_swap_usage,
1702         zone_max_swap_set,
1703         zone_max_swap_test
1704 };
1705
1706 /*ARGSUSED*/
1707 static rctl_qty_t
1708 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1709 {
1710         rctl_qty_t q;
1711         zone_t *z = p->p_zone;
1712
1713         ASSERT(MUTEX_HELD(&p->p_lock));
1714         mutex_enter(&z->zone_rctl_lock);
1715         q = z->zone_max_lofi;
1716         mutex_exit(&z->zone_rctl_lock);
1717         return (q);
1718 }
1719
1720 /*ARGSUSED*/
1721 static int
1722 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1723     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1724 {
1725         rctl_qty_t q;
1726         zone_t *z;
1727
1728         z = e->rcep_p.zone;
1729         ASSERT(MUTEX_HELD(&p->p_lock));
1730         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1731         q = z->zone_max_lofi;
1732         if (q + incr > rcntl->rcv_value)
1733                 return (1);
1734         return (0);
1735 }
1736
1737 /*ARGSUSED*/
1738 static int
1739 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1740     rctl_qty_t nv)
1741 {
1742         ASSERT(MUTEX_HELD(&p->p_lock));
1743         ASSERT(e->rcep_t == RCENTITY_ZONE);
1744         if (e->rcep_p.zone == NULL)
1745                 return (0);
1746         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1747         return (0);
1748 }
1749
1750 static rctl_ops_t zone_max_lofi_ops = {
1751         rcop_no_action,
1752         zone_max_lofi_usage,
1753         zone_max_lofi_set,
1754         zone_max_lofi_test
1755 };
1756
1757 /*
1758  * Helper function to brand the zone with a unique ID.
1759  */
1760 static void
1761 zone_uniqid(zone_t *zone)
1762 {
1763         static uint64_t uniqid = 0;
1764
1765         ASSERT(MUTEX_HELD(&zonehash_lock));
1766         zone->zone_uniqid = uniqid++;
1767 }
1768
1769 /*
1770  * Returns a held pointer to the "kcred" for the specified zone.
1771  */
1772 struct cred *
1773 zone_get_kcred(zoneid_t zoneid)
1774 {
1775         zone_t *zone;
1776         cred_t *cr;
1777
1778         if ((zone = zone_find_by_id(zoneid)) == NULL)
1779                 return (NULL);
1780         cr = zone->zone_kcred;
1781         crhold(cr);
1782         zone_rele(zone);
1783         return (cr);
1784 }
1785
1786 static int
1787 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1788 {
1789         zone_t *zone = ksp->ks_private;
1790         zone_kstat_t *zk = ksp->ks_data;
1791
1792         if (rw == KSTAT_WRITE)
1793                 return (EACCES);
1794
1795         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1796         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1797         return (0);
1798 }
1799
1800 static int
1801 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1802 {
1803         zone_t *zone = ksp->ks_private;
1804         zone_kstat_t *zk = ksp->ks_data;
1805
1806         if (rw == KSTAT_WRITE)
1807                 return (EACCES);
1808
1809         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1810         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1811         return (0);
1812 }
1813
1814 static int
1815 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1816 {
1817         zone_t *zone = ksp->ks_private;
1818         zone_kstat_t *zk = ksp->ks_data;
1819
1820         if (rw == KSTAT_WRITE)
1821                 return (EACCES);
1822
1823         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1824         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1825         return (0);
1826 }
1827
1828 static kstat_t *
1829 zone_kstat_create_common(zone_t *zone, char *name,
1830     int (*updatefunc) (kstat_t *, int))
1831 {
1832         kstat_t *ksp;
1833         zone_kstat_t *zk;
1834
1835         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1836             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1837             KSTAT_FLAG_VIRTUAL);
1838
1839         if (ksp == NULL)
1840                 return (NULL);
1841
1842         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1843         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1844         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1845         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1846         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1847         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1848         ksp->ks_update = updatefunc;
1849         ksp->ks_private = zone;
1850         kstat_install(ksp);
1851         return (ksp);
1852 }
1853
1854
1855 static int
1856 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1857 {
1858         zone_t *zone = ksp->ks_private;
1859         zone_mcap_kstat_t *zmp = ksp->ks_data;
1860
1861         if (rw == KSTAT_WRITE)
1862                 return (EACCES);
1863
1864         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1865         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1866         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1867         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1868         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1869
1870         return (0);
1871 }
1872
1873 static kstat_t *
1874 zone_mcap_kstat_create(zone_t *zone)
1875 {
1876         kstat_t *ksp;
1877         zone_mcap_kstat_t *zmp;
1878
1879         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1880             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1881             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1882             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1883                 return (NULL);
1884
1885         if (zone->zone_id != GLOBAL_ZONEID)
1886                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1887
1888         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1889         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1890         ksp->ks_lock = &zone->zone_mcap_lock;
1891         zone->zone_mcap_stats = zmp;
1892
1893         /* The kstat "name" field is not large enough for a full zonename */
1894         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1895         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1896         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1897         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1898         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1899         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1900         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1901             KSTAT_DATA_UINT64);
1902
1903         ksp->ks_update = zone_mcap_kstat_update;
1904         ksp->ks_private = zone;
1905
1906         kstat_install(ksp);
1907         return (ksp);
1908 }
1909
1910 static int
1911 zone_misc_kstat_update(kstat_t *ksp, int rw)
1912 {
1913         zone_t *zone = ksp->ks_private;
1914         zone_misc_kstat_t *zmp = ksp->ks_data;
1915         hrtime_t tmp;
1916
1917         if (rw == KSTAT_WRITE)
1918                 return (EACCES);
1919
1920         tmp = zone->zone_utime;
1921         scalehrtime(&tmp);
1922         zmp->zm_utime.value.ui64 = tmp;
1923         tmp = zone->zone_stime;
1924         scalehrtime(&tmp);
1925         zmp->zm_stime.value.ui64 = tmp;
1926         tmp = zone->zone_wtime;
1927         scalehrtime(&tmp);
1928         zmp->zm_wtime.value.ui64 = tmp;
1929
1930         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1931         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1932         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1933
1934         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1935         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1936         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1937         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1938
1939         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1940
1941         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1942         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1943
1944         return (0);
1945 }
1946
1947 static kstat_t *
1948 zone_misc_kstat_create(zone_t *zone)
1949 {
1950         kstat_t *ksp;
1951         zone_misc_kstat_t *zmp;
1952
1953         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1954             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1955             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1956             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1957                 return (NULL);
1958
1959         if (zone->zone_id != GLOBAL_ZONEID)
1960                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1961
1962         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1963         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1964         ksp->ks_lock = &zone->zone_misc_lock;
1965         zone->zone_misc_stats = zmp;
1966
1967         /* The kstat "name" field is not large enough for a full zonename */
1968         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1969         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1970         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1971         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1972         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1973         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1974         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1975         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1976             KSTAT_DATA_UINT32);
1977         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1978         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1979             KSTAT_DATA_UINT32);
1980         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1981         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1982         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1983             KSTAT_DATA_UINT32);
1984         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1985         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1986
1987         ksp->ks_update = zone_misc_kstat_update;
1988         ksp->ks_private = zone;
1989
1990         kstat_install(ksp);
1991         return (ksp);
1992 }
1993
1994 static void
1995 zone_kstat_create(zone_t *zone)
1996 {
1997         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1998             "lockedmem", zone_lockedmem_kstat_update);
1999         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2000             "swapresv", zone_swapresv_kstat_update);
2001         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2002             "nprocs", zone_nprocs_kstat_update);
2003
2004         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2005                 zone->zone_mcap_stats = kmem_zalloc(
2006                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2007         }
2008
2009         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2010                 zone->zone_misc_stats = kmem_zalloc(
2011                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2012         }
2013 }
2014
2015 static void
2016 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2017 {
2018         void *data;
2019
2020         if (*pkstat != NULL) {
2021                 data = (*pkstat)->ks_data;
2022                 kstat_delete(*pkstat);
2023                 kmem_free(data, datasz);
2024                 *pkstat = NULL;
2025         }
2026 }
2027
2028 static void
2029 zone_kstat_delete(zone_t *zone)
2030 {
2031         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2032             sizeof (zone_kstat_t));
2033         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2034             sizeof (zone_kstat_t));
2035         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2036             sizeof (zone_kstat_t));
2037         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2038             sizeof (zone_mcap_kstat_t));
2039         zone_kstat_delete_common(&zone->zone_misc_ksp,
2040             sizeof (zone_misc_kstat_t));
2041 }
2042
2043 /*
2044  * Called very early on in boot to initialize the ZSD list so that
2045  * zone_key_create() can be called before zone_init().  It also initializes
2046  * portions of zone0 which may be used before zone_init() is called.  The
2047  * variable "global_zone" will be set when zone0 is fully initialized by
2048  * zone_init().
2049  */
2050 void
2051 zone_zsd_init(void)
2052 {
2053         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2054         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2055         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2056             offsetof(struct zsd_entry, zsd_linkage));
2057         list_create(&zone_active, sizeof (zone_t),
2058             offsetof(zone_t, zone_linkage));
2059         list_create(&zone_deathrow, sizeof (zone_t),
2060             offsetof(zone_t, zone_linkage));
2061
2062         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2063         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2064         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2065         zone0.zone_shares = 1;
2066         zone0.zone_nlwps = 0;
2067         zone0.zone_nlwps_ctl = INT_MAX;
2068         zone0.zone_nprocs = 0;
2069         zone0.zone_nprocs_ctl = INT_MAX;
2070         zone0.zone_locked_mem = 0;
2071         zone0.zone_locked_mem_ctl = UINT64_MAX;
2072         ASSERT(zone0.zone_max_swap == 0);
2073         zone0.zone_max_swap_ctl = UINT64_MAX;
2074         zone0.zone_max_lofi = 0;
2075         zone0.zone_max_lofi_ctl = UINT64_MAX;
2076         zone0.zone_shmmax = 0;
2077         zone0.zone_ipc.ipcq_shmmni = 0;
2078         zone0.zone_ipc.ipcq_semmni = 0;
2079         zone0.zone_ipc.ipcq_msgmni = 0;
2080         zone0.zone_name = GLOBAL_ZONENAME;
2081         zone0.zone_nodename = utsname.nodename;
2082         zone0.zone_domain = srpc_domain;
2083         zone0.zone_hostid = HW_INVALID_HOSTID;
2084         zone0.zone_fs_allowed = NULL;
2085         psecflags_default(&zone0.zone_secflags);
2086         zone0.zone_ref = 1;
2087         zone0.zone_id = GLOBAL_ZONEID;
2088         zone0.zone_status = ZONE_IS_RUNNING;
2089         zone0.zone_rootpath = "/";
2090         zone0.zone_rootpathlen = 2;
2091         zone0.zone_psetid = ZONE_PS_INVAL;
2092         zone0.zone_ncpus = 0;
2093         zone0.zone_ncpus_online = 0;
2094         zone0.zone_proc_initpid = 1;
2095         zone0.zone_initname = initname;
2096         zone0.zone_lockedmem_kstat = NULL;
2097         zone0.zone_swapresv_kstat = NULL;
2098         zone0.zone_nprocs_kstat = NULL;
2099
2100         zone0.zone_stime = 0;
2101         zone0.zone_utime = 0;
2102         zone0.zone_wtime = 0;
2103
2104         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2105             offsetof(zone_ref_t, zref_linkage));
2106         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2107             offsetof(struct zsd_entry, zsd_linkage));
2108         list_insert_head(&zone_active, &zone0);
2109
2110         /*
2111          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2112          * to anything meaningful.  It is assigned to be 'rootdir' in
2113          * vfs_mountroot().
2114          */
2115         zone0.zone_rootvp = NULL;
2116         zone0.zone_vfslist = NULL;
2117         zone0.zone_bootargs = initargs;
2118         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2119         /*
2120          * The global zone has all privileges
2121          */
2122         priv_fillset(zone0.zone_privset);
2123         /*
2124          * Add p0 to the global zone
2125          */
2126         zone0.zone_zsched = &p0;
2127         p0.p_zone = &zone0;
2128 }
2129
2130 /*
2131  * Called by main() to initialize the zones framework.
2132  */
2133 void
2134 zone_init(void)
2135 {
2136         rctl_dict_entry_t *rde;
2137         rctl_val_t *dval;
2138         rctl_set_t *set;
2139         rctl_alloc_gp_t *gp;
2140         rctl_entity_p_t e;
2141         int res;
2142
2143         ASSERT(curproc == &p0);
2144
2145         /*
2146          * Create ID space for zone IDs.  ID 0 is reserved for the
2147          * global zone.
2148          */
2149         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2150
2151         /*
2152          * Initialize generic zone resource controls, if any.
2153          */
2154         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2155             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2156             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2157             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2158
2159         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2160             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2161             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2162             RCTL_GLOBAL_INFINITE,
2163             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2164
2165         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2166             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2167             INT_MAX, INT_MAX, &zone_lwps_ops);
2168
2169         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2170             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2171             INT_MAX, INT_MAX, &zone_procs_ops);
2172
2173         /*
2174          * System V IPC resource controls
2175          */
2176         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2177             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2178             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2179
2180         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2181             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2182             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2183
2184         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2185             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2186             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2187
2188         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2189             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2190             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2191
2192         /*
2193          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2194          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2195          */
2196         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2197         bzero(dval, sizeof (rctl_val_t));
2198         dval->rcv_value = 1;
2199         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2200         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2201         dval->rcv_action_recip_pid = -1;
2202
2203         rde = rctl_dict_lookup("zone.cpu-shares");
2204         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2205
2206         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2207             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2208             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2209             &zone_locked_mem_ops);
2210
2211         rc_zone_max_swap = rctl_register("zone.max-swap",
2212             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2213             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2214             &zone_max_swap_ops);
2215
2216         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2217             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2218             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2219             &zone_max_lofi_ops);
2220
2221         /*
2222          * Initialize the ``global zone''.
2223          */
2224         set = rctl_set_create();
2225         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2226         mutex_enter(&p0.p_lock);
2227         e.rcep_p.zone = &zone0;
2228         e.rcep_t = RCENTITY_ZONE;
2229         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2230             gp);
2231
2232         zone0.zone_nlwps = p0.p_lwpcnt;
2233         zone0.zone_nprocs = 1;
2234         zone0.zone_ntasks = 1;
2235         mutex_exit(&p0.p_lock);
2236         zone0.zone_restart_init = B_TRUE;
2237         zone0.zone_brand = &native_brand;
2238         rctl_prealloc_destroy(gp);
2239         /*
2240          * pool_default hasn't been initialized yet, so we let pool_init()
2241          * take care of making sure the global zone is in the default pool.
2242          */
2243
2244         /*
2245          * Initialize global zone kstats
2246          */
2247         zone_kstat_create(&zone0);
2248
2249         /*
2250          * Initialise the lock for the database structure used by mntfs.
2251          */
2252         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2253
2254         mutex_enter(&zonehash_lock);
2255         zone_uniqid(&zone0);
2256         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2257
2258         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2259             mod_hash_null_valdtor);
2260         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2261             zone_hash_size, mod_hash_null_valdtor);
2262         zonecount = 1;
2263
2264         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2265             (mod_hash_val_t)&zone0);
2266         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2267             (mod_hash_val_t)&zone0);
2268         mutex_exit(&zonehash_lock);
2269
2270         /*
2271          * We avoid setting zone_kcred until now, since kcred is initialized
2272          * sometime after zone_zsd_init() and before zone_init().
2273          */
2274         zone0.zone_kcred = kcred;
2275         /*
2276          * The global zone is fully initialized (except for zone_rootvp which
2277          * will be set when the root filesystem is mounted).
2278          */
2279         global_zone = &zone0;
2280
2281         /*
2282          * Setup an event channel to send zone status change notifications on
2283          */
2284         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2285             EVCH_CREAT);
2286
2287         if (res)
2288                 panic("Sysevent_evc_bind failed during zone setup.\n");
2289
2290 }
2291
2292 static void
2293 zone_free(zone_t *zone)
2294 {
2295         ASSERT(zone != global_zone);
2296         ASSERT(zone->zone_ntasks == 0);
2297         ASSERT(zone->zone_nlwps == 0);
2298         ASSERT(zone->zone_nprocs == 0);
2299         ASSERT(zone->zone_cred_ref == 0);
2300         ASSERT(zone->zone_kcred == NULL);
2301         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2302             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2303         ASSERT(list_is_empty(&zone->zone_ref_list));
2304
2305         /*
2306          * Remove any zone caps.
2307          */
2308         cpucaps_zone_remove(zone);
2309
2310         ASSERT(zone->zone_cpucap == NULL);
2311
2312         /* remove from deathrow list */
2313         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2314                 ASSERT(zone->zone_ref == 0);
2315                 mutex_enter(&zone_deathrow_lock);
2316                 list_remove(&zone_deathrow, zone);
2317                 mutex_exit(&zone_deathrow_lock);
2318         }
2319
2320         list_destroy(&zone->zone_ref_list);
2321         zone_free_zsd(zone);
2322         zone_free_datasets(zone);
2323         list_destroy(&zone->zone_dl_list);
2324
2325         if (zone->zone_rootvp != NULL)
2326                 VN_RELE(zone->zone_rootvp);
2327         if (zone->zone_rootpath)
2328                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2329         if (zone->zone_name != NULL)
2330                 kmem_free(zone->zone_name, ZONENAME_MAX);
2331         if (zone->zone_nodename != NULL)
2332                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2333         if (zone->zone_domain != NULL)
2334                 kmem_free(zone->zone_domain, _SYS_NMLN);
2335         if (zone->zone_privset != NULL)
2336                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2337         if (zone->zone_rctls != NULL)
2338                 rctl_set_free(zone->zone_rctls);
2339         if (zone->zone_bootargs != NULL)
2340                 strfree(zone->zone_bootargs);
2341         if (zone->zone_initname != NULL)
2342                 strfree(zone->zone_initname);
2343         if (zone->zone_fs_allowed != NULL)
2344                 strfree(zone->zone_fs_allowed);
2345         if (zone->zone_pfexecd != NULL)
2346                 klpd_freelist(&zone->zone_pfexecd);
2347         id_free(zoneid_space, zone->zone_id);
2348         mutex_destroy(&zone->zone_lock);
2349         cv_destroy(&zone->zone_cv);
2350         rw_destroy(&zone->zone_mntfs_db_lock);
2351         kmem_free(zone, sizeof (zone_t));
2352 }
2353
2354 /*
2355  * See block comment at the top of this file for information about zone
2356  * status values.
2357  */
2358 /*
2359  * Convenience function for setting zone status.
2360  */
2361 static void
2362 zone_status_set(zone_t *zone, zone_status_t status)
2363 {
2364
2365         nvlist_t *nvl = NULL;
2366         ASSERT(MUTEX_HELD(&zone_status_lock));
2367         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2368             status >= zone_status_get(zone));
2369
2370         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2371             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2372             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2373             zone_status_table[status]) ||
2374             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2375             zone_status_table[zone->zone_status]) ||
2376             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2377             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2378             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2379             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2380 #ifdef DEBUG
2381                 (void) printf(
2382                     "Failed to allocate and send zone state change event.\n");
2383 #endif
2384         }
2385         nvlist_free(nvl);
2386
2387         zone->zone_status = status;
2388
2389         cv_broadcast(&zone->zone_cv);
2390 }
2391
2392 /*
2393  * Public function to retrieve the zone status.  The zone status may
2394  * change after it is retrieved.
2395  */
2396 zone_status_t
2397 zone_status_get(zone_t *zone)
2398 {
2399         return (zone->zone_status);
2400 }
2401
2402 static int
2403 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2404 {
2405         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2406         int err = 0;
2407
2408         ASSERT(zone != global_zone);
2409         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2410                 goto done;      /* EFAULT or ENAMETOOLONG */
2411
2412         if (zone->zone_bootargs != NULL)
2413                 strfree(zone->zone_bootargs);
2414
2415         zone->zone_bootargs = strdup(buf);
2416
2417 done:
2418         kmem_free(buf, BOOTARGS_MAX);
2419         return (err);
2420 }
2421
2422 static int
2423 zone_set_brand(zone_t *zone, const char *brand)
2424 {
2425         struct brand_attr *attrp;
2426         brand_t *bp;
2427
2428         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2429         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2430                 kmem_free(attrp, sizeof (struct brand_attr));
2431                 return (EFAULT);
2432         }
2433
2434         bp = brand_register_zone(attrp);
2435         kmem_free(attrp, sizeof (struct brand_attr));
2436         if (bp == NULL)
2437                 return (EINVAL);
2438
2439         /*
2440          * This is the only place where a zone can change it's brand.
2441          * We already need to hold zone_status_lock to check the zone
2442          * status, so we'll just use that lock to serialize zone
2443          * branding requests as well.
2444          */
2445         mutex_enter(&zone_status_lock);
2446
2447         /* Re-Branding is not allowed and the zone can't be booted yet */
2448         if ((ZONE_IS_BRANDED(zone)) ||
2449             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2450                 mutex_exit(&zone_status_lock);
2451                 brand_unregister_zone(bp);
2452                 return (EINVAL);
2453         }
2454
2455         /* set up the brand specific data */
2456         zone->zone_brand = bp;
2457         ZBROP(zone)->b_init_brand_data(zone);
2458
2459         mutex_exit(&zone_status_lock);
2460         return (0);
2461 }
2462
2463 static int
2464 zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2465 {
2466         int err = 0;
2467         psecflags_t psf;
2468
2469         ASSERT(zone != global_zone);
2470
2471         if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2472                 return (err);
2473
2474         if (zone_status_get(zone) > ZONE_IS_READY)
2475                 return (EINVAL);
2476
2477         if (!psecflags_validate(&psf))
2478                 return (EINVAL);
2479
2480         (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2481
2482         /* Set security flags on the zone's zsched */
2483         (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2484             sizeof (zone->zone_zsched->p_secflags));
2485
2486         return (0);
2487 }
2488
2489 static int
2490 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2491 {
2492         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2493         int err = 0;
2494
2495         ASSERT(zone != global_zone);
2496         if ((err = copyinstr(zone_fs_allowed, buf,
2497             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2498                 goto done;
2499
2500         if (zone->zone_fs_allowed != NULL)
2501                 strfree(zone->zone_fs_allowed);
2502
2503         zone->zone_fs_allowed = strdup(buf);
2504
2505 done:
2506         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2507         return (err);
2508 }
2509
2510 static int
2511 zone_set_initname(zone_t *zone, const char *zone_initname)
2512 {
2513         char initname[INITNAME_SZ];
2514         size_t len;
2515         int err = 0;
2516
2517         ASSERT(zone != global_zone);
2518         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2519                 return (err);   /* EFAULT or ENAMETOOLONG */
2520
2521         if (zone->zone_initname != NULL)
2522                 strfree(zone->zone_initname);
2523
2524         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2525         (void) strcpy(zone->zone_initname, initname);
2526         return (0);
2527 }
2528
2529 static int
2530 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2531 {
2532         uint64_t mcap;
2533         int err = 0;
2534
2535         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2536                 zone->zone_phys_mcap = mcap;
2537
2538         return (err);
2539 }
2540
2541 static int
2542 zone_set_sched_class(zone_t *zone, const char *new_class)
2543 {
2544         char sched_class[PC_CLNMSZ];
2545         id_t classid;
2546         int err;
2547
2548         ASSERT(zone != global_zone);
2549         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2550                 return (err);   /* EFAULT or ENAMETOOLONG */
2551
2552         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2553                 return (set_errno(EINVAL));
2554         zone->zone_defaultcid = classid;
2555         ASSERT(zone->zone_defaultcid > 0 &&
2556             zone->zone_defaultcid < loaded_classes);
2557
2558         return (0);
2559 }
2560
2561 /*
2562  * Block indefinitely waiting for (zone_status >= status)
2563  */
2564 void
2565 zone_status_wait(zone_t *zone, zone_status_t status)
2566 {
2567         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2568
2569         mutex_enter(&zone_status_lock);
2570         while (zone->zone_status < status) {
2571                 cv_wait(&zone->zone_cv, &zone_status_lock);
2572         }
2573         mutex_exit(&zone_status_lock);
2574 }
2575
2576 /*
2577  * Private CPR-safe version of zone_status_wait().
2578  */
2579 static void
2580 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2581 {
2582         callb_cpr_t cprinfo;
2583
2584         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2585
2586         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2587             str);
2588         mutex_enter(&zone_status_lock);
2589         while (zone->zone_status < status) {
2590                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2591                 cv_wait(&zone->zone_cv, &zone_status_lock);
2592                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2593         }
2594         /*
2595          * zone_status_lock is implicitly released by the following.
2596          */
2597         CALLB_CPR_EXIT(&cprinfo);
2598 }
2599
2600 /*
2601  * Block until zone enters requested state or signal is received.  Return (0)
2602  * if signaled, non-zero otherwise.
2603  */
2604 int
2605 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2606 {
2607         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2608
2609         mutex_enter(&zone_status_lock);
2610         while (zone->zone_status < status) {
2611                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2612                         mutex_exit(&zone_status_lock);
2613                         return (0);
2614                 }
2615         }
2616         mutex_exit(&zone_status_lock);
2617         return (1);
2618 }
2619
2620 /*
2621  * Block until the zone enters the requested state or the timeout expires,
2622  * whichever happens first.  Return (-1) if operation timed out, time remaining
2623  * otherwise.
2624  */
2625 clock_t
2626 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2627 {
2628         clock_t timeleft = 0;
2629
2630         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2631
2632         mutex_enter(&zone_status_lock);
2633         while (zone->zone_status < status && timeleft != -1) {
2634                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2635         }
2636         mutex_exit(&zone_status_lock);
2637         return (timeleft);
2638 }
2639
2640 /*
2641  * Block until the zone enters the requested state, the current process is
2642  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2643  * operation timed out, 0 if signaled, time remaining otherwise.
2644  */
2645 clock_t
2646 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2647 {
2648         clock_t timeleft = tim - ddi_get_lbolt();
2649
2650         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2651
2652         mutex_enter(&zone_status_lock);
2653         while (zone->zone_status < status) {
2654                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2655                     tim);
2656                 if (timeleft <= 0)
2657                         break;
2658         }
2659         mutex_exit(&zone_status_lock);
2660         return (timeleft);
2661 }
2662
2663 /*
2664  * Zones have two reference counts: one for references from credential
2665  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2666  * This is so we can allow a zone to be rebooted while there are still
2667  * outstanding cred references, since certain drivers cache dblks (which
2668  * implicitly results in cached creds).  We wait for zone_ref to drop to
2669  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2670  * later freed when the zone_cred_ref drops to 0, though nothing other
2671  * than the zone id and privilege set should be accessed once the zone
2672  * is "dead".
2673  *
2674  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2675  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2676  * to 0.  This can be useful to flush out other sources of cached creds
2677  * that may be less innocuous than the driver case.
2678  *
2679  * Zones also provide a tracked reference counting mechanism in which zone
2680  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2681  * debuggers determine the sources of leaked zone references.  See
2682  * zone_hold_ref() and zone_rele_ref() below for more information.
2683  */
2684
2685 int zone_wait_for_cred = 0;
2686
2687 static void
2688 zone_hold_locked(zone_t *z)
2689 {
2690         ASSERT(MUTEX_HELD(&z->zone_lock));
2691         z->zone_ref++;
2692         ASSERT(z->zone_ref != 0);
2693 }
2694
2695 /*
2696  * Increment the specified zone's reference count.  The zone's zone_t structure
2697  * will not be freed as long as the zone's reference count is nonzero.
2698  * Decrement the zone's reference count via zone_rele().
2699  *
2700  * NOTE: This function should only be used to hold zones for short periods of
2701  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2702  */
2703 void
2704 zone_hold(zone_t *z)
2705 {
2706         mutex_enter(&z->zone_lock);
2707         zone_hold_locked(z);
2708         mutex_exit(&z->zone_lock);
2709 }
2710
2711 /*
2712  * If the non-cred ref count drops to 1 and either the cred ref count
2713  * is 0 or we aren't waiting for cred references, the zone is ready to
2714  * be destroyed.
2715  */
2716 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2717             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2718
2719 /*
2720  * Common zone reference release function invoked by zone_rele() and
2721  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2722  * zone's subsystem-specific reference counters are not affected by the
2723  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2724  * removed from the specified zone's reference list.  ref must be non-NULL iff
2725  * subsys is not ZONE_REF_NUM_SUBSYS.
2726  */
2727 static void
2728 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2729 {
2730         boolean_t wakeup;
2731
2732         mutex_enter(&z->zone_lock);
2733         ASSERT(z->zone_ref != 0);
2734         z->zone_ref--;
2735         if (subsys != ZONE_REF_NUM_SUBSYS) {
2736                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2737                 z->zone_subsys_ref[subsys]--;
2738                 list_remove(&z->zone_ref_list, ref);
2739         }
2740         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2741                 /* no more refs, free the structure */
2742                 mutex_exit(&z->zone_lock);
2743                 zone_free(z);
2744                 return;
2745         }
2746         /* signal zone_destroy so the zone can finish halting */
2747         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2748         mutex_exit(&z->zone_lock);
2749
2750         if (wakeup) {
2751                 /*
2752                  * Grabbing zonehash_lock here effectively synchronizes with
2753                  * zone_destroy() to avoid missed signals.
2754                  */
2755                 mutex_enter(&zonehash_lock);
2756                 cv_broadcast(&zone_destroy_cv);
2757                 mutex_exit(&zonehash_lock);
2758         }
2759 }
2760
2761 /*
2762  * Decrement the specified zone's reference count.  The specified zone will
2763  * cease to exist after this function returns if the reference count drops to
2764  * zero.  This function should be paired with zone_hold().
2765  */
2766 void
2767 zone_rele(zone_t *z)
2768 {
2769         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2770 }
2771
2772 /*
2773  * Initialize a zone reference structure.  This function must be invoked for
2774  * a reference structure before the structure is passed to zone_hold_ref().
2775  */
2776 void
2777 zone_init_ref(zone_ref_t *ref)
2778 {
2779         ref->zref_zone = NULL;
2780         list_link_init(&ref->zref_linkage);
2781 }
2782
2783 /*
2784  * Acquire a reference to zone z.  The caller must specify the
2785  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2786  * zone_ref_t structure will represent a reference to the specified zone.  Use
2787  * zone_rele_ref() to release the reference.
2788  *
2789  * The referenced zone_t structure will not be freed as long as the zone_t's
2790  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2791  * references.
2792  *
2793  * NOTE: The zone_ref_t structure must be initialized before it is used.
2794  * See zone_init_ref() above.
2795  */
2796 void
2797 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2798 {
2799         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2800
2801         /*
2802          * Prevent consumers from reusing a reference structure before
2803          * releasing it.
2804          */
2805         VERIFY(ref->zref_zone == NULL);
2806
2807         ref->zref_zone = z;
2808         mutex_enter(&z->zone_lock);
2809         zone_hold_locked(z);
2810         z->zone_subsys_ref[subsys]++;
2811         ASSERT(z->zone_subsys_ref[subsys] != 0);
2812         list_insert_head(&z->zone_ref_list, ref);
2813         mutex_exit(&z->zone_lock);
2814 }
2815
2816 /*
2817  * Release the zone reference represented by the specified zone_ref_t.
2818  * The reference is invalid after it's released; however, the zone_ref_t
2819  * structure can be reused without having to invoke zone_init_ref().
2820  * subsys should be the same value that was passed to zone_hold_ref()
2821  * when the reference was acquired.
2822  */
2823 void
2824 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2825 {
2826         zone_rele_common(ref->zref_zone, ref, subsys);
2827
2828         /*
2829          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2830          * when consumers dereference the reference.  This helps us catch
2831          * consumers who use released references.  Furthermore, this lets
2832          * consumers reuse the zone_ref_t structure without having to
2833          * invoke zone_init_ref().
2834          */
2835         ref->zref_zone = NULL;
2836 }
2837
2838 void
2839 zone_cred_hold(zone_t *z)
2840 {
2841         mutex_enter(&z->zone_lock);
2842         z->zone_cred_ref++;
2843         ASSERT(z->zone_cred_ref != 0);
2844         mutex_exit(&z->zone_lock);
2845 }
2846
2847 void
2848 zone_cred_rele(zone_t *z)
2849 {
2850         boolean_t wakeup;
2851
2852         mutex_enter(&z->zone_lock);
2853         ASSERT(z->zone_cred_ref != 0);
2854         z->zone_cred_ref--;
2855         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2856                 /* no more refs, free the structure */
2857                 mutex_exit(&z->zone_lock);
2858                 zone_free(z);
2859                 return;
2860         }
2861         /*
2862          * If zone_destroy is waiting for the cred references to drain
2863          * out, and they have, signal it.
2864          */
2865         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2866             zone_status_get(z) >= ZONE_IS_DEAD);
2867         mutex_exit(&z->zone_lock);
2868
2869         if (wakeup) {
2870                 /*
2871                  * Grabbing zonehash_lock here effectively synchronizes with
2872                  * zone_destroy() to avoid missed signals.
2873                  */
2874                 mutex_enter(&zonehash_lock);
2875                 cv_broadcast(&zone_destroy_cv);
2876                 mutex_exit(&zonehash_lock);
2877         }
2878 }
2879
2880 void
2881 zone_task_hold(zone_t *z)
2882 {
2883         mutex_enter(&z->zone_lock);
2884         z->zone_ntasks++;
2885         ASSERT(z->zone_ntasks != 0);
2886         mutex_exit(&z->zone_lock);
2887 }
2888
2889 void
2890 zone_task_rele(zone_t *zone)
2891 {
2892         uint_t refcnt;
2893
2894         mutex_enter(&zone->zone_lock);
2895         ASSERT(zone->zone_ntasks != 0);
2896         refcnt = --zone->zone_ntasks;
2897         if (refcnt > 1) {       /* Common case */
2898                 mutex_exit(&zone->zone_lock);
2899                 return;
2900         }
2901         zone_hold_locked(zone); /* so we can use the zone_t later */
2902         mutex_exit(&zone->zone_lock);
2903         if (refcnt == 1) {
2904                 /*
2905                  * See if the zone is shutting down.
2906                  */
2907                 mutex_enter(&zone_status_lock);
2908                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2909                         goto out;
2910                 }
2911
2912                 /*
2913                  * Make sure the ntasks didn't change since we
2914                  * dropped zone_lock.
2915                  */
2916                 mutex_enter(&zone->zone_lock);
2917                 if (refcnt != zone->zone_ntasks) {
2918                         mutex_exit(&zone->zone_lock);
2919                         goto out;
2920                 }
2921                 mutex_exit(&zone->zone_lock);
2922
2923                 /*
2924                  * No more user processes in the zone.  The zone is empty.
2925                  */
2926                 zone_status_set(zone, ZONE_IS_EMPTY);
2927                 goto out;
2928         }
2929
2930         ASSERT(refcnt == 0);
2931         /*
2932          * zsched has exited; the zone is dead.
2933          */
2934         zone->zone_zsched = NULL;               /* paranoia */
2935         mutex_enter(&zone_status_lock);
2936         zone_status_set(zone, ZONE_IS_DEAD);
2937 out:
2938         mutex_exit(&zone_status_lock);
2939         zone_rele(zone);
2940 }
2941
2942 zoneid_t
2943 getzoneid(void)
2944 {
2945         return (curproc->p_zone->zone_id);
2946 }
2947
2948 /*
2949  * Internal versions of zone_find_by_*().  These don't zone_hold() or
2950  * check the validity of a zone's state.
2951  */
2952 static zone_t *
2953 zone_find_all_by_id(zoneid_t zoneid)
2954 {
2955         mod_hash_val_t hv;
2956         zone_t *zone = NULL;
2957
2958         ASSERT(MUTEX_HELD(&zonehash_lock));
2959
2960         if (mod_hash_find(zonehashbyid,
2961             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2962                 zone = (zone_t *)hv;
2963         return (zone);
2964 }
2965
2966 static zone_t *
2967 zone_find_all_by_name(char *name)
2968 {
2969         mod_hash_val_t hv;
2970         zone_t *zone = NULL;
2971
2972         ASSERT(MUTEX_HELD(&zonehash_lock));
2973
2974         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2975                 zone = (zone_t *)hv;
2976         return (zone);
2977 }
2978
2979 /*
2980  * Public interface for looking up a zone by zoneid.  Only returns the zone if
2981  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2982  * Caller must call zone_rele() once it is done with the zone.
2983  *
2984  * The zone may begin the zone_destroy() sequence immediately after this
2985  * function returns, but may be safely used until zone_rele() is called.
2986  */
2987 zone_t *
2988 zone_find_by_id(zoneid_t zoneid)
2989 {
2990         zone_t *zone;
2991         zone_status_t status;
2992
2993         mutex_enter(&zonehash_lock);
2994         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2995                 mutex_exit(&zonehash_lock);
2996                 return (NULL);
2997         }
2998         status = zone_status_get(zone);
2999         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3000                 /*
3001                  * For all practical purposes the zone doesn't exist.
3002                  */
3003                 mutex_exit(&zonehash_lock);
3004                 return (NULL);
3005         }
3006         zone_hold(zone);
3007         mutex_exit(&zonehash_lock);
3008         return (zone);
3009 }
3010
3011 /*
3012  * Similar to zone_find_by_id, but using zone name as the key.
3013  */
3014 zone_t *
3015 zone_find_by_name(char *name)
3016 {
3017         zone_t *zone;
3018         zone_status_t status;
3019
3020         mutex_enter(&zonehash_lock);
3021         if ((zone = zone_find_all_by_name(name)) == NULL) {
3022                 mutex_exit(&zonehash_lock);
3023                 return (NULL);
3024         }
3025         status = zone_status_get(zone);
3026         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3027                 /*
3028                  * For all practical purposes the zone doesn't exist.
3029                  */
3030                 mutex_exit(&zonehash_lock);
3031                 return (NULL);
3032         }
3033         zone_hold(zone);
3034         mutex_exit(&zonehash_lock);
3035         return (zone);
3036 }
3037
3038 /*
3039  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3040  * if there is a zone "foo" rooted at /foo/root, and the path argument
3041  * is "/foo/root/proc", it will return the held zone_t corresponding to
3042  * zone "foo".
3043  *
3044  * zone_find_by_path() always returns a non-NULL value, since at the
3045  * very least every path will be contained in the global zone.
3046  *
3047  * As with the other zone_find_by_*() functions, the caller is
3048  * responsible for zone_rele()ing the return value of this function.
3049  */
3050 zone_t *
3051 zone_find_by_path(const char *path)
3052 {
3053         zone_t *zone;
3054         zone_t *zret = NULL;
3055         zone_status_t status;
3056
3057         if (path == NULL) {
3058                 /*
3059                  * Call from rootconf().
3060                  */
3061                 zone_hold(global_zone);
3062                 return (global_zone);
3063         }
3064         ASSERT(*path == '/');
3065         mutex_enter(&zonehash_lock);
3066         for (zone = list_head(&zone_active); zone != NULL;
3067             zone = list_next(&zone_active, zone)) {
3068                 if (ZONE_PATH_VISIBLE(path, zone))
3069                         zret = zone;
3070         }
3071         ASSERT(zret != NULL);
3072         status = zone_status_get(zret);
3073         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3074                 /*
3075                  * Zone practically doesn't exist.
3076                  */
3077                 zret = global_zone;
3078         }
3079         zone_hold(zret);
3080         mutex_exit(&zonehash_lock);
3081         return (zret);
3082 }
3083
3084 /*
3085  * Public interface for updating per-zone load averages.  Called once per
3086  * second.
3087  *
3088  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3089  */
3090 void
3091 zone_loadavg_update()
3092 {
3093         zone_t *zp;
3094         zone_status_t status;
3095         struct loadavg_s *lavg;
3096         hrtime_t zone_total;
3097         int i;
3098         hrtime_t hr_avg;
3099         int nrun;
3100         static int64_t f[3] = { 135, 27, 9 };
3101         int64_t q, r;
3102
3103         mutex_enter(&zonehash_lock);
3104         for (zp = list_head(&zone_active); zp != NULL;
3105             zp = list_next(&zone_active, zp)) {
3106                 mutex_enter(&zp->zone_lock);
3107
3108                 /* Skip zones that are on the way down or not yet up */
3109                 status = zone_status_get(zp);
3110                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3111                         /* For all practical purposes the zone doesn't exist. */
3112                         mutex_exit(&zp->zone_lock);
3113                         continue;
3114                 }
3115
3116                 /*
3117                  * Update the 10 second moving average data in zone_loadavg.
3118                  */
3119                 lavg = &zp->zone_loadavg;
3120
3121                 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3122                 scalehrtime(&zone_total);
3123
3124                 /* The zone_total should always be increasing. */
3125                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3126                     zone_total - lavg->lg_total : 0;
3127                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3128                 /* lg_total holds the prev. 1 sec. total */
3129                 lavg->lg_total = zone_total;
3130
3131                 /*
3132                  * To simplify the calculation, we don't calculate the load avg.
3133                  * until the zone has been up for at least 10 seconds and our
3134                  * moving average is thus full.
3135                  */
3136                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3137                         lavg->lg_len++;
3138                         mutex_exit(&zp->zone_lock);
3139                         continue;
3140                 }
3141
3142                 /* Now calculate the 1min, 5min, 15 min load avg. */
3143                 hr_avg = 0;
3144                 for (i = 0; i < S_LOADAVG_SZ; i++)
3145                         hr_avg += lavg->lg_loads[i];
3146                 hr_avg = hr_avg / S_LOADAVG_SZ;
3147                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3148
3149                 /* Compute load avg. See comment in calcloadavg() */
3150                 for (i = 0; i < 3; i++) {
3151                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3152                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3153                         zp->zone_hp_avenrun[i] +=
3154                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3155
3156                         /* avenrun[] can only hold 31 bits of load avg. */
3157                         if (zp->zone_hp_avenrun[i] <
3158                             ((uint64_t)1<<(31+16-FSHIFT)))
3159                                 zp->zone_avenrun[i] = (int32_t)
3160                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3161                         else
3162                                 zp->zone_avenrun[i] = 0x7fffffff;
3163                 }
3164
3165                 mutex_exit(&zp->zone_lock);
3166         }
3167         mutex_exit(&zonehash_lock);
3168 }
3169
3170 /*
3171  * Get the number of cpus visible to this zone.  The system-wide global
3172  * 'ncpus' is returned if pools are disabled, the caller is in the
3173  * global zone, or a NULL zone argument is passed in.
3174  */
3175 int
3176 zone_ncpus_get(zone_t *zone)
3177 {
3178         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3179
3180         return (myncpus != 0 ? myncpus : ncpus);
3181 }
3182
3183 /*
3184  * Get the number of online cpus visible to this zone.  The system-wide
3185  * global 'ncpus_online' is returned if pools are disabled, the caller
3186  * is in the global zone, or a NULL zone argument is passed in.
3187  */
3188 int
3189 zone_ncpus_online_get(zone_t *zone)
3190 {
3191         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3192
3193         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3194 }
3195
3196 /*
3197  * Return the pool to which the zone is currently bound.
3198  */
3199 pool_t *
3200 zone_pool_get(zone_t *zone)
3201 {
3202         ASSERT(pool_lock_held());
3203
3204         return (zone->zone_pool);
3205 }
3206
3207 /*
3208  * Set the zone's pool pointer and update the zone's visibility to match
3209  * the resources in the new pool.
3210  */
3211 void
3212 zone_pool_set(zone_t *zone, pool_t *pool)
3213 {
3214         ASSERT(pool_lock_held());
3215         ASSERT(MUTEX_HELD(&cpu_lock));
3216
3217         zone->zone_pool = pool;
3218         zone_pset_set(zone, pool->pool_pset->pset_id);
3219 }
3220
3221 /*
3222  * Return the cached value of the id of the processor set to which the
3223  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3224  * facility is disabled.
3225  */
3226 psetid_t
3227 zone_pset_get(zone_t *zone)
3228 {
3229         ASSERT(MUTEX_HELD(&cpu_lock));
3230
3231         return (zone->zone_psetid);
3232 }
3233
3234 /*
3235  * Set the cached value of the id of the processor set to which the zone
3236  * is currently bound.  Also update the zone's visibility to match the
3237  * resources in the new processor set.
3238  */
3239 void
3240 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3241 {
3242         psetid_t oldpsetid;
3243
3244         ASSERT(MUTEX_HELD(&cpu_lock));
3245         oldpsetid = zone_pset_get(zone);
3246
3247         if (oldpsetid == newpsetid)
3248                 return;
3249         /*
3250          * Global zone sees all.
3251          */
3252         if (zone != global_zone) {
3253                 zone->zone_psetid = newpsetid;
3254                 if (newpsetid != ZONE_PS_INVAL)
3255                         pool_pset_visibility_add(newpsetid, zone);
3256                 if (oldpsetid != ZONE_PS_INVAL)
3257                         pool_pset_visibility_remove(oldpsetid, zone);
3258         }
3259         /*
3260          * Disabling pools, so we should start using the global values
3261          * for ncpus and ncpus_online.
3262          */
3263         if (newpsetid == ZONE_PS_INVAL) {
3264                 zone->zone_ncpus = 0;
3265                 zone->zone_ncpus_online = 0;
3266         }
3267 }
3268
3269 /*
3270  * Walk the list of active zones and issue the provided callback for
3271  * each of them.
3272  *
3273  * Caller must not be holding any locks that may be acquired under
3274  * zonehash_lock.  See comment at the beginning of the file for a list of
3275  * common locks and their interactions with zones.
3276  */
3277 int
3278 zone_walk(int (*cb)(zone_t *, void *), void *data)
3279 {
3280         zone_t *zone;
3281         int ret = 0;
3282         zone_status_t status;
3283
3284         mutex_enter(&zonehash_lock);
3285         for (zone = list_head(&zone_active); zone != NULL;
3286             zone = list_next(&zone_active, zone)) {
3287                 /*
3288                  * Skip zones that shouldn't be externally visible.
3289                  */
3290                 status = zone_status_get(zone);
3291                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3292                         continue;
3293                 /*
3294                  * Bail immediately if any callback invocation returns a
3295                  * non-zero value.
3296                  */
3297                 ret = (*cb)(zone, data);
3298                 if (ret != 0)
3299                         break;
3300         }
3301         mutex_exit(&zonehash_lock);
3302         return (ret);
3303 }
3304
3305 static int
3306 zone_set_root(zone_t *zone, const char *upath)
3307 {
3308         vnode_t *vp;
3309         int trycount;
3310         int error = 0;
3311         char *path;
3312         struct pathname upn, pn;
3313         size_t pathlen;
3314
3315         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3316                 return (error);
3317
3318         pn_alloc(&pn);
3319
3320         /* prevent infinite loop */
3321         trycount = 10;
3322         for (;;) {
3323                 if (--trycount <= 0) {
3324                         error = ESTALE;
3325                         goto out;
3326                 }
3327
3328                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3329                         /*
3330                          * fop_access() may cover 'vp' with a new
3331                          * filesystem, if 'vp' is an autoFS vnode.
3332                          * Get the new 'vp' if so.
3333                          */
3334                         if ((error =
3335                             fop_access(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3336                             (!vn_ismntpt(vp) ||
3337                             (error = traverse(&vp)) == 0)) {
3338                                 pathlen = pn.pn_pathlen + 2;
3339                                 path = kmem_alloc(pathlen, KM_SLEEP);
3340                                 (void) strncpy(path, pn.pn_path,
3341                                     pn.pn_pathlen + 1);
3342                                 path[pathlen - 2] = '/';
3343                                 path[pathlen - 1] = '\0';
3344                                 pn_free(&pn);
3345                                 pn_free(&upn);
3346
3347                                 /* Success! */
3348                                 break;
3349                         }
3350                         VN_RELE(vp);
3351                 }
3352                 if (error != ESTALE)
3353                         goto out;
3354         }
3355
3356         ASSERT(error == 0);
3357         zone->zone_rootvp = vp;         /* we hold a reference to vp */
3358         zone->zone_rootpath = path;
3359         zone->zone_rootpathlen = pathlen;
3360         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3361                 zone->zone_flags |= ZF_IS_SCRATCH;
3362         return (0);
3363
3364 out:
3365         pn_free(&pn);
3366         pn_free(&upn);
3367         return (error);
3368 }
3369
3370 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3371                         ((c) >= 'a' && (c) <= 'z') || \
3372                         ((c) >= 'A' && (c) <= 'Z'))
3373
3374 static int
3375 zone_set_name(zone_t *zone, const char *uname)
3376 {
3377         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3378         size_t len;
3379         int i, err;
3380
3381         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3382                 kmem_free(kname, ZONENAME_MAX);
3383                 return (err);   /* EFAULT or ENAMETOOLONG */
3384         }
3385
3386         /* must be less than ZONENAME_MAX */
3387         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3388                 kmem_free(kname, ZONENAME_MAX);
3389                 return (EINVAL);
3390         }
3391
3392         /*
3393          * Name must start with an alphanumeric and must contain only
3394          * alphanumerics, '-', '_' and '.'.
3395          */
3396         if (!isalnum(kname[0])) {
3397                 kmem_free(kname, ZONENAME_MAX);
3398                 return (EINVAL);
3399         }
3400         for (i = 1; i < len - 1; i++) {
3401                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3402                     kname[i] != '.') {
3403                         kmem_free(kname, ZONENAME_MAX);
3404                         return (EINVAL);
3405                 }
3406         }
3407
3408         zone->zone_name = kname;
3409         return (0);
3410 }
3411
3412 /*
3413  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3414  * is NULL or it points to a zone with no hostid emulation, then the machine's
3415  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3416  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3417  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3418  * hostid and the machine's hostid is invalid.
3419  */
3420 uint32_t
3421 zone_get_hostid(zone_t *zonep)
3422 {
3423         unsigned long machine_hostid;
3424
3425         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3426                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3427                         return (HW_INVALID_HOSTID);
3428                 return ((uint32_t)machine_hostid);
3429         }
3430         return (zonep->zone_hostid);
3431 }
3432
3433 /*
3434  * Similar to thread_create(), but makes sure the thread is in the appropriate
3435  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3436  */
3437 /*ARGSUSED*/
3438 kthread_t *
3439 zthread_create(
3440     caddr_t stk,
3441     size_t stksize,
3442     void (*proc)(),
3443     void *arg,
3444     size_t len,
3445     pri_t pri)
3446 {
3447         kthread_t *t;
3448         zone_t *zone = curproc->p_zone;
3449         proc_t *pp = zone->zone_zsched;
3450
3451         zone_hold(zone);        /* Reference to be dropped when thread exits */
3452
3453         /*
3454          * No-one should be trying to create threads if the zone is shutting
3455          * down and there aren't any kernel threads around.  See comment
3456          * in zthread_exit().
3457          */
3458         ASSERT(!(zone->zone_kthreads == NULL &&
3459             zone_status_get(zone) >= ZONE_IS_EMPTY));
3460         /*
3461          * Create a thread, but don't let it run until we've finished setting
3462          * things up.
3463          */
3464         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3465         ASSERT(t->t_forw == NULL);
3466         mutex_enter(&zone_status_lock);
3467         if (zone->zone_kthreads == NULL) {
3468                 t->t_forw = t->t_back = t;
3469         } else {
3470                 kthread_t *tx = zone->zone_kthreads;
3471
3472                 t->t_forw = tx;
3473                 t->t_back = tx->t_back;
3474                 tx->t_back->t_forw = t;
3475                 tx->t_back = t;
3476         }
3477         zone->zone_kthreads = t;
3478         mutex_exit(&zone_status_lock);
3479
3480         mutex_enter(&pp->p_lock);
3481         t->t_proc_flag |= TP_ZTHREAD;
3482         project_rele(t->t_proj);
3483         t->t_proj = project_hold(pp->p_task->tk_proj);
3484
3485         /*
3486          * Setup complete, let it run.
3487          */
3488         thread_lock(t);
3489         t->t_schedflag |= TS_ALLSTART;
3490         setrun_locked(t);
3491         thread_unlock(t);
3492
3493         mutex_exit(&pp->p_lock);
3494
3495         return (t);
3496 }
3497
3498 /*
3499  * Similar to thread_exit().  Must be called by threads created via
3500  * zthread_exit().
3501  */
3502 void
3503 zthread_exit(void)
3504 {
3505         kthread_t *t = curthread;
3506         proc_t *pp = curproc;
3507         zone_t *zone = pp->p_zone;
3508
3509         mutex_enter(&zone_status_lock);
3510
3511         /*
3512          * Reparent to p0
3513          */
3514         kpreempt_disable();
3515         mutex_enter(&pp->p_lock);
3516         t->t_proc_flag &= ~TP_ZTHREAD;
3517         t->t_procp = &p0;
3518         hat_thread_exit(t);
3519         mutex_exit(&pp->p_lock);
3520         kpreempt_enable();
3521
3522         if (t->t_back == t) {
3523                 ASSERT(t->t_forw == t);
3524                 /*
3525                  * If the zone is empty, once the thread count
3526                  * goes to zero no further kernel threads can be
3527                  * created.  This is because if the creator is a process
3528                  * in the zone, then it must have exited before the zone
3529                  * state could be set to ZONE_IS_EMPTY.
3530                  * Otherwise, if the creator is a kernel thread in the
3531                  * zone, the thread count is non-zero.
3532                  *
3533                  * This really means that non-zone kernel threads should
3534                  * not create zone kernel threads.
3535                  */
3536                 zone->zone_kthreads = NULL;
3537                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3538                         zone_status_set(zone, ZONE_IS_DOWN);
3539                         /*
3540                          * Remove any CPU caps on this zone.
3541                          */
3542                         cpucaps_zone_remove(zone);
3543                 }
3544         } else {
3545                 t->t_forw->t_back = t->t_back;
3546                 t->t_back->t_forw = t->t_forw;
3547                 if (zone->zone_kthreads == t)
3548                         zone->zone_kthreads = t->t_forw;
3549         }
3550         mutex_exit(&zone_status_lock);
3551         zone_rele(zone);
3552         thread_exit();
3553         /* NOTREACHED */
3554 }
3555
3556 static void
3557 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3558 {
3559         vnode_t *oldvp;
3560
3561         /* we're going to hold a reference here to the directory */
3562         VN_HOLD(vp);
3563
3564         /* update abs cwd/root path see c2/audit.c */
3565         if (AU_AUDITING())
3566                 audit_chdirec(vp, vpp);
3567
3568         mutex_enter(&pp->p_lock);
3569         oldvp = *vpp;
3570         *vpp = vp;
3571         mutex_exit(&pp->p_lock);
3572         if (oldvp != NULL)
3573                 VN_RELE(oldvp);
3574 }
3575
3576 /*
3577  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3578  */
3579 static int
3580 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3581 {
3582         nvpair_t *nvp = NULL;
3583         boolean_t priv_set = B_FALSE;
3584         boolean_t limit_set = B_FALSE;
3585         boolean_t action_set = B_FALSE;
3586
3587         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3588                 const char *name;
3589                 uint64_t ui64;
3590
3591                 name = nvpair_name(nvp);
3592                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3593                         return (EINVAL);
3594                 (void) nvpair_value_uint64(nvp, &ui64);
3595                 if (strcmp(name, "privilege") == 0) {
3596                         /*
3597                          * Currently only privileged values are allowed, but
3598                          * this may change in the future.
3599                          */
3600                         if (ui64 != RCPRIV_PRIVILEGED)
3601                                 return (EINVAL);
3602                         rv->rcv_privilege = ui64;
3603                         priv_set = B_TRUE;
3604                 } else if (strcmp(name, "limit") == 0) {
3605                         rv->rcv_value = ui64;
3606                         limit_set = B_TRUE;
3607                 } else if (strcmp(name, "action") == 0) {
3608                         if (ui64 != RCTL_LOCAL_NOACTION &&
3609                             ui64 != RCTL_LOCAL_DENY)
3610                                 return (EINVAL);
3611                         rv->rcv_flagaction = ui64;
3612                         action_set = B_TRUE;
3613                 } else {
3614                         return (EINVAL);
3615                 }
3616         }
3617
3618         if (!(priv_set && limit_set && action_set))
3619                 return (EINVAL);
3620         rv->rcv_action_signal = 0;
3621         rv->rcv_action_recipient = NULL;
3622         rv->rcv_action_recip_pid = -1;
3623         rv->rcv_firing_time = 0;
3624
3625         return (0);
3626 }
3627
3628 /*
3629  * Non-global zone version of start_init.
3630  */
3631 void
3632 zone_start_init(void)
3633 {
3634         proc_t *p = ttoproc(curthread);
3635         zone_t *z = p->p_zone;
3636
3637         ASSERT(!INGLOBALZONE(curproc));
3638
3639         /*
3640          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3641          * storing just the pid of init is sufficient.
3642          */
3643         z->zone_proc_initpid = p->p_pid;
3644
3645         /*
3646          * We maintain zone_boot_err so that we can return the cause of the
3647          * failure back to the caller of the zone_boot syscall.
3648          */
3649         p->p_zone->zone_boot_err = start_init_common();
3650
3651         /*
3652          * We will prevent booting zones from becoming running zones if the
3653          * global zone is shutting down.
3654          */
3655         mutex_enter(&zone_status_lock);
3656         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3657             ZONE_IS_SHUTTING_DOWN) {
3658                 /*
3659                  * Make sure we are still in the booting state-- we could have
3660                  * raced and already be shutting down, or even further along.
3661                  */
3662                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3663                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3664                 }
3665                 mutex_exit(&zone_status_lock);
3666                 /* It's gone bad, dispose of the process */
3667                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3668                         mutex_enter(&p->p_lock);
3669                         ASSERT(p->p_flag & SEXITLWPS);
3670                         lwp_exit();
3671                 }
3672         } else {
3673                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3674                         zone_status_set(z, ZONE_IS_RUNNING);
3675                 mutex_exit(&zone_status_lock);
3676                 /* cause the process to return to userland. */
3677                 lwp_rtt();
3678         }
3679 }
3680
3681 struct zsched_arg {
3682         zone_t *zone;
3683         nvlist_t *nvlist;
3684 };
3685
3686 /*
3687  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3688  * anything to do with scheduling, but rather with the fact that
3689  * per-zone kernel threads are parented to zsched, just like regular
3690  * kernel threads are parented to sched (p0).
3691  *
3692  * zsched is also responsible for launching init for the zone.
3693  */
3694 static void
3695 zsched(void *arg)
3696 {
3697         struct zsched_arg *za = arg;
3698         proc_t *pp = curproc;
3699         proc_t *initp = proc_init;
3700         zone_t *zone = za->zone;
3701         cred_t *cr, *oldcred;
3702         rctl_set_t *set;
3703         rctl_alloc_gp_t *gp;
3704         contract_t *ct = NULL;
3705         task_t *tk, *oldtk;
3706         rctl_entity_p_t e;
3707         kproject_t *pj;
3708
3709         nvlist_t *nvl = za->nvlist;
3710         nvpair_t *nvp = NULL;
3711
3712         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3713         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3714         PTOU(pp)->u_argc = 0;
3715         PTOU(pp)->u_argv = (uintptr_t)NULL;
3716         PTOU(pp)->u_envp = (uintptr_t)NULL;
3717         PTOU(pp)->u_commpagep = (uintptr_t)NULL;
3718         closeall(P_FINFO(pp));
3719
3720         /*
3721          * We are this zone's "zsched" process.  As the zone isn't generally
3722          * visible yet we don't need to grab any locks before initializing its
3723          * zone_proc pointer.
3724          */
3725         zone_hold(zone);  /* this hold is released by zone_destroy() */
3726         zone->zone_zsched = pp;
3727         mutex_enter(&pp->p_lock);
3728         pp->p_zone = zone;
3729         mutex_exit(&pp->p_lock);
3730
3731         /*
3732          * Disassociate process from its 'parent'; parent ourselves to init
3733          * (pid 1) and change other values as needed.
3734          */
3735         sess_create();
3736
3737         mutex_enter(&pidlock);
3738         proc_detach(pp);
3739         pp->p_ppid = 1;
3740         pp->p_flag |= SZONETOP;
3741         pp->p_ancpid = 1;
3742         pp->p_parent = initp;
3743         pp->p_psibling = NULL;
3744         if (initp->p_child)
3745                 initp->p_child->p_psibling = pp;
3746         pp->p_sibling = initp->p_child;
3747         initp->p_child = pp;
3748
3749         /* Decrement what newproc() incremented. */
3750         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3751         /*
3752          * Our credentials are about to become kcred-like, so we don't care
3753          * about the caller's ruid.
3754          */
3755         upcount_inc(crgetruid(kcred), zone->zone_id);
3756         mutex_exit(&pidlock);
3757
3758         /*
3759          * getting out of global zone, so decrement lwp and process counts
3760          */
3761         pj = pp->p_task->tk_proj;
3762         mutex_enter(&global_zone->zone_nlwps_lock);
3763         pj->kpj_nlwps -= pp->p_lwpcnt;
3764         global_zone->zone_nlwps -= pp->p_lwpcnt;
3765         pj->kpj_nprocs--;
3766         global_zone->zone_nprocs--;
3767         mutex_exit(&global_zone->zone_nlwps_lock);
3768
3769         /*
3770          * Decrement locked memory counts on old zone and project.
3771          */
3772         mutex_enter(&global_zone->zone_mem_lock);
3773         global_zone->zone_locked_mem -= pp->p_locked_mem;
3774         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3775         mutex_exit(&global_zone->zone_mem_lock);
3776
3777         /*
3778          * Create and join a new task in project '0' of this zone.
3779          *
3780          * We don't need to call holdlwps() since we know we're the only lwp in
3781          * this process.
3782          *
3783          * task_join() returns with p_lock held.
3784          */
3785         tk = task_create(0, zone);
3786         mutex_enter(&cpu_lock);
3787         oldtk = task_join(tk, 0);
3788
3789         pj = pp->p_task->tk_proj;
3790
3791         mutex_enter(&zone->zone_mem_lock);
3792         zone->zone_locked_mem += pp->p_locked_mem;
3793         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3794         mutex_exit(&zone->zone_mem_lock);
3795
3796         /*
3797          * add lwp and process counts to zsched's zone, and increment
3798          * project's task and process count due to the task created in
3799          * the above task_create.
3800          */
3801         mutex_enter(&zone->zone_nlwps_lock);
3802         pj->kpj_nlwps += pp->p_lwpcnt;
3803         pj->kpj_ntasks += 1;
3804         zone->zone_nlwps += pp->p_lwpcnt;
3805         pj->kpj_nprocs++;
3806         zone->zone_nprocs++;
3807         mutex_exit(&zone->zone_nlwps_lock);
3808
3809         mutex_exit(&curproc->p_lock);
3810         mutex_exit(&cpu_lock);
3811         task_rele(oldtk);
3812
3813         /*
3814          * The process was created by a process in the global zone, hence the
3815          * credentials are wrong.  We might as well have kcred-ish credentials.
3816          */
3817         cr = zone->zone_kcred;
3818         crhold(cr);
3819         mutex_enter(&pp->p_crlock);
3820         oldcred = pp->p_cred;
3821         pp->p_cred = cr;
3822         mutex_exit(&pp->p_crlock);
3823         crfree(oldcred);
3824
3825         /*
3826          * Hold credentials again (for thread)
3827          */
3828         crhold(cr);
3829
3830         /*
3831          * p_lwpcnt can't change since this is a kernel process.
3832          */
3833         crset(pp, cr);
3834
3835         /*
3836          * Chroot
3837          */
3838         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3839         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3840
3841         /*
3842          * Initialize zone's rctl set.
3843          */
3844         set = rctl_set_create();
3845         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3846         mutex_enter(&pp->p_lock);
3847         e.rcep_p.zone = zone;
3848         e.rcep_t = RCENTITY_ZONE;
3849         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3850         mutex_exit(&pp->p_lock);
3851         rctl_prealloc_destroy(gp);
3852
3853         /*
3854          * Apply the rctls passed in to zone_create().  This is basically a list
3855          * assignment: all of the old values are removed and the new ones
3856          * inserted.  That is, if an empty list is passed in, all values are
3857          * removed.
3858          */
3859         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3860                 rctl_dict_entry_t *rde;
3861                 rctl_hndl_t hndl;
3862                 char *name;
3863                 nvlist_t **nvlarray;
3864                 uint_t i, nelem;
3865                 int error;      /* For ASSERT()s */
3866
3867                 name = nvpair_name(nvp);
3868                 hndl = rctl_hndl_lookup(name);
3869                 ASSERT(hndl != -1);
3870                 rde = rctl_dict_lookup_hndl(hndl);
3871                 ASSERT(rde != NULL);
3872
3873                 for (; /* ever */; ) {
3874                         rctl_val_t oval;
3875
3876                         mutex_enter(&pp->p_lock);
3877                         error = rctl_local_get(hndl, NULL, &oval, pp);
3878                         mutex_exit(&pp->p_lock);
3879                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3880                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3881                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
3882                                 break;
3883                         mutex_enter(&pp->p_lock);
3884                         error = rctl_local_delete(hndl, &oval, pp);
3885                         mutex_exit(&pp->p_lock);
3886                         ASSERT(error == 0);
3887                 }
3888                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3889                 ASSERT(error == 0);
3890                 for (i = 0; i < nelem; i++) {
3891                         rctl_val_t *nvalp;
3892
3893                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3894                         error = nvlist2rctlval(nvlarray[i], nvalp);
3895                         ASSERT(error == 0);
3896                         /*
3897                          * rctl_local_insert can fail if the value being
3898                          * inserted is a duplicate; this is OK.
3899                          */
3900                         mutex_enter(&pp->p_lock);
3901                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
3902                                 kmem_cache_free(rctl_val_cache, nvalp);
3903                         mutex_exit(&pp->p_lock);
3904                 }
3905         }
3906
3907         /*
3908          * Tell the world that we're done setting up.
3909          *
3910          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3911          * and atomically set the zone's processor set visibility.  Once
3912          * we drop pool_lock() this zone will automatically get updated
3913          * to reflect any future changes to the pools configuration.
3914          *
3915          * Note that after we drop the locks below (zonehash_lock in
3916          * particular) other operations such as a zone_getattr call can
3917          * now proceed and observe the zone. That is the reason for doing a
3918          * state transition to the INITIALIZED state.
3919          */
3920         pool_lock();
3921         mutex_enter(&cpu_lock);
3922         mutex_enter(&zonehash_lock);
3923         zone_uniqid(zone);
3924         zone_zsd_configure(zone);
3925         if (pool_state == POOL_ENABLED)
3926                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
3927         mutex_enter(&zone_status_lock);
3928         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3929         zone_status_set(zone, ZONE_IS_INITIALIZED);
3930         mutex_exit(&zone_status_lock);
3931         mutex_exit(&zonehash_lock);
3932         mutex_exit(&cpu_lock);
3933         pool_unlock();
3934
3935         /* Now call the create callback for this key */
3936         zsd_apply_all_keys(zsd_apply_create, zone);
3937
3938         /* The callbacks are complete. Mark ZONE_IS_READY */
3939         mutex_enter(&zone_status_lock);
3940         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3941         zone_status_set(zone, ZONE_IS_READY);
3942         mutex_exit(&zone_status_lock);
3943
3944         /*
3945          * Once we see the zone transition to the ZONE_IS_BOOTING state,
3946          * we launch init, and set the state to running.
3947          */
3948         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3949
3950         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3951                 id_t cid;
3952
3953                 /*
3954                  * Ok, this is a little complicated.  We need to grab the
3955                  * zone's pool's scheduling class ID; note that by now, we
3956                  * are already bound to a pool if we need to be (zoneadmd
3957                  * will have done that to us while we're in the READY
3958                  * state).  *But* the scheduling class for the zone's 'init'
3959                  * must be explicitly passed to newproc, which doesn't
3960                  * respect pool bindings.
3961                  *
3962                  * We hold the pool_lock across the call to newproc() to
3963                  * close the obvious race: the pool's scheduling class
3964                  * could change before we manage to create the LWP with
3965                  * classid 'cid'.
3966                  */
3967                 pool_lock();
3968                 if (zone->zone_defaultcid > 0)
3969                         cid = zone->zone_defaultcid;
3970                 else
3971                         cid = pool_get_class(zone->zone_pool);
3972                 if (cid == -1)
3973                         cid = defaultcid;
3974
3975                 /*
3976                  * If this fails, zone_boot will ultimately fail.  The
3977                  * state of the zone will be set to SHUTTING_DOWN-- userland
3978                  * will have to tear down the zone, and fail, or try again.
3979                  */
3980                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3981                     minclsyspri - 1, &ct, 0)) != 0) {
3982                         mutex_enter(&zone_status_lock);
3983                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3984                         mutex_exit(&zone_status_lock);
3985                 } else {
3986                         zone->zone_boot_time = gethrestime_sec();
3987                 }
3988
3989                 pool_unlock();
3990         }
3991
3992         /*
3993          * Wait for zone_destroy() to be called.  This is what we spend
3994          * most of our life doing.
3995          */
3996         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3997
3998         if (ct)
3999                 /*
4000                  * At this point the process contract should be empty.
4001                  * (Though if it isn't, it's not the end of the world.)
4002                  */
4003                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4004
4005         /*
4006          * Allow kcred to be freed when all referring processes
4007          * (including this one) go away.  We can't just do this in
4008          * zone_free because we need to wait for the zone_cred_ref to
4009          * drop to 0 before calling zone_free, and the existence of
4010          * zone_kcred will prevent that.  Thus, we call crfree here to
4011          * balance the crdup in zone_create.  The crhold calls earlier
4012          * in zsched will be dropped when the thread and process exit.
4013          */
4014         crfree(zone->zone_kcred);
4015         zone->zone_kcred = NULL;
4016
4017         exit(CLD_EXITED, 0);
4018 }
4019
4020 /*
4021  * Helper function to determine if there are any submounts of the
4022  * provided path.  Used to make sure the zone doesn't "inherit" any
4023  * mounts from before it is created.
4024  */
4025 static uint_t
4026 zone_mount_count(const char *rootpath)
4027 {
4028         vfs_t *vfsp;
4029         uint_t count = 0;
4030         size_t rootpathlen = strlen(rootpath);
4031
4032         /*
4033          * Holding zonehash_lock prevents race conditions with
4034          * vfs_list_add()/vfs_list_remove() since we serialize with
4035          * zone_find_by_path().
4036          */
4037         ASSERT(MUTEX_HELD(&zonehash_lock));
4038         /*
4039          * The rootpath must end with a '/'
4040          */
4041         ASSERT(rootpath[rootpathlen - 1] == '/');
4042
4043         /*
4044          * This intentionally does not count the rootpath itself if that
4045          * happens to be a mount point.
4046          */
4047         vfs_list_read_lock();
4048         vfsp = rootvfs;
4049         do {
4050                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4051                     rootpathlen) == 0)
4052                         count++;
4053                 vfsp = vfsp->vfs_next;
4054         } while (vfsp != rootvfs);
4055         vfs_list_unlock();
4056         return (count);
4057 }
4058
4059 /*
4060  * Helper function to make sure that a zone created on 'rootpath'
4061  * wouldn't end up containing other zones' rootpaths.
4062  */
4063 static boolean_t
4064 zone_is_nested(const char *rootpath)
4065 {
4066         zone_t *zone;
4067         size_t rootpathlen = strlen(rootpath);
4068         size_t len;
4069
4070         ASSERT(MUTEX_HELD(&zonehash_lock));
4071
4072         /*
4073          * zone_set_root() appended '/' and '\0' at the end of rootpath
4074          */
4075         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4076             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4077                 return (B_TRUE);
4078
4079         for (zone = list_head(&zone_active); zone != NULL;
4080             zone = list_next(&zone_active, zone)) {
4081                 if (zone == global_zone)
4082                         continue;
4083                 len = strlen(zone->zone_rootpath);
4084                 if (strncmp(rootpath, zone->zone_rootpath,
4085                     MIN(rootpathlen, len)) == 0)
4086                         return (B_TRUE);
4087         }
4088         return (B_FALSE);
4089 }
4090
4091 static int
4092 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4093     size_t zone_privssz)
4094 {
4095         priv_set_t *privs;
4096
4097         if (zone_privssz < sizeof (priv_set_t))
4098                 return (ENOMEM);
4099
4100         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4101
4102         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4103                 kmem_free(privs, sizeof (priv_set_t));
4104                 return (EFAULT);
4105         }
4106
4107         zone->zone_privset = privs;
4108         return (0);
4109 }
4110
4111 /*
4112  * We make creative use of nvlists to pass in rctls from userland.  The list is
4113  * a list of the following structures:
4114  *
4115  * (name = rctl_name, value = nvpair_list_array)
4116  *
4117  * Where each element of the nvpair_list_array is of the form:
4118  *
4119  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4120  *      (name = "limit", value = uint64_t),
4121  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4122  */
4123 static int
4124 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4125 {
4126         nvpair_t *nvp = NULL;
4127         nvlist_t *nvl = NULL;
4128         char *kbuf;
4129         int error;
4130         rctl_val_t rv;
4131
4132         *nvlp = NULL;
4133
4134         if (buflen == 0)
4135                 return (0);
4136
4137         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4138                 return (ENOMEM);
4139         if (copyin(ubuf, kbuf, buflen)) {
4140                 error = EFAULT;
4141                 goto out;
4142         }
4143         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4144                 /*
4145                  * nvl may have been allocated/free'd, but the value set to
4146                  * non-NULL, so we reset it here.
4147                  */
4148                 nvl = NULL;
4149                 error = EINVAL;
4150                 goto out;
4151         }
4152         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4153                 rctl_dict_entry_t *rde;
4154                 rctl_hndl_t hndl;
4155                 nvlist_t **nvlarray;
4156                 uint_t i, nelem;
4157                 char *name;
4158
4159                 error = EINVAL;
4160                 name = nvpair_name(nvp);
4161                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4162                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4163                         goto out;
4164                 }
4165                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4166                         goto out;
4167                 }
4168                 rde = rctl_dict_lookup_hndl(hndl);
4169                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4170                 ASSERT(error == 0);
4171                 for (i = 0; i < nelem; i++) {
4172                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4173                                 goto out;
4174                 }
4175                 if (rctl_invalid_value(rde, &rv)) {
4176                         error = EINVAL;
4177                         goto out;
4178                 }
4179         }
4180         error = 0;
4181         *nvlp = nvl;
4182 out:
4183         kmem_free(kbuf, buflen);
4184         if (error && nvl != NULL)
4185                 nvlist_free(nvl);
4186         return (error);
4187 }
4188
4189 int
4190 zone_create_error(int er_error, int er_ext, int *er_out)
4191 {
4192         if (er_out != NULL) {
4193                 if (copyout(&er_ext, er_out, sizeof (int))) {
4194                         return (set_errno(EFAULT));
4195                 }
4196         }
4197         return (set_errno(er_error));
4198 }
4199
4200 /*
4201  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4202  */
4203 static int
4204 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4205 {
4206         char *kbuf;
4207         char *dataset, *next;
4208         zone_dataset_t *zd;
4209         size_t len;
4210
4211         if (ubuf == NULL || buflen == 0)
4212                 return (0);
4213
4214         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4215                 return (ENOMEM);
4216
4217         if (copyin(ubuf, kbuf, buflen) != 0) {
4218                 kmem_free(kbuf, buflen);
4219                 return (EFAULT);
4220         }
4221
4222         dataset = next = kbuf;
4223         for (;;) {
4224                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4225
4226                 next = strchr(dataset, ',');
4227
4228                 if (next == NULL)
4229                         len = strlen(dataset);
4230                 else
4231                         len = next - dataset;
4232
4233                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4234                 bcopy(dataset, zd->zd_dataset, len);
4235                 zd->zd_dataset[len] = '\0';
4236
4237                 list_insert_head(&zone->zone_datasets, zd);
4238
4239                 if (next == NULL)
4240                         break;
4241
4242                 dataset = next + 1;
4243         }
4244
4245         kmem_free(kbuf, buflen);
4246         return (0);
4247 }
4248
4249 /*
4250  * System call to create/initialize a new zone named 'zone_name', rooted
4251  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4252  * and initialized with the zone-wide rctls described in 'rctlbuf'.
4253  *
4254  * If extended error is non-null, we may use it to return more detailed
4255  * error information.
4256  */
4257 static zoneid_t
4258 zone_create(const char *zone_name, const char *zone_root,
4259     const priv_set_t *zone_privs, size_t zone_privssz,
4260     caddr_t rctlbuf, size_t rctlbufsz,
4261     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4262     int flags)
4263 {
4264         struct zsched_arg zarg;
4265         nvlist_t *rctls = NULL;
4266         proc_t *pp = curproc;
4267         zone_t *zone, *ztmp;
4268         zoneid_t zoneid, start = GLOBAL_ZONEID;
4269         int error;
4270         int error2 = 0;
4271         char *str;
4272         cred_t *zkcr;
4273
4274         if (secpolicy_zone_config(CRED()) != 0)
4275                 return (set_errno(EPERM));
4276
4277         /* can't boot zone from within chroot environment */
4278         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4279                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4280                     extended_error));
4281         /*
4282          * As the first step of zone creation, we want to allocate a zoneid.
4283          * This allocation is complicated by the fact that netstacks use the
4284          * zoneid to determine their stackid, but netstacks themselves are
4285          * freed asynchronously with respect to zone destruction.  This means
4286          * that a netstack reference leak (or in principle, an extraordinarily
4287          * long netstack reference hold) could result in a zoneid being
4288          * allocated that in fact corresponds to a stackid from an active
4289          * (referenced) netstack -- unleashing all sorts of havoc when that
4290          * netstack is actually (re)used.  (In the abstract, we might wish a
4291          * zoneid to not be deallocated until its last referencing netstack
4292          * has been released, but netstacks lack a backpointer into their
4293          * referencing zone -- and changing them to have such a pointer would
4294          * be substantial, to put it euphemistically.)  To avoid this, we
4295          * detect this condition on allocation: if we have allocated a zoneid
4296          * that corresponds to a netstack that's still in use, we warn about
4297          * it (as it is much more likely to be a reference leak than an actual
4298          * netstack reference), free it, and allocate another.  That these
4299          * identifers are allocated out of an ID space assures that we won't
4300          * see the identifier we just allocated.
4301          */
4302         for (;;) {
4303                 zoneid = id_alloc(zoneid_space);
4304
4305                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4306                         break;
4307
4308                 id_free(zoneid_space, zoneid);
4309
4310                 if (start == GLOBAL_ZONEID) {
4311                         start = zoneid;
4312                 } else if (zoneid == start) {
4313                         /*
4314                          * We have managed to iterate over the entire available
4315                          * zoneid space -- there are no identifiers available,
4316                          * presumably due to some number of leaked netstack
4317                          * references.  While it's in principle possible for us
4318                          * to continue to try, it seems wiser to give up at
4319                          * this point to warn and fail explicitly with a
4320                          * distinctive error.
4321                          */
4322                         cmn_err(CE_WARN, "zone_create() failed: all available "
4323                             "zone IDs have netstacks still in use");
4324                         return (set_errno(ENFILE));
4325                 }
4326
4327                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4328                     "netstack still in use", zoneid);
4329         }
4330
4331         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4332         zone->zone_id = zoneid;
4333         zone->zone_status = ZONE_IS_UNINITIALIZED;
4334         zone->zone_pool = pool_default;
4335         zone->zone_pool_mod = gethrtime();
4336         zone->zone_psetid = ZONE_PS_INVAL;
4337         zone->zone_ncpus = 0;
4338         zone->zone_ncpus_online = 0;
4339         zone->zone_restart_init = B_TRUE;
4340         zone->zone_brand = &native_brand;
4341         zone->zone_initname = NULL;
4342         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4343         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4344         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4345         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4346         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4347             offsetof(zone_ref_t, zref_linkage));
4348         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4349             offsetof(struct zsd_entry, zsd_linkage));
4350         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4351             offsetof(zone_dataset_t, zd_linkage));
4352         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4353             offsetof(zone_dl_t, zdl_linkage));
4354         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4355
4356         if (flags & ZCF_NET_EXCL) {
4357                 zone->zone_flags |= ZF_NET_EXCL;
4358         }
4359
4360         if ((error = zone_set_name(zone, zone_name)) != 0) {
4361                 zone_free(zone);
4362                 return (zone_create_error(error, 0, extended_error));
4363         }
4364
4365         if ((error = zone_set_root(zone, zone_root)) != 0) {
4366                 zone_free(zone);
4367                 return (zone_create_error(error, 0, extended_error));
4368         }
4369         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4370                 zone_free(zone);
4371                 return (zone_create_error(error, 0, extended_error));
4372         }
4373
4374         /* initialize node name to be the same as zone name */
4375         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4376         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4377         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4378
4379         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4380         zone->zone_domain[0] = '\0';
4381         zone->zone_hostid = HW_INVALID_HOSTID;
4382         zone->zone_shares = 1;
4383         zone->zone_shmmax = 0;
4384         zone->zone_ipc.ipcq_shmmni = 0;
4385         zone->zone_ipc.ipcq_semmni = 0;
4386         zone->zone_ipc.ipcq_msgmni = 0;
4387         zone->zone_bootargs = NULL;
4388         zone->zone_fs_allowed = NULL;
4389
4390         secflags_zero(&zone0.zone_secflags.psf_lower);
4391         secflags_zero(&zone0.zone_secflags.psf_effective);
4392         secflags_zero(&zone0.zone_secflags.psf_inherit);
4393         secflags_fullset(&zone0.zone_secflags.psf_upper);
4394
4395         zone->zone_initname =
4396             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4397         (void) strcpy(zone->zone_initname, zone_default_initname);
4398         zone->zone_nlwps = 0;
4399         zone->zone_nlwps_ctl = INT_MAX;
4400         zone->zone_nprocs = 0;
4401         zone->zone_nprocs_ctl = INT_MAX;
4402         zone->zone_locked_mem = 0;
4403         zone->zone_locked_mem_ctl = UINT64_MAX;
4404         zone->zone_max_swap = 0;
4405         zone->zone_max_swap_ctl = UINT64_MAX;
4406         zone->zone_max_lofi = 0;
4407         zone->zone_max_lofi_ctl = UINT64_MAX;
4408         zone0.zone_lockedmem_kstat = NULL;
4409         zone0.zone_swapresv_kstat = NULL;
4410
4411         /*
4412          * Zsched initializes the rctls.
4413          */
4414         zone->zone_rctls = NULL;
4415
4416         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4417                 zone_free(zone);
4418                 return (zone_create_error(error, 0, extended_error));
4419         }
4420
4421         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4422                 zone_free(zone);
4423                 return (set_errno(error));
4424         }
4425
4426         /*
4427          * Stop all lwps since that's what normally happens as part of fork().
4428          * This needs to happen before we grab any locks to avoid deadlock
4429          * (another lwp in the process could be waiting for the held lock).
4430          */
4431         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4432                 zone_free(zone);
4433                 nvlist_free(rctls);
4434                 return (zone_create_error(error, 0, extended_error));
4435         }
4436
4437         if (block_mounts(zone) == 0) {
4438                 mutex_enter(&pp->p_lock);
4439                 if (curthread != pp->p_agenttp)
4440                         continuelwps(pp);
4441                 mutex_exit(&pp->p_lock);
4442                 zone_free(zone);
4443                 nvlist_free(rctls);
4444                 return (zone_create_error(error, 0, extended_error));
4445         }
4446
4447         /*
4448          * Set up credential for kernel access.  After this, any errors
4449          * should go through the dance in errout rather than calling
4450          * zone_free directly.
4451          */
4452         zone->zone_kcred = crdup(kcred);
4453         crsetzone(zone->zone_kcred, zone);
4454         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4455         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4456         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4457         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4458
4459         mutex_enter(&zonehash_lock);
4460         /*
4461          * Make sure zone doesn't already exist.
4462          */
4463         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL) {
4464                 zone_status_t status;
4465
4466                 status = zone_status_get(ztmp);
4467                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4468                         error = EEXIST;
4469                 else
4470                         error = EBUSY;
4471
4472                 goto errout;
4473         }
4474
4475         /*
4476          * Don't allow zone creations which would cause one zone's rootpath to
4477          * be accessible from that of another (non-global) zone.
4478          */
4479         if (zone_is_nested(zone->zone_rootpath)) {
4480                 error = EBUSY;
4481                 goto errout;
4482         }
4483
4484         ASSERT(zonecount != 0);         /* check for leaks */
4485         if (zonecount + 1 > maxzones) {
4486                 error = ENOMEM;
4487                 goto errout;
4488         }
4489
4490         if (zone_mount_count(zone->zone_rootpath) != 0) {
4491                 error = EBUSY;
4492                 error2 = ZE_AREMOUNTS;
4493                 goto errout;
4494         }
4495
4496         /*
4497          * Zone is still incomplete, but we need to drop all locks while
4498          * zsched() initializes this zone's kernel process.  We
4499          * optimistically add the zone to the hashtable and associated
4500          * lists so a parallel zone_create() doesn't try to create the
4501          * same zone.
4502          */
4503         zonecount++;
4504         (void) mod_hash_insert(zonehashbyid,
4505             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4506             (mod_hash_val_t)(uintptr_t)zone);
4507         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4508         (void) strcpy(str, zone->zone_name);
4509         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4510             (mod_hash_val_t)(uintptr_t)zone);
4511
4512         /*
4513          * Insert into active list.  At this point there are no 'hold's
4514          * on the zone, but everyone else knows not to use it, so we can
4515          * continue to use it.  zsched() will do a zone_hold() if the
4516          * newproc() is successful.
4517          */
4518         list_insert_tail(&zone_active, zone);
4519         mutex_exit(&zonehash_lock);
4520
4521         zarg.zone = zone;
4522         zarg.nvlist = rctls;
4523         /*
4524          * The process, task, and project rctls are probably wrong;
4525          * we need an interface to get the default values of all rctls,
4526          * and initialize zsched appropriately.  I'm not sure that that
4527          * makes much of a difference, though.
4528          */
4529         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4530         if (error != 0) {
4531                 /*
4532                  * We need to undo all globally visible state.
4533                  */
4534                 mutex_enter(&zonehash_lock);
4535                 list_remove(&zone_active, zone);
4536                 (void) mod_hash_destroy(zonehashbyname,
4537                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4538                 (void) mod_hash_destroy(zonehashbyid,
4539                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4540                 ASSERT(zonecount > 1);
4541                 zonecount--;
4542                 goto errout;
4543         }
4544
4545         /*
4546          * Zone creation can't fail from now on.
4547          */
4548
4549         /*
4550          * Create zone kstats
4551          */
4552         zone_kstat_create(zone);
4553
4554         /*
4555          * Let the other lwps continue.
4556          */
4557         mutex_enter(&pp->p_lock);
4558         if (curthread != pp->p_agenttp)
4559                 continuelwps(pp);
4560         mutex_exit(&pp->p_lock);
4561
4562         /*
4563          * Wait for zsched to finish initializing the zone.
4564          */
4565         zone_status_wait(zone, ZONE_IS_READY);
4566         /*
4567          * The zone is fully visible, so we can let mounts progress.
4568          */
4569         resume_mounts(zone);
4570         nvlist_free(rctls);
4571
4572         return (zoneid);
4573
4574 errout:
4575         mutex_exit(&zonehash_lock);
4576         /*
4577          * Let the other lwps continue.
4578          */
4579         mutex_enter(&pp->p_lock);
4580         if (curthread != pp->p_agenttp)
4581                 continuelwps(pp);
4582         mutex_exit(&pp->p_lock);
4583
4584         resume_mounts(zone);
4585         nvlist_free(rctls);
4586         /*
4587          * There is currently one reference to the zone, a cred_ref from
4588          * zone_kcred.  To free the zone, we call crfree, which will call
4589          * zone_cred_rele, which will call zone_free.
4590          */
4591         ASSERT(zone->zone_cred_ref == 1);
4592         ASSERT(zone->zone_kcred->cr_ref == 1);
4593         ASSERT(zone->zone_ref == 0);
4594         zkcr = zone->zone_kcred;
4595         zone->zone_kcred = NULL;
4596         crfree(zkcr);                           /* triggers call to zone_free */
4597         return (zone_create_error(error, error2, extended_error));
4598 }
4599
4600 /*
4601  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4602  * the heavy lifting.  initname is the path to the program to launch
4603  * at the "top" of the zone; if this is NULL, we use the system default,
4604  * which is stored at zone_default_initname.
4605  */
4606 static int
4607 zone_boot(zoneid_t zoneid)
4608 {
4609         int err;
4610         zone_t *zone;
4611
4612         if (secpolicy_zone_config(CRED()) != 0)
4613                 return (set_errno(EPERM));
4614         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4615                 return (set_errno(EINVAL));
4616
4617         mutex_enter(&zonehash_lock);
4618         /*
4619          * Look for zone under hash lock to prevent races with calls to
4620          * zone_shutdown, zone_destroy, etc.
4621          */
4622         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4623                 mutex_exit(&zonehash_lock);
4624                 return (set_errno(EINVAL));
4625         }
4626
4627         mutex_enter(&zone_status_lock);
4628         if (zone_status_get(zone) != ZONE_IS_READY) {
4629                 mutex_exit(&zone_status_lock);
4630                 mutex_exit(&zonehash_lock);
4631                 return (set_errno(EINVAL));
4632         }
4633         zone_status_set(zone, ZONE_IS_BOOTING);
4634         mutex_exit(&zone_status_lock);
4635
4636         zone_hold(zone);        /* so we can use the zone_t later */
4637         mutex_exit(&zonehash_lock);
4638
4639         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4640                 zone_rele(zone);
4641                 return (set_errno(EINTR));
4642         }
4643
4644         /*
4645          * Boot (starting init) might have failed, in which case the zone
4646          * will go to the SHUTTING_DOWN state; an appropriate errno will
4647          * be placed in zone->zone_boot_err, and so we return that.
4648          */
4649         err = zone->zone_boot_err;
4650         zone_rele(zone);
4651         return (err ? set_errno(err) : 0);
4652 }
4653
4654 /*
4655  * Kills all user processes in the zone, waiting for them all to exit
4656  * before returning.
4657  */
4658 static int
4659 zone_empty(zone_t *zone)
4660 {
4661         int waitstatus;
4662
4663         /*
4664          * We need to drop zonehash_lock before killing all
4665          * processes, otherwise we'll deadlock with zone_find_*
4666          * which can be called from the exit path.
4667          */
4668         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4669         while ((waitstatus = zone_status_timedwait_sig(zone,
4670             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4671                 killall(zone->zone_id);
4672         }
4673         /*
4674          * return EINTR if we were signaled
4675          */
4676         if (waitstatus == 0)
4677                 return (EINTR);
4678         return (0);
4679 }
4680
4681 /*
4682  * This function implements the policy for zone visibility. A non-global zone
4683  * can only see itself.
4684  *
4685  * Returns true if zone attributes are viewable, false otherwise.
4686  */
4687 static boolean_t
4688 zone_list_access(zone_t *zone)
4689 {
4690
4691         if (curproc->p_zone == global_zone ||
4692             curproc->p_zone == zone) {
4693                 return (B_TRUE);
4694         } else {
4695                 return (B_FALSE);
4696         }
4697 }
4698
4699 /*
4700  * Systemcall to start the zone's halt sequence.  By the time this
4701  * function successfully returns, all user processes and kernel threads
4702  * executing in it will have exited, ZSD shutdown callbacks executed,
4703  * and the zone status set to ZONE_IS_DOWN.
4704  *
4705  * It is possible that the call will interrupt itself if the caller is the
4706  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4707  */
4708 static int
4709 zone_shutdown(zoneid_t zoneid)
4710 {
4711         int error;
4712         zone_t *zone;
4713         zone_status_t status;
4714
4715         if (secpolicy_zone_config(CRED()) != 0)
4716                 return (set_errno(EPERM));
4717         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4718                 return (set_errno(EINVAL));
4719
4720         mutex_enter(&zonehash_lock);
4721         /*
4722          * Look for zone under hash lock to prevent races with other
4723          * calls to zone_shutdown and zone_destroy.
4724          */
4725         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4726                 mutex_exit(&zonehash_lock);
4727                 return (set_errno(EINVAL));
4728         }
4729
4730         /*
4731          * We have to drop zonehash_lock before calling block_mounts.
4732          * Hold the zone so we can continue to use the zone_t.
4733          */
4734         zone_hold(zone);
4735         mutex_exit(&zonehash_lock);
4736
4737         /*
4738          * Block mounts so that VFS_MOUNT() can get an accurate view of
4739          * the zone's status with regards to ZONE_IS_SHUTTING down.
4740          *
4741          * e.g. NFS can fail the mount if it determines that the zone
4742          * has already begun the shutdown sequence.
4743          *
4744          */
4745         if (block_mounts(zone) == 0) {
4746                 zone_rele(zone);
4747                 return (set_errno(EINTR));
4748         }
4749
4750         mutex_enter(&zonehash_lock);
4751         mutex_enter(&zone_status_lock);
4752         status = zone_status_get(zone);
4753         /*
4754          * Fail if the zone isn't fully initialized yet.
4755          */
4756         if (status < ZONE_IS_READY) {
4757                 mutex_exit(&zone_status_lock);
4758                 mutex_exit(&zonehash_lock);
4759                 resume_mounts(zone);
4760                 zone_rele(zone);
4761                 return (set_errno(EINVAL));
4762         }
4763         /*
4764          * If conditions required for zone_shutdown() to return have been met,
4765          * return success.
4766          */
4767         if (status >= ZONE_IS_DOWN) {
4768                 mutex_exit(&zone_status_lock);
4769                 mutex_exit(&zonehash_lock);
4770                 resume_mounts(zone);
4771                 zone_rele(zone);
4772                 return (0);
4773         }
4774         /*
4775          * If zone_shutdown() hasn't been called before, go through the motions.
4776          * If it has, there's nothing to do but wait for the kernel threads to
4777          * drain.
4778          */
4779         if (status < ZONE_IS_EMPTY) {
4780                 uint_t ntasks;
4781
4782                 mutex_enter(&zone->zone_lock);
4783                 if ((ntasks = zone->zone_ntasks) != 1) {
4784                         /*
4785                          * There's still stuff running.
4786                          */
4787                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4788                 }
4789                 mutex_exit(&zone->zone_lock);
4790                 if (ntasks == 1) {
4791                         /*
4792                          * The only way to create another task is through
4793                          * zone_enter(), which will block until we drop
4794                          * zonehash_lock.  The zone is empty.
4795                          */
4796                         if (zone->zone_kthreads == NULL) {
4797                                 /*
4798                                  * Skip ahead to ZONE_IS_DOWN
4799                                  */
4800                                 zone_status_set(zone, ZONE_IS_DOWN);
4801                         } else {
4802                                 zone_status_set(zone, ZONE_IS_EMPTY);
4803                         }
4804                 }
4805         }
4806         mutex_exit(&zone_status_lock);
4807         mutex_exit(&zonehash_lock);
4808         resume_mounts(zone);
4809
4810         if (error = zone_empty(zone)) {
4811                 zone_rele(zone);
4812                 return (set_errno(error));
4813         }
4814         /*
4815          * After the zone status goes to ZONE_IS_DOWN this zone will no
4816          * longer be notified of changes to the pools configuration, so
4817          * in order to not end up with a stale pool pointer, we point
4818          * ourselves at the default pool and remove all resource
4819          * visibility.  This is especially important as the zone_t may
4820          * languish on the deathrow for a very long time waiting for
4821          * cred's to drain out.
4822          *
4823          * This rebinding of the zone can happen multiple times
4824          * (presumably due to interrupted or parallel systemcalls)
4825          * without any adverse effects.
4826          */
4827         if (pool_lock_intr() != 0) {
4828                 zone_rele(zone);
4829                 return (set_errno(EINTR));
4830         }
4831         if (pool_state == POOL_ENABLED) {
4832                 mutex_enter(&cpu_lock);
4833                 zone_pool_set(zone, pool_default);
4834                 /*
4835                  * The zone no longer needs to be able to see any cpus.
4836                  */
4837                 zone_pset_set(zone, ZONE_PS_INVAL);
4838                 mutex_exit(&cpu_lock);
4839         }
4840         pool_unlock();
4841
4842         /*
4843          * ZSD shutdown callbacks can be executed multiple times, hence
4844          * it is safe to not be holding any locks across this call.
4845          */
4846         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4847
4848         mutex_enter(&zone_status_lock);
4849         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4850                 zone_status_set(zone, ZONE_IS_DOWN);
4851         mutex_exit(&zone_status_lock);
4852
4853         /*
4854          * Wait for kernel threads to drain.
4855          */
4856         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4857                 zone_rele(zone);
4858                 return (set_errno(EINTR));
4859         }
4860
4861         /*
4862          * Zone can be become down/destroyable even if the above wait
4863          * returns EINTR, so any code added here may never execute.
4864          * (i.e. don't add code here)
4865          */
4866
4867         zone_rele(zone);
4868         return (0);
4869 }
4870
4871 /*
4872  * Log the specified zone's reference counts.  The caller should not be
4873  * holding the zone's zone_lock.
4874  */
4875 static void
4876 zone_log_refcounts(zone_t *zone)
4877 {
4878         char *buffer;
4879         char *buffer_position;
4880         uint32_t buffer_size;
4881         uint32_t index;
4882         uint_t ref;
4883         uint_t cred_ref;
4884
4885         /*
4886          * Construct a string representing the subsystem-specific reference
4887          * counts.  The counts are printed in ascending order by index into the
4888          * zone_t::zone_subsys_ref array.  The list will be surrounded by
4889          * square brackets [] and will only contain nonzero reference counts.
4890          *
4891          * The buffer will hold two square bracket characters plus ten digits,
4892          * one colon, one space, one comma, and some characters for a
4893          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4894          * bit integers have at most ten decimal digits.)  The last
4895          * reference count's comma is replaced by the closing square
4896          * bracket and a NULL character to terminate the string.
4897          *
4898          * NOTE: We have to grab the zone's zone_lock to create a consistent
4899          * snapshot of the zone's reference counters.
4900          *
4901          * First, figure out how much space the string buffer will need.
4902          * The buffer's size is stored in buffer_size.
4903          */
4904         buffer_size = 2;                        /* for the square brackets */
4905         mutex_enter(&zone->zone_lock);
4906         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4907         ref = zone->zone_ref;
4908         cred_ref = zone->zone_cred_ref;
4909         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4910                 if (zone->zone_subsys_ref[index] != 0)
4911                         buffer_size += strlen(zone_ref_subsys_names[index]) +
4912                             13;
4913         if (buffer_size == 2) {
4914                 /*
4915                  * No subsystems had nonzero reference counts.  Don't bother
4916                  * with allocating a buffer; just log the general-purpose and
4917                  * credential reference counts.
4918                  */
4919                 mutex_exit(&zone->zone_lock);
4920                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4921                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
4922                     "references and %u credential references are still extant",
4923                     zone->zone_name, zone->zone_id, ref, cred_ref);
4924                 return;
4925         }
4926
4927         /*
4928          * buffer_size contains the exact number of characters that the
4929          * buffer will need.  Allocate the buffer and fill it with nonzero
4930          * subsystem-specific reference counts.  Surround the results with
4931          * square brackets afterwards.
4932          */
4933         buffer = kmem_alloc(buffer_size, KM_SLEEP);
4934         buffer_position = &buffer[1];
4935         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
4936                 /*
4937                  * NOTE: The DDI's version of sprintf() returns a pointer to
4938                  * the modified buffer rather than the number of bytes written
4939                  * (as in snprintf(3C)).  This is unfortunate and annoying.
4940                  * Therefore, we'll use snprintf() with INT_MAX to get the
4941                  * number of bytes written.  Using INT_MAX is safe because
4942                  * the buffer is perfectly sized for the data: we'll never
4943                  * overrun the buffer.
4944                  */
4945                 if (zone->zone_subsys_ref[index] != 0)
4946                         buffer_position += snprintf(buffer_position, INT_MAX,
4947                             "%s: %u,", zone_ref_subsys_names[index],
4948                             zone->zone_subsys_ref[index]);
4949         }
4950         mutex_exit(&zone->zone_lock);
4951         buffer[0] = '[';
4952         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
4953         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
4954         buffer_position[-1] = ']';
4955
4956         /*
4957          * Log the reference counts and free the message buffer.
4958          */
4959         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4960             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
4961             "%u credential references are still extant %s", zone->zone_name,
4962             zone->zone_id, ref, cred_ref, buffer);
4963         kmem_free(buffer, buffer_size);
4964 }
4965
4966 /*
4967  * Systemcall entry point to finalize the zone halt process.  The caller
4968  * must have already successfully called zone_shutdown().
4969  *
4970  * Upon successful completion, the zone will have been fully destroyed:
4971  * zsched will have exited, destructor callbacks executed, and the zone
4972  * removed from the list of active zones.
4973  */
4974 static int
4975 zone_destroy(zoneid_t zoneid)
4976 {
4977         uint64_t uniqid;
4978         zone_t *zone;
4979         zone_status_t status;
4980         clock_t wait_time;
4981         boolean_t log_refcounts;
4982
4983         if (secpolicy_zone_config(CRED()) != 0)
4984                 return (set_errno(EPERM));
4985         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4986                 return (set_errno(EINVAL));
4987
4988         mutex_enter(&zonehash_lock);
4989         /*
4990          * Look for zone under hash lock to prevent races with other
4991          * calls to zone_destroy.
4992          */
4993         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4994                 mutex_exit(&zonehash_lock);
4995                 return (set_errno(EINVAL));
4996         }
4997
4998         if (zone_mount_count(zone->zone_rootpath) != 0) {
4999                 mutex_exit(&zonehash_lock);
5000                 return (set_errno(EBUSY));
5001         }
5002         mutex_enter(&zone_status_lock);
5003         status = zone_status_get(zone);
5004         if (status < ZONE_IS_DOWN) {
5005                 mutex_exit(&zone_status_lock);
5006                 mutex_exit(&zonehash_lock);
5007                 return (set_errno(EBUSY));
5008         } else if (status == ZONE_IS_DOWN) {
5009                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5010         }
5011         mutex_exit(&zone_status_lock);
5012         zone_hold(zone);
5013         mutex_exit(&zonehash_lock);
5014
5015         /*
5016          * wait for zsched to exit
5017          */
5018         zone_status_wait(zone, ZONE_IS_DEAD);
5019         zone_zsd_callbacks(zone, ZSD_DESTROY);
5020         zone->zone_netstack = NULL;
5021         uniqid = zone->zone_uniqid;
5022         zone_rele(zone);
5023         zone = NULL;    /* potentially free'd */
5024
5025         log_refcounts = B_FALSE;
5026         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5027         mutex_enter(&zonehash_lock);
5028         for (; /* ever */; ) {
5029                 boolean_t unref;
5030                 boolean_t refs_have_been_logged;
5031
5032                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5033                     zone->zone_uniqid != uniqid) {
5034                         /*
5035                          * The zone has gone away.  Necessary conditions
5036                          * are met, so we return success.
5037                          */
5038                         mutex_exit(&zonehash_lock);
5039                         return (0);
5040                 }
5041                 mutex_enter(&zone->zone_lock);
5042                 unref = ZONE_IS_UNREF(zone);
5043                 refs_have_been_logged = (zone->zone_flags &
5044                     ZF_REFCOUNTS_LOGGED);
5045                 mutex_exit(&zone->zone_lock);
5046                 if (unref) {
5047                         /*
5048                          * There is only one reference to the zone -- that
5049                          * added when the zone was added to the hashtables --
5050                          * and things will remain this way until we drop
5051                          * zonehash_lock... we can go ahead and cleanup the
5052                          * zone.
5053                          */
5054                         break;
5055                 }
5056
5057                 /*
5058                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5059                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5060                  * some zone's general-purpose reference count reaches one.
5061                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5062                  * on zone_destroy_cv, then log the zone's reference counts and
5063                  * continue to wait for zone_rele() and zone_cred_rele().
5064                  */
5065                 if (!refs_have_been_logged) {
5066                         if (!log_refcounts) {
5067                                 /*
5068                                  * This thread hasn't timed out waiting on
5069                                  * zone_destroy_cv yet.  Wait wait_time clock
5070                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5071                                  * seconds) for the zone's references to clear.
5072                                  */
5073                                 ASSERT(wait_time > 0);
5074                                 wait_time = cv_reltimedwait_sig(
5075                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5076                                     TR_SEC);
5077                                 if (wait_time > 0) {
5078                                         /*
5079                                          * A thread in zone_rele() or
5080                                          * zone_cred_rele() signaled
5081                                          * zone_destroy_cv before this thread's
5082                                          * wait timed out.  The zone might have
5083                                          * only one reference left; find out!
5084                                          */
5085                                         continue;
5086                                 } else if (wait_time == 0) {
5087                                         /* The thread's process was signaled. */
5088                                         mutex_exit(&zonehash_lock);
5089                                         return (set_errno(EINTR));
5090                                 }
5091
5092                                 /*
5093                                  * The thread timed out while waiting on
5094                                  * zone_destroy_cv.  Even though the thread
5095                                  * timed out, it has to check whether another
5096                                  * thread woke up from zone_destroy_cv and
5097                                  * destroyed the zone.
5098                                  *
5099                                  * If the zone still exists and has more than
5100                                  * one unreleased general-purpose reference,
5101                                  * then log the zone's reference counts.
5102                                  */
5103                                 log_refcounts = B_TRUE;
5104                                 continue;
5105                         }
5106
5107                         /*
5108                          * The thread already timed out on zone_destroy_cv while
5109                          * waiting for subsystems to release the zone's last
5110                          * general-purpose references.  Log the zone's reference
5111                          * counts and wait indefinitely on zone_destroy_cv.
5112                          */
5113                         zone_log_refcounts(zone);
5114                 }
5115                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5116                         /* The thread's process was signaled. */
5117                         mutex_exit(&zonehash_lock);
5118                         return (set_errno(EINTR));
5119                 }
5120         }
5121
5122         /*
5123          * Remove CPU cap for this zone now since we're not going to
5124          * fail below this point.
5125          */
5126         cpucaps_zone_remove(zone);
5127
5128         /* Get rid of the zone's kstats */
5129         zone_kstat_delete(zone);
5130
5131         /* remove the pfexecd doors */
5132         if (zone->zone_pfexecd != NULL) {
5133                 klpd_freelist(&zone->zone_pfexecd);
5134                 zone->zone_pfexecd = NULL;
5135         }
5136
5137         /* free brand specific data */
5138         if (ZONE_IS_BRANDED(zone))
5139                 ZBROP(zone)->b_free_brand_data(zone);
5140
5141         /* Say goodbye to brand framework. */
5142         brand_unregister_zone(zone->zone_brand);
5143
5144         /*
5145          * It is now safe to let the zone be recreated; remove it from the
5146          * lists.  The memory will not be freed until the last cred
5147          * reference goes away.
5148          */
5149         ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5150         zonecount--;
5151         /* remove from active list and hash tables */
5152         list_remove(&zone_active, zone);
5153         (void) mod_hash_destroy(zonehashbyname,
5154             (mod_hash_key_t)zone->zone_name);
5155         (void) mod_hash_destroy(zonehashbyid,
5156             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5157         mutex_exit(&zonehash_lock);
5158
5159         /*
5160          * Release the root vnode; we're not using it anymore.  Nor should any
5161          * other thread that might access it exist.
5162          */
5163         if (zone->zone_rootvp != NULL) {
5164                 VN_RELE(zone->zone_rootvp);
5165                 zone->zone_rootvp = NULL;
5166         }
5167
5168         /* add to deathrow list */
5169         mutex_enter(&zone_deathrow_lock);
5170         list_insert_tail(&zone_deathrow, zone);
5171         mutex_exit(&zone_deathrow_lock);
5172
5173         /*
5174          * Drop last reference (which was added by zsched()), this will
5175          * free the zone unless there are outstanding cred references.
5176          */
5177         zone_rele(zone);
5178         return (0);
5179 }
5180
5181 /*
5182  * Systemcall entry point for zone_getattr(2).
5183  */
5184 static ssize_t
5185 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5186 {
5187         size_t size;
5188         int error = 0, err;
5189         zone_t *zone;
5190         char *zonepath;
5191         char *outstr;
5192         zone_status_t zone_status;
5193         pid_t initpid;
5194         boolean_t global = (curzone == global_zone);
5195         boolean_t inzone = (curzone->zone_id == zoneid);
5196         ushort_t flags;
5197         zone_net_data_t *zbuf;
5198
5199         mutex_enter(&zonehash_lock);
5200         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5201                 mutex_exit(&zonehash_lock);
5202                 return (set_errno(EINVAL));
5203         }
5204         zone_status = zone_status_get(zone);
5205         if (zone_status < ZONE_IS_INITIALIZED) {
5206                 mutex_exit(&zonehash_lock);
5207                 return (set_errno(EINVAL));
5208         }
5209         zone_hold(zone);
5210         mutex_exit(&zonehash_lock);
5211
5212         /*
5213          * If not in the global zone, don't show information about other zones.
5214          */
5215         if (!zone_list_access(zone)) {
5216                 zone_rele(zone);
5217                 return (set_errno(EINVAL));
5218         }
5219
5220         switch (attr) {
5221         case ZONE_ATTR_ROOT:
5222                 if (global) {
5223                         /*
5224                          * Copy the path to trim the trailing "/" (except for
5225                          * the global zone).
5226                          */
5227                         if (zone != global_zone)
5228                                 size = zone->zone_rootpathlen - 1;
5229                         else
5230                                 size = zone->zone_rootpathlen;
5231                         zonepath = kmem_alloc(size, KM_SLEEP);
5232                         bcopy(zone->zone_rootpath, zonepath, size);
5233                         zonepath[size - 1] = '\0';
5234                 } else {
5235                         if (inzone) {
5236                                 /*
5237                                  * Caller is not in the global zone.  if the
5238                                  * query is on the current zone just return
5239                                  * faked-up path for current zone.
5240                                  */
5241                                 zonepath = "/";
5242                                 size = 2;
5243                         } else {
5244                                 /*
5245                                  * Return related path for current zone.
5246                                  */
5247                                 int prefix_len = strlen(zone_prefix);
5248                                 int zname_len = strlen(zone->zone_name);
5249
5250                                 size = prefix_len + zname_len + 1;
5251                                 zonepath = kmem_alloc(size, KM_SLEEP);
5252                                 bcopy(zone_prefix, zonepath, prefix_len);
5253                                 bcopy(zone->zone_name, zonepath +
5254                                     prefix_len, zname_len);
5255                                 zonepath[size - 1] = '\0';
5256                         }
5257                 }
5258                 if (bufsize > size)
5259                         bufsize = size;
5260                 if (buf != NULL) {
5261                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5262                         if (err != 0 && err != ENAMETOOLONG)
5263                                 error = EFAULT;
5264                 }
5265                 if (global)
5266                         kmem_free(zonepath, size);
5267                 break;
5268
5269         case ZONE_ATTR_NAME:
5270                 size = strlen(zone->zone_name) + 1;
5271                 if (bufsize > size)
5272                         bufsize = size;
5273                 if (buf != NULL) {
5274                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5275                         if (err != 0 && err != ENAMETOOLONG)
5276                                 error = EFAULT;
5277                 }
5278                 break;
5279
5280         case ZONE_ATTR_STATUS:
5281                 /*
5282                  * Since we're not holding zonehash_lock, the zone status
5283                  * may be anything; leave it up to userland to sort it out.
5284                  */
5285                 size = sizeof (zone_status);
5286                 if (bufsize > size)
5287                         bufsize = size;
5288                 zone_status = zone_status_get(zone);
5289                 if (buf != NULL &&
5290                     copyout(&zone_status, buf, bufsize) != 0)
5291                         error = EFAULT;
5292                 break;
5293         case ZONE_ATTR_FLAGS:
5294                 size = sizeof (zone->zone_flags);
5295                 if (bufsize > size)
5296                         bufsize = size;
5297                 flags = zone->zone_flags;
5298                 if (buf != NULL &&
5299                     copyout(&flags, buf, bufsize) != 0)
5300                         error = EFAULT;
5301                 break;
5302         case ZONE_ATTR_PRIVSET:
5303                 size = sizeof (priv_set_t);
5304                 if (bufsize > size)
5305                         bufsize = size;
5306                 if (buf != NULL &&
5307                     copyout(zone->zone_privset, buf, bufsize) != 0)
5308                         error = EFAULT;
5309                 break;
5310         case ZONE_ATTR_UNIQID:
5311                 size = sizeof (zone->zone_uniqid);
5312                 if (bufsize > size)
5313                         bufsize = size;
5314                 if (buf != NULL &&
5315                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5316                         error = EFAULT;
5317                 break;
5318         case ZONE_ATTR_POOLID:
5319                 {
5320                         pool_t *pool;
5321                         poolid_t poolid;
5322
5323                         if (pool_lock_intr() != 0) {
5324                                 error = EINTR;
5325                                 break;
5326                         }
5327                         pool = zone_pool_get(zone);
5328                         poolid = pool->pool_id;
5329                         pool_unlock();
5330                         size = sizeof (poolid);
5331                         if (bufsize > size)
5332                                 bufsize = size;
5333                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5334                                 error = EFAULT;
5335                 }
5336                 break;
5337         case ZONE_ATTR_INITPID:
5338                 size = sizeof (initpid);
5339                 if (bufsize > size)
5340                         bufsize = size;
5341                 initpid = zone->zone_proc_initpid;
5342                 if (initpid == -1) {
5343                         error = ESRCH;
5344                         break;
5345                 }
5346                 if (buf != NULL &&
5347                     copyout(&initpid, buf, bufsize) != 0)
5348                         error = EFAULT;
5349                 break;
5350         case ZONE_ATTR_BRAND:
5351                 size = strlen(zone->zone_brand->b_name) + 1;
5352
5353                 if (bufsize > size)
5354                         bufsize = size;
5355                 if (buf != NULL) {
5356                         err = copyoutstr(zone->zone_brand->b_name, buf,
5357                             bufsize, NULL);
5358                         if (err != 0 && err != ENAMETOOLONG)
5359                                 error = EFAULT;
5360                 }
5361                 break;
5362         case ZONE_ATTR_INITNAME:
5363                 size = strlen(zone->zone_initname) + 1;
5364                 if (bufsize > size)
5365                         bufsize = size;
5366                 if (buf != NULL) {
5367                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5368                             NULL);
5369                         if (err != 0 && err != ENAMETOOLONG)
5370                                 error = EFAULT;
5371                 }
5372                 break;
5373         case ZONE_ATTR_BOOTARGS:
5374                 if (zone->zone_bootargs == NULL)
5375                         outstr = "";
5376                 else
5377                         outstr = zone->zone_bootargs;
5378                 size = strlen(outstr) + 1;
5379                 if (bufsize > size)
5380                         bufsize = size;
5381                 if (buf != NULL) {
5382                         err = copyoutstr(outstr, buf, bufsize, NULL);
5383                         if (err != 0 && err != ENAMETOOLONG)
5384                                 error = EFAULT;
5385                 }
5386                 break;
5387         case ZONE_ATTR_PHYS_MCAP:
5388                 size = sizeof (zone->zone_phys_mcap);
5389                 if (bufsize > size)
5390                         bufsize = size;
5391                 if (buf != NULL &&
5392                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5393                         error = EFAULT;
5394                 break;
5395         case ZONE_ATTR_SCHED_CLASS:
5396                 mutex_enter(&class_lock);
5397
5398                 if (zone->zone_defaultcid >= loaded_classes)
5399                         outstr = "";
5400                 else
5401                         outstr = sclass[zone->zone_defaultcid].cl_name;
5402                 size = strlen(outstr) + 1;
5403                 if (bufsize > size)
5404                         bufsize = size;
5405                 if (buf != NULL) {
5406                         err = copyoutstr(outstr, buf, bufsize, NULL);
5407                         if (err != 0 && err != ENAMETOOLONG)
5408                                 error = EFAULT;
5409                 }
5410
5411                 mutex_exit(&class_lock);
5412                 break;
5413         case ZONE_ATTR_HOSTID:
5414                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5415                     bufsize == sizeof (zone->zone_hostid)) {
5416                         size = sizeof (zone->zone_hostid);
5417                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5418                             bufsize) != 0)
5419                                 error = EFAULT;
5420                 } else {
5421                         error = EINVAL;
5422                 }
5423                 break;
5424         case ZONE_ATTR_FS_ALLOWED:
5425                 if (zone->zone_fs_allowed == NULL)
5426                         outstr = "";
5427                 else
5428                         outstr = zone->zone_fs_allowed;
5429                 size = strlen(outstr) + 1;
5430                 if (bufsize > size)
5431                         bufsize = size;
5432                 if (buf != NULL) {
5433                         err = copyoutstr(outstr, buf, bufsize, NULL);
5434                         if (err != 0 && err != ENAMETOOLONG)
5435                                 error = EFAULT;
5436                 }
5437                 break;
5438         case ZONE_ATTR_SECFLAGS:
5439                 size = sizeof (zone->zone_secflags);
5440                 if (bufsize > size)
5441                         bufsize = size;
5442                 if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5443                         error = EFAULT;
5444                 break;
5445         case ZONE_ATTR_NETWORK:
5446                 bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
5447                 size = bufsize;
5448                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5449                 if (copyin(buf, zbuf, bufsize) != 0) {
5450                         error = EFAULT;
5451                 } else {
5452                         error = zone_get_network(zoneid, zbuf);
5453                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5454                                 error = EFAULT;
5455                 }
5456                 kmem_free(zbuf, bufsize);
5457                 break;
5458         default:
5459                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5460                         size = bufsize;
5461                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5462                 } else {
5463                         error = EINVAL;
5464                 }
5465         }
5466         zone_rele(zone);
5467
5468         if (error)
5469                 return (set_errno(error));
5470         return ((ssize_t)size);
5471 }
5472
5473 /*
5474  * Systemcall entry point for zone_setattr(2).
5475  */
5476 /*ARGSUSED*/
5477 static int
5478 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5479 {
5480         zone_t *zone;
5481         zone_status_t zone_status;
5482         int err = -1;
5483         zone_net_data_t *zbuf;
5484
5485         if (secpolicy_zone_config(CRED()) != 0)
5486                 return (set_errno(EPERM));
5487
5488         /*
5489          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5490          * global zone.
5491          */
5492         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5493                 return (set_errno(EINVAL));
5494         }
5495
5496         mutex_enter(&zonehash_lock);
5497         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5498                 mutex_exit(&zonehash_lock);
5499                 return (set_errno(EINVAL));
5500         }
5501         zone_hold(zone);
5502         mutex_exit(&zonehash_lock);
5503
5504         /*
5505          * At present most attributes can only be set on non-running,
5506          * non-global zones.
5507          */
5508         zone_status = zone_status_get(zone);
5509         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5510                 err = EINVAL;
5511                 goto done;
5512         }
5513
5514         switch (attr) {
5515         case ZONE_ATTR_INITNAME:
5516                 err = zone_set_initname(zone, (const char *)buf);
5517                 break;
5518         case ZONE_ATTR_INITNORESTART:
5519                 zone->zone_restart_init = B_FALSE;
5520                 err = 0;
5521                 break;
5522         case ZONE_ATTR_BOOTARGS:
5523                 err = zone_set_bootargs(zone, (const char *)buf);
5524                 break;
5525         case ZONE_ATTR_BRAND:
5526                 err = zone_set_brand(zone, (const char *)buf);
5527                 break;
5528         case ZONE_ATTR_FS_ALLOWED:
5529                 err = zone_set_fs_allowed(zone, (const char *)buf);
5530                 break;
5531         case ZONE_ATTR_SECFLAGS:
5532                 err = zone_set_secflags(zone, (psecflags_t *)buf);
5533                 break;
5534         case ZONE_ATTR_PHYS_MCAP:
5535                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5536                 break;
5537         case ZONE_ATTR_SCHED_CLASS:
5538                 err = zone_set_sched_class(zone, (const char *)buf);
5539                 break;
5540         case ZONE_ATTR_HOSTID:
5541                 if (bufsize == sizeof (zone->zone_hostid)) {
5542                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5543                                 err = 0;
5544                         else
5545                                 err = EFAULT;
5546                 } else {
5547                         err = EINVAL;
5548                 }
5549                 break;
5550         case ZONE_ATTR_NETWORK:
5551                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5552                         err = EINVAL;
5553                         break;
5554                 }
5555                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5556                 if (copyin(buf, zbuf, bufsize) != 0) {
5557                         kmem_free(zbuf, bufsize);
5558                         err = EFAULT;
5559                         break;
5560                 }
5561                 err = zone_set_network(zoneid, zbuf);
5562                 kmem_free(zbuf, bufsize);
5563                 break;
5564         default:
5565                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5566                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5567                 else
5568                         err = EINVAL;
5569         }
5570
5571 done:
5572         zone_rele(zone);
5573         ASSERT(err != -1);
5574         return (err != 0 ? set_errno(err) : 0);
5575 }
5576
5577 /*
5578  * Return zero if the process has at least one vnode mapped in to its
5579  * address space which shouldn't be allowed to change zones.
5580  *
5581  * Also return zero if the process has any shared mappings which reserve
5582  * swap.  This is because the counting for zone.max-swap does not allow swap
5583  * reservation to be shared between zones.  zone swap reservation is counted
5584  * on zone->zone_max_swap.
5585  */
5586 static int
5587 as_can_change_zones(void)
5588 {
5589         proc_t *pp = curproc;
5590         struct seg *seg;
5591         struct as *as = pp->p_as;
5592         vnode_t *vp;
5593         int allow = 1;
5594
5595         ASSERT(pp->p_as != &kas);
5596         AS_LOCK_ENTER(as, RW_READER);
5597         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5598
5599                 /*
5600                  * Cannot enter zone with shared anon memory which
5601                  * reserves swap.  See comment above.
5602                  */
5603                 if (seg_can_change_zones(seg) == B_FALSE) {
5604                         allow = 0;
5605                         break;
5606                 }
5607                 /*
5608                  * if we can't get a backing vnode for this segment then skip
5609                  * it.
5610                  */
5611                 vp = NULL;
5612                 if (segop_getvp(seg, seg->s_base, &vp) != 0 || vp == NULL)
5613                         continue;
5614                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5615                         allow = 0;
5616                         break;
5617                 }
5618         }
5619         AS_LOCK_EXIT(as);
5620         return (allow);
5621 }
5622
5623 /*
5624  * Count swap reserved by curproc's address space
5625  */
5626 static size_t
5627 as_swresv(void)
5628 {
5629         proc_t *pp = curproc;
5630         struct seg *seg;
5631         struct as *as = pp->p_as;
5632         size_t swap = 0;
5633
5634         ASSERT(pp->p_as != &kas);
5635         ASSERT(AS_WRITE_HELD(as));
5636         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5637                 swap += seg_swresv(seg);
5638
5639         return (swap);
5640 }
5641
5642 /*
5643  * Systemcall entry point for zone_enter().
5644  *
5645  * The current process is injected into said zone.  In the process
5646  * it will change its project membership, privileges, rootdir/cwd,
5647  * zone-wide rctls, and pool association to match those of the zone.
5648  *
5649  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5650  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5651  * enter a zone that is "ready" or "running".
5652  */
5653 static int
5654 zone_enter(zoneid_t zoneid)
5655 {
5656         zone_t *zone;
5657         vnode_t *vp;
5658         proc_t *pp = curproc;
5659         contract_t *ct;
5660         cont_process_t *ctp;
5661         task_t *tk, *oldtk;
5662         kproject_t *zone_proj0;
5663         cred_t *cr, *newcr;
5664         pool_t *oldpool, *newpool;
5665         sess_t *sp;
5666         uid_t uid;
5667         zone_status_t status;
5668         int err = 0;
5669         rctl_entity_p_t e;
5670         size_t swap;
5671         kthread_id_t t;
5672
5673         if (secpolicy_zone_config(CRED()) != 0)
5674                 return (set_errno(EPERM));
5675         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5676                 return (set_errno(EINVAL));
5677
5678         /*
5679          * Stop all lwps so we don't need to hold a lock to look at
5680          * curproc->p_zone.  This needs to happen before we grab any
5681          * locks to avoid deadlock (another lwp in the process could
5682          * be waiting for the held lock).
5683          */
5684         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5685                 return (set_errno(EINTR));
5686
5687         /*
5688          * Make sure we're not changing zones with files open or mapped in
5689          * to our address space which shouldn't be changing zones.
5690          */
5691         if (!files_can_change_zones()) {
5692                 err = EBADF;
5693                 goto out;
5694         }
5695         if (!as_can_change_zones()) {
5696                 err = EFAULT;
5697                 goto out;
5698         }
5699
5700         mutex_enter(&zonehash_lock);
5701         if (pp->p_zone != global_zone) {
5702                 mutex_exit(&zonehash_lock);
5703                 err = EINVAL;
5704                 goto out;
5705         }
5706
5707         zone = zone_find_all_by_id(zoneid);
5708         if (zone == NULL) {
5709                 mutex_exit(&zonehash_lock);
5710                 err = EINVAL;
5711                 goto out;
5712         }
5713
5714         /*
5715          * To prevent processes in a zone from holding contracts on
5716          * extrazonal resources, and to avoid process contract
5717          * memberships which span zones, contract holders and processes
5718          * which aren't the sole members of their encapsulating process
5719          * contracts are not allowed to zone_enter.
5720          */
5721         ctp = pp->p_ct_process;
5722         ct = &ctp->conp_contract;
5723         mutex_enter(&ct->ct_lock);
5724         mutex_enter(&pp->p_lock);
5725         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5726                 mutex_exit(&pp->p_lock);
5727                 mutex_exit(&ct->ct_lock);
5728                 mutex_exit(&zonehash_lock);
5729                 err = EINVAL;
5730                 goto out;
5731         }
5732
5733         /*
5734          * Moreover, we don't allow processes whose encapsulating
5735          * process contracts have inherited extrazonal contracts.
5736          * While it would be easier to eliminate all process contracts
5737          * with inherited contracts, we need to be able to give a
5738          * restarted init (or other zone-penetrating process) its
5739          * predecessor's contracts.
5740          */
5741         if (ctp->conp_ninherited != 0) {
5742                 contract_t *next;
5743                 for (next = list_head(&ctp->conp_inherited); next;
5744                     next = list_next(&ctp->conp_inherited, next)) {
5745                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5746                                 mutex_exit(&pp->p_lock);
5747                                 mutex_exit(&ct->ct_lock);
5748                                 mutex_exit(&zonehash_lock);
5749                                 err = EINVAL;
5750                                 goto out;
5751                         }
5752                 }
5753         }
5754
5755         mutex_exit(&pp->p_lock);
5756         mutex_exit(&ct->ct_lock);
5757
5758         status = zone_status_get(zone);
5759         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5760                 /*
5761                  * Can't join
5762                  */
5763                 mutex_exit(&zonehash_lock);
5764                 err = EINVAL;
5765                 goto out;
5766         }
5767
5768         /*
5769          * Make sure new priv set is within the permitted set for caller
5770          */
5771         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5772                 mutex_exit(&zonehash_lock);
5773                 err = EPERM;
5774                 goto out;
5775         }
5776         /*
5777          * We want to momentarily drop zonehash_lock while we optimistically
5778          * bind curproc to the pool it should be running in.  This is safe
5779          * since the zone can't disappear (we have a hold on it).
5780          */
5781         zone_hold(zone);
5782         mutex_exit(&zonehash_lock);
5783
5784         /*
5785          * Grab pool_lock to keep the pools configuration from changing
5786          * and to stop ourselves from getting rebound to another pool
5787          * until we join the zone.
5788          */
5789         if (pool_lock_intr() != 0) {
5790                 zone_rele(zone);
5791                 err = EINTR;
5792                 goto out;
5793         }
5794         ASSERT(secpolicy_pool(CRED()) == 0);
5795         /*
5796          * Bind ourselves to the pool currently associated with the zone.
5797          */
5798         oldpool = curproc->p_pool;
5799         newpool = zone_pool_get(zone);
5800         if (pool_state == POOL_ENABLED && newpool != oldpool &&
5801             (err = pool_do_bind(newpool, P_PID, P_MYID,
5802             POOL_BIND_ALL)) != 0) {
5803                 pool_unlock();
5804                 zone_rele(zone);
5805                 goto out;
5806         }
5807
5808         /*
5809          * Grab cpu_lock now; we'll need it later when we call
5810          * task_join().
5811          */
5812         mutex_enter(&cpu_lock);
5813         mutex_enter(&zonehash_lock);
5814         /*
5815          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5816          */
5817         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5818                 /*
5819                  * Can't join anymore.
5820                  */
5821                 mutex_exit(&zonehash_lock);
5822                 mutex_exit(&cpu_lock);
5823                 if (pool_state == POOL_ENABLED &&
5824                     newpool != oldpool)
5825                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
5826                             POOL_BIND_ALL);
5827                 pool_unlock();
5828                 zone_rele(zone);
5829                 err = EINVAL;
5830                 goto out;
5831         }
5832
5833         /*
5834          * a_lock must be held while transfering locked memory and swap
5835          * reservation from the global zone to the non global zone because
5836          * asynchronous faults on the processes' address space can lock
5837          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5838          * segments respectively.
5839          */
5840         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
5841         swap = as_swresv();
5842         mutex_enter(&pp->p_lock);
5843         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5844         /* verify that we do not exceed and task or lwp limits */
5845         mutex_enter(&zone->zone_nlwps_lock);
5846         /* add new lwps to zone and zone's proj0 */
5847         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5848         zone->zone_nlwps += pp->p_lwpcnt;
5849         /* add 1 task to zone's proj0 */
5850         zone_proj0->kpj_ntasks += 1;
5851
5852         zone_proj0->kpj_nprocs++;
5853         zone->zone_nprocs++;
5854         mutex_exit(&zone->zone_nlwps_lock);
5855
5856         mutex_enter(&zone->zone_mem_lock);
5857         zone->zone_locked_mem += pp->p_locked_mem;
5858         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5859         zone->zone_max_swap += swap;
5860         mutex_exit(&zone->zone_mem_lock);
5861
5862         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5863         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5864         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5865
5866         /* remove lwps and process from proc's old zone and old project */
5867         mutex_enter(&pp->p_zone->zone_nlwps_lock);
5868         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5869         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5870         pp->p_task->tk_proj->kpj_nprocs--;
5871         pp->p_zone->zone_nprocs--;
5872         mutex_exit(&pp->p_zone->zone_nlwps_lock);
5873
5874         mutex_enter(&pp->p_zone->zone_mem_lock);
5875         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5876         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5877         pp->p_zone->zone_max_swap -= swap;
5878         mutex_exit(&pp->p_zone->zone_mem_lock);
5879
5880         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5881         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5882         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5883
5884         pp->p_flag |= SZONETOP;
5885         pp->p_zone = zone;
5886         mutex_exit(&pp->p_lock);
5887         AS_LOCK_EXIT(pp->p_as);
5888
5889         /*
5890          * Joining the zone cannot fail from now on.
5891          *
5892          * This means that a lot of the following code can be commonized and
5893          * shared with zsched().
5894          */
5895
5896         /*
5897          * If the process contract fmri was inherited, we need to
5898          * flag this so that any contract status will not leak
5899          * extra zone information, svc_fmri in this case
5900          */
5901         if (ctp->conp_svc_ctid != ct->ct_id) {
5902                 mutex_enter(&ct->ct_lock);
5903                 ctp->conp_svc_zone_enter = ct->ct_id;
5904                 mutex_exit(&ct->ct_lock);
5905         }
5906
5907         /*
5908          * Reset the encapsulating process contract's zone.
5909          */
5910         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5911         contract_setzuniqid(ct, zone->zone_uniqid);
5912
5913         /*
5914          * Create a new task and associate the process with the project keyed
5915          * by (projid,zoneid).
5916          *
5917          * We might as well be in project 0; the global zone's projid doesn't
5918          * make much sense in a zone anyhow.
5919          *
5920          * This also increments zone_ntasks, and returns with p_lock held.
5921          */
5922         tk = task_create(0, zone);
5923         oldtk = task_join(tk, 0);
5924         mutex_exit(&cpu_lock);
5925
5926         /*
5927          * call RCTLOP_SET functions on this proc
5928          */
5929         e.rcep_p.zone = zone;
5930         e.rcep_t = RCENTITY_ZONE;
5931         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5932             RCD_CALLBACK);
5933         mutex_exit(&pp->p_lock);
5934
5935         /*
5936          * We don't need to hold any of zsched's locks here; not only do we know
5937          * the process and zone aren't going away, we know its session isn't
5938          * changing either.
5939          *
5940          * By joining zsched's session here, we mimic the behavior in the
5941          * global zone of init's sid being the pid of sched.  We extend this
5942          * to all zlogin-like zone_enter()'ing processes as well.
5943          */
5944         mutex_enter(&pidlock);
5945         sp = zone->zone_zsched->p_sessp;
5946         sess_hold(zone->zone_zsched);
5947         mutex_enter(&pp->p_lock);
5948         pgexit(pp);
5949         sess_rele(pp->p_sessp, B_TRUE);
5950         pp->p_sessp = sp;
5951         pgjoin(pp, zone->zone_zsched->p_pidp);
5952
5953         /*
5954          * If any threads are scheduled to be placed on zone wait queue they
5955          * should abandon the idea since the wait queue is changing.
5956          * We need to be holding pidlock & p_lock to do this.
5957          */
5958         if ((t = pp->p_tlist) != NULL) {
5959                 do {
5960                         thread_lock(t);
5961                         /*
5962                          * Kick this thread so that it doesn't sit
5963                          * on a wrong wait queue.
5964                          */
5965                         if (ISWAITING(t))
5966                                 setrun_locked(t);
5967
5968                         if (t->t_schedflag & TS_ANYWAITQ)
5969                                 t->t_schedflag &= ~ TS_ANYWAITQ;
5970
5971                         thread_unlock(t);
5972                 } while ((t = t->t_forw) != pp->p_tlist);
5973         }
5974
5975         /*
5976          * If there is a default scheduling class for the zone and it is not
5977          * the class we are currently in, change all of the threads in the
5978          * process to the new class.  We need to be holding pidlock & p_lock
5979          * when we call parmsset so this is a good place to do it.
5980          */
5981         if (zone->zone_defaultcid > 0 &&
5982             zone->zone_defaultcid != curthread->t_cid) {
5983                 pcparms_t pcparms;
5984
5985                 pcparms.pc_cid = zone->zone_defaultcid;
5986                 pcparms.pc_clparms[0] = 0;
5987
5988                 /*
5989                  * If setting the class fails, we still want to enter the zone.
5990                  */
5991                 if ((t = pp->p_tlist) != NULL) {
5992                         do {
5993                                 (void) parmsset(&pcparms, t);
5994                         } while ((t = t->t_forw) != pp->p_tlist);
5995                 }
5996         }
5997
5998         mutex_exit(&pp->p_lock);
5999         mutex_exit(&pidlock);
6000
6001         mutex_exit(&zonehash_lock);
6002         /*
6003          * We're firmly in the zone; let pools progress.
6004          */
6005         pool_unlock();
6006         task_rele(oldtk);
6007         /*
6008          * We don't need to retain a hold on the zone since we already
6009          * incremented zone_ntasks, so the zone isn't going anywhere.
6010          */
6011         zone_rele(zone);
6012
6013         /*
6014          * Chroot
6015          */
6016         vp = zone->zone_rootvp;
6017         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6018         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6019
6020         /*
6021          * Change process security flags.  Note that the _effective_ flags
6022          * cannot change
6023          */
6024         secflags_copy(&pp->p_secflags.psf_lower,
6025             &zone->zone_secflags.psf_lower);
6026         secflags_copy(&pp->p_secflags.psf_upper,
6027             &zone->zone_secflags.psf_upper);
6028         secflags_copy(&pp->p_secflags.psf_inherit,
6029             &zone->zone_secflags.psf_inherit);
6030
6031         /*
6032          * Change process credentials
6033          */
6034         newcr = cralloc();
6035         mutex_enter(&pp->p_crlock);
6036         cr = pp->p_cred;
6037         crcopy_to(cr, newcr);
6038         crsetzone(newcr, zone);
6039         pp->p_cred = newcr;
6040
6041         /*
6042          * Restrict all process privilege sets to zone limit
6043          */
6044         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6045         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6046         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6047         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6048         mutex_exit(&pp->p_crlock);
6049         crset(pp, newcr);
6050
6051         /*
6052          * Adjust upcount to reflect zone entry.
6053          */
6054         uid = crgetruid(newcr);
6055         mutex_enter(&pidlock);
6056         upcount_dec(uid, GLOBAL_ZONEID);
6057         upcount_inc(uid, zoneid);
6058         mutex_exit(&pidlock);
6059
6060         /*
6061          * Set up core file path and content.
6062          */
6063         set_core_defaults();
6064
6065 out:
6066         /*
6067          * Let the other lwps continue.
6068          */
6069         mutex_enter(&pp->p_lock);
6070         if (curthread != pp->p_agenttp)
6071                 continuelwps(pp);
6072         mutex_exit(&pp->p_lock);
6073
6074         return (err != 0 ? set_errno(err) : 0);
6075 }
6076
6077 /*
6078  * Systemcall entry point for zone_list(2).
6079  *
6080  * Processes running in a (non-global) zone only see themselves.
6081  */
6082 static int
6083 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6084 {
6085         zoneid_t *zoneids;
6086         zone_t *zone, *myzone;
6087         uint_t user_nzones, real_nzones;
6088         uint_t domi_nzones;
6089         int error;
6090
6091         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6092                 return (set_errno(EFAULT));
6093
6094         myzone = curproc->p_zone;
6095         if (myzone != global_zone) {
6096                 /* just return current zone */
6097                 real_nzones = domi_nzones = 1;
6098                 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6099                 zoneids[0] = myzone->zone_id;
6100         } else {
6101                 mutex_enter(&zonehash_lock);
6102                 real_nzones = zonecount;
6103                 domi_nzones = 0;
6104                 if (real_nzones > 0) {
6105                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6106                             KM_SLEEP);
6107                         for (zone = list_head(&zone_active); zone != NULL;
6108                             zone = list_next(&zone_active, zone))
6109                                 zoneids[domi_nzones++] = zone->zone_id;
6110                         ASSERT(domi_nzones == real_nzones);
6111                 }
6112                 mutex_exit(&zonehash_lock);
6113         }
6114
6115         /*
6116          * If user has allocated space for fewer entries than we found, then
6117          * return only up to their limit.  Either way, tell them exactly how
6118          * many we found.
6119          */
6120         if (domi_nzones < user_nzones)
6121                 user_nzones = domi_nzones;
6122         error = 0;
6123         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6124                 error = EFAULT;
6125         } else if (zoneidlist != NULL && user_nzones != 0) {
6126                 if (copyout(zoneids, zoneidlist,
6127                     user_nzones * sizeof (zoneid_t)) != 0)
6128                         error = EFAULT;
6129         }
6130
6131         if (real_nzones > 0)
6132                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6133
6134         if (error != 0)
6135                 return (set_errno(error));
6136         else
6137                 return (0);
6138 }
6139
6140 /*
6141  * Systemcall entry point for zone_lookup(2).
6142  *
6143  * Non-global zones are only able to see themselves.
6144  */
6145 static zoneid_t
6146 zone_lookup(const char *zone_name)
6147 {
6148         char *kname;
6149         zone_t *zone;
6150         zoneid_t zoneid;
6151         int err;
6152
6153         if (zone_name == NULL) {
6154                 /* return caller's zone id */
6155                 return (getzoneid());
6156         }
6157
6158         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6159         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6160                 kmem_free(kname, ZONENAME_MAX);
6161                 return (set_errno(err));
6162         }
6163
6164         mutex_enter(&zonehash_lock);
6165         zone = zone_find_all_by_name(kname);
6166         kmem_free(kname, ZONENAME_MAX);
6167         /* In a non-global zone, can only lookup global and own name. */
6168         if (zone == NULL ||
6169             zone_status_get(zone) < ZONE_IS_READY ||
6170             !zone_list_access(zone)) {
6171                 mutex_exit(&zonehash_lock);
6172                 return (set_errno(EINVAL));
6173         } else {
6174                 zoneid = zone->zone_id;
6175                 mutex_exit(&zonehash_lock);
6176                 return (zoneid);
6177         }
6178 }
6179
6180 static int
6181 zone_version(int *version_arg)
6182 {
6183         int version = ZONE_SYSCALL_API_VERSION;
6184
6185         if (copyout(&version, version_arg, sizeof (int)) != 0)
6186                 return (set_errno(EFAULT));
6187         return (0);
6188 }
6189
6190 /* ARGSUSED */
6191 long
6192 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6193 {
6194         zone_def zs;
6195         int err;
6196
6197         switch (cmd) {
6198         case ZONE_CREATE:
6199                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6200                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6201                                 return (set_errno(EFAULT));
6202                         }
6203                 } else {
6204 #ifdef _SYSCALL32_IMPL
6205                         zone_def32 zs32;
6206
6207                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6208                                 return (set_errno(EFAULT));
6209                         }
6210                         zs.zone_name =
6211                             (const char *)(unsigned long)zs32.zone_name;
6212                         zs.zone_root =
6213                             (const char *)(unsigned long)zs32.zone_root;
6214                         zs.zone_privs =
6215                             (const struct priv_set *)
6216                             (unsigned long)zs32.zone_privs;
6217                         zs.zone_privssz = zs32.zone_privssz;
6218                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6219                         zs.rctlbufsz = zs32.rctlbufsz;
6220                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6221                         zs.zfsbufsz = zs32.zfsbufsz;
6222                         zs.extended_error =
6223                             (int *)(unsigned long)zs32.extended_error;
6224                         zs.flags = zs32.flags;
6225 #else
6226                         panic("get_udatamodel() returned bogus result\n");
6227 #endif
6228                 }
6229
6230                 return (zone_create(zs.zone_name, zs.zone_root,
6231                     zs.zone_privs, zs.zone_privssz,
6232                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6233                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6234                     zs.extended_error, zs.flags));
6235         case ZONE_BOOT:
6236                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6237         case ZONE_DESTROY:
6238                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6239         case ZONE_GETATTR:
6240                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6241                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6242         case ZONE_SETATTR:
6243                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6244                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6245         case ZONE_ENTER:
6246                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6247         case ZONE_LIST:
6248                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6249         case ZONE_SHUTDOWN:
6250                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6251         case ZONE_LOOKUP:
6252                 return (zone_lookup((const char *)arg1));
6253         case ZONE_VERSION:
6254                 return (zone_version((int *)arg1));
6255         case ZONE_ADD_DATALINK:
6256                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6257                     (datalink_id_t)(uintptr_t)arg2));
6258         case ZONE_DEL_DATALINK:
6259                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6260                     (datalink_id_t)(uintptr_t)arg2));
6261         case ZONE_CHECK_DATALINK: {
6262                 zoneid_t        zoneid;
6263                 boolean_t       need_copyout;
6264
6265                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6266                         return (EFAULT);
6267                 need_copyout = (zoneid == ALL_ZONES);
6268                 err = zone_check_datalink(&zoneid,
6269                     (datalink_id_t)(uintptr_t)arg2);
6270                 if (err == 0 && need_copyout) {
6271                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6272                                 err = EFAULT;
6273                 }
6274                 return (err == 0 ? 0 : set_errno(err));
6275         }
6276         case ZONE_LIST_DATALINK:
6277                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6278                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6279         default:
6280                 return (set_errno(EINVAL));
6281         }
6282 }
6283
6284 struct zarg {
6285         zone_t *zone;
6286         zone_cmd_arg_t arg;
6287 };
6288
6289 static int
6290 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6291 {
6292         char *buf;
6293         size_t buflen;
6294         int error;
6295
6296         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6297         buf = kmem_alloc(buflen, KM_SLEEP);
6298         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6299         error = door_ki_open(buf, doorp);
6300         kmem_free(buf, buflen);
6301         return (error);
6302 }
6303
6304 static void
6305 zone_release_door(door_handle_t *doorp)
6306 {
6307         door_ki_rele(*doorp);
6308         *doorp = NULL;
6309 }
6310
6311 static void
6312 zone_ki_call_zoneadmd(struct zarg *zargp)
6313 {
6314         door_handle_t door = NULL;
6315         door_arg_t darg, save_arg;
6316         char *zone_name;
6317         size_t zone_namelen;
6318         zoneid_t zoneid;
6319         zone_t *zone;
6320         zone_cmd_arg_t arg;
6321         uint64_t uniqid;
6322         size_t size;
6323         int error;
6324         int retry;
6325
6326         zone = zargp->zone;
6327         arg = zargp->arg;
6328         kmem_free(zargp, sizeof (*zargp));
6329
6330         zone_namelen = strlen(zone->zone_name) + 1;
6331         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6332         bcopy(zone->zone_name, zone_name, zone_namelen);
6333         zoneid = zone->zone_id;
6334         uniqid = zone->zone_uniqid;
6335         /*
6336          * zoneadmd may be down, but at least we can empty out the zone.
6337          * We can ignore the return value of zone_empty() since we're called
6338          * from a kernel thread and know we won't be delivered any signals.
6339          */
6340         ASSERT(curproc == &p0);
6341         (void) zone_empty(zone);
6342         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6343         zone_rele(zone);
6344
6345         size = sizeof (arg);
6346         darg.rbuf = (char *)&arg;
6347         darg.data_ptr = (char *)&arg;
6348         darg.rsize = size;
6349         darg.data_size = size;
6350         darg.desc_ptr = NULL;
6351         darg.desc_num = 0;
6352
6353         save_arg = darg;
6354         /*
6355          * Since we're not holding a reference to the zone, any number of
6356          * things can go wrong, including the zone disappearing before we get a
6357          * chance to talk to zoneadmd.
6358          */
6359         for (retry = 0; /* forever */; retry++) {
6360                 if (door == NULL &&
6361                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6362                         goto next;
6363                 }
6364                 ASSERT(door != NULL);
6365
6366                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6367                     SIZE_MAX, 0)) == 0) {
6368                         break;
6369                 }
6370                 switch (error) {
6371                 case EINTR:
6372                         /* FALLTHROUGH */
6373                 case EAGAIN:    /* process may be forking */
6374                         /*
6375                          * Back off for a bit
6376                          */
6377                         break;
6378                 case EBADF:
6379                         zone_release_door(&door);
6380                         if (zone_lookup_door(zone_name, &door) != 0) {
6381                                 /*
6382                                  * zoneadmd may be dead, but it may come back to
6383                                  * life later.
6384                                  */
6385                                 break;
6386                         }
6387                         break;
6388                 default:
6389                         cmn_err(CE_WARN,
6390                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6391                             error);
6392                         goto out;
6393                 }
6394 next:
6395                 /*
6396                  * If this isn't the same zone_t that we originally had in mind,
6397                  * then this is the same as if two kadmin requests come in at
6398                  * the same time: the first one wins.  This means we lose, so we
6399                  * bail.
6400                  */
6401                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6402                         /*
6403                          * Problem is solved.
6404                          */
6405                         break;
6406                 }
6407                 if (zone->zone_uniqid != uniqid) {
6408                         /*
6409                          * zoneid recycled
6410                          */
6411                         zone_rele(zone);
6412                         break;
6413                 }
6414                 /*
6415                  * We could zone_status_timedwait(), but there doesn't seem to
6416                  * be much point in doing that (plus, it would mean that
6417                  * zone_free() isn't called until this thread exits).
6418                  */
6419                 zone_rele(zone);
6420                 ddi_sleep(1);
6421                 darg = save_arg;
6422         }
6423 out:
6424         if (door != NULL) {
6425                 zone_release_door(&door);
6426         }
6427         kmem_free(zone_name, zone_namelen);
6428         thread_exit();
6429 }
6430
6431 /*
6432  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6433  * kadmin().  The caller is a process in the zone.
6434  *
6435  * In order to shutdown the zone, we will hand off control to zoneadmd
6436  * (running in the global zone) via a door.  We do a half-hearted job at
6437  * killing all processes in the zone, create a kernel thread to contact
6438  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6439  * a form of generation number used to let zoneadmd (as well as
6440  * zone_destroy()) know exactly which zone they're re talking about.
6441  */
6442 int
6443 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6444 {
6445         struct zarg *zargp;
6446         zone_cmd_t zcmd;
6447         zone_t *zone;
6448
6449         zone = curproc->p_zone;
6450         ASSERT(getzoneid() != GLOBAL_ZONEID);
6451
6452         switch (cmd) {
6453         case A_SHUTDOWN:
6454                 switch (fcn) {
6455                 case AD_HALT:
6456                 case AD_POWEROFF:
6457                         zcmd = Z_HALT;
6458                         break;
6459                 case AD_BOOT:
6460                         zcmd = Z_REBOOT;
6461                         break;
6462                 case AD_IBOOT:
6463                 case AD_SBOOT:
6464                 case AD_SIBOOT:
6465                 case AD_NOSYNC:
6466                         return (ENOTSUP);
6467                 default:
6468                         return (EINVAL);
6469                 }
6470                 break;
6471         case A_REBOOT:
6472                 zcmd = Z_REBOOT;
6473                 break;
6474         case A_FTRACE:
6475         case A_REMOUNT:
6476         case A_FREEZE:
6477         case A_DUMP:
6478         case A_CONFIG:
6479                 return (ENOTSUP);
6480         default:
6481                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6482                 return (EINVAL);
6483         }
6484
6485         if (secpolicy_zone_admin(credp, B_FALSE))
6486                 return (EPERM);
6487         mutex_enter(&zone_status_lock);
6488
6489         /*
6490          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6491          * is in the zone.
6492          */
6493         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6494         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6495                 /*
6496                  * This zone is already on its way down.
6497                  */
6498                 mutex_exit(&zone_status_lock);
6499                 return (0);
6500         }
6501         /*
6502          * Prevent future zone_enter()s
6503          */
6504         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6505         mutex_exit(&zone_status_lock);
6506
6507         /*
6508          * Kill everyone now and call zoneadmd later.
6509          * zone_ki_call_zoneadmd() will do a more thorough job of this
6510          * later.
6511          */
6512         killall(zone->zone_id);
6513         /*
6514          * Now, create the thread to contact zoneadmd and do the rest of the
6515          * work.  This thread can't be created in our zone otherwise
6516          * zone_destroy() would deadlock.
6517          */
6518         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6519         zargp->arg.cmd = zcmd;
6520         zargp->arg.uniqid = zone->zone_uniqid;
6521         zargp->zone = zone;
6522         (void) strcpy(zargp->arg.locale, "C");
6523         /* mdep was already copied in for us by uadmin */
6524         if (mdep != NULL)
6525                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6526                     sizeof (zargp->arg.bootbuf));
6527         zone_hold(zone);
6528
6529         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6530             TS_RUN, minclsyspri);
6531         exit(CLD_EXITED, 0);
6532
6533         return (EINVAL);
6534 }
6535
6536 /*
6537  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6538  * status to ZONE_IS_SHUTTING_DOWN.
6539  *
6540  * This function also shuts down all running zones to ensure that they won't
6541  * fork new processes.
6542  */
6543 void
6544 zone_shutdown_global(void)
6545 {
6546         zone_t *current_zonep;
6547
6548         ASSERT(INGLOBALZONE(curproc));
6549         mutex_enter(&zonehash_lock);
6550         mutex_enter(&zone_status_lock);
6551
6552         /* Modify the global zone's status first. */
6553         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6554         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6555
6556         /*
6557          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6558          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6559          * could cause assertions to fail (e.g., assertions about a zone's
6560          * state during initialization, readying, or booting) or produce races.
6561          * We'll let threads continue to initialize and ready new zones: they'll
6562          * fail to boot the new zones when they see that the global zone is
6563          * shutting down.
6564          */
6565         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6566             current_zonep = list_next(&zone_active, current_zonep)) {
6567                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6568                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6569         }
6570         mutex_exit(&zone_status_lock);
6571         mutex_exit(&zonehash_lock);
6572 }
6573
6574 /*
6575  * Returns true if the named dataset is visible in the current zone.
6576  * The 'write' parameter is set to 1 if the dataset is also writable.
6577  */
6578 int
6579 zone_dataset_visible(const char *dataset, int *write)
6580 {
6581         static int zfstype = -1;
6582         zone_dataset_t *zd;
6583         size_t len;
6584         zone_t *zone = curproc->p_zone;
6585         const char *name = NULL;
6586         vfs_t *vfsp = NULL;
6587
6588         if (dataset[0] == '\0')
6589                 return (0);
6590
6591         /*
6592          * Walk the list once, looking for datasets which match exactly, or
6593          * specify a dataset underneath an exported dataset.  If found, return
6594          * true and note that it is writable.
6595          */
6596         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6597             zd = list_next(&zone->zone_datasets, zd)) {
6598
6599                 len = strlen(zd->zd_dataset);
6600                 if (strlen(dataset) >= len &&
6601                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6602                     (dataset[len] == '\0' || dataset[len] == '/' ||
6603                     dataset[len] == '@')) {
6604                         if (write)
6605                                 *write = 1;
6606                         return (1);
6607                 }
6608         }
6609
6610         /*
6611          * Walk the list a second time, searching for datasets which are parents
6612          * of exported datasets.  These should be visible, but read-only.
6613          *
6614          * Note that we also have to support forms such as 'pool/dataset/', with
6615          * a trailing slash.
6616          */
6617         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6618             zd = list_next(&zone->zone_datasets, zd)) {
6619
6620                 len = strlen(dataset);
6621                 if (dataset[len - 1] == '/')
6622                         len--;  /* Ignore trailing slash */
6623                 if (len < strlen(zd->zd_dataset) &&
6624                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6625                     zd->zd_dataset[len] == '/') {
6626                         if (write)
6627                                 *write = 0;
6628                         return (1);
6629                 }
6630         }
6631
6632         /*
6633          * We reach here if the given dataset is not found in the zone_dataset
6634          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6635          * instead of delegation. For this we search for the dataset in the
6636          * zone_vfslist of this zone. If found, return true and note that it is
6637          * not writable.
6638          */
6639
6640         /*
6641          * Initialize zfstype if it is not initialized yet.
6642          */
6643         if (zfstype == -1) {
6644                 struct vfssw *vswp = vfs_getvfssw("zfs");
6645                 zfstype = vswp - vfssw;
6646                 vfs_unrefvfssw(vswp);
6647         }
6648
6649         vfs_list_read_lock();
6650         vfsp = zone->zone_vfslist;
6651         do {
6652                 ASSERT(vfsp);
6653                 if (vfsp->vfs_fstype == zfstype) {
6654                         name = refstr_value(vfsp->vfs_resource);
6655
6656                         /*
6657                          * Check if we have an exact match.
6658                          */
6659                         if (strcmp(dataset, name) == 0) {
6660                                 vfs_list_unlock();
6661                                 if (write)
6662                                         *write = 0;
6663                                 return (1);
6664                         }
6665                         /*
6666                          * We need to check if we are looking for parents of
6667                          * a dataset. These should be visible, but read-only.
6668                          */
6669                         len = strlen(dataset);
6670                         if (dataset[len - 1] == '/')
6671                                 len--;
6672
6673                         if (len < strlen(name) &&
6674                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6675                                 vfs_list_unlock();
6676                                 if (write)
6677                                         *write = 0;
6678                                 return (1);
6679                         }
6680                 }
6681                 vfsp = vfsp->vfs_zone_next;
6682         } while (vfsp != zone->zone_vfslist);
6683
6684         vfs_list_unlock();
6685         return (0);
6686 }
6687
6688 /*
6689  * zone_find_by_any_path() -
6690  *
6691  * kernel-private routine similar to zone_find_by_path(), but which
6692  * effectively compares against zone paths rather than zonerootpath
6693  * (i.e., the last component of zonerootpaths, which should be "root/",
6694  * are not compared.)  This is done in order to accurately identify all
6695  * paths, whether zone-visible or not, including those which are parallel
6696  * to /root/, such as /dev/, /home/, etc...
6697  *
6698  * If the specified path does not fall under any zone path then global
6699  * zone is returned.
6700  *
6701  * The treat_abs parameter indicates whether the path should be treated as
6702  * an absolute path although it does not begin with "/".  (This supports
6703  * nfs mount syntax such as host:any/path.)
6704  *
6705  * The caller is responsible for zone_rele of the returned zone.
6706  */
6707 zone_t *
6708 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6709 {
6710         zone_t *zone;
6711         int path_offset = 0;
6712
6713         if (path == NULL) {
6714                 zone_hold(global_zone);
6715                 return (global_zone);
6716         }
6717
6718         if (*path != '/') {
6719                 ASSERT(treat_abs);
6720                 path_offset = 1;
6721         }
6722
6723         mutex_enter(&zonehash_lock);
6724         for (zone = list_head(&zone_active); zone != NULL;
6725             zone = list_next(&zone_active, zone)) {
6726                 char    *c;
6727                 size_t  pathlen;
6728                 char *rootpath_start;
6729
6730                 if (zone == global_zone)        /* skip global zone */
6731                         continue;
6732
6733                 /* scan backwards to find start of last component */
6734                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6735                 do {
6736                         c--;
6737                 } while (*c != '/');
6738
6739                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
6740                 rootpath_start = (zone->zone_rootpath + path_offset);
6741                 if (strncmp(path, rootpath_start, pathlen) == 0)
6742                         break;
6743         }
6744         if (zone == NULL)
6745                 zone = global_zone;
6746         zone_hold(zone);
6747         mutex_exit(&zonehash_lock);
6748         return (zone);
6749 }
6750
6751 /*
6752  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6753  * zone_dl_t pointer if found, and NULL otherwise.
6754  */
6755 static zone_dl_t *
6756 zone_find_dl(zone_t *zone, datalink_id_t linkid)
6757 {
6758         zone_dl_t *zdl;
6759
6760         ASSERT(mutex_owned(&zone->zone_lock));
6761         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6762             zdl = list_next(&zone->zone_dl_list, zdl)) {
6763                 if (zdl->zdl_id == linkid)
6764                         break;
6765         }
6766         return (zdl);
6767 }
6768
6769 static boolean_t
6770 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6771 {
6772         boolean_t exists;
6773
6774         mutex_enter(&zone->zone_lock);
6775         exists = (zone_find_dl(zone, linkid) != NULL);
6776         mutex_exit(&zone->zone_lock);
6777         return (exists);
6778 }
6779
6780 /*
6781  * Add an data link name for the zone.
6782  */
6783 static int
6784 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6785 {
6786         zone_dl_t *zdl;
6787         zone_t *zone;
6788         zone_t *thiszone;
6789
6790         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6791                 return (set_errno(ENXIO));
6792
6793         /* Verify that the datalink ID doesn't already belong to a zone. */
6794         mutex_enter(&zonehash_lock);
6795         for (zone = list_head(&zone_active); zone != NULL;
6796             zone = list_next(&zone_active, zone)) {
6797                 if (zone_dl_exists(zone, linkid)) {
6798                         mutex_exit(&zonehash_lock);
6799                         zone_rele(thiszone);
6800                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6801                 }
6802         }
6803
6804         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6805         zdl->zdl_id = linkid;
6806         zdl->zdl_net = NULL;
6807         mutex_enter(&thiszone->zone_lock);
6808         list_insert_head(&thiszone->zone_dl_list, zdl);
6809         mutex_exit(&thiszone->zone_lock);
6810         mutex_exit(&zonehash_lock);
6811         zone_rele(thiszone);
6812         return (0);
6813 }
6814
6815 static int
6816 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6817 {
6818         zone_dl_t *zdl;
6819         zone_t *zone;
6820         int err = 0;
6821
6822         if ((zone = zone_find_by_id(zoneid)) == NULL)
6823                 return (set_errno(EINVAL));
6824
6825         mutex_enter(&zone->zone_lock);
6826         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6827                 err = ENXIO;
6828         } else {
6829                 list_remove(&zone->zone_dl_list, zdl);
6830                 nvlist_free(zdl->zdl_net);
6831                 kmem_free(zdl, sizeof (zone_dl_t));
6832         }
6833         mutex_exit(&zone->zone_lock);
6834         zone_rele(zone);
6835         return (err == 0 ? 0 : set_errno(err));
6836 }
6837
6838 /*
6839  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6840  * the linkid.  Otherwise we just check if the specified zoneidp has been
6841  * assigned the supplied linkid.
6842  */
6843 int
6844 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6845 {
6846         zone_t *zone;
6847         int err = ENXIO;
6848
6849         if (*zoneidp != ALL_ZONES) {
6850                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6851                         if (zone_dl_exists(zone, linkid))
6852                                 err = 0;
6853                         zone_rele(zone);
6854                 }
6855                 return (err);
6856         }
6857
6858         mutex_enter(&zonehash_lock);
6859         for (zone = list_head(&zone_active); zone != NULL;
6860             zone = list_next(&zone_active, zone)) {
6861                 if (zone_dl_exists(zone, linkid)) {
6862                         *zoneidp = zone->zone_id;
6863                         err = 0;
6864                         break;
6865                 }
6866         }
6867         mutex_exit(&zonehash_lock);
6868         return (err);
6869 }
6870
6871 /*
6872  * Get the list of datalink IDs assigned to a zone.
6873  *
6874  * On input, *nump is the number of datalink IDs that can fit in the supplied
6875  * idarray.  Upon return, *nump is either set to the number of datalink IDs
6876  * that were placed in the array if the array was large enough, or to the
6877  * number of datalink IDs that the function needs to place in the array if the
6878  * array is too small.
6879  */
6880 static int
6881 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6882 {
6883         uint_t num, dlcount;
6884         zone_t *zone;
6885         zone_dl_t *zdl;
6886         datalink_id_t *idptr = idarray;
6887
6888         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6889                 return (set_errno(EFAULT));
6890         if ((zone = zone_find_by_id(zoneid)) == NULL)
6891                 return (set_errno(ENXIO));
6892
6893         num = 0;
6894         mutex_enter(&zone->zone_lock);
6895         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6896             zdl = list_next(&zone->zone_dl_list, zdl)) {
6897                 /*
6898                  * If the list is bigger than what the caller supplied, just
6899                  * count, don't do copyout.
6900                  */
6901                 if (++num > dlcount)
6902                         continue;
6903                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6904                         mutex_exit(&zone->zone_lock);
6905                         zone_rele(zone);
6906                         return (set_errno(EFAULT));
6907                 }
6908                 idptr++;
6909         }
6910         mutex_exit(&zone->zone_lock);
6911         zone_rele(zone);
6912
6913         /* Increased or decreased, caller should be notified. */
6914         if (num != dlcount) {
6915                 if (copyout(&num, nump, sizeof (num)) != 0)
6916                         return (set_errno(EFAULT));
6917         }
6918         return (0);
6919 }
6920
6921 /*
6922  * Public interface for looking up a zone by zoneid. It's a customized version
6923  * for netstack_zone_create(). It can only be called from the zsd create
6924  * callbacks, since it doesn't have reference on the zone structure hence if
6925  * it is called elsewhere the zone could disappear after the zonehash_lock
6926  * is dropped.
6927  *
6928  * Furthermore it
6929  * 1. Doesn't check the status of the zone.
6930  * 2. It will be called even before zone_init is called, in that case the
6931  *    address of zone0 is returned directly, and netstack_zone_create()
6932  *    will only assign a value to zone0.zone_netstack, won't break anything.
6933  * 3. Returns without the zone being held.
6934  */
6935 zone_t *
6936 zone_find_by_id_nolock(zoneid_t zoneid)
6937 {
6938         zone_t *zone;
6939
6940         mutex_enter(&zonehash_lock);
6941         if (zonehashbyid == NULL)
6942                 zone = &zone0;
6943         else
6944                 zone = zone_find_all_by_id(zoneid);
6945         mutex_exit(&zonehash_lock);
6946         return (zone);
6947 }
6948
6949 /*
6950  * Walk the datalinks for a given zone
6951  */
6952 int
6953 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
6954     void *data)
6955 {
6956         zone_t          *zone;
6957         zone_dl_t       *zdl;
6958         datalink_id_t   *idarray;
6959         uint_t          idcount = 0;
6960         int             i, ret = 0;
6961
6962         if ((zone = zone_find_by_id(zoneid)) == NULL)
6963                 return (ENOENT);
6964
6965         /*
6966          * We first build an array of linkid's so that we can walk these and
6967          * execute the callback with the zone_lock dropped.
6968          */
6969         mutex_enter(&zone->zone_lock);
6970         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6971             zdl = list_next(&zone->zone_dl_list, zdl)) {
6972                 idcount++;
6973         }
6974
6975         if (idcount == 0) {
6976                 mutex_exit(&zone->zone_lock);
6977                 zone_rele(zone);
6978                 return (0);
6979         }
6980
6981         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
6982         if (idarray == NULL) {
6983                 mutex_exit(&zone->zone_lock);
6984                 zone_rele(zone);
6985                 return (ENOMEM);
6986         }
6987
6988         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6989             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
6990                 idarray[i] = zdl->zdl_id;
6991         }
6992
6993         mutex_exit(&zone->zone_lock);
6994
6995         for (i = 0; i < idcount && ret == 0; i++) {
6996                 if ((ret = (*cb)(idarray[i], data)) != 0)
6997                         break;
6998         }
6999
7000         zone_rele(zone);
7001         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7002         return (ret);
7003 }
7004
7005 static char *
7006 zone_net_type2name(int type)
7007 {
7008         switch (type) {
7009         case ZONE_NETWORK_ADDRESS:
7010                 return (ZONE_NET_ADDRNAME);
7011         case ZONE_NETWORK_DEFROUTER:
7012                 return (ZONE_NET_RTRNAME);
7013         default:
7014                 return (NULL);
7015         }
7016 }
7017
7018 static int
7019 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7020 {
7021         zone_t *zone;
7022         zone_dl_t *zdl;
7023         nvlist_t *nvl;
7024         int err = 0;
7025         uint8_t *new = NULL;
7026         char *nvname;
7027         int bufsize;
7028         datalink_id_t linkid = znbuf->zn_linkid;
7029
7030         if (secpolicy_zone_config(CRED()) != 0)
7031                 return (set_errno(EPERM));
7032
7033         if (zoneid == GLOBAL_ZONEID)
7034                 return (set_errno(EINVAL));
7035
7036         nvname = zone_net_type2name(znbuf->zn_type);
7037         bufsize = znbuf->zn_len;
7038         new = znbuf->zn_val;
7039         if (nvname == NULL)
7040                 return (set_errno(EINVAL));
7041
7042         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7043                 return (set_errno(EINVAL));
7044         }
7045
7046         mutex_enter(&zone->zone_lock);
7047         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7048                 err = ENXIO;
7049                 goto done;
7050         }
7051         if ((nvl = zdl->zdl_net) == NULL) {
7052                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7053                         err = ENOMEM;
7054                         goto done;
7055                 } else {
7056                         zdl->zdl_net = nvl;
7057                 }
7058         }
7059         if (nvlist_exists(nvl, nvname)) {
7060                 err = EINVAL;
7061                 goto done;
7062         }
7063         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7064         ASSERT(err == 0);
7065 done:
7066         mutex_exit(&zone->zone_lock);
7067         zone_rele(zone);
7068         if (err != 0)
7069                 return (set_errno(err));
7070         else
7071                 return (0);
7072 }
7073
7074 static int
7075 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7076 {
7077         zone_t *zone;
7078         zone_dl_t *zdl;
7079         nvlist_t *nvl;
7080         uint8_t *ptr;
7081         uint_t psize;
7082         int err = 0;
7083         char *nvname;
7084         int bufsize;
7085         void *buf;
7086         datalink_id_t linkid = znbuf->zn_linkid;
7087
7088         if (zoneid == GLOBAL_ZONEID)
7089                 return (set_errno(EINVAL));
7090
7091         nvname = zone_net_type2name(znbuf->zn_type);
7092         bufsize = znbuf->zn_len;
7093         buf = znbuf->zn_val;
7094
7095         if (nvname == NULL)
7096                 return (set_errno(EINVAL));
7097         if ((zone = zone_find_by_id(zoneid)) == NULL)
7098                 return (set_errno(EINVAL));
7099
7100         mutex_enter(&zone->zone_lock);
7101         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7102                 err = ENXIO;
7103                 goto done;
7104         }
7105         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7106                 err = ENOENT;
7107                 goto done;
7108         }
7109         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7110         ASSERT(err == 0);
7111
7112         if (psize > bufsize) {
7113                 err = ENOBUFS;
7114                 goto done;
7115         }
7116         znbuf->zn_len = psize;
7117         bcopy(ptr, buf, psize);
7118 done:
7119         mutex_exit(&zone->zone_lock);
7120         zone_rele(zone);
7121         if (err != 0)
7122                 return (set_errno(err));
7123         else
7124                 return (0);
7125 }