usr/src/uts/common/os/zone.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27
  28 /*
  29  * Zones
  30  *
  31  *   A zone is a named collection of processes, namespace constraints,
  32  *   and other system resources which comprise a secure and manageable
  33  *   application containment facility.
  34  *
  35  *   Zones (represented by the reference counted zone_t) are tracked in
  36  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  37  *   (zoneid_t) are used to track zone association.  Zone IDs are
  38  *   dynamically generated when the zone is created; if a persistent
  39  *   identifier is needed (core files, accounting logs, audit trail,
  40  *   etc.), the zone name should be used.
  41  *
  42  *
  43  *   Global Zone:
  44  *
  45  *   The global zone (zoneid 0) is automatically associated with all
  46  *   system resources that have not been bound to a user-created zone.
  47  *   This means that even systems where zones are not in active use
  48  *   have a global zone, and all processes, mounts, etc. are
  49  *   associated with that zone.  The global zone is generally
  50  *   unconstrained in terms of privileges and access, though the usual
  51  *   credential and privilege based restrictions apply.
  52  *
  53  *
  54  *   Zone States:
  55  *
  56  *   The states in which a zone may be in and the transitions are as
  57  *   follows:
  58  *
  59  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  60  *   initialized zone is added to the list of active zones on the system but
  61  *   isn't accessible.
  62  *
  63  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  64  *   not yet completed. Not possible to enter the zone, but attributes can
  65  *   be retrieved.
  66  *
  67  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  68  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  69  *   executed.  A zone remains in this state until it transitions into
  70  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  71  *
  72  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  73  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  74  *   state.
  75  *
  76  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  77  *   successfully started init.   A zone remains in this state until
  78  *   zone_shutdown() is called.
  79  *
  80  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  81  *   killing all processes running in the zone. The zone remains
  82  *   in this state until there are no more user processes running in the zone.
  83  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  84  *   Since zone_shutdown() is restartable, it may be called successfully
  85  *   multiple times for the same zone_t.  Setting of the zone's state to
  86  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  87  *   the zone's status without worrying about it being a moving target.
  88  *
  89  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  90  *   are no more user processes in the zone.  The zone remains in this
  91  *   state until there are no more kernel threads associated with the
  92  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  93  *   fail.
  94  *
  95  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  96  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  97  *   join the zone or create kernel threads therein.
  98  *
  99  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 100  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 101  *   return NULL from now on.
 102  *
 103  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 104  *   processes or threads doing work on behalf of the zone.  The zone is
 105  *   removed from the list of active zones.  zone_destroy() returns, and
 106  *   the zone can be recreated.
 107  *
 108  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 109  *   callbacks are executed, and all memory associated with the zone is
 110  *   freed.
 111  *
 112  *   Threads can wait for the zone to enter a requested state by using
 113  *   zone_status_wait() or zone_status_timedwait() with the desired
 114  *   state passed in as an argument.  Zone state transitions are
 115  *   uni-directional; it is not possible to move back to an earlier state.
 116  *
 117  *
 118  *   Zone-Specific Data:
 119  *
 120  *   Subsystems needing to maintain zone-specific data can store that
 121  *   data using the ZSD mechanism.  This provides a zone-specific data
 122  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 123  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 124  *   to register callbacks to be invoked when a zone is created, shut
 125  *   down, or destroyed.  This can be used to initialize zone-specific
 126  *   data for new zones and to clean up when zones go away.
 127  *
 128  *
 129  *   Data Structures:
 130  *
 131  *   The per-zone structure (zone_t) is reference counted, and freed
 132  *   when all references are released.  zone_hold and zone_rele can be
 133  *   used to adjust the reference count.  In addition, reference counts
 134  *   associated with the cred_t structure are tracked separately using
 135  *   zone_cred_hold and zone_cred_rele.
 136  *
 137  *   Pointers to active zone_t's are stored in two hash tables; one
 138  *   for searching by id, the other for searching by name.  Lookups
 139  *   can be performed on either basis, using zone_find_by_id and
 140  *   zone_find_by_name.  Both return zone_t pointers with the zone
 141  *   held, so zone_rele should be called when the pointer is no longer
 142  *   needed.  Zones can also be searched by path; zone_find_by_path
 143  *   returns the zone with which a path name is associated (global
 144  *   zone if the path is not within some other zone's file system
 145  *   hierarchy).  This currently requires iterating through each zone,
 146  *   so it is slower than an id or name search via a hash table.
 147  *
 148  *
 149  *   Locking:
 150  *
 151  *   zonehash_lock: This is a top-level global lock used to protect the
 152  *       zone hash tables and lists.  Zones cannot be created or destroyed
 153  *       while this lock is held.
 154  *   zone_status_lock: This is a global lock protecting zone state.
 155  *       Zones cannot change state while this lock is held.  It also
 156  *       protects the list of kernel threads associated with a zone.
 157  *   zone_lock: This is a per-zone lock used to protect several fields of
 158  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 159  *       this lock means that the zone cannot go away.
 160  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 161  *       related to the zone.max-lwps rctl.
 162  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 163  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 164  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 165  *       currently just max_lofi
 166  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 167  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 168  *       list (a list of zones in the ZONE_IS_DEAD state).
 169  *
 170  *   Ordering requirements:
 171  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 172  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 173  *
 174  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 175  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 176  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 177  *
 178  *   Blocking memory allocations are permitted while holding any of the
 179  *   zone locks.
 180  *
 181  *
 182  *   System Call Interface:
 183  *
 184  *   The zone subsystem can be managed and queried from user level with
 185  *   the following system calls (all subcodes of the primary "zone"
 186  *   system call):
 187  *   - zone_create: creates a zone with selected attributes (name,
 188  *     root path, privileges, resource controls, ZFS datasets)
 189  *   - zone_enter: allows the current process to enter a zone
 190  *   - zone_getattr: reports attributes of a zone
 191  *   - zone_setattr: set attributes of a zone
 192  *   - zone_boot: set 'init' running for the zone
 193  *   - zone_list: lists all zones active in the system
 194  *   - zone_lookup: looks up zone id based on name
 195  *   - zone_shutdown: initiates shutdown process (see states above)
 196  *   - zone_destroy: completes shutdown process (see states above)
 197  *
 198  */
 199
 200 #include <sys/priv_impl.h>
 201 #include <sys/cred.h>
 202 #include <c2/audit.h>
 203 #include <sys/debug.h>
 204 #include <sys/file.h>
 205 #include <sys/kmem.h>
 206 #include <sys/kstat.h>
 207 #include <sys/mutex.h>
 208 #include <sys/note.h>
 209 #include <sys/pathname.h>
 210 #include <sys/proc.h>
 211 #include <sys/project.h>
 212 #include <sys/sysevent.h>
 213 #include <sys/task.h>
 214 #include <sys/systm.h>
 215 #include <sys/types.h>
 216 #include <sys/utsname.h>
 217 #include <sys/vnode.h>
 218 #include <sys/vfs.h>
 219 #include <sys/systeminfo.h>
 220 #include <sys/policy.h>
 221 #include <sys/cred_impl.h>
 222 #include <sys/contract_impl.h>
 223 #include <sys/contract/process_impl.h>
 224 #include <sys/class.h>
 225 #include <sys/pool.h>
 226 #include <sys/pool_pset.h>
 227 #include <sys/pset.h>
 228 #include <sys/strlog.h>
 229 #include <sys/sysmacros.h>
 230 #include <sys/callb.h>
 231 #include <sys/vmparam.h>
 232 #include <sys/corectl.h>
 233 #include <sys/ipc_impl.h>
 234 #include <sys/klpd.h>
 235
 236 #include <sys/door.h>
 237 #include <sys/cpuvar.h>
 238 #include <sys/sdt.h>
 239
 240 #include <sys/uadmin.h>
 241 #include <sys/session.h>
 242 #include <sys/cmn_err.h>
 243 #include <sys/modhash.h>
 244 #include <sys/sunddi.h>
 245 #include <sys/nvpair.h>
 246 #include <sys/rctl.h>
 247 #include <sys/fss.h>
 248 #include <sys/brand.h>
 249 #include <sys/zone.h>
 250 #include <net/if.h>
 251 #include <sys/cpucaps.h>
 252 #include <vm/seg.h>
 253 #include <sys/mac.h>
 254
 255 /*
 256  * This constant specifies the number of seconds that threads waiting for
 257  * subsystems to release a zone's general-purpose references will wait before
 258  * they log the zone's reference counts.  The constant's value shouldn't
 259  * be so small that reference counts are unnecessarily reported for zones
 260  * whose references are slowly released.  On the other hand, it shouldn't be so
 261  * large that users reboot their systems out of frustration over hung zones
 262  * before the system logs the zones' reference counts.
 263  */
 264 #define ZONE_DESTROY_TIMEOUT_SECS       60
 265
 266 /* List of data link IDs which are accessible from the zone */
 267 typedef struct zone_dl {
 268         datalink_id_t   zdl_id;
 269         nvlist_t        *zdl_net;
 270         list_node_t     zdl_linkage;
 271 } zone_dl_t;
 272
 273 /*
 274  * cv used to signal that all references to the zone have been released.  This
 275  * needs to be global since there may be multiple waiters, and the first to
 276  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 277  */
 278 static kcondvar_t zone_destroy_cv;
 279 /*
 280  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 281  * but then we'd need another lock for zone_destroy_cv, and why bother?
 282  */
 283 static kmutex_t zone_status_lock;
 284
 285 /*
 286  * ZSD-related global variables.
 287  */
 288 static kmutex_t zsd_key_lock;   /* protects the following two */
 289 /*
 290  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 291  */
 292 static zone_key_t zsd_keyval = 0;
 293 /*
 294  * Global list of registered keys.  We use this when a new zone is created.
 295  */
 296 static list_t zsd_registered_keys;
 297
 298 int zone_hash_size = 256;
 299 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 300 static kmutex_t zonehash_lock;
 301 static uint_t zonecount;
 302 static id_space_t *zoneid_space;
 303
 304 /*
 305  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 306  * kernel proper runs, and which manages all other zones.
 307  *
 308  * Although not declared as static, the variable "zone0" should not be used
 309  * except for by code that needs to reference the global zone early on in boot,
 310  * before it is fully initialized.  All other consumers should use
 311  * 'global_zone'.
 312  */
 313 zone_t zone0;
 314 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 315
 316 /*
 317  * List of active zones, protected by zonehash_lock.
 318  */
 319 static list_t zone_active;
 320
 321 /*
 322  * List of destroyed zones that still have outstanding cred references.
 323  * Used for debugging.  Uses a separate lock to avoid lock ordering
 324  * problems in zone_free.
 325  */
 326 static list_t zone_deathrow;
 327 static kmutex_t zone_deathrow_lock;
 328
 329 /* number of zones is limited by virtual interface limit in IP */
 330 uint_t maxzones = 8192;
 331
 332 /* Event channel to sent zone state change notifications */
 333 evchan_t *zone_event_chan;
 334
 335 /*
 336  * This table holds the mapping from kernel zone states to
 337  * states visible in the state notification API.
 338  * The idea is that we only expose "obvious" states and
 339  * do not expose states which are just implementation details.
 340  */
 341 const char  *zone_status_table[] = {
 342         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 343         ZONE_EVENT_INITIALIZED,         /* initialized */
 344         ZONE_EVENT_READY,               /* ready */
 345         ZONE_EVENT_READY,               /* booting */
 346         ZONE_EVENT_RUNNING,             /* running */
 347         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 351         ZONE_EVENT_UNINITIALIZED,       /* dead */
 352 };
 353
 354 /*
 355  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 356  * (see sys/zone.h).
 357  */
 358 static char *zone_ref_subsys_names[] = {
 359         "NFS",          /* ZONE_REF_NFS */
 360         "NFSv4",        /* ZONE_REF_NFSV4 */
 361         "SMBFS",        /* ZONE_REF_SMBFS */
 362         "MNTFS",        /* ZONE_REF_MNTFS */
 363         "LOFI",         /* ZONE_REF_LOFI */
 364         "VFS",          /* ZONE_REF_VFS */
 365         "IPC"           /* ZONE_REF_IPC */
 366 };
 367
 368 /*
 369  * This isn't static so lint doesn't complain.
 370  */
 371 rctl_hndl_t rc_zone_cpu_shares;
 372 rctl_hndl_t rc_zone_locked_mem;
 373 rctl_hndl_t rc_zone_max_swap;
 374 rctl_hndl_t rc_zone_max_lofi;
 375 rctl_hndl_t rc_zone_cpu_cap;
 376 rctl_hndl_t rc_zone_nlwps;
 377 rctl_hndl_t rc_zone_nprocs;
 378 rctl_hndl_t rc_zone_shmmax;
 379 rctl_hndl_t rc_zone_shmmni;
 380 rctl_hndl_t rc_zone_semmni;
 381 rctl_hndl_t rc_zone_msgmni;
 382
 383 const char * const zone_default_initname = "/sbin/init";
 384 static char * const zone_prefix = "/zone/";
 385 static int zone_shutdown(zoneid_t zoneid);
 386 static int zone_add_datalink(zoneid_t, datalink_id_t);
 387 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 388 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 389 static int zone_set_network(zoneid_t, zone_net_data_t *);
 390 static int zone_get_network(zoneid_t, zone_net_data_t *);
 391
 392 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 393
 394 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 395 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 396 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 397 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 398     zone_key_t);
 399 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 400 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 401     kmutex_t *);
 402 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 403     kmutex_t *);
 404
 405 /*
 406  * Bump this number when you alter the zone syscall interfaces; this is
 407  * because we need to have support for previous API versions in libc
 408  * to support patching; libc calls into the kernel to determine this number.
 409  *
 410  * Version 1 of the API is the version originally shipped with Solaris 10
 411  * Version 2 alters the zone_create system call in order to support more
 412  *     arguments by moving the args into a structure; and to do better
 413  *     error reporting when zone_create() fails.
 414  * Version 3 alters the zone_create system call in order to support the
 415  *     import of ZFS datasets to zones.
 416  * Version 4 alters the zone_create system call in order to support
 417  *     Trusted Extensions.
 418  * Version 5 alters the zone_boot system call, and converts its old
 419  *     bootargs parameter to be set by the zone_setattr API instead.
 420  * Version 6 adds the flag argument to zone_create.
 421  */
 422 static const int ZONE_SYSCALL_API_VERSION = 6;
 423
 424 /*
 425  * Certain filesystems (such as NFS and autofs) need to know which zone
 426  * the mount is being placed in.  Because of this, we need to be able to
 427  * ensure that a zone isn't in the process of being created/destroyed such
 428  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 429  * it gets added the list of mounted zones, it ends up on the wrong zone's
 430  * mount list. Since a zone can't reside on an NFS file system, we don't
 431  * have to worry about the zonepath itself.
 432  *
 433  * The following functions: block_mounts()/resume_mounts() and
 434  * mount_in_progress()/mount_completed() are used by zones and the VFS
 435  * layer (respectively) to synchronize zone state transitions and new
 436  * mounts within a zone. This syncronization is on a per-zone basis, so
 437  * activity for one zone will not interfere with activity for another zone.
 438  *
 439  * The semantics are like a reader-reader lock such that there may
 440  * either be multiple mounts (or zone state transitions, if that weren't
 441  * serialized by zonehash_lock) in progress at the same time, but not
 442  * both.
 443  *
 444  * We use cv's so the user can ctrl-C out of the operation if it's
 445  * taking too long.
 446  *
 447  * The semantics are such that there is unfair bias towards the
 448  * "current" operation.  This means that zone halt may starve if
 449  * there is a rapid succession of new mounts coming in to the zone.
 450  */
 451 /*
 452  * Prevent new mounts from progressing to the point of calling
 453  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 454  * them to complete.
 455  */
 456 static int
 457 block_mounts(zone_t *zp)
 458 {
 459         int retval = 0;
 460
 461         /*
 462          * Since it may block for a long time, block_mounts() shouldn't be
 463          * called with zonehash_lock held.
 464          */
 465         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 466         mutex_enter(&zp->zone_mount_lock);
 467         while (zp->zone_mounts_in_progress > 0) {
 468                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 469                         goto signaled;
 470         }
 471         /*
 472          * A negative value of mounts_in_progress indicates that mounts
 473          * have been blocked by (-mounts_in_progress) different callers
 474          * (remotely possible if two threads enter zone_shutdown at the same
 475          * time).
 476          */
 477         zp->zone_mounts_in_progress--;
 478         retval = 1;
 479 signaled:
 480         mutex_exit(&zp->zone_mount_lock);
 481         return (retval);
 482 }
 483
 484 /*
 485  * The VFS layer may progress with new mounts as far as we're concerned.
 486  * Allow them to progress if we were the last obstacle.
 487  */
 488 static void
 489 resume_mounts(zone_t *zp)
 490 {
 491         mutex_enter(&zp->zone_mount_lock);
 492         if (++zp->zone_mounts_in_progress == 0)
 493                 cv_broadcast(&zp->zone_mount_cv);
 494         mutex_exit(&zp->zone_mount_lock);
 495 }
 496
 497 /*
 498  * The VFS layer is busy with a mount; this zone should wait until all
 499  * of its mounts are completed to progress.
 500  */
 501 void
 502 mount_in_progress(zone_t *zp)
 503 {
 504         mutex_enter(&zp->zone_mount_lock);
 505         while (zp->zone_mounts_in_progress < 0)
 506                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 507         zp->zone_mounts_in_progress++;
 508         mutex_exit(&zp->zone_mount_lock);
 509 }
 510
 511 /*
 512  * VFS is done with one mount; wake up any waiting block_mounts()
 513  * callers if this is the last mount.
 514  */
 515 void
 516 mount_completed(zone_t *zp)
 517 {
 518         mutex_enter(&zp->zone_mount_lock);
 519         if (--zp->zone_mounts_in_progress == 0)
 520                 cv_broadcast(&zp->zone_mount_cv);
 521         mutex_exit(&zp->zone_mount_lock);
 522 }
 523
 524 /*
 525  * ZSD routines.
 526  *
 527  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 528  * defined by the pthread_key_create() and related interfaces.
 529  *
 530  * Kernel subsystems may register one or more data items and/or
 531  * callbacks to be executed when a zone is created, shutdown, or
 532  * destroyed.
 533  *
 534  * Unlike the thread counterpart, destructor callbacks will be executed
 535  * even if the data pointer is NULL and/or there are no constructor
 536  * callbacks, so it is the responsibility of such callbacks to check for
 537  * NULL data values if necessary.
 538  *
 539  * The locking strategy and overall picture is as follows:
 540  *
 541  * When someone calls zone_key_create(), a template ZSD entry is added to the
 542  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 543  * holding that lock all the existing zones are marked as
 544  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 545  * zone_zsd list (protected by zone_lock). The global list is updated first
 546  * (under zone_key_lock) to make sure that newly created zones use the
 547  * most recent list of keys. Then under zonehash_lock we walk the zones
 548  * and mark them.  Similar locking is used in zone_key_delete().
 549  *
 550  * The actual create, shutdown, and destroy callbacks are done without
 551  * holding any lock. And zsd_flags are used to ensure that the operations
 552  * completed so that when zone_key_create (and zone_create) is done, as well as
 553  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 554  * are completed.
 555  *
 556  * When new zones are created constructor callbacks for all registered ZSD
 557  * entries will be called. That also uses the above two phases of marking
 558  * what needs to be done, and then running the callbacks without holding
 559  * any locks.
 560  *
 561  * The framework does not provide any locking around zone_getspecific() and
 562  * zone_setspecific() apart from that needed for internal consistency, so
 563  * callers interested in atomic "test-and-set" semantics will need to provide
 564  * their own locking.
 565  */
 566
 567 /*
 568  * Helper function to find the zsd_entry associated with the key in the
 569  * given list.
 570  */
 571 static struct zsd_entry *
 572 zsd_find(list_t *l, zone_key_t key)
 573 {
 574         struct zsd_entry *zsd;
 575
 576         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 577                 if (zsd->zsd_key == key) {
 578                         return (zsd);
 579                 }
 580         }
 581         return (NULL);
 582 }
 583
 584 /*
 585  * Helper function to find the zsd_entry associated with the key in the
 586  * given list. Move it to the front of the list.
 587  */
 588 static struct zsd_entry *
 589 zsd_find_mru(list_t *l, zone_key_t key)
 590 {
 591         struct zsd_entry *zsd;
 592
 593         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 594                 if (zsd->zsd_key == key) {
 595                         /*
 596                          * Move to head of list to keep list in MRU order.
 597                          */
 598                         if (zsd != list_head(l)) {
 599                                 list_remove(l, zsd);
 600                                 list_insert_head(l, zsd);
 601                         }
 602                         return (zsd);
 603                 }
 604         }
 605         return (NULL);
 606 }
 607
 608 void
 609 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 610     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 611 {
 612         struct zsd_entry *zsdp;
 613         struct zsd_entry *t;
 614         struct zone *zone;
 615         zone_key_t  key;
 616
 617         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 618         zsdp->zsd_data = NULL;
 619         zsdp->zsd_create = create;
 620         zsdp->zsd_shutdown = shutdown;
 621         zsdp->zsd_destroy = destroy;
 622
 623         /*
 624          * Insert in global list of callbacks. Makes future zone creations
 625          * see it.
 626          */
 627         mutex_enter(&zsd_key_lock);
 628         key = zsdp->zsd_key = ++zsd_keyval;
 629         ASSERT(zsd_keyval != 0);
 630         list_insert_tail(&zsd_registered_keys, zsdp);
 631         mutex_exit(&zsd_key_lock);
 632
 633         /*
 634          * Insert for all existing zones and mark them as needing
 635          * a create callback.
 636          */
 637         mutex_enter(&zonehash_lock);    /* stop the world */
 638         for (zone = list_head(&zone_active); zone != NULL;
 639             zone = list_next(&zone_active, zone)) {
 640                 zone_status_t status;
 641
 642                 mutex_enter(&zone->zone_lock);
 643
 644                 /* Skip zones that are on the way down or not yet up */
 645                 status = zone_status_get(zone);
 646                 if (status >= ZONE_IS_DOWN ||
 647                     status == ZONE_IS_UNINITIALIZED) {
 648                         mutex_exit(&zone->zone_lock);
 649                         continue;
 650                 }
 651
 652                 t = zsd_find_mru(&zone->zone_zsd, key);
 653                 if (t != NULL) {
 654                         /*
 655                          * A zsd_configure already inserted it after
 656                          * we dropped zsd_key_lock above.
 657                          */
 658                         mutex_exit(&zone->zone_lock);
 659                         continue;
 660                 }
 661                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 662                 t->zsd_key = key;
 663                 t->zsd_create = create;
 664                 t->zsd_shutdown = shutdown;
 665                 t->zsd_destroy = destroy;
 666                 if (create != NULL) {
 667                         t->zsd_flags = ZSD_CREATE_NEEDED;
 668                         DTRACE_PROBE2(zsd__create__needed,
 669                             zone_t *, zone, zone_key_t, key);
 670                 }
 671                 list_insert_tail(&zone->zone_zsd, t);
 672                 mutex_exit(&zone->zone_lock);
 673         }
 674         mutex_exit(&zonehash_lock);
 675
 676         if (create != NULL) {
 677                 /* Now call the create callback for this key */
 678                 zsd_apply_all_zones(zsd_apply_create, key);
 679         }
 680         /*
 681          * It is safe for consumers to use the key now, make it
 682          * globally visible. Specifically zone_getspecific() will
 683          * always successfully return the zone specific data associated
 684          * with the key.
 685          */
 686         *keyp = key;
 687
 688 }
 689
 690 /*
 691  * Function called when a module is being unloaded, or otherwise wishes
 692  * to unregister its ZSD key and callbacks.
 693  *
 694  * Remove from the global list and determine the functions that need to
 695  * be called under a global lock. Then call the functions without
 696  * holding any locks. Finally free up the zone_zsd entries. (The apply
 697  * functions need to access the zone_zsd entries to find zsd_data etc.)
 698  */
 699 int
 700 zone_key_delete(zone_key_t key)
 701 {
 702         struct zsd_entry *zsdp = NULL;
 703         zone_t *zone;
 704
 705         mutex_enter(&zsd_key_lock);
 706         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 707         if (zsdp == NULL) {
 708                 mutex_exit(&zsd_key_lock);
 709                 return (-1);
 710         }
 711         list_remove(&zsd_registered_keys, zsdp);
 712         mutex_exit(&zsd_key_lock);
 713
 714         mutex_enter(&zonehash_lock);
 715         for (zone = list_head(&zone_active); zone != NULL;
 716             zone = list_next(&zone_active, zone)) {
 717                 struct zsd_entry *del;
 718
 719                 mutex_enter(&zone->zone_lock);
 720                 del = zsd_find_mru(&zone->zone_zsd, key);
 721                 if (del == NULL) {
 722                         /*
 723                          * Somebody else got here first e.g the zone going
 724                          * away.
 725                          */
 726                         mutex_exit(&zone->zone_lock);
 727                         continue;
 728                 }
 729                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 730                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 731                 if (del->zsd_shutdown != NULL &&
 732                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 733                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 734                         DTRACE_PROBE2(zsd__shutdown__needed,
 735                             zone_t *, zone, zone_key_t, key);
 736                 }
 737                 if (del->zsd_destroy != NULL &&
 738                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 739                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 740                         DTRACE_PROBE2(zsd__destroy__needed,
 741                             zone_t *, zone, zone_key_t, key);
 742                 }
 743                 mutex_exit(&zone->zone_lock);
 744         }
 745         mutex_exit(&zonehash_lock);
 746         kmem_free(zsdp, sizeof (*zsdp));
 747
 748         /* Now call the shutdown and destroy callback for this key */
 749         zsd_apply_all_zones(zsd_apply_shutdown, key);
 750         zsd_apply_all_zones(zsd_apply_destroy, key);
 751
 752         /* Now we can free up the zsdp structures in each zone */
 753         mutex_enter(&zonehash_lock);
 754         for (zone = list_head(&zone_active); zone != NULL;
 755             zone = list_next(&zone_active, zone)) {
 756                 struct zsd_entry *del;
 757
 758                 mutex_enter(&zone->zone_lock);
 759                 del = zsd_find(&zone->zone_zsd, key);
 760                 if (del != NULL) {
 761                         list_remove(&zone->zone_zsd, del);
 762                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 763                         kmem_free(del, sizeof (*del));
 764                 }
 765                 mutex_exit(&zone->zone_lock);
 766         }
 767         mutex_exit(&zonehash_lock);
 768
 769         return (0);
 770 }
 771
 772 /*
 773  * ZSD counterpart of pthread_setspecific().
 774  *
 775  * Since all zsd callbacks, including those with no create function,
 776  * have an entry in zone_zsd, if the key is registered it is part of
 777  * the zone_zsd list.
 778  * Return an error if the key wasn't registerd.
 779  */
 780 int
 781 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 782 {
 783         struct zsd_entry *t;
 784
 785         mutex_enter(&zone->zone_lock);
 786         t = zsd_find_mru(&zone->zone_zsd, key);
 787         if (t != NULL) {
 788                 /*
 789                  * Replace old value with new
 790                  */
 791                 t->zsd_data = (void *)data;
 792                 mutex_exit(&zone->zone_lock);
 793                 return (0);
 794         }
 795         mutex_exit(&zone->zone_lock);
 796         return (-1);
 797 }
 798
 799 /*
 800  * ZSD counterpart of pthread_getspecific().
 801  */
 802 void *
 803 zone_getspecific(zone_key_t key, zone_t *zone)
 804 {
 805         struct zsd_entry *t;
 806         void *data;
 807
 808         mutex_enter(&zone->zone_lock);
 809         t = zsd_find_mru(&zone->zone_zsd, key);
 810         data = (t == NULL ? NULL : t->zsd_data);
 811         mutex_exit(&zone->zone_lock);
 812         return (data);
 813 }
 814
 815 /*
 816  * Function used to initialize a zone's list of ZSD callbacks and data
 817  * when the zone is being created.  The callbacks are initialized from
 818  * the template list (zsd_registered_keys). The constructor callback is
 819  * executed later (once the zone exists and with locks dropped).
 820  */
 821 static void
 822 zone_zsd_configure(zone_t *zone)
 823 {
 824         struct zsd_entry *zsdp;
 825         struct zsd_entry *t;
 826
 827         ASSERT(MUTEX_HELD(&zonehash_lock));
 828         ASSERT(list_head(&zone->zone_zsd) == NULL);
 829         mutex_enter(&zone->zone_lock);
 830         mutex_enter(&zsd_key_lock);
 831         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 832             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 833                 /*
 834                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 835                  * should not have added anything to it.
 836                  */
 837                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 838
 839                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 840                 t->zsd_key = zsdp->zsd_key;
 841                 t->zsd_create = zsdp->zsd_create;
 842                 t->zsd_shutdown = zsdp->zsd_shutdown;
 843                 t->zsd_destroy = zsdp->zsd_destroy;
 844                 if (zsdp->zsd_create != NULL) {
 845                         t->zsd_flags = ZSD_CREATE_NEEDED;
 846                         DTRACE_PROBE2(zsd__create__needed,
 847                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 848                 }
 849                 list_insert_tail(&zone->zone_zsd, t);
 850         }
 851         mutex_exit(&zsd_key_lock);
 852         mutex_exit(&zone->zone_lock);
 853 }
 854
 855 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 856
 857 /*
 858  * Helper function to execute shutdown or destructor callbacks.
 859  */
 860 static void
 861 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 862 {
 863         struct zsd_entry *t;
 864
 865         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 866         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 867         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 868
 869         /*
 870          * Run the callback solely based on what is registered for the zone
 871          * in zone_zsd. The global list can change independently of this
 872          * as keys are registered and unregistered and we don't register new
 873          * callbacks for a zone that is in the process of going away.
 874          */
 875         mutex_enter(&zone->zone_lock);
 876         for (t = list_head(&zone->zone_zsd); t != NULL;
 877             t = list_next(&zone->zone_zsd, t)) {
 878                 zone_key_t key = t->zsd_key;
 879
 880                 /* Skip if no callbacks registered */
 881
 882                 if (ct == ZSD_SHUTDOWN) {
 883                         if (t->zsd_shutdown != NULL &&
 884                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 885                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 886                                 DTRACE_PROBE2(zsd__shutdown__needed,
 887                                     zone_t *, zone, zone_key_t, key);
 888                         }
 889                 } else {
 890                         if (t->zsd_destroy != NULL &&
 891                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 892                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 893                                 DTRACE_PROBE2(zsd__destroy__needed,
 894                                     zone_t *, zone, zone_key_t, key);
 895                         }
 896                 }
 897         }
 898         mutex_exit(&zone->zone_lock);
 899
 900         /* Now call the shutdown and destroy callback for this key */
 901         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 902         zsd_apply_all_keys(zsd_apply_destroy, zone);
 903
 904 }
 905
 906 /*
 907  * Called when the zone is going away; free ZSD-related memory, and
 908  * destroy the zone_zsd list.
 909  */
 910 static void
 911 zone_free_zsd(zone_t *zone)
 912 {
 913         struct zsd_entry *t, *next;
 914
 915         /*
 916          * Free all the zsd_entry's we had on this zone.
 917          */
 918         mutex_enter(&zone->zone_lock);
 919         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 920                 next = list_next(&zone->zone_zsd, t);
 921                 list_remove(&zone->zone_zsd, t);
 922                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 923                 kmem_free(t, sizeof (*t));
 924         }
 925         list_destroy(&zone->zone_zsd);
 926         mutex_exit(&zone->zone_lock);
 927
 928 }
 929
 930 /*
 931  * Apply a function to all zones for particular key value.
 932  *
 933  * The applyfn has to drop zonehash_lock if it does some work, and
 934  * then reacquire it before it returns.
 935  * When the lock is dropped we don't follow list_next even
 936  * if it is possible to do so without any hazards. This is
 937  * because we want the design to allow for the list of zones
 938  * to change in any arbitrary way during the time the
 939  * lock was dropped.
 940  *
 941  * It is safe to restart the loop at list_head since the applyfn
 942  * changes the zsd_flags as it does work, so a subsequent
 943  * pass through will have no effect in applyfn, hence the loop will terminate
 944  * in at worst O(N^2).
 945  */
 946 static void
 947 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 948 {
 949         zone_t *zone;
 950
 951         mutex_enter(&zonehash_lock);
 952         zone = list_head(&zone_active);
 953         while (zone != NULL) {
 954                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 955                         /* Lock dropped - restart at head */
 956                         zone = list_head(&zone_active);
 957                 } else {
 958                         zone = list_next(&zone_active, zone);
 959                 }
 960         }
 961         mutex_exit(&zonehash_lock);
 962 }
 963
 964 /*
 965  * Apply a function to all keys for a particular zone.
 966  *
 967  * The applyfn has to drop zonehash_lock if it does some work, and
 968  * then reacquire it before it returns.
 969  * When the lock is dropped we don't follow list_next even
 970  * if it is possible to do so without any hazards. This is
 971  * because we want the design to allow for the list of zsd callbacks
 972  * to change in any arbitrary way during the time the
 973  * lock was dropped.
 974  *
 975  * It is safe to restart the loop at list_head since the applyfn
 976  * changes the zsd_flags as it does work, so a subsequent
 977  * pass through will have no effect in applyfn, hence the loop will terminate
 978  * in at worst O(N^2).
 979  */
 980 static void
 981 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 982 {
 983         struct zsd_entry *t;
 984
 985         mutex_enter(&zone->zone_lock);
 986         t = list_head(&zone->zone_zsd);
 987         while (t != NULL) {
 988                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 989                         /* Lock dropped - restart at head */
 990                         t = list_head(&zone->zone_zsd);
 991                 } else {
 992                         t = list_next(&zone->zone_zsd, t);
 993                 }
 994         }
 995         mutex_exit(&zone->zone_lock);
 996 }
 997
 998 /*
 999  * Call the create function for the zone and key if CREATE_NEEDED
1000  * is set.
1001  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1002  * we wait for that thread to complete so that we can ensure that
1003  * all the callbacks are done when we've looped over all zones/keys.
1004  *
1005  * When we call the create function, we drop the global held by the
1006  * caller, and return true to tell the caller it needs to re-evalute the
1007  * state.
1008  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1009  * remains held on exit.
1010  */
1011 static boolean_t
1012 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1013     zone_t *zone, zone_key_t key)
1014 {
1015         void *result;
1016         struct zsd_entry *t;
1017         boolean_t dropped;
1018
1019         if (lockp != NULL) {
1020                 ASSERT(MUTEX_HELD(lockp));
1021         }
1022         if (zone_lock_held) {
1023                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1024         } else {
1025                 mutex_enter(&zone->zone_lock);
1026         }
1027
1028         t = zsd_find(&zone->zone_zsd, key);
1029         if (t == NULL) {
1030                 /*
1031                  * Somebody else got here first e.g the zone going
1032                  * away.
1033                  */
1034                 if (!zone_lock_held)
1035                         mutex_exit(&zone->zone_lock);
1036                 return (B_FALSE);
1037         }
1038         dropped = B_FALSE;
1039         if (zsd_wait_for_inprogress(zone, t, lockp))
1040                 dropped = B_TRUE;
1041
1042         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1043                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1044                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1045                 DTRACE_PROBE2(zsd__create__inprogress,
1046                     zone_t *, zone, zone_key_t, key);
1047                 mutex_exit(&zone->zone_lock);
1048                 if (lockp != NULL)
1049                         mutex_exit(lockp);
1050
1051                 dropped = B_TRUE;
1052                 ASSERT(t->zsd_create != NULL);
1053                 DTRACE_PROBE2(zsd__create__start,
1054                     zone_t *, zone, zone_key_t, key);
1055
1056                 result = (*t->zsd_create)(zone->zone_id);
1057
1058                 DTRACE_PROBE2(zsd__create__end,
1059                     zone_t *, zone, voidn *, result);
1060
1061                 ASSERT(result != NULL);
1062                 if (lockp != NULL)
1063                         mutex_enter(lockp);
1064                 mutex_enter(&zone->zone_lock);
1065                 t->zsd_data = result;
1066                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1067                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1068                 cv_broadcast(&t->zsd_cv);
1069                 DTRACE_PROBE2(zsd__create__completed,
1070                     zone_t *, zone, zone_key_t, key);
1071         }
1072         if (!zone_lock_held)
1073                 mutex_exit(&zone->zone_lock);
1074         return (dropped);
1075 }
1076
1077 /*
1078  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1079  * is set.
1080  * If some other thread gets here first and sets *_INPROGRESS, then
1081  * we wait for that thread to complete so that we can ensure that
1082  * all the callbacks are done when we've looped over all zones/keys.
1083  *
1084  * When we call the shutdown function, we drop the global held by the
1085  * caller, and return true to tell the caller it needs to re-evalute the
1086  * state.
1087  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1088  * remains held on exit.
1089  */
1090 static boolean_t
1091 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1092     zone_t *zone, zone_key_t key)
1093 {
1094         struct zsd_entry *t;
1095         void *data;
1096         boolean_t dropped;
1097
1098         if (lockp != NULL) {
1099                 ASSERT(MUTEX_HELD(lockp));
1100         }
1101         if (zone_lock_held) {
1102                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1103         } else {
1104                 mutex_enter(&zone->zone_lock);
1105         }
1106
1107         t = zsd_find(&zone->zone_zsd, key);
1108         if (t == NULL) {
1109                 /*
1110                  * Somebody else got here first e.g the zone going
1111                  * away.
1112                  */
1113                 if (!zone_lock_held)
1114                         mutex_exit(&zone->zone_lock);
1115                 return (B_FALSE);
1116         }
1117         dropped = B_FALSE;
1118         if (zsd_wait_for_creator(zone, t, lockp))
1119                 dropped = B_TRUE;
1120
1121         if (zsd_wait_for_inprogress(zone, t, lockp))
1122                 dropped = B_TRUE;
1123
1124         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1125                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1126                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1127                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1128                     zone_t *, zone, zone_key_t, key);
1129                 mutex_exit(&zone->zone_lock);
1130                 if (lockp != NULL)
1131                         mutex_exit(lockp);
1132                 dropped = B_TRUE;
1133
1134                 ASSERT(t->zsd_shutdown != NULL);
1135                 data = t->zsd_data;
1136
1137                 DTRACE_PROBE2(zsd__shutdown__start,
1138                     zone_t *, zone, zone_key_t, key);
1139
1140                 (t->zsd_shutdown)(zone->zone_id, data);
1141                 DTRACE_PROBE2(zsd__shutdown__end,
1142                     zone_t *, zone, zone_key_t, key);
1143
1144                 if (lockp != NULL)
1145                         mutex_enter(lockp);
1146                 mutex_enter(&zone->zone_lock);
1147                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1148                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1149                 cv_broadcast(&t->zsd_cv);
1150                 DTRACE_PROBE2(zsd__shutdown__completed,
1151                     zone_t *, zone, zone_key_t, key);
1152         }
1153         if (!zone_lock_held)
1154                 mutex_exit(&zone->zone_lock);
1155         return (dropped);
1156 }
1157
1158 /*
1159  * Call the destroy function for the zone and key if DESTROY_NEEDED
1160  * is set.
1161  * If some other thread gets here first and sets *_INPROGRESS, then
1162  * we wait for that thread to complete so that we can ensure that
1163  * all the callbacks are done when we've looped over all zones/keys.
1164  *
1165  * When we call the destroy function, we drop the global held by the
1166  * caller, and return true to tell the caller it needs to re-evalute the
1167  * state.
1168  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1169  * remains held on exit.
1170  */
1171 static boolean_t
1172 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1173     zone_t *zone, zone_key_t key)
1174 {
1175         struct zsd_entry *t;
1176         void *data;
1177         boolean_t dropped;
1178
1179         if (lockp != NULL) {
1180                 ASSERT(MUTEX_HELD(lockp));
1181         }
1182         if (zone_lock_held) {
1183                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1184         } else {
1185                 mutex_enter(&zone->zone_lock);
1186         }
1187
1188         t = zsd_find(&zone->zone_zsd, key);
1189         if (t == NULL) {
1190                 /*
1191                  * Somebody else got here first e.g the zone going
1192                  * away.
1193                  */
1194                 if (!zone_lock_held)
1195                         mutex_exit(&zone->zone_lock);
1196                 return (B_FALSE);
1197         }
1198         dropped = B_FALSE;
1199         if (zsd_wait_for_creator(zone, t, lockp))
1200                 dropped = B_TRUE;
1201
1202         if (zsd_wait_for_inprogress(zone, t, lockp))
1203                 dropped = B_TRUE;
1204
1205         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1206                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1207                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1208                 DTRACE_PROBE2(zsd__destroy__inprogress,
1209                     zone_t *, zone, zone_key_t, key);
1210                 mutex_exit(&zone->zone_lock);
1211                 if (lockp != NULL)
1212                         mutex_exit(lockp);
1213                 dropped = B_TRUE;
1214
1215                 ASSERT(t->zsd_destroy != NULL);
1216                 data = t->zsd_data;
1217                 DTRACE_PROBE2(zsd__destroy__start,
1218                     zone_t *, zone, zone_key_t, key);
1219
1220                 (t->zsd_destroy)(zone->zone_id, data);
1221                 DTRACE_PROBE2(zsd__destroy__end,
1222                     zone_t *, zone, zone_key_t, key);
1223
1224                 if (lockp != NULL)
1225                         mutex_enter(lockp);
1226                 mutex_enter(&zone->zone_lock);
1227                 t->zsd_data = NULL;
1228                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1229                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1230                 cv_broadcast(&t->zsd_cv);
1231                 DTRACE_PROBE2(zsd__destroy__completed,
1232                     zone_t *, zone, zone_key_t, key);
1233         }
1234         if (!zone_lock_held)
1235                 mutex_exit(&zone->zone_lock);
1236         return (dropped);
1237 }
1238
1239 /*
1240  * Wait for any CREATE_NEEDED flag to be cleared.
1241  * Returns true if lockp was temporarily dropped while waiting.
1242  */
1243 static boolean_t
1244 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1245 {
1246         boolean_t dropped = B_FALSE;
1247
1248         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1249                 DTRACE_PROBE2(zsd__wait__for__creator,
1250                     zone_t *, zone, struct zsd_entry *, t);
1251                 if (lockp != NULL) {
1252                         dropped = B_TRUE;
1253                         mutex_exit(lockp);
1254                 }
1255                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1256                 if (lockp != NULL) {
1257                         /* First drop zone_lock to preserve order */
1258                         mutex_exit(&zone->zone_lock);
1259                         mutex_enter(lockp);
1260                         mutex_enter(&zone->zone_lock);
1261                 }
1262         }
1263         return (dropped);
1264 }
1265
1266 /*
1267  * Wait for any INPROGRESS flag to be cleared.
1268  * Returns true if lockp was temporarily dropped while waiting.
1269  */
1270 static boolean_t
1271 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1272 {
1273         boolean_t dropped = B_FALSE;
1274
1275         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1276                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1277                     zone_t *, zone, struct zsd_entry *, t);
1278                 if (lockp != NULL) {
1279                         dropped = B_TRUE;
1280                         mutex_exit(lockp);
1281                 }
1282                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1283                 if (lockp != NULL) {
1284                         /* First drop zone_lock to preserve order */
1285                         mutex_exit(&zone->zone_lock);
1286                         mutex_enter(lockp);
1287                         mutex_enter(&zone->zone_lock);
1288                 }
1289         }
1290         return (dropped);
1291 }
1292
1293 /*
1294  * Frees memory associated with the zone dataset list.
1295  */
1296 static void
1297 zone_free_datasets(zone_t *zone)
1298 {
1299         zone_dataset_t *t, *next;
1300
1301         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1302                 next = list_next(&zone->zone_datasets, t);
1303                 list_remove(&zone->zone_datasets, t);
1304                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1305                 kmem_free(t, sizeof (*t));
1306         }
1307         list_destroy(&zone->zone_datasets);
1308 }
1309
1310 /*
1311  * zone.cpu-shares resource control support.
1312  */
1313 /*ARGSUSED*/
1314 static rctl_qty_t
1315 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1316 {
1317         ASSERT(MUTEX_HELD(&p->p_lock));
1318         return (p->p_zone->zone_shares);
1319 }
1320
1321 /*ARGSUSED*/
1322 static int
1323 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1324     rctl_qty_t nv)
1325 {
1326         ASSERT(MUTEX_HELD(&p->p_lock));
1327         ASSERT(e->rcep_t == RCENTITY_ZONE);
1328         if (e->rcep_p.zone == NULL)
1329                 return (0);
1330
1331         e->rcep_p.zone->zone_shares = nv;
1332         return (0);
1333 }
1334
1335 static rctl_ops_t zone_cpu_shares_ops = {
1336         rcop_no_action,
1337         zone_cpu_shares_usage,
1338         zone_cpu_shares_set,
1339         rcop_no_test
1340 };
1341
1342 /*
1343  * zone.cpu-cap resource control support.
1344  */
1345 /*ARGSUSED*/
1346 static rctl_qty_t
1347 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1348 {
1349         ASSERT(MUTEX_HELD(&p->p_lock));
1350         return (cpucaps_zone_get(p->p_zone));
1351 }
1352
1353 /*ARGSUSED*/
1354 static int
1355 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1356     rctl_qty_t nv)
1357 {
1358         zone_t *zone = e->rcep_p.zone;
1359
1360         ASSERT(MUTEX_HELD(&p->p_lock));
1361         ASSERT(e->rcep_t == RCENTITY_ZONE);
1362
1363         if (zone == NULL)
1364                 return (0);
1365
1366         /*
1367          * set cap to the new value.
1368          */
1369         return (cpucaps_zone_set(zone, nv));
1370 }
1371
1372 static rctl_ops_t zone_cpu_cap_ops = {
1373         rcop_no_action,
1374         zone_cpu_cap_get,
1375         zone_cpu_cap_set,
1376         rcop_no_test
1377 };
1378
1379 /*ARGSUSED*/
1380 static rctl_qty_t
1381 zone_lwps_usage(rctl_t *r, proc_t *p)
1382 {
1383         rctl_qty_t nlwps;
1384         zone_t *zone = p->p_zone;
1385
1386         ASSERT(MUTEX_HELD(&p->p_lock));
1387
1388         mutex_enter(&zone->zone_nlwps_lock);
1389         nlwps = zone->zone_nlwps;
1390         mutex_exit(&zone->zone_nlwps_lock);
1391
1392         return (nlwps);
1393 }
1394
1395 /*ARGSUSED*/
1396 static int
1397 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1398     rctl_qty_t incr, uint_t flags)
1399 {
1400         rctl_qty_t nlwps;
1401
1402         ASSERT(MUTEX_HELD(&p->p_lock));
1403         ASSERT(e->rcep_t == RCENTITY_ZONE);
1404         if (e->rcep_p.zone == NULL)
1405                 return (0);
1406         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1407         nlwps = e->rcep_p.zone->zone_nlwps;
1408
1409         if (nlwps + incr > rcntl->rcv_value)
1410                 return (1);
1411
1412         return (0);
1413 }
1414
1415 /*ARGSUSED*/
1416 static int
1417 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1418 {
1419         ASSERT(MUTEX_HELD(&p->p_lock));
1420         ASSERT(e->rcep_t == RCENTITY_ZONE);
1421         if (e->rcep_p.zone == NULL)
1422                 return (0);
1423         e->rcep_p.zone->zone_nlwps_ctl = nv;
1424         return (0);
1425 }
1426
1427 static rctl_ops_t zone_lwps_ops = {
1428         rcop_no_action,
1429         zone_lwps_usage,
1430         zone_lwps_set,
1431         zone_lwps_test,
1432 };
1433
1434 /*ARGSUSED*/
1435 static rctl_qty_t
1436 zone_procs_usage(rctl_t *r, proc_t *p)
1437 {
1438         rctl_qty_t nprocs;
1439         zone_t *zone = p->p_zone;
1440
1441         ASSERT(MUTEX_HELD(&p->p_lock));
1442
1443         mutex_enter(&zone->zone_nlwps_lock);
1444         nprocs = zone->zone_nprocs;
1445         mutex_exit(&zone->zone_nlwps_lock);
1446
1447         return (nprocs);
1448 }
1449
1450 /*ARGSUSED*/
1451 static int
1452 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1453     rctl_qty_t incr, uint_t flags)
1454 {
1455         rctl_qty_t nprocs;
1456
1457         ASSERT(MUTEX_HELD(&p->p_lock));
1458         ASSERT(e->rcep_t == RCENTITY_ZONE);
1459         if (e->rcep_p.zone == NULL)
1460                 return (0);
1461         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1462         nprocs = e->rcep_p.zone->zone_nprocs;
1463
1464         if (nprocs + incr > rcntl->rcv_value)
1465                 return (1);
1466
1467         return (0);
1468 }
1469
1470 /*ARGSUSED*/
1471 static int
1472 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1473 {
1474         ASSERT(MUTEX_HELD(&p->p_lock));
1475         ASSERT(e->rcep_t == RCENTITY_ZONE);
1476         if (e->rcep_p.zone == NULL)
1477                 return (0);
1478         e->rcep_p.zone->zone_nprocs_ctl = nv;
1479         return (0);
1480 }
1481
1482 static rctl_ops_t zone_procs_ops = {
1483         rcop_no_action,
1484         zone_procs_usage,
1485         zone_procs_set,
1486         zone_procs_test,
1487 };
1488
1489 /*ARGSUSED*/
1490 static rctl_qty_t
1491 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1492 {
1493         ASSERT(MUTEX_HELD(&p->p_lock));
1494         return (p->p_zone->zone_shmmax);
1495 }
1496
1497 /*ARGSUSED*/
1498 static int
1499 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1500     rctl_qty_t incr, uint_t flags)
1501 {
1502         rctl_qty_t v;
1503         ASSERT(MUTEX_HELD(&p->p_lock));
1504         ASSERT(e->rcep_t == RCENTITY_ZONE);
1505         v = e->rcep_p.zone->zone_shmmax + incr;
1506         if (v > rval->rcv_value)
1507                 return (1);
1508         return (0);
1509 }
1510
1511 static rctl_ops_t zone_shmmax_ops = {
1512         rcop_no_action,
1513         zone_shmmax_usage,
1514         rcop_no_set,
1515         zone_shmmax_test
1516 };
1517
1518 /*ARGSUSED*/
1519 static rctl_qty_t
1520 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1521 {
1522         ASSERT(MUTEX_HELD(&p->p_lock));
1523         return (p->p_zone->zone_ipc.ipcq_shmmni);
1524 }
1525
1526 /*ARGSUSED*/
1527 static int
1528 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1529     rctl_qty_t incr, uint_t flags)
1530 {
1531         rctl_qty_t v;
1532         ASSERT(MUTEX_HELD(&p->p_lock));
1533         ASSERT(e->rcep_t == RCENTITY_ZONE);
1534         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1535         if (v > rval->rcv_value)
1536                 return (1);
1537         return (0);
1538 }
1539
1540 static rctl_ops_t zone_shmmni_ops = {
1541         rcop_no_action,
1542         zone_shmmni_usage,
1543         rcop_no_set,
1544         zone_shmmni_test
1545 };
1546
1547 /*ARGSUSED*/
1548 static rctl_qty_t
1549 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1550 {
1551         ASSERT(MUTEX_HELD(&p->p_lock));
1552         return (p->p_zone->zone_ipc.ipcq_semmni);
1553 }
1554
1555 /*ARGSUSED*/
1556 static int
1557 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1558     rctl_qty_t incr, uint_t flags)
1559 {
1560         rctl_qty_t v;
1561         ASSERT(MUTEX_HELD(&p->p_lock));
1562         ASSERT(e->rcep_t == RCENTITY_ZONE);
1563         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1564         if (v > rval->rcv_value)
1565                 return (1);
1566         return (0);
1567 }
1568
1569 static rctl_ops_t zone_semmni_ops = {
1570         rcop_no_action,
1571         zone_semmni_usage,
1572         rcop_no_set,
1573         zone_semmni_test
1574 };
1575
1576 /*ARGSUSED*/
1577 static rctl_qty_t
1578 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1579 {
1580         ASSERT(MUTEX_HELD(&p->p_lock));
1581         return (p->p_zone->zone_ipc.ipcq_msgmni);
1582 }
1583
1584 /*ARGSUSED*/
1585 static int
1586 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1587     rctl_qty_t incr, uint_t flags)
1588 {
1589         rctl_qty_t v;
1590         ASSERT(MUTEX_HELD(&p->p_lock));
1591         ASSERT(e->rcep_t == RCENTITY_ZONE);
1592         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1593         if (v > rval->rcv_value)
1594                 return (1);
1595         return (0);
1596 }
1597
1598 static rctl_ops_t zone_msgmni_ops = {
1599         rcop_no_action,
1600         zone_msgmni_usage,
1601         rcop_no_set,
1602         zone_msgmni_test
1603 };
1604
1605 /*ARGSUSED*/
1606 static rctl_qty_t
1607 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1608 {
1609         rctl_qty_t q;
1610         ASSERT(MUTEX_HELD(&p->p_lock));
1611         mutex_enter(&p->p_zone->zone_mem_lock);
1612         q = p->p_zone->zone_locked_mem;
1613         mutex_exit(&p->p_zone->zone_mem_lock);
1614         return (q);
1615 }
1616
1617 /*ARGSUSED*/
1618 static int
1619 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1620     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1621 {
1622         rctl_qty_t q;
1623         zone_t *z;
1624
1625         z = e->rcep_p.zone;
1626         ASSERT(MUTEX_HELD(&p->p_lock));
1627         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1628         q = z->zone_locked_mem;
1629         if (q + incr > rcntl->rcv_value)
1630                 return (1);
1631         return (0);
1632 }
1633
1634 /*ARGSUSED*/
1635 static int
1636 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1637     rctl_qty_t nv)
1638 {
1639         ASSERT(MUTEX_HELD(&p->p_lock));
1640         ASSERT(e->rcep_t == RCENTITY_ZONE);
1641         if (e->rcep_p.zone == NULL)
1642                 return (0);
1643         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1644         return (0);
1645 }
1646
1647 static rctl_ops_t zone_locked_mem_ops = {
1648         rcop_no_action,
1649         zone_locked_mem_usage,
1650         zone_locked_mem_set,
1651         zone_locked_mem_test
1652 };
1653
1654 /*ARGSUSED*/
1655 static rctl_qty_t
1656 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1657 {
1658         rctl_qty_t q;
1659         zone_t *z = p->p_zone;
1660
1661         ASSERT(MUTEX_HELD(&p->p_lock));
1662         mutex_enter(&z->zone_mem_lock);
1663         q = z->zone_max_swap;
1664         mutex_exit(&z->zone_mem_lock);
1665         return (q);
1666 }
1667
1668 /*ARGSUSED*/
1669 static int
1670 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1671     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1672 {
1673         rctl_qty_t q;
1674         zone_t *z;
1675
1676         z = e->rcep_p.zone;
1677         ASSERT(MUTEX_HELD(&p->p_lock));
1678         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1679         q = z->zone_max_swap;
1680         if (q + incr > rcntl->rcv_value)
1681                 return (1);
1682         return (0);
1683 }
1684
1685 /*ARGSUSED*/
1686 static int
1687 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1688     rctl_qty_t nv)
1689 {
1690         ASSERT(MUTEX_HELD(&p->p_lock));
1691         ASSERT(e->rcep_t == RCENTITY_ZONE);
1692         if (e->rcep_p.zone == NULL)
1693                 return (0);
1694         e->rcep_p.zone->zone_max_swap_ctl = nv;
1695         return (0);
1696 }
1697
1698 static rctl_ops_t zone_max_swap_ops = {
1699         rcop_no_action,
1700         zone_max_swap_usage,
1701         zone_max_swap_set,
1702         zone_max_swap_test
1703 };
1704
1705 /*ARGSUSED*/
1706 static rctl_qty_t
1707 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1708 {
1709         rctl_qty_t q;
1710         zone_t *z = p->p_zone;
1711
1712         ASSERT(MUTEX_HELD(&p->p_lock));
1713         mutex_enter(&z->zone_rctl_lock);
1714         q = z->zone_max_lofi;
1715         mutex_exit(&z->zone_rctl_lock);
1716         return (q);
1717 }
1718
1719 /*ARGSUSED*/
1720 static int
1721 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1722     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1723 {
1724         rctl_qty_t q;
1725         zone_t *z;
1726
1727         z = e->rcep_p.zone;
1728         ASSERT(MUTEX_HELD(&p->p_lock));
1729         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1730         q = z->zone_max_lofi;
1731         if (q + incr > rcntl->rcv_value)
1732                 return (1);
1733         return (0);
1734 }
1735
1736 /*ARGSUSED*/
1737 static int
1738 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1739     rctl_qty_t nv)
1740 {
1741         ASSERT(MUTEX_HELD(&p->p_lock));
1742         ASSERT(e->rcep_t == RCENTITY_ZONE);
1743         if (e->rcep_p.zone == NULL)
1744                 return (0);
1745         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1746         return (0);
1747 }
1748
1749 static rctl_ops_t zone_max_lofi_ops = {
1750         rcop_no_action,
1751         zone_max_lofi_usage,
1752         zone_max_lofi_set,
1753         zone_max_lofi_test
1754 };
1755
1756 /*
1757  * Helper function to brand the zone with a unique ID.
1758  */
1759 static void
1760 zone_uniqid(zone_t *zone)
1761 {
1762         static uint64_t uniqid = 0;
1763
1764         ASSERT(MUTEX_HELD(&zonehash_lock));
1765         zone->zone_uniqid = uniqid++;
1766 }
1767
1768 /*
1769  * Returns a held pointer to the "kcred" for the specified zone.
1770  */
1771 struct cred *
1772 zone_get_kcred(zoneid_t zoneid)
1773 {
1774         zone_t *zone;
1775         cred_t *cr;
1776
1777         if ((zone = zone_find_by_id(zoneid)) == NULL)
1778                 return (NULL);
1779         cr = zone->zone_kcred;
1780         crhold(cr);
1781         zone_rele(zone);
1782         return (cr);
1783 }
1784
1785 static int
1786 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1787 {
1788         zone_t *zone = ksp->ks_private;
1789         zone_kstat_t *zk = ksp->ks_data;
1790
1791         if (rw == KSTAT_WRITE)
1792                 return (EACCES);
1793
1794         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1795         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1796         return (0);
1797 }
1798
1799 static int
1800 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1801 {
1802         zone_t *zone = ksp->ks_private;
1803         zone_kstat_t *zk = ksp->ks_data;
1804
1805         if (rw == KSTAT_WRITE)
1806                 return (EACCES);
1807
1808         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1809         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1810         return (0);
1811 }
1812
1813 static int
1814 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1815 {
1816         zone_t *zone = ksp->ks_private;
1817         zone_kstat_t *zk = ksp->ks_data;
1818
1819         if (rw == KSTAT_WRITE)
1820                 return (EACCES);
1821
1822         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1823         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1824         return (0);
1825 }
1826
1827 static kstat_t *
1828 zone_kstat_create_common(zone_t *zone, char *name,
1829     int (*updatefunc) (kstat_t *, int))
1830 {
1831         kstat_t *ksp;
1832         zone_kstat_t *zk;
1833
1834         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1835             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1836             KSTAT_FLAG_VIRTUAL);
1837
1838         if (ksp == NULL)
1839                 return (NULL);
1840
1841         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1842         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1843         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1844         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1845         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1846         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1847         ksp->ks_update = updatefunc;
1848         ksp->ks_private = zone;
1849         kstat_install(ksp);
1850         return (ksp);
1851 }
1852
1853
1854 static int
1855 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1856 {
1857         zone_t *zone = ksp->ks_private;
1858         zone_mcap_kstat_t *zmp = ksp->ks_data;
1859
1860         if (rw == KSTAT_WRITE)
1861                 return (EACCES);
1862
1863         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1864         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1865         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1866         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1867         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1868
1869         return (0);
1870 }
1871
1872 static kstat_t *
1873 zone_mcap_kstat_create(zone_t *zone)
1874 {
1875         kstat_t *ksp;
1876         zone_mcap_kstat_t *zmp;
1877
1878         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1879             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1880             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1881             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1882                 return (NULL);
1883
1884         if (zone->zone_id != GLOBAL_ZONEID)
1885                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1886
1887         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1888         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1889         ksp->ks_lock = &zone->zone_mcap_lock;
1890         zone->zone_mcap_stats = zmp;
1891
1892         /* The kstat "name" field is not large enough for a full zonename */
1893         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1894         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1895         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1896         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1897         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1898         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1899         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1900             KSTAT_DATA_UINT64);
1901
1902         ksp->ks_update = zone_mcap_kstat_update;
1903         ksp->ks_private = zone;
1904
1905         kstat_install(ksp);
1906         return (ksp);
1907 }
1908
1909 static int
1910 zone_misc_kstat_update(kstat_t *ksp, int rw)
1911 {
1912         zone_t *zone = ksp->ks_private;
1913         zone_misc_kstat_t *zmp = ksp->ks_data;
1914         hrtime_t tmp;
1915
1916         if (rw == KSTAT_WRITE)
1917                 return (EACCES);
1918
1919         tmp = zone->zone_utime;
1920         scalehrtime(&tmp);
1921         zmp->zm_utime.value.ui64 = tmp;
1922         tmp = zone->zone_stime;
1923         scalehrtime(&tmp);
1924         zmp->zm_stime.value.ui64 = tmp;
1925         tmp = zone->zone_wtime;
1926         scalehrtime(&tmp);
1927         zmp->zm_wtime.value.ui64 = tmp;
1928
1929         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1930         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1931         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1932
1933         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1934         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1935         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1936         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1937
1938         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1939
1940         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1941         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1942
1943         return (0);
1944 }
1945
1946 static kstat_t *
1947 zone_misc_kstat_create(zone_t *zone)
1948 {
1949         kstat_t *ksp;
1950         zone_misc_kstat_t *zmp;
1951
1952         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1953             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1954             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1955             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1956                 return (NULL);
1957
1958         if (zone->zone_id != GLOBAL_ZONEID)
1959                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1960
1961         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1962         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1963         ksp->ks_lock = &zone->zone_misc_lock;
1964         zone->zone_misc_stats = zmp;
1965
1966         /* The kstat "name" field is not large enough for a full zonename */
1967         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1968         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1969         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1970         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1971         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1972         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1973         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1974         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1975             KSTAT_DATA_UINT32);
1976         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1977         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1978             KSTAT_DATA_UINT32);
1979         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1980         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1981         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1982             KSTAT_DATA_UINT32);
1983         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1984         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1985
1986         ksp->ks_update = zone_misc_kstat_update;
1987         ksp->ks_private = zone;
1988
1989         kstat_install(ksp);
1990         return (ksp);
1991 }
1992
1993 static void
1994 zone_kstat_create(zone_t *zone)
1995 {
1996         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1997             "lockedmem", zone_lockedmem_kstat_update);
1998         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1999             "swapresv", zone_swapresv_kstat_update);
2000         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2001             "nprocs", zone_nprocs_kstat_update);
2002
2003         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2004                 zone->zone_mcap_stats = kmem_zalloc(
2005                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2006         }
2007
2008         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2009                 zone->zone_misc_stats = kmem_zalloc(
2010                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2011         }
2012 }
2013
2014 static void
2015 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2016 {
2017         void *data;
2018
2019         if (*pkstat != NULL) {
2020                 data = (*pkstat)->ks_data;
2021                 kstat_delete(*pkstat);
2022                 kmem_free(data, datasz);
2023                 *pkstat = NULL;
2024         }
2025 }
2026
2027 static void
2028 zone_kstat_delete(zone_t *zone)
2029 {
2030         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2031             sizeof (zone_kstat_t));
2032         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2033             sizeof (zone_kstat_t));
2034         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2035             sizeof (zone_kstat_t));
2036         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2037             sizeof (zone_mcap_kstat_t));
2038         zone_kstat_delete_common(&zone->zone_misc_ksp,
2039             sizeof (zone_misc_kstat_t));
2040 }
2041
2042 /*
2043  * Called very early on in boot to initialize the ZSD list so that
2044  * zone_key_create() can be called before zone_init().  It also initializes
2045  * portions of zone0 which may be used before zone_init() is called.  The
2046  * variable "global_zone" will be set when zone0 is fully initialized by
2047  * zone_init().
2048  */
2049 void
2050 zone_zsd_init(void)
2051 {
2052         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2053         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2054         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2055             offsetof(struct zsd_entry, zsd_linkage));
2056         list_create(&zone_active, sizeof (zone_t),
2057             offsetof(zone_t, zone_linkage));
2058         list_create(&zone_deathrow, sizeof (zone_t),
2059             offsetof(zone_t, zone_linkage));
2060
2061         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2062         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2063         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2064         zone0.zone_shares = 1;
2065         zone0.zone_nlwps = 0;
2066         zone0.zone_nlwps_ctl = INT_MAX;
2067         zone0.zone_nprocs = 0;
2068         zone0.zone_nprocs_ctl = INT_MAX;
2069         zone0.zone_locked_mem = 0;
2070         zone0.zone_locked_mem_ctl = UINT64_MAX;
2071         ASSERT(zone0.zone_max_swap == 0);
2072         zone0.zone_max_swap_ctl = UINT64_MAX;
2073         zone0.zone_max_lofi = 0;
2074         zone0.zone_max_lofi_ctl = UINT64_MAX;
2075         zone0.zone_shmmax = 0;
2076         zone0.zone_ipc.ipcq_shmmni = 0;
2077         zone0.zone_ipc.ipcq_semmni = 0;
2078         zone0.zone_ipc.ipcq_msgmni = 0;
2079         zone0.zone_name = GLOBAL_ZONENAME;
2080         zone0.zone_nodename = utsname.nodename;
2081         zone0.zone_domain = srpc_domain;
2082         zone0.zone_hostid = HW_INVALID_HOSTID;
2083         zone0.zone_fs_allowed = NULL;
2084         psecflags_default(&zone0.zone_secflags);
2085         zone0.zone_ref = 1;
2086         zone0.zone_id = GLOBAL_ZONEID;
2087         zone0.zone_status = ZONE_IS_RUNNING;
2088         zone0.zone_rootpath = "/";
2089         zone0.zone_rootpathlen = 2;
2090         zone0.zone_psetid = ZONE_PS_INVAL;
2091         zone0.zone_ncpus = 0;
2092         zone0.zone_ncpus_online = 0;
2093         zone0.zone_proc_initpid = 1;
2094         zone0.zone_initname = initname;
2095         zone0.zone_lockedmem_kstat = NULL;
2096         zone0.zone_swapresv_kstat = NULL;
2097         zone0.zone_nprocs_kstat = NULL;
2098
2099         zone0.zone_stime = 0;
2100         zone0.zone_utime = 0;
2101         zone0.zone_wtime = 0;
2102
2103         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2104             offsetof(zone_ref_t, zref_linkage));
2105         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2106             offsetof(struct zsd_entry, zsd_linkage));
2107         list_insert_head(&zone_active, &zone0);
2108
2109         /*
2110          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2111          * to anything meaningful.  It is assigned to be 'rootdir' in
2112          * vfs_mountroot().
2113          */
2114         zone0.zone_rootvp = NULL;
2115         zone0.zone_vfslist = NULL;
2116         zone0.zone_bootargs = initargs;
2117         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2118         /*
2119          * The global zone has all privileges
2120          */
2121         priv_fillset(zone0.zone_privset);
2122         /*
2123          * Add p0 to the global zone
2124          */
2125         zone0.zone_zsched = &p0;
2126         p0.p_zone = &zone0;
2127 }
2128
2129 /*
2130  * Compute a hash value based on the contents of the label and the DOI.  The
2131  * hash algorithm is somewhat arbitrary, but is based on the observation that
2132  * humans will likely pick labels that differ by amounts that work out to be
2133  * multiples of the number of hash chains, and thus stirring in some primes
2134  * should help.
2135  */
2136 static uint_t
2137 hash_bylabel(void *hdata, mod_hash_key_t key)
2138 {
2139         const ts_label_t *lab = (ts_label_t *)key;
2140         const uint32_t *up, *ue;
2141         uint_t hash;
2142         int i;
2143
2144         _NOTE(ARGUNUSED(hdata));
2145
2146         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2147         /* we depend on alignment of label, but not representation */
2148         up = (const uint32_t *)&lab->tsl_label;
2149         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2150         i = 1;
2151         while (up < ue) {
2152                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2153                 hash += *up + (*up << ((i % 16) + 1));
2154                 up++;
2155                 i++;
2156         }
2157         return (hash);
2158 }
2159
2160 /*
2161  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2162  * equal).  This may need to be changed if less than / greater than is ever
2163  * needed.
2164  */
2165 static int
2166 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2167 {
2168         ts_label_t *lab1 = (ts_label_t *)key1;
2169         ts_label_t *lab2 = (ts_label_t *)key2;
2170
2171         return (label_equal(lab1, lab2) ? 0 : 1);
2172 }
2173
2174 /*
2175  * Called by main() to initialize the zones framework.
2176  */
2177 void
2178 zone_init(void)
2179 {
2180         rctl_dict_entry_t *rde;
2181         rctl_val_t *dval;
2182         rctl_set_t *set;
2183         rctl_alloc_gp_t *gp;
2184         rctl_entity_p_t e;
2185         int res;
2186
2187         ASSERT(curproc == &p0);
2188
2189         /*
2190          * Create ID space for zone IDs.  ID 0 is reserved for the
2191          * global zone.
2192          */
2193         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2194
2195         /*
2196          * Initialize generic zone resource controls, if any.
2197          */
2198         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2199             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2200             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2201             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2202
2203         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2204             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2205             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2206             RCTL_GLOBAL_INFINITE,
2207             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2208
2209         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2210             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2211             INT_MAX, INT_MAX, &zone_lwps_ops);
2212
2213         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2214             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2215             INT_MAX, INT_MAX, &zone_procs_ops);
2216
2217         /*
2218          * System V IPC resource controls
2219          */
2220         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2221             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2222             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2223
2224         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2225             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2226             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2227
2228         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2229             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2230             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2231
2232         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2233             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2234             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2235
2236         /*
2237          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2238          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2239          */
2240         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2241         bzero(dval, sizeof (rctl_val_t));
2242         dval->rcv_value = 1;
2243         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2244         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2245         dval->rcv_action_recip_pid = -1;
2246
2247         rde = rctl_dict_lookup("zone.cpu-shares");
2248         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2249
2250         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2251             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2252             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2253             &zone_locked_mem_ops);
2254
2255         rc_zone_max_swap = rctl_register("zone.max-swap",
2256             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2257             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2258             &zone_max_swap_ops);
2259
2260         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2261             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2262             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2263             &zone_max_lofi_ops);
2264
2265         /*
2266          * Initialize the ``global zone''.
2267          */
2268         set = rctl_set_create();
2269         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2270         mutex_enter(&p0.p_lock);
2271         e.rcep_p.zone = &zone0;
2272         e.rcep_t = RCENTITY_ZONE;
2273         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2274             gp);
2275
2276         zone0.zone_nlwps = p0.p_lwpcnt;
2277         zone0.zone_nprocs = 1;
2278         zone0.zone_ntasks = 1;
2279         mutex_exit(&p0.p_lock);
2280         zone0.zone_restart_init = B_TRUE;
2281         zone0.zone_brand = &native_brand;
2282         rctl_prealloc_destroy(gp);
2283         /*
2284          * pool_default hasn't been initialized yet, so we let pool_init()
2285          * take care of making sure the global zone is in the default pool.
2286          */
2287
2288         /*
2289          * Initialize global zone kstats
2290          */
2291         zone_kstat_create(&zone0);
2292
2293         /*
2294          * Initialize zone label.
2295          * mlp are initialized when tnzonecfg is loaded.
2296          */
2297         zone0.zone_slabel = l_admin_low;
2298         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2299         label_hold(l_admin_low);
2300
2301         /*
2302          * Initialise the lock for the database structure used by mntfs.
2303          */
2304         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2305
2306         mutex_enter(&zonehash_lock);
2307         zone_uniqid(&zone0);
2308         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2309
2310         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2311             mod_hash_null_valdtor);
2312         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2313             zone_hash_size, mod_hash_null_valdtor);
2314         /*
2315          * maintain zonehashbylabel only for labeled systems
2316          */
2317         if (is_system_labeled())
2318                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2319                     zone_hash_size, mod_hash_null_keydtor,
2320                     mod_hash_null_valdtor, hash_bylabel, NULL,
2321                     hash_labelkey_cmp, KM_SLEEP);
2322         zonecount = 1;
2323
2324         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2325             (mod_hash_val_t)&zone0);
2326         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2327             (mod_hash_val_t)&zone0);
2328         if (is_system_labeled()) {
2329                 zone0.zone_flags |= ZF_HASHED_LABEL;
2330                 (void) mod_hash_insert(zonehashbylabel,
2331                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2332         }
2333         mutex_exit(&zonehash_lock);
2334
2335         /*
2336          * We avoid setting zone_kcred until now, since kcred is initialized
2337          * sometime after zone_zsd_init() and before zone_init().
2338          */
2339         zone0.zone_kcred = kcred;
2340         /*
2341          * The global zone is fully initialized (except for zone_rootvp which
2342          * will be set when the root filesystem is mounted).
2343          */
2344         global_zone = &zone0;
2345
2346         /*
2347          * Setup an event channel to send zone status change notifications on
2348          */
2349         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2350             EVCH_CREAT);
2351
2352         if (res)
2353                 panic("Sysevent_evc_bind failed during zone setup.\n");
2354
2355 }
2356
2357 static void
2358 zone_free(zone_t *zone)
2359 {
2360         ASSERT(zone != global_zone);
2361         ASSERT(zone->zone_ntasks == 0);
2362         ASSERT(zone->zone_nlwps == 0);
2363         ASSERT(zone->zone_nprocs == 0);
2364         ASSERT(zone->zone_cred_ref == 0);
2365         ASSERT(zone->zone_kcred == NULL);
2366         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2367             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2368         ASSERT(list_is_empty(&zone->zone_ref_list));
2369
2370         /*
2371          * Remove any zone caps.
2372          */
2373         cpucaps_zone_remove(zone);
2374
2375         ASSERT(zone->zone_cpucap == NULL);
2376
2377         /* remove from deathrow list */
2378         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2379                 ASSERT(zone->zone_ref == 0);
2380                 mutex_enter(&zone_deathrow_lock);
2381                 list_remove(&zone_deathrow, zone);
2382                 mutex_exit(&zone_deathrow_lock);
2383         }
2384
2385         list_destroy(&zone->zone_ref_list);
2386         zone_free_zsd(zone);
2387         zone_free_datasets(zone);
2388         list_destroy(&zone->zone_dl_list);
2389
2390         if (zone->zone_rootvp != NULL)
2391                 VN_RELE(zone->zone_rootvp);
2392         if (zone->zone_rootpath)
2393                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2394         if (zone->zone_name != NULL)
2395                 kmem_free(zone->zone_name, ZONENAME_MAX);
2396         if (zone->zone_slabel != NULL)
2397                 label_rele(zone->zone_slabel);
2398         if (zone->zone_nodename != NULL)
2399                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2400         if (zone->zone_domain != NULL)
2401                 kmem_free(zone->zone_domain, _SYS_NMLN);
2402         if (zone->zone_privset != NULL)
2403                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2404         if (zone->zone_rctls != NULL)
2405                 rctl_set_free(zone->zone_rctls);
2406         if (zone->zone_bootargs != NULL)
2407                 strfree(zone->zone_bootargs);
2408         if (zone->zone_initname != NULL)
2409                 strfree(zone->zone_initname);
2410         if (zone->zone_fs_allowed != NULL)
2411                 strfree(zone->zone_fs_allowed);
2412         if (zone->zone_pfexecd != NULL)
2413                 klpd_freelist(&zone->zone_pfexecd);
2414         id_free(zoneid_space, zone->zone_id);
2415         mutex_destroy(&zone->zone_lock);
2416         cv_destroy(&zone->zone_cv);
2417         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2418         rw_destroy(&zone->zone_mntfs_db_lock);
2419         kmem_free(zone, sizeof (zone_t));
2420 }
2421
2422 /*
2423  * See block comment at the top of this file for information about zone
2424  * status values.
2425  */
2426 /*
2427  * Convenience function for setting zone status.
2428  */
2429 static void
2430 zone_status_set(zone_t *zone, zone_status_t status)
2431 {
2432
2433         nvlist_t *nvl = NULL;
2434         ASSERT(MUTEX_HELD(&zone_status_lock));
2435         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2436             status >= zone_status_get(zone));
2437
2438         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2439             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2440             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2441             zone_status_table[status]) ||
2442             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2443             zone_status_table[zone->zone_status]) ||
2444             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2445             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2446             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2447             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2448 #ifdef DEBUG
2449                 (void) printf(
2450                     "Failed to allocate and send zone state change event.\n");
2451 #endif
2452         }
2453         nvlist_free(nvl);
2454
2455         zone->zone_status = status;
2456
2457         cv_broadcast(&zone->zone_cv);
2458 }
2459
2460 /*
2461  * Public function to retrieve the zone status.  The zone status may
2462  * change after it is retrieved.
2463  */
2464 zone_status_t
2465 zone_status_get(zone_t *zone)
2466 {
2467         return (zone->zone_status);
2468 }
2469
2470 static int
2471 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2472 {
2473         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2474         int err = 0;
2475
2476         ASSERT(zone != global_zone);
2477         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2478                 goto done;      /* EFAULT or ENAMETOOLONG */
2479
2480         if (zone->zone_bootargs != NULL)
2481                 strfree(zone->zone_bootargs);
2482
2483         zone->zone_bootargs = strdup(buf);
2484
2485 done:
2486         kmem_free(buf, BOOTARGS_MAX);
2487         return (err);
2488 }
2489
2490 static int
2491 zone_set_brand(zone_t *zone, const char *brand)
2492 {
2493         struct brand_attr *attrp;
2494         brand_t *bp;
2495
2496         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2497         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2498                 kmem_free(attrp, sizeof (struct brand_attr));
2499                 return (EFAULT);
2500         }
2501
2502         bp = brand_register_zone(attrp);
2503         kmem_free(attrp, sizeof (struct brand_attr));
2504         if (bp == NULL)
2505                 return (EINVAL);
2506
2507         /*
2508          * This is the only place where a zone can change it's brand.
2509          * We already need to hold zone_status_lock to check the zone
2510          * status, so we'll just use that lock to serialize zone
2511          * branding requests as well.
2512          */
2513         mutex_enter(&zone_status_lock);
2514
2515         /* Re-Branding is not allowed and the zone can't be booted yet */
2516         if ((ZONE_IS_BRANDED(zone)) ||
2517             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2518                 mutex_exit(&zone_status_lock);
2519                 brand_unregister_zone(bp);
2520                 return (EINVAL);
2521         }
2522
2523         /* set up the brand specific data */
2524         zone->zone_brand = bp;
2525         ZBROP(zone)->b_init_brand_data(zone);
2526
2527         mutex_exit(&zone_status_lock);
2528         return (0);
2529 }
2530
2531 static int
2532 zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2533 {
2534         int err = 0;
2535         psecflags_t psf;
2536
2537         ASSERT(zone != global_zone);
2538
2539         if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2540                 return (err);
2541
2542         if (zone_status_get(zone) > ZONE_IS_READY)
2543                 return (EINVAL);
2544
2545         if (!psecflags_validate(&psf))
2546                 return (EINVAL);
2547
2548         (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2549
2550         /* Set security flags on the zone's zsched */
2551         (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2552             sizeof (zone->zone_zsched->p_secflags));
2553
2554         return (0);
2555 }
2556
2557 static int
2558 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2559 {
2560         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2561         int err = 0;
2562
2563         ASSERT(zone != global_zone);
2564         if ((err = copyinstr(zone_fs_allowed, buf,
2565             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2566                 goto done;
2567
2568         if (zone->zone_fs_allowed != NULL)
2569                 strfree(zone->zone_fs_allowed);
2570
2571         zone->zone_fs_allowed = strdup(buf);
2572
2573 done:
2574         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2575         return (err);
2576 }
2577
2578 static int
2579 zone_set_initname(zone_t *zone, const char *zone_initname)
2580 {
2581         char initname[INITNAME_SZ];
2582         size_t len;
2583         int err = 0;
2584
2585         ASSERT(zone != global_zone);
2586         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2587                 return (err);   /* EFAULT or ENAMETOOLONG */
2588
2589         if (zone->zone_initname != NULL)
2590                 strfree(zone->zone_initname);
2591
2592         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2593         (void) strcpy(zone->zone_initname, initname);
2594         return (0);
2595 }
2596
2597 static int
2598 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2599 {
2600         uint64_t mcap;
2601         int err = 0;
2602
2603         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2604                 zone->zone_phys_mcap = mcap;
2605
2606         return (err);
2607 }
2608
2609 static int
2610 zone_set_sched_class(zone_t *zone, const char *new_class)
2611 {
2612         char sched_class[PC_CLNMSZ];
2613         id_t classid;
2614         int err;
2615
2616         ASSERT(zone != global_zone);
2617         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2618                 return (err);   /* EFAULT or ENAMETOOLONG */
2619
2620         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2621                 return (set_errno(EINVAL));
2622         zone->zone_defaultcid = classid;
2623         ASSERT(zone->zone_defaultcid > 0 &&
2624             zone->zone_defaultcid < loaded_classes);
2625
2626         return (0);
2627 }
2628
2629 /*
2630  * Block indefinitely waiting for (zone_status >= status)
2631  */
2632 void
2633 zone_status_wait(zone_t *zone, zone_status_t status)
2634 {
2635         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2636
2637         mutex_enter(&zone_status_lock);
2638         while (zone->zone_status < status) {
2639                 cv_wait(&zone->zone_cv, &zone_status_lock);
2640         }
2641         mutex_exit(&zone_status_lock);
2642 }
2643
2644 /*
2645  * Private CPR-safe version of zone_status_wait().
2646  */
2647 static void
2648 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2649 {
2650         callb_cpr_t cprinfo;
2651
2652         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2653
2654         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2655             str);
2656         mutex_enter(&zone_status_lock);
2657         while (zone->zone_status < status) {
2658                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2659                 cv_wait(&zone->zone_cv, &zone_status_lock);
2660                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2661         }
2662         /*
2663          * zone_status_lock is implicitly released by the following.
2664          */
2665         CALLB_CPR_EXIT(&cprinfo);
2666 }
2667
2668 /*
2669  * Block until zone enters requested state or signal is received.  Return (0)
2670  * if signaled, non-zero otherwise.
2671  */
2672 int
2673 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2674 {
2675         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2676
2677         mutex_enter(&zone_status_lock);
2678         while (zone->zone_status < status) {
2679                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2680                         mutex_exit(&zone_status_lock);
2681                         return (0);
2682                 }
2683         }
2684         mutex_exit(&zone_status_lock);
2685         return (1);
2686 }
2687
2688 /*
2689  * Block until the zone enters the requested state or the timeout expires,
2690  * whichever happens first.  Return (-1) if operation timed out, time remaining
2691  * otherwise.
2692  */
2693 clock_t
2694 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2695 {
2696         clock_t timeleft = 0;
2697
2698         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2699
2700         mutex_enter(&zone_status_lock);
2701         while (zone->zone_status < status && timeleft != -1) {
2702                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2703         }
2704         mutex_exit(&zone_status_lock);
2705         return (timeleft);
2706 }
2707
2708 /*
2709  * Block until the zone enters the requested state, the current process is
2710  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2711  * operation timed out, 0 if signaled, time remaining otherwise.
2712  */
2713 clock_t
2714 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2715 {
2716         clock_t timeleft = tim - ddi_get_lbolt();
2717
2718         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2719
2720         mutex_enter(&zone_status_lock);
2721         while (zone->zone_status < status) {
2722                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2723                     tim);
2724                 if (timeleft <= 0)
2725                         break;
2726         }
2727         mutex_exit(&zone_status_lock);
2728         return (timeleft);
2729 }
2730
2731 /*
2732  * Zones have two reference counts: one for references from credential
2733  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2734  * This is so we can allow a zone to be rebooted while there are still
2735  * outstanding cred references, since certain drivers cache dblks (which
2736  * implicitly results in cached creds).  We wait for zone_ref to drop to
2737  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2738  * later freed when the zone_cred_ref drops to 0, though nothing other
2739  * than the zone id and privilege set should be accessed once the zone
2740  * is "dead".
2741  *
2742  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2743  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2744  * to 0.  This can be useful to flush out other sources of cached creds
2745  * that may be less innocuous than the driver case.
2746  *
2747  * Zones also provide a tracked reference counting mechanism in which zone
2748  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2749  * debuggers determine the sources of leaked zone references.  See
2750  * zone_hold_ref() and zone_rele_ref() below for more information.
2751  */
2752
2753 int zone_wait_for_cred = 0;
2754
2755 static void
2756 zone_hold_locked(zone_t *z)
2757 {
2758         ASSERT(MUTEX_HELD(&z->zone_lock));
2759         z->zone_ref++;
2760         ASSERT(z->zone_ref != 0);
2761 }
2762
2763 /*
2764  * Increment the specified zone's reference count.  The zone's zone_t structure
2765  * will not be freed as long as the zone's reference count is nonzero.
2766  * Decrement the zone's reference count via zone_rele().
2767  *
2768  * NOTE: This function should only be used to hold zones for short periods of
2769  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2770  */
2771 void
2772 zone_hold(zone_t *z)
2773 {
2774         mutex_enter(&z->zone_lock);
2775         zone_hold_locked(z);
2776         mutex_exit(&z->zone_lock);
2777 }
2778
2779 /*
2780  * If the non-cred ref count drops to 1 and either the cred ref count
2781  * is 0 or we aren't waiting for cred references, the zone is ready to
2782  * be destroyed.
2783  */
2784 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2785             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2786
2787 /*
2788  * Common zone reference release function invoked by zone_rele() and
2789  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2790  * zone's subsystem-specific reference counters are not affected by the
2791  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2792  * removed from the specified zone's reference list.  ref must be non-NULL iff
2793  * subsys is not ZONE_REF_NUM_SUBSYS.
2794  */
2795 static void
2796 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2797 {
2798         boolean_t wakeup;
2799
2800         mutex_enter(&z->zone_lock);
2801         ASSERT(z->zone_ref != 0);
2802         z->zone_ref--;
2803         if (subsys != ZONE_REF_NUM_SUBSYS) {
2804                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2805                 z->zone_subsys_ref[subsys]--;
2806                 list_remove(&z->zone_ref_list, ref);
2807         }
2808         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2809                 /* no more refs, free the structure */
2810                 mutex_exit(&z->zone_lock);
2811                 zone_free(z);
2812                 return;
2813         }
2814         /* signal zone_destroy so the zone can finish halting */
2815         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2816         mutex_exit(&z->zone_lock);
2817
2818         if (wakeup) {
2819                 /*
2820                  * Grabbing zonehash_lock here effectively synchronizes with
2821                  * zone_destroy() to avoid missed signals.
2822                  */
2823                 mutex_enter(&zonehash_lock);
2824                 cv_broadcast(&zone_destroy_cv);
2825                 mutex_exit(&zonehash_lock);
2826         }
2827 }
2828
2829 /*
2830  * Decrement the specified zone's reference count.  The specified zone will
2831  * cease to exist after this function returns if the reference count drops to
2832  * zero.  This function should be paired with zone_hold().
2833  */
2834 void
2835 zone_rele(zone_t *z)
2836 {
2837         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2838 }
2839
2840 /*
2841  * Initialize a zone reference structure.  This function must be invoked for
2842  * a reference structure before the structure is passed to zone_hold_ref().
2843  */
2844 void
2845 zone_init_ref(zone_ref_t *ref)
2846 {
2847         ref->zref_zone = NULL;
2848         list_link_init(&ref->zref_linkage);
2849 }
2850
2851 /*
2852  * Acquire a reference to zone z.  The caller must specify the
2853  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2854  * zone_ref_t structure will represent a reference to the specified zone.  Use
2855  * zone_rele_ref() to release the reference.
2856  *
2857  * The referenced zone_t structure will not be freed as long as the zone_t's
2858  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2859  * references.
2860  *
2861  * NOTE: The zone_ref_t structure must be initialized before it is used.
2862  * See zone_init_ref() above.
2863  */
2864 void
2865 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2866 {
2867         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2868
2869         /*
2870          * Prevent consumers from reusing a reference structure before
2871          * releasing it.
2872          */
2873         VERIFY(ref->zref_zone == NULL);
2874
2875         ref->zref_zone = z;
2876         mutex_enter(&z->zone_lock);
2877         zone_hold_locked(z);
2878         z->zone_subsys_ref[subsys]++;
2879         ASSERT(z->zone_subsys_ref[subsys] != 0);
2880         list_insert_head(&z->zone_ref_list, ref);
2881         mutex_exit(&z->zone_lock);
2882 }
2883
2884 /*
2885  * Release the zone reference represented by the specified zone_ref_t.
2886  * The reference is invalid after it's released; however, the zone_ref_t
2887  * structure can be reused without having to invoke zone_init_ref().
2888  * subsys should be the same value that was passed to zone_hold_ref()
2889  * when the reference was acquired.
2890  */
2891 void
2892 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2893 {
2894         zone_rele_common(ref->zref_zone, ref, subsys);
2895
2896         /*
2897          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2898          * when consumers dereference the reference.  This helps us catch
2899          * consumers who use released references.  Furthermore, this lets
2900          * consumers reuse the zone_ref_t structure without having to
2901          * invoke zone_init_ref().
2902          */
2903         ref->zref_zone = NULL;
2904 }
2905
2906 void
2907 zone_cred_hold(zone_t *z)
2908 {
2909         mutex_enter(&z->zone_lock);
2910         z->zone_cred_ref++;
2911         ASSERT(z->zone_cred_ref != 0);
2912         mutex_exit(&z->zone_lock);
2913 }
2914
2915 void
2916 zone_cred_rele(zone_t *z)
2917 {
2918         boolean_t wakeup;
2919
2920         mutex_enter(&z->zone_lock);
2921         ASSERT(z->zone_cred_ref != 0);
2922         z->zone_cred_ref--;
2923         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2924                 /* no more refs, free the structure */
2925                 mutex_exit(&z->zone_lock);
2926                 zone_free(z);
2927                 return;
2928         }
2929         /*
2930          * If zone_destroy is waiting for the cred references to drain
2931          * out, and they have, signal it.
2932          */
2933         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2934             zone_status_get(z) >= ZONE_IS_DEAD);
2935         mutex_exit(&z->zone_lock);
2936
2937         if (wakeup) {
2938                 /*
2939                  * Grabbing zonehash_lock here effectively synchronizes with
2940                  * zone_destroy() to avoid missed signals.
2941                  */
2942                 mutex_enter(&zonehash_lock);
2943                 cv_broadcast(&zone_destroy_cv);
2944                 mutex_exit(&zonehash_lock);
2945         }
2946 }
2947
2948 void
2949 zone_task_hold(zone_t *z)
2950 {
2951         mutex_enter(&z->zone_lock);
2952         z->zone_ntasks++;
2953         ASSERT(z->zone_ntasks != 0);
2954         mutex_exit(&z->zone_lock);
2955 }
2956
2957 void
2958 zone_task_rele(zone_t *zone)
2959 {
2960         uint_t refcnt;
2961
2962         mutex_enter(&zone->zone_lock);
2963         ASSERT(zone->zone_ntasks != 0);
2964         refcnt = --zone->zone_ntasks;
2965         if (refcnt > 1) {       /* Common case */
2966                 mutex_exit(&zone->zone_lock);
2967                 return;
2968         }
2969         zone_hold_locked(zone); /* so we can use the zone_t later */
2970         mutex_exit(&zone->zone_lock);
2971         if (refcnt == 1) {
2972                 /*
2973                  * See if the zone is shutting down.
2974                  */
2975                 mutex_enter(&zone_status_lock);
2976                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2977                         goto out;
2978                 }
2979
2980                 /*
2981                  * Make sure the ntasks didn't change since we
2982                  * dropped zone_lock.
2983                  */
2984                 mutex_enter(&zone->zone_lock);
2985                 if (refcnt != zone->zone_ntasks) {
2986                         mutex_exit(&zone->zone_lock);
2987                         goto out;
2988                 }
2989                 mutex_exit(&zone->zone_lock);
2990
2991                 /*
2992                  * No more user processes in the zone.  The zone is empty.
2993                  */
2994                 zone_status_set(zone, ZONE_IS_EMPTY);
2995                 goto out;
2996         }
2997
2998         ASSERT(refcnt == 0);
2999         /*
3000          * zsched has exited; the zone is dead.
3001          */
3002         zone->zone_zsched = NULL;               /* paranoia */
3003         mutex_enter(&zone_status_lock);
3004         zone_status_set(zone, ZONE_IS_DEAD);
3005 out:
3006         mutex_exit(&zone_status_lock);
3007         zone_rele(zone);
3008 }
3009
3010 zoneid_t
3011 getzoneid(void)
3012 {
3013         return (curproc->p_zone->zone_id);
3014 }
3015
3016 /*
3017  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3018  * check the validity of a zone's state.
3019  */
3020 static zone_t *
3021 zone_find_all_by_id(zoneid_t zoneid)
3022 {
3023         mod_hash_val_t hv;
3024         zone_t *zone = NULL;
3025
3026         ASSERT(MUTEX_HELD(&zonehash_lock));
3027
3028         if (mod_hash_find(zonehashbyid,
3029             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3030                 zone = (zone_t *)hv;
3031         return (zone);
3032 }
3033
3034 static zone_t *
3035 zone_find_all_by_label(const ts_label_t *label)
3036 {
3037         mod_hash_val_t hv;
3038         zone_t *zone = NULL;
3039
3040         ASSERT(MUTEX_HELD(&zonehash_lock));
3041
3042         /*
3043          * zonehashbylabel is not maintained for unlabeled systems
3044          */
3045         if (!is_system_labeled())
3046                 return (NULL);
3047         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3048                 zone = (zone_t *)hv;
3049         return (zone);
3050 }
3051
3052 static zone_t *
3053 zone_find_all_by_name(char *name)
3054 {
3055         mod_hash_val_t hv;
3056         zone_t *zone = NULL;
3057
3058         ASSERT(MUTEX_HELD(&zonehash_lock));
3059
3060         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3061                 zone = (zone_t *)hv;
3062         return (zone);
3063 }
3064
3065 /*
3066  * Public interface for looking up a zone by zoneid.  Only returns the zone if
3067  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3068  * Caller must call zone_rele() once it is done with the zone.
3069  *
3070  * The zone may begin the zone_destroy() sequence immediately after this
3071  * function returns, but may be safely used until zone_rele() is called.
3072  */
3073 zone_t *
3074 zone_find_by_id(zoneid_t zoneid)
3075 {
3076         zone_t *zone;
3077         zone_status_t status;
3078
3079         mutex_enter(&zonehash_lock);
3080         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3081                 mutex_exit(&zonehash_lock);
3082                 return (NULL);
3083         }
3084         status = zone_status_get(zone);
3085         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3086                 /*
3087                  * For all practical purposes the zone doesn't exist.
3088                  */
3089                 mutex_exit(&zonehash_lock);
3090                 return (NULL);
3091         }
3092         zone_hold(zone);
3093         mutex_exit(&zonehash_lock);
3094         return (zone);
3095 }
3096
3097 /*
3098  * Similar to zone_find_by_id, but using zone label as the key.
3099  */
3100 zone_t *
3101 zone_find_by_label(const ts_label_t *label)
3102 {
3103         zone_t *zone;
3104         zone_status_t status;
3105
3106         mutex_enter(&zonehash_lock);
3107         if ((zone = zone_find_all_by_label(label)) == NULL) {
3108                 mutex_exit(&zonehash_lock);
3109                 return (NULL);
3110         }
3111
3112         status = zone_status_get(zone);
3113         if (status > ZONE_IS_DOWN) {
3114                 /*
3115                  * For all practical purposes the zone doesn't exist.
3116                  */
3117                 mutex_exit(&zonehash_lock);
3118                 return (NULL);
3119         }
3120         zone_hold(zone);
3121         mutex_exit(&zonehash_lock);
3122         return (zone);
3123 }
3124
3125 /*
3126  * Similar to zone_find_by_id, but using zone name as the key.
3127  */
3128 zone_t *
3129 zone_find_by_name(char *name)
3130 {
3131         zone_t *zone;
3132         zone_status_t status;
3133
3134         mutex_enter(&zonehash_lock);
3135         if ((zone = zone_find_all_by_name(name)) == NULL) {
3136                 mutex_exit(&zonehash_lock);
3137                 return (NULL);
3138         }
3139         status = zone_status_get(zone);
3140         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3141                 /*
3142                  * For all practical purposes the zone doesn't exist.
3143                  */
3144                 mutex_exit(&zonehash_lock);
3145                 return (NULL);
3146         }
3147         zone_hold(zone);
3148         mutex_exit(&zonehash_lock);
3149         return (zone);
3150 }
3151
3152 /*
3153  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3154  * if there is a zone "foo" rooted at /foo/root, and the path argument
3155  * is "/foo/root/proc", it will return the held zone_t corresponding to
3156  * zone "foo".
3157  *
3158  * zone_find_by_path() always returns a non-NULL value, since at the
3159  * very least every path will be contained in the global zone.
3160  *
3161  * As with the other zone_find_by_*() functions, the caller is
3162  * responsible for zone_rele()ing the return value of this function.
3163  */
3164 zone_t *
3165 zone_find_by_path(const char *path)
3166 {
3167         zone_t *zone;
3168         zone_t *zret = NULL;
3169         zone_status_t status;
3170
3171         if (path == NULL) {
3172                 /*
3173                  * Call from rootconf().
3174                  */
3175                 zone_hold(global_zone);
3176                 return (global_zone);
3177         }
3178         ASSERT(*path == '/');
3179         mutex_enter(&zonehash_lock);
3180         for (zone = list_head(&zone_active); zone != NULL;
3181             zone = list_next(&zone_active, zone)) {
3182                 if (ZONE_PATH_VISIBLE(path, zone))
3183                         zret = zone;
3184         }
3185         ASSERT(zret != NULL);
3186         status = zone_status_get(zret);
3187         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3188                 /*
3189                  * Zone practically doesn't exist.
3190                  */
3191                 zret = global_zone;
3192         }
3193         zone_hold(zret);
3194         mutex_exit(&zonehash_lock);
3195         return (zret);
3196 }
3197
3198 /*
3199  * Public interface for updating per-zone load averages.  Called once per
3200  * second.
3201  *
3202  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3203  */
3204 void
3205 zone_loadavg_update()
3206 {
3207         zone_t *zp;
3208         zone_status_t status;
3209         struct loadavg_s *lavg;
3210         hrtime_t zone_total;
3211         int i;
3212         hrtime_t hr_avg;
3213         int nrun;
3214         static int64_t f[3] = { 135, 27, 9 };
3215         int64_t q, r;
3216
3217         mutex_enter(&zonehash_lock);
3218         for (zp = list_head(&zone_active); zp != NULL;
3219             zp = list_next(&zone_active, zp)) {
3220                 mutex_enter(&zp->zone_lock);
3221
3222                 /* Skip zones that are on the way down or not yet up */
3223                 status = zone_status_get(zp);
3224                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3225                         /* For all practical purposes the zone doesn't exist. */
3226                         mutex_exit(&zp->zone_lock);
3227                         continue;
3228                 }
3229
3230                 /*
3231                  * Update the 10 second moving average data in zone_loadavg.
3232                  */
3233                 lavg = &zp->zone_loadavg;
3234
3235                 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3236                 scalehrtime(&zone_total);
3237
3238                 /* The zone_total should always be increasing. */
3239                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3240                     zone_total - lavg->lg_total : 0;
3241                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3242                 /* lg_total holds the prev. 1 sec. total */
3243                 lavg->lg_total = zone_total;
3244
3245                 /*
3246                  * To simplify the calculation, we don't calculate the load avg.
3247                  * until the zone has been up for at least 10 seconds and our
3248                  * moving average is thus full.
3249                  */
3250                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3251                         lavg->lg_len++;
3252                         mutex_exit(&zp->zone_lock);
3253                         continue;
3254                 }
3255
3256                 /* Now calculate the 1min, 5min, 15 min load avg. */
3257                 hr_avg = 0;
3258                 for (i = 0; i < S_LOADAVG_SZ; i++)
3259                         hr_avg += lavg->lg_loads[i];
3260                 hr_avg = hr_avg / S_LOADAVG_SZ;
3261                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3262
3263                 /* Compute load avg. See comment in calcloadavg() */
3264                 for (i = 0; i < 3; i++) {
3265                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3266                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3267                         zp->zone_hp_avenrun[i] +=
3268                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3269
3270                         /* avenrun[] can only hold 31 bits of load avg. */
3271                         if (zp->zone_hp_avenrun[i] <
3272                             ((uint64_t)1<<(31+16-FSHIFT)))
3273                                 zp->zone_avenrun[i] = (int32_t)
3274                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3275                         else
3276                                 zp->zone_avenrun[i] = 0x7fffffff;
3277                 }
3278
3279                 mutex_exit(&zp->zone_lock);
3280         }
3281         mutex_exit(&zonehash_lock);
3282 }
3283
3284 /*
3285  * Get the number of cpus visible to this zone.  The system-wide global
3286  * 'ncpus' is returned if pools are disabled, the caller is in the
3287  * global zone, or a NULL zone argument is passed in.
3288  */
3289 int
3290 zone_ncpus_get(zone_t *zone)
3291 {
3292         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3293
3294         return (myncpus != 0 ? myncpus : ncpus);
3295 }
3296
3297 /*
3298  * Get the number of online cpus visible to this zone.  The system-wide
3299  * global 'ncpus_online' is returned if pools are disabled, the caller
3300  * is in the global zone, or a NULL zone argument is passed in.
3301  */
3302 int
3303 zone_ncpus_online_get(zone_t *zone)
3304 {
3305         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3306
3307         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3308 }
3309
3310 /*
3311  * Return the pool to which the zone is currently bound.
3312  */
3313 pool_t *
3314 zone_pool_get(zone_t *zone)
3315 {
3316         ASSERT(pool_lock_held());
3317
3318         return (zone->zone_pool);
3319 }
3320
3321 /*
3322  * Set the zone's pool pointer and update the zone's visibility to match
3323  * the resources in the new pool.
3324  */
3325 void
3326 zone_pool_set(zone_t *zone, pool_t *pool)
3327 {
3328         ASSERT(pool_lock_held());
3329         ASSERT(MUTEX_HELD(&cpu_lock));
3330
3331         zone->zone_pool = pool;
3332         zone_pset_set(zone, pool->pool_pset->pset_id);
3333 }
3334
3335 /*
3336  * Return the cached value of the id of the processor set to which the
3337  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3338  * facility is disabled.
3339  */
3340 psetid_t
3341 zone_pset_get(zone_t *zone)
3342 {
3343         ASSERT(MUTEX_HELD(&cpu_lock));
3344
3345         return (zone->zone_psetid);
3346 }
3347
3348 /*
3349  * Set the cached value of the id of the processor set to which the zone
3350  * is currently bound.  Also update the zone's visibility to match the
3351  * resources in the new processor set.
3352  */
3353 void
3354 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3355 {
3356         psetid_t oldpsetid;
3357
3358         ASSERT(MUTEX_HELD(&cpu_lock));
3359         oldpsetid = zone_pset_get(zone);
3360
3361         if (oldpsetid == newpsetid)
3362                 return;
3363         /*
3364          * Global zone sees all.
3365          */
3366         if (zone != global_zone) {
3367                 zone->zone_psetid = newpsetid;
3368                 if (newpsetid != ZONE_PS_INVAL)
3369                         pool_pset_visibility_add(newpsetid, zone);
3370                 if (oldpsetid != ZONE_PS_INVAL)
3371                         pool_pset_visibility_remove(oldpsetid, zone);
3372         }
3373         /*
3374          * Disabling pools, so we should start using the global values
3375          * for ncpus and ncpus_online.
3376          */
3377         if (newpsetid == ZONE_PS_INVAL) {
3378                 zone->zone_ncpus = 0;
3379                 zone->zone_ncpus_online = 0;
3380         }
3381 }
3382
3383 /*
3384  * Walk the list of active zones and issue the provided callback for
3385  * each of them.
3386  *
3387  * Caller must not be holding any locks that may be acquired under
3388  * zonehash_lock.  See comment at the beginning of the file for a list of
3389  * common locks and their interactions with zones.
3390  */
3391 int
3392 zone_walk(int (*cb)(zone_t *, void *), void *data)
3393 {
3394         zone_t *zone;
3395         int ret = 0;
3396         zone_status_t status;
3397
3398         mutex_enter(&zonehash_lock);
3399         for (zone = list_head(&zone_active); zone != NULL;
3400             zone = list_next(&zone_active, zone)) {
3401                 /*
3402                  * Skip zones that shouldn't be externally visible.
3403                  */
3404                 status = zone_status_get(zone);
3405                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3406                         continue;
3407                 /*
3408                  * Bail immediately if any callback invocation returns a
3409                  * non-zero value.
3410                  */
3411                 ret = (*cb)(zone, data);
3412                 if (ret != 0)
3413                         break;
3414         }
3415         mutex_exit(&zonehash_lock);
3416         return (ret);
3417 }
3418
3419 static int
3420 zone_set_root(zone_t *zone, const char *upath)
3421 {
3422         vnode_t *vp;
3423         int trycount;
3424         int error = 0;
3425         char *path;
3426         struct pathname upn, pn;
3427         size_t pathlen;
3428
3429         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3430                 return (error);
3431
3432         pn_alloc(&pn);
3433
3434         /* prevent infinite loop */
3435         trycount = 10;
3436         for (;;) {
3437                 if (--trycount <= 0) {
3438                         error = ESTALE;
3439                         goto out;
3440                 }
3441
3442                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3443                         /*
3444                          * VOP_ACCESS() may cover 'vp' with a new
3445                          * filesystem, if 'vp' is an autoFS vnode.
3446                          * Get the new 'vp' if so.
3447                          */
3448                         if ((error =
3449                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3450                             (!vn_ismntpt(vp) ||
3451                             (error = traverse(&vp)) == 0)) {
3452                                 pathlen = pn.pn_pathlen + 2;
3453                                 path = kmem_alloc(pathlen, KM_SLEEP);
3454                                 (void) strncpy(path, pn.pn_path,
3455                                     pn.pn_pathlen + 1);
3456                                 path[pathlen - 2] = '/';
3457                                 path[pathlen - 1] = '\0';
3458                                 pn_free(&pn);
3459                                 pn_free(&upn);
3460
3461                                 /* Success! */
3462                                 break;
3463                         }
3464                         VN_RELE(vp);
3465                 }
3466                 if (error != ESTALE)
3467                         goto out;
3468         }
3469
3470         ASSERT(error == 0);
3471         zone->zone_rootvp = vp;         /* we hold a reference to vp */
3472         zone->zone_rootpath = path;
3473         zone->zone_rootpathlen = pathlen;
3474         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3475                 zone->zone_flags |= ZF_IS_SCRATCH;
3476         return (0);
3477
3478 out:
3479         pn_free(&pn);
3480         pn_free(&upn);
3481         return (error);
3482 }
3483
3484 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3485                         ((c) >= 'a' && (c) <= 'z') || \
3486                         ((c) >= 'A' && (c) <= 'Z'))
3487
3488 static int
3489 zone_set_name(zone_t *zone, const char *uname)
3490 {
3491         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3492         size_t len;
3493         int i, err;
3494
3495         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3496                 kmem_free(kname, ZONENAME_MAX);
3497                 return (err);   /* EFAULT or ENAMETOOLONG */
3498         }
3499
3500         /* must be less than ZONENAME_MAX */
3501         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3502                 kmem_free(kname, ZONENAME_MAX);
3503                 return (EINVAL);
3504         }
3505
3506         /*
3507          * Name must start with an alphanumeric and must contain only
3508          * alphanumerics, '-', '_' and '.'.
3509          */
3510         if (!isalnum(kname[0])) {
3511                 kmem_free(kname, ZONENAME_MAX);
3512                 return (EINVAL);
3513         }
3514         for (i = 1; i < len - 1; i++) {
3515                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3516                     kname[i] != '.') {
3517                         kmem_free(kname, ZONENAME_MAX);
3518                         return (EINVAL);
3519                 }
3520         }
3521
3522         zone->zone_name = kname;
3523         return (0);
3524 }
3525
3526 /*
3527  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3528  * is NULL or it points to a zone with no hostid emulation, then the machine's
3529  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3530  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3531  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3532  * hostid and the machine's hostid is invalid.
3533  */
3534 uint32_t
3535 zone_get_hostid(zone_t *zonep)
3536 {
3537         unsigned long machine_hostid;
3538
3539         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3540                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3541                         return (HW_INVALID_HOSTID);
3542                 return ((uint32_t)machine_hostid);
3543         }
3544         return (zonep->zone_hostid);
3545 }
3546
3547 /*
3548  * Similar to thread_create(), but makes sure the thread is in the appropriate
3549  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3550  */
3551 /*ARGSUSED*/
3552 kthread_t *
3553 zthread_create(
3554     caddr_t stk,
3555     size_t stksize,
3556     void (*proc)(),
3557     void *arg,
3558     size_t len,
3559     pri_t pri)
3560 {
3561         kthread_t *t;
3562         zone_t *zone = curproc->p_zone;
3563         proc_t *pp = zone->zone_zsched;
3564
3565         zone_hold(zone);        /* Reference to be dropped when thread exits */
3566
3567         /*
3568          * No-one should be trying to create threads if the zone is shutting
3569          * down and there aren't any kernel threads around.  See comment
3570          * in zthread_exit().
3571          */
3572         ASSERT(!(zone->zone_kthreads == NULL &&
3573             zone_status_get(zone) >= ZONE_IS_EMPTY));
3574         /*
3575          * Create a thread, but don't let it run until we've finished setting
3576          * things up.
3577          */
3578         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3579         ASSERT(t->t_forw == NULL);
3580         mutex_enter(&zone_status_lock);
3581         if (zone->zone_kthreads == NULL) {
3582                 t->t_forw = t->t_back = t;
3583         } else {
3584                 kthread_t *tx = zone->zone_kthreads;
3585
3586                 t->t_forw = tx;
3587                 t->t_back = tx->t_back;
3588                 tx->t_back->t_forw = t;
3589                 tx->t_back = t;
3590         }
3591         zone->zone_kthreads = t;
3592         mutex_exit(&zone_status_lock);
3593
3594         mutex_enter(&pp->p_lock);
3595         t->t_proc_flag |= TP_ZTHREAD;
3596         project_rele(t->t_proj);
3597         t->t_proj = project_hold(pp->p_task->tk_proj);
3598
3599         /*
3600          * Setup complete, let it run.
3601          */
3602         thread_lock(t);
3603         t->t_schedflag |= TS_ALLSTART;
3604         setrun_locked(t);
3605         thread_unlock(t);
3606
3607         mutex_exit(&pp->p_lock);
3608
3609         return (t);
3610 }
3611
3612 /*
3613  * Similar to thread_exit().  Must be called by threads created via
3614  * zthread_exit().
3615  */
3616 void
3617 zthread_exit(void)
3618 {
3619         kthread_t *t = curthread;
3620         proc_t *pp = curproc;
3621         zone_t *zone = pp->p_zone;
3622
3623         mutex_enter(&zone_status_lock);
3624
3625         /*
3626          * Reparent to p0
3627          */
3628         kpreempt_disable();
3629         mutex_enter(&pp->p_lock);
3630         t->t_proc_flag &= ~TP_ZTHREAD;
3631         t->t_procp = &p0;
3632         hat_thread_exit(t);
3633         mutex_exit(&pp->p_lock);
3634         kpreempt_enable();
3635
3636         if (t->t_back == t) {
3637                 ASSERT(t->t_forw == t);
3638                 /*
3639                  * If the zone is empty, once the thread count
3640                  * goes to zero no further kernel threads can be
3641                  * created.  This is because if the creator is a process
3642                  * in the zone, then it must have exited before the zone
3643                  * state could be set to ZONE_IS_EMPTY.
3644                  * Otherwise, if the creator is a kernel thread in the
3645                  * zone, the thread count is non-zero.
3646                  *
3647                  * This really means that non-zone kernel threads should
3648                  * not create zone kernel threads.
3649                  */
3650                 zone->zone_kthreads = NULL;
3651                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3652                         zone_status_set(zone, ZONE_IS_DOWN);
3653                         /*
3654                          * Remove any CPU caps on this zone.
3655                          */
3656                         cpucaps_zone_remove(zone);
3657                 }
3658         } else {
3659                 t->t_forw->t_back = t->t_back;
3660                 t->t_back->t_forw = t->t_forw;
3661                 if (zone->zone_kthreads == t)
3662                         zone->zone_kthreads = t->t_forw;
3663         }
3664         mutex_exit(&zone_status_lock);
3665         zone_rele(zone);
3666         thread_exit();
3667         /* NOTREACHED */
3668 }
3669
3670 static void
3671 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3672 {
3673         vnode_t *oldvp;
3674
3675         /* we're going to hold a reference here to the directory */
3676         VN_HOLD(vp);
3677
3678         /* update abs cwd/root path see c2/audit.c */
3679         if (AU_AUDITING())
3680                 audit_chdirec(vp, vpp);
3681
3682         mutex_enter(&pp->p_lock);
3683         oldvp = *vpp;
3684         *vpp = vp;
3685         mutex_exit(&pp->p_lock);
3686         if (oldvp != NULL)
3687                 VN_RELE(oldvp);
3688 }
3689
3690 /*
3691  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3692  */
3693 static int
3694 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3695 {
3696         nvpair_t *nvp = NULL;
3697         boolean_t priv_set = B_FALSE;
3698         boolean_t limit_set = B_FALSE;
3699         boolean_t action_set = B_FALSE;
3700
3701         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3702                 const char *name;
3703                 uint64_t ui64;
3704
3705                 name = nvpair_name(nvp);
3706                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3707                         return (EINVAL);
3708                 (void) nvpair_value_uint64(nvp, &ui64);
3709                 if (strcmp(name, "privilege") == 0) {
3710                         /*
3711                          * Currently only privileged values are allowed, but
3712                          * this may change in the future.
3713                          */
3714                         if (ui64 != RCPRIV_PRIVILEGED)
3715                                 return (EINVAL);
3716                         rv->rcv_privilege = ui64;
3717                         priv_set = B_TRUE;
3718                 } else if (strcmp(name, "limit") == 0) {
3719                         rv->rcv_value = ui64;
3720                         limit_set = B_TRUE;
3721                 } else if (strcmp(name, "action") == 0) {
3722                         if (ui64 != RCTL_LOCAL_NOACTION &&
3723                             ui64 != RCTL_LOCAL_DENY)
3724                                 return (EINVAL);
3725                         rv->rcv_flagaction = ui64;
3726                         action_set = B_TRUE;
3727                 } else {
3728                         return (EINVAL);
3729                 }
3730         }
3731
3732         if (!(priv_set && limit_set && action_set))
3733                 return (EINVAL);
3734         rv->rcv_action_signal = 0;
3735         rv->rcv_action_recipient = NULL;
3736         rv->rcv_action_recip_pid = -1;
3737         rv->rcv_firing_time = 0;
3738
3739         return (0);
3740 }
3741
3742 /*
3743  * Non-global zone version of start_init.
3744  */
3745 void
3746 zone_start_init(void)
3747 {
3748         proc_t *p = ttoproc(curthread);
3749         zone_t *z = p->p_zone;
3750
3751         ASSERT(!INGLOBALZONE(curproc));
3752
3753         /*
3754          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3755          * storing just the pid of init is sufficient.
3756          */
3757         z->zone_proc_initpid = p->p_pid;
3758
3759         /*
3760          * We maintain zone_boot_err so that we can return the cause of the
3761          * failure back to the caller of the zone_boot syscall.
3762          */
3763         p->p_zone->zone_boot_err = start_init_common();
3764
3765         /*
3766          * We will prevent booting zones from becoming running zones if the
3767          * global zone is shutting down.
3768          */
3769         mutex_enter(&zone_status_lock);
3770         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3771             ZONE_IS_SHUTTING_DOWN) {
3772                 /*
3773                  * Make sure we are still in the booting state-- we could have
3774                  * raced and already be shutting down, or even further along.
3775                  */
3776                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3777                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3778                 }
3779                 mutex_exit(&zone_status_lock);
3780                 /* It's gone bad, dispose of the process */
3781                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3782                         mutex_enter(&p->p_lock);
3783                         ASSERT(p->p_flag & SEXITLWPS);
3784                         lwp_exit();
3785                 }
3786         } else {
3787                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3788                         zone_status_set(z, ZONE_IS_RUNNING);
3789                 mutex_exit(&zone_status_lock);
3790                 /* cause the process to return to userland. */
3791                 lwp_rtt();
3792         }
3793 }
3794
3795 struct zsched_arg {
3796         zone_t *zone;
3797         nvlist_t *nvlist;
3798 };
3799
3800 /*
3801  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3802  * anything to do with scheduling, but rather with the fact that
3803  * per-zone kernel threads are parented to zsched, just like regular
3804  * kernel threads are parented to sched (p0).
3805  *
3806  * zsched is also responsible for launching init for the zone.
3807  */
3808 static void
3809 zsched(void *arg)
3810 {
3811         struct zsched_arg *za = arg;
3812         proc_t *pp = curproc;
3813         proc_t *initp = proc_init;
3814         zone_t *zone = za->zone;
3815         cred_t *cr, *oldcred;
3816         rctl_set_t *set;
3817         rctl_alloc_gp_t *gp;
3818         contract_t *ct = NULL;
3819         task_t *tk, *oldtk;
3820         rctl_entity_p_t e;
3821         kproject_t *pj;
3822
3823         nvlist_t *nvl = za->nvlist;
3824         nvpair_t *nvp = NULL;
3825
3826         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3827         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3828         PTOU(pp)->u_argc = 0;
3829         PTOU(pp)->u_argv = NULL;
3830         PTOU(pp)->u_envp = NULL;
3831         closeall(P_FINFO(pp));
3832
3833         /*
3834          * We are this zone's "zsched" process.  As the zone isn't generally
3835          * visible yet we don't need to grab any locks before initializing its
3836          * zone_proc pointer.
3837          */
3838         zone_hold(zone);  /* this hold is released by zone_destroy() */
3839         zone->zone_zsched = pp;
3840         mutex_enter(&pp->p_lock);
3841         pp->p_zone = zone;
3842         mutex_exit(&pp->p_lock);
3843
3844         /*
3845          * Disassociate process from its 'parent'; parent ourselves to init
3846          * (pid 1) and change other values as needed.
3847          */
3848         sess_create();
3849
3850         mutex_enter(&pidlock);
3851         proc_detach(pp);
3852         pp->p_ppid = 1;
3853         pp->p_flag |= SZONETOP;
3854         pp->p_ancpid = 1;
3855         pp->p_parent = initp;
3856         pp->p_psibling = NULL;
3857         if (initp->p_child)
3858                 initp->p_child->p_psibling = pp;
3859         pp->p_sibling = initp->p_child;
3860         initp->p_child = pp;
3861
3862         /* Decrement what newproc() incremented. */
3863         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3864         /*
3865          * Our credentials are about to become kcred-like, so we don't care
3866          * about the caller's ruid.
3867          */
3868         upcount_inc(crgetruid(kcred), zone->zone_id);
3869         mutex_exit(&pidlock);
3870
3871         /*
3872          * getting out of global zone, so decrement lwp and process counts
3873          */
3874         pj = pp->p_task->tk_proj;
3875         mutex_enter(&global_zone->zone_nlwps_lock);
3876         pj->kpj_nlwps -= pp->p_lwpcnt;
3877         global_zone->zone_nlwps -= pp->p_lwpcnt;
3878         pj->kpj_nprocs--;
3879         global_zone->zone_nprocs--;
3880         mutex_exit(&global_zone->zone_nlwps_lock);
3881
3882         /*
3883          * Decrement locked memory counts on old zone and project.
3884          */
3885         mutex_enter(&global_zone->zone_mem_lock);
3886         global_zone->zone_locked_mem -= pp->p_locked_mem;
3887         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3888         mutex_exit(&global_zone->zone_mem_lock);
3889
3890         /*
3891          * Create and join a new task in project '0' of this zone.
3892          *
3893          * We don't need to call holdlwps() since we know we're the only lwp in
3894          * this process.
3895          *
3896          * task_join() returns with p_lock held.
3897          */
3898         tk = task_create(0, zone);
3899         mutex_enter(&cpu_lock);
3900         oldtk = task_join(tk, 0);
3901
3902         pj = pp->p_task->tk_proj;
3903
3904         mutex_enter(&zone->zone_mem_lock);
3905         zone->zone_locked_mem += pp->p_locked_mem;
3906         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3907         mutex_exit(&zone->zone_mem_lock);
3908
3909         /*
3910          * add lwp and process counts to zsched's zone, and increment
3911          * project's task and process count due to the task created in
3912          * the above task_create.
3913          */
3914         mutex_enter(&zone->zone_nlwps_lock);
3915         pj->kpj_nlwps += pp->p_lwpcnt;
3916         pj->kpj_ntasks += 1;
3917         zone->zone_nlwps += pp->p_lwpcnt;
3918         pj->kpj_nprocs++;
3919         zone->zone_nprocs++;
3920         mutex_exit(&zone->zone_nlwps_lock);
3921
3922         mutex_exit(&curproc->p_lock);
3923         mutex_exit(&cpu_lock);
3924         task_rele(oldtk);
3925
3926         /*
3927          * The process was created by a process in the global zone, hence the
3928          * credentials are wrong.  We might as well have kcred-ish credentials.
3929          */
3930         cr = zone->zone_kcred;
3931         crhold(cr);
3932         mutex_enter(&pp->p_crlock);
3933         oldcred = pp->p_cred;
3934         pp->p_cred = cr;
3935         mutex_exit(&pp->p_crlock);
3936         crfree(oldcred);
3937
3938         /*
3939          * Hold credentials again (for thread)
3940          */
3941         crhold(cr);
3942
3943         /*
3944          * p_lwpcnt can't change since this is a kernel process.
3945          */
3946         crset(pp, cr);
3947
3948         /*
3949          * Chroot
3950          */
3951         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3952         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3953
3954         /*
3955          * Initialize zone's rctl set.
3956          */
3957         set = rctl_set_create();
3958         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3959         mutex_enter(&pp->p_lock);
3960         e.rcep_p.zone = zone;
3961         e.rcep_t = RCENTITY_ZONE;
3962         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3963         mutex_exit(&pp->p_lock);
3964         rctl_prealloc_destroy(gp);
3965
3966         /*
3967          * Apply the rctls passed in to zone_create().  This is basically a list
3968          * assignment: all of the old values are removed and the new ones
3969          * inserted.  That is, if an empty list is passed in, all values are
3970          * removed.
3971          */
3972         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3973                 rctl_dict_entry_t *rde;
3974                 rctl_hndl_t hndl;
3975                 char *name;
3976                 nvlist_t **nvlarray;
3977                 uint_t i, nelem;
3978                 int error;      /* For ASSERT()s */
3979
3980                 name = nvpair_name(nvp);
3981                 hndl = rctl_hndl_lookup(name);
3982                 ASSERT(hndl != -1);
3983                 rde = rctl_dict_lookup_hndl(hndl);
3984                 ASSERT(rde != NULL);
3985
3986                 for (; /* ever */; ) {
3987                         rctl_val_t oval;
3988
3989                         mutex_enter(&pp->p_lock);
3990                         error = rctl_local_get(hndl, NULL, &oval, pp);
3991                         mutex_exit(&pp->p_lock);
3992                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3993                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3994                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
3995                                 break;
3996                         mutex_enter(&pp->p_lock);
3997                         error = rctl_local_delete(hndl, &oval, pp);
3998                         mutex_exit(&pp->p_lock);
3999                         ASSERT(error == 0);
4000                 }
4001                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4002                 ASSERT(error == 0);
4003                 for (i = 0; i < nelem; i++) {
4004                         rctl_val_t *nvalp;
4005
4006                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4007                         error = nvlist2rctlval(nvlarray[i], nvalp);
4008                         ASSERT(error == 0);
4009                         /*
4010                          * rctl_local_insert can fail if the value being
4011                          * inserted is a duplicate; this is OK.
4012                          */
4013                         mutex_enter(&pp->p_lock);
4014                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
4015                                 kmem_cache_free(rctl_val_cache, nvalp);
4016                         mutex_exit(&pp->p_lock);
4017                 }
4018         }
4019
4020         /*
4021          * Tell the world that we're done setting up.
4022          *
4023          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4024          * and atomically set the zone's processor set visibility.  Once
4025          * we drop pool_lock() this zone will automatically get updated
4026          * to reflect any future changes to the pools configuration.
4027          *
4028          * Note that after we drop the locks below (zonehash_lock in
4029          * particular) other operations such as a zone_getattr call can
4030          * now proceed and observe the zone. That is the reason for doing a
4031          * state transition to the INITIALIZED state.
4032          */
4033         pool_lock();
4034         mutex_enter(&cpu_lock);
4035         mutex_enter(&zonehash_lock);
4036         zone_uniqid(zone);
4037         zone_zsd_configure(zone);
4038         if (pool_state == POOL_ENABLED)
4039                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4040         mutex_enter(&zone_status_lock);
4041         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4042         zone_status_set(zone, ZONE_IS_INITIALIZED);
4043         mutex_exit(&zone_status_lock);
4044         mutex_exit(&zonehash_lock);
4045         mutex_exit(&cpu_lock);
4046         pool_unlock();
4047
4048         /* Now call the create callback for this key */
4049         zsd_apply_all_keys(zsd_apply_create, zone);
4050
4051         /* The callbacks are complete. Mark ZONE_IS_READY */
4052         mutex_enter(&zone_status_lock);
4053         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4054         zone_status_set(zone, ZONE_IS_READY);
4055         mutex_exit(&zone_status_lock);
4056
4057         /*
4058          * Once we see the zone transition to the ZONE_IS_BOOTING state,
4059          * we launch init, and set the state to running.
4060          */
4061         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4062
4063         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4064                 id_t cid;
4065
4066                 /*
4067                  * Ok, this is a little complicated.  We need to grab the
4068                  * zone's pool's scheduling class ID; note that by now, we
4069                  * are already bound to a pool if we need to be (zoneadmd
4070                  * will have done that to us while we're in the READY
4071                  * state).  *But* the scheduling class for the zone's 'init'
4072                  * must be explicitly passed to newproc, which doesn't
4073                  * respect pool bindings.
4074                  *
4075                  * We hold the pool_lock across the call to newproc() to
4076                  * close the obvious race: the pool's scheduling class
4077                  * could change before we manage to create the LWP with
4078                  * classid 'cid'.
4079                  */
4080                 pool_lock();
4081                 if (zone->zone_defaultcid > 0)
4082                         cid = zone->zone_defaultcid;
4083                 else
4084                         cid = pool_get_class(zone->zone_pool);
4085                 if (cid == -1)
4086                         cid = defaultcid;
4087
4088                 /*
4089                  * If this fails, zone_boot will ultimately fail.  The
4090                  * state of the zone will be set to SHUTTING_DOWN-- userland
4091                  * will have to tear down the zone, and fail, or try again.
4092                  */
4093                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4094                     minclsyspri - 1, &ct, 0)) != 0) {
4095                         mutex_enter(&zone_status_lock);
4096                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4097                         mutex_exit(&zone_status_lock);
4098                 } else {
4099                         zone->zone_boot_time = gethrestime_sec();
4100                 }
4101
4102                 pool_unlock();
4103         }
4104
4105         /*
4106          * Wait for zone_destroy() to be called.  This is what we spend
4107          * most of our life doing.
4108          */
4109         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4110
4111         if (ct)
4112                 /*
4113                  * At this point the process contract should be empty.
4114                  * (Though if it isn't, it's not the end of the world.)
4115                  */
4116                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4117
4118         /*
4119          * Allow kcred to be freed when all referring processes
4120          * (including this one) go away.  We can't just do this in
4121          * zone_free because we need to wait for the zone_cred_ref to
4122          * drop to 0 before calling zone_free, and the existence of
4123          * zone_kcred will prevent that.  Thus, we call crfree here to
4124          * balance the crdup in zone_create.  The crhold calls earlier
4125          * in zsched will be dropped when the thread and process exit.
4126          */
4127         crfree(zone->zone_kcred);
4128         zone->zone_kcred = NULL;
4129
4130         exit(CLD_EXITED, 0);
4131 }
4132
4133 /*
4134  * Helper function to determine if there are any submounts of the
4135  * provided path.  Used to make sure the zone doesn't "inherit" any
4136  * mounts from before it is created.
4137  */
4138 static uint_t
4139 zone_mount_count(const char *rootpath)
4140 {
4141         vfs_t *vfsp;
4142         uint_t count = 0;
4143         size_t rootpathlen = strlen(rootpath);
4144
4145         /*
4146          * Holding zonehash_lock prevents race conditions with
4147          * vfs_list_add()/vfs_list_remove() since we serialize with
4148          * zone_find_by_path().
4149          */
4150         ASSERT(MUTEX_HELD(&zonehash_lock));
4151         /*
4152          * The rootpath must end with a '/'
4153          */
4154         ASSERT(rootpath[rootpathlen - 1] == '/');
4155
4156         /*
4157          * This intentionally does not count the rootpath itself if that
4158          * happens to be a mount point.
4159          */
4160         vfs_list_read_lock();
4161         vfsp = rootvfs;
4162         do {
4163                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4164                     rootpathlen) == 0)
4165                         count++;
4166                 vfsp = vfsp->vfs_next;
4167         } while (vfsp != rootvfs);
4168         vfs_list_unlock();
4169         return (count);
4170 }
4171
4172 /*
4173  * Helper function to make sure that a zone created on 'rootpath'
4174  * wouldn't end up containing other zones' rootpaths.
4175  */
4176 static boolean_t
4177 zone_is_nested(const char *rootpath)
4178 {
4179         zone_t *zone;
4180         size_t rootpathlen = strlen(rootpath);
4181         size_t len;
4182
4183         ASSERT(MUTEX_HELD(&zonehash_lock));
4184
4185         /*
4186          * zone_set_root() appended '/' and '\0' at the end of rootpath
4187          */
4188         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4189             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4190                 return (B_TRUE);
4191
4192         for (zone = list_head(&zone_active); zone != NULL;
4193             zone = list_next(&zone_active, zone)) {
4194                 if (zone == global_zone)
4195                         continue;
4196                 len = strlen(zone->zone_rootpath);
4197                 if (strncmp(rootpath, zone->zone_rootpath,
4198                     MIN(rootpathlen, len)) == 0)
4199                         return (B_TRUE);
4200         }
4201         return (B_FALSE);
4202 }
4203
4204 static int
4205 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4206     size_t zone_privssz)
4207 {
4208         priv_set_t *privs;
4209
4210         if (zone_privssz < sizeof (priv_set_t))
4211                 return (ENOMEM);
4212
4213         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4214
4215         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4216                 kmem_free(privs, sizeof (priv_set_t));
4217                 return (EFAULT);
4218         }
4219
4220         zone->zone_privset = privs;
4221         return (0);
4222 }
4223
4224 /*
4225  * We make creative use of nvlists to pass in rctls from userland.  The list is
4226  * a list of the following structures:
4227  *
4228  * (name = rctl_name, value = nvpair_list_array)
4229  *
4230  * Where each element of the nvpair_list_array is of the form:
4231  *
4232  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4233  *      (name = "limit", value = uint64_t),
4234  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4235  */
4236 static int
4237 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4238 {
4239         nvpair_t *nvp = NULL;
4240         nvlist_t *nvl = NULL;
4241         char *kbuf;
4242         int error;
4243         rctl_val_t rv;
4244
4245         *nvlp = NULL;
4246
4247         if (buflen == 0)
4248                 return (0);
4249
4250         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4251                 return (ENOMEM);
4252         if (copyin(ubuf, kbuf, buflen)) {
4253                 error = EFAULT;
4254                 goto out;
4255         }
4256         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4257                 /*
4258                  * nvl may have been allocated/free'd, but the value set to
4259                  * non-NULL, so we reset it here.
4260                  */
4261                 nvl = NULL;
4262                 error = EINVAL;
4263                 goto out;
4264         }
4265         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4266                 rctl_dict_entry_t *rde;
4267                 rctl_hndl_t hndl;
4268                 nvlist_t **nvlarray;
4269                 uint_t i, nelem;
4270                 char *name;
4271
4272                 error = EINVAL;
4273                 name = nvpair_name(nvp);
4274                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4275                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4276                         goto out;
4277                 }
4278                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4279                         goto out;
4280                 }
4281                 rde = rctl_dict_lookup_hndl(hndl);
4282                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4283                 ASSERT(error == 0);
4284                 for (i = 0; i < nelem; i++) {
4285                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4286                                 goto out;
4287                 }
4288                 if (rctl_invalid_value(rde, &rv)) {
4289                         error = EINVAL;
4290                         goto out;
4291                 }
4292         }
4293         error = 0;
4294         *nvlp = nvl;
4295 out:
4296         kmem_free(kbuf, buflen);
4297         if (error && nvl != NULL)
4298                 nvlist_free(nvl);
4299         return (error);
4300 }
4301
4302 int
4303 zone_create_error(int er_error, int er_ext, int *er_out)
4304 {
4305         if (er_out != NULL) {
4306                 if (copyout(&er_ext, er_out, sizeof (int))) {
4307                         return (set_errno(EFAULT));
4308                 }
4309         }
4310         return (set_errno(er_error));
4311 }
4312
4313 static int
4314 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4315 {
4316         ts_label_t *tsl;
4317         bslabel_t blab;
4318
4319         /* Get label from user */
4320         if (copyin(lab, &blab, sizeof (blab)) != 0)
4321                 return (EFAULT);
4322         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4323         if (tsl == NULL)
4324                 return (ENOMEM);
4325
4326         zone->zone_slabel = tsl;
4327         return (0);
4328 }
4329
4330 /*
4331  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4332  */
4333 static int
4334 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4335 {
4336         char *kbuf;
4337         char *dataset, *next;
4338         zone_dataset_t *zd;
4339         size_t len;
4340
4341         if (ubuf == NULL || buflen == 0)
4342                 return (0);
4343
4344         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4345                 return (ENOMEM);
4346
4347         if (copyin(ubuf, kbuf, buflen) != 0) {
4348                 kmem_free(kbuf, buflen);
4349                 return (EFAULT);
4350         }
4351
4352         dataset = next = kbuf;
4353         for (;;) {
4354                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4355
4356                 next = strchr(dataset, ',');
4357
4358                 if (next == NULL)
4359                         len = strlen(dataset);
4360                 else
4361                         len = next - dataset;
4362
4363                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4364                 bcopy(dataset, zd->zd_dataset, len);
4365                 zd->zd_dataset[len] = '\0';
4366
4367                 list_insert_head(&zone->zone_datasets, zd);
4368
4369                 if (next == NULL)
4370                         break;
4371
4372                 dataset = next + 1;
4373         }
4374
4375         kmem_free(kbuf, buflen);
4376         return (0);
4377 }
4378
4379 /*
4380  * System call to create/initialize a new zone named 'zone_name', rooted
4381  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4382  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4383  * with labeling set by 'match', 'doi', and 'label'.
4384  *
4385  * If extended error is non-null, we may use it to return more detailed
4386  * error information.
4387  */
4388 static zoneid_t
4389 zone_create(const char *zone_name, const char *zone_root,
4390     const priv_set_t *zone_privs, size_t zone_privssz,
4391     caddr_t rctlbuf, size_t rctlbufsz,
4392     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4393     int match, uint32_t doi, const bslabel_t *label,
4394     int flags)
4395 {
4396         struct zsched_arg zarg;
4397         nvlist_t *rctls = NULL;
4398         proc_t *pp = curproc;
4399         zone_t *zone, *ztmp;
4400         zoneid_t zoneid, start = GLOBAL_ZONEID;
4401         int error;
4402         int error2 = 0;
4403         char *str;
4404         cred_t *zkcr;
4405         boolean_t insert_label_hash;
4406
4407         if (secpolicy_zone_config(CRED()) != 0)
4408                 return (set_errno(EPERM));
4409
4410         /* can't boot zone from within chroot environment */
4411         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4412                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4413                     extended_error));
4414         /*
4415          * As the first step of zone creation, we want to allocate a zoneid.
4416          * This allocation is complicated by the fact that netstacks use the
4417          * zoneid to determine their stackid, but netstacks themselves are
4418          * freed asynchronously with respect to zone destruction.  This means
4419          * that a netstack reference leak (or in principle, an extraordinarily
4420          * long netstack reference hold) could result in a zoneid being
4421          * allocated that in fact corresponds to a stackid from an active
4422          * (referenced) netstack -- unleashing all sorts of havoc when that
4423          * netstack is actually (re)used.  (In the abstract, we might wish a
4424          * zoneid to not be deallocated until its last referencing netstack
4425          * has been released, but netstacks lack a backpointer into their
4426          * referencing zone -- and changing them to have such a pointer would
4427          * be substantial, to put it euphemistically.)  To avoid this, we
4428          * detect this condition on allocation: if we have allocated a zoneid
4429          * that corresponds to a netstack that's still in use, we warn about
4430          * it (as it is much more likely to be a reference leak than an actual
4431          * netstack reference), free it, and allocate another.  That these
4432          * identifers are allocated out of an ID space assures that we won't
4433          * see the identifier we just allocated.
4434          */
4435         for (;;) {
4436                 zoneid = id_alloc(zoneid_space);
4437
4438                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4439                         break;
4440
4441                 id_free(zoneid_space, zoneid);
4442
4443                 if (start == GLOBAL_ZONEID) {
4444                         start = zoneid;
4445                 } else if (zoneid == start) {
4446                         /*
4447                          * We have managed to iterate over the entire available
4448                          * zoneid space -- there are no identifiers available,
4449                          * presumably due to some number of leaked netstack
4450                          * references.  While it's in principle possible for us
4451                          * to continue to try, it seems wiser to give up at
4452                          * this point to warn and fail explicitly with a
4453                          * distinctive error.
4454                          */
4455                         cmn_err(CE_WARN, "zone_create() failed: all available "
4456                             "zone IDs have netstacks still in use");
4457                         return (set_errno(ENFILE));
4458                 }
4459
4460                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4461                     "netstack still in use", zoneid);
4462         }
4463
4464         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4465         zone->zone_id = zoneid;
4466         zone->zone_status = ZONE_IS_UNINITIALIZED;
4467         zone->zone_pool = pool_default;
4468         zone->zone_pool_mod = gethrtime();
4469         zone->zone_psetid = ZONE_PS_INVAL;
4470         zone->zone_ncpus = 0;
4471         zone->zone_ncpus_online = 0;
4472         zone->zone_restart_init = B_TRUE;
4473         zone->zone_brand = &native_brand;
4474         zone->zone_initname = NULL;
4475         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4476         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4477         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4478         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4479         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4480             offsetof(zone_ref_t, zref_linkage));
4481         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4482             offsetof(struct zsd_entry, zsd_linkage));
4483         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4484             offsetof(zone_dataset_t, zd_linkage));
4485         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4486             offsetof(zone_dl_t, zdl_linkage));
4487         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4488         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4489
4490         if (flags & ZCF_NET_EXCL) {
4491                 zone->zone_flags |= ZF_NET_EXCL;
4492         }
4493
4494         if ((error = zone_set_name(zone, zone_name)) != 0) {
4495                 zone_free(zone);
4496                 return (zone_create_error(error, 0, extended_error));
4497         }
4498
4499         if ((error = zone_set_root(zone, zone_root)) != 0) {
4500                 zone_free(zone);
4501                 return (zone_create_error(error, 0, extended_error));
4502         }
4503         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4504                 zone_free(zone);
4505                 return (zone_create_error(error, 0, extended_error));
4506         }
4507
4508         /* initialize node name to be the same as zone name */
4509         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4510         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4511         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4512
4513         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4514         zone->zone_domain[0] = '\0';
4515         zone->zone_hostid = HW_INVALID_HOSTID;
4516         zone->zone_shares = 1;
4517         zone->zone_shmmax = 0;
4518         zone->zone_ipc.ipcq_shmmni = 0;
4519         zone->zone_ipc.ipcq_semmni = 0;
4520         zone->zone_ipc.ipcq_msgmni = 0;
4521         zone->zone_bootargs = NULL;
4522         zone->zone_fs_allowed = NULL;
4523
4524         secflags_zero(&zone0.zone_secflags.psf_lower);
4525         secflags_zero(&zone0.zone_secflags.psf_effective);
4526         secflags_zero(&zone0.zone_secflags.psf_inherit);
4527         secflags_fullset(&zone0.zone_secflags.psf_upper);
4528
4529         zone->zone_initname =
4530             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4531         (void) strcpy(zone->zone_initname, zone_default_initname);
4532         zone->zone_nlwps = 0;
4533         zone->zone_nlwps_ctl = INT_MAX;
4534         zone->zone_nprocs = 0;
4535         zone->zone_nprocs_ctl = INT_MAX;
4536         zone->zone_locked_mem = 0;
4537         zone->zone_locked_mem_ctl = UINT64_MAX;
4538         zone->zone_max_swap = 0;
4539         zone->zone_max_swap_ctl = UINT64_MAX;
4540         zone->zone_max_lofi = 0;
4541         zone->zone_max_lofi_ctl = UINT64_MAX;
4542         zone0.zone_lockedmem_kstat = NULL;
4543         zone0.zone_swapresv_kstat = NULL;
4544
4545         /*
4546          * Zsched initializes the rctls.
4547          */
4548         zone->zone_rctls = NULL;
4549
4550         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4551                 zone_free(zone);
4552                 return (zone_create_error(error, 0, extended_error));
4553         }
4554
4555         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4556                 zone_free(zone);
4557                 return (set_errno(error));
4558         }
4559
4560         /*
4561          * Read in the trusted system parameters:
4562          * match flag and sensitivity label.
4563          */
4564         zone->zone_match = match;
4565         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4566                 /* Fail if requested to set doi to anything but system's doi */
4567                 if (doi != 0 && doi != default_doi) {
4568                         zone_free(zone);
4569                         return (set_errno(EINVAL));
4570                 }
4571                 /* Always apply system's doi to the zone */
4572                 error = zone_set_label(zone, label, default_doi);
4573                 if (error != 0) {
4574                         zone_free(zone);
4575                         return (set_errno(error));
4576                 }
4577                 insert_label_hash = B_TRUE;
4578         } else {
4579                 /* all zones get an admin_low label if system is not labeled */
4580                 zone->zone_slabel = l_admin_low;
4581                 label_hold(l_admin_low);
4582                 insert_label_hash = B_FALSE;
4583         }
4584
4585         /*
4586          * Stop all lwps since that's what normally happens as part of fork().
4587          * This needs to happen before we grab any locks to avoid deadlock
4588          * (another lwp in the process could be waiting for the held lock).
4589          */
4590         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4591                 zone_free(zone);
4592                 nvlist_free(rctls);
4593                 return (zone_create_error(error, 0, extended_error));
4594         }
4595
4596         if (block_mounts(zone) == 0) {
4597                 mutex_enter(&pp->p_lock);
4598                 if (curthread != pp->p_agenttp)
4599                         continuelwps(pp);
4600                 mutex_exit(&pp->p_lock);
4601                 zone_free(zone);
4602                 nvlist_free(rctls);
4603                 return (zone_create_error(error, 0, extended_error));
4604         }
4605
4606         /*
4607          * Set up credential for kernel access.  After this, any errors
4608          * should go through the dance in errout rather than calling
4609          * zone_free directly.
4610          */
4611         zone->zone_kcred = crdup(kcred);
4612         crsetzone(zone->zone_kcred, zone);
4613         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4614         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4615         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4616         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4617
4618         mutex_enter(&zonehash_lock);
4619         /*
4620          * Make sure zone doesn't already exist.
4621          *
4622          * If the system and zone are labeled,
4623          * make sure no other zone exists that has the same label.
4624          */
4625         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4626             (insert_label_hash &&
4627             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4628                 zone_status_t status;
4629
4630                 status = zone_status_get(ztmp);
4631                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4632                         error = EEXIST;
4633                 else
4634                         error = EBUSY;
4635
4636                 if (insert_label_hash)
4637                         error2 = ZE_LABELINUSE;
4638
4639                 goto errout;
4640         }
4641
4642         /*
4643          * Don't allow zone creations which would cause one zone's rootpath to
4644          * be accessible from that of another (non-global) zone.
4645          */
4646         if (zone_is_nested(zone->zone_rootpath)) {
4647                 error = EBUSY;
4648                 goto errout;
4649         }
4650
4651         ASSERT(zonecount != 0);         /* check for leaks */
4652         if (zonecount + 1 > maxzones) {
4653                 error = ENOMEM;
4654                 goto errout;
4655         }
4656
4657         if (zone_mount_count(zone->zone_rootpath) != 0) {
4658                 error = EBUSY;
4659                 error2 = ZE_AREMOUNTS;
4660                 goto errout;
4661         }
4662
4663         /*
4664          * Zone is still incomplete, but we need to drop all locks while
4665          * zsched() initializes this zone's kernel process.  We
4666          * optimistically add the zone to the hashtable and associated
4667          * lists so a parallel zone_create() doesn't try to create the
4668          * same zone.
4669          */
4670         zonecount++;
4671         (void) mod_hash_insert(zonehashbyid,
4672             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4673             (mod_hash_val_t)(uintptr_t)zone);
4674         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4675         (void) strcpy(str, zone->zone_name);
4676         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4677             (mod_hash_val_t)(uintptr_t)zone);
4678         if (insert_label_hash) {
4679                 (void) mod_hash_insert(zonehashbylabel,
4680                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4681                 zone->zone_flags |= ZF_HASHED_LABEL;
4682         }
4683
4684         /*
4685          * Insert into active list.  At this point there are no 'hold's
4686          * on the zone, but everyone else knows not to use it, so we can
4687          * continue to use it.  zsched() will do a zone_hold() if the
4688          * newproc() is successful.
4689          */
4690         list_insert_tail(&zone_active, zone);
4691         mutex_exit(&zonehash_lock);
4692
4693         zarg.zone = zone;
4694         zarg.nvlist = rctls;
4695         /*
4696          * The process, task, and project rctls are probably wrong;
4697          * we need an interface to get the default values of all rctls,
4698          * and initialize zsched appropriately.  I'm not sure that that
4699          * makes much of a difference, though.
4700          */
4701         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4702         if (error != 0) {
4703                 /*
4704                  * We need to undo all globally visible state.
4705                  */
4706                 mutex_enter(&zonehash_lock);
4707                 list_remove(&zone_active, zone);
4708                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4709                         ASSERT(zone->zone_slabel != NULL);
4710                         (void) mod_hash_destroy(zonehashbylabel,
4711                             (mod_hash_key_t)zone->zone_slabel);
4712                 }
4713                 (void) mod_hash_destroy(zonehashbyname,
4714                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4715                 (void) mod_hash_destroy(zonehashbyid,
4716                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4717                 ASSERT(zonecount > 1);
4718                 zonecount--;
4719                 goto errout;
4720         }
4721
4722         /*
4723          * Zone creation can't fail from now on.
4724          */
4725
4726         /*
4727          * Create zone kstats
4728          */
4729         zone_kstat_create(zone);
4730
4731         /*
4732          * Let the other lwps continue.
4733          */
4734         mutex_enter(&pp->p_lock);
4735         if (curthread != pp->p_agenttp)
4736                 continuelwps(pp);
4737         mutex_exit(&pp->p_lock);
4738
4739         /*
4740          * Wait for zsched to finish initializing the zone.
4741          */
4742         zone_status_wait(zone, ZONE_IS_READY);
4743         /*
4744          * The zone is fully visible, so we can let mounts progress.
4745          */
4746         resume_mounts(zone);
4747         nvlist_free(rctls);
4748
4749         return (zoneid);
4750
4751 errout:
4752         mutex_exit(&zonehash_lock);
4753         /*
4754          * Let the other lwps continue.
4755          */
4756         mutex_enter(&pp->p_lock);
4757         if (curthread != pp->p_agenttp)
4758                 continuelwps(pp);
4759         mutex_exit(&pp->p_lock);
4760
4761         resume_mounts(zone);
4762         nvlist_free(rctls);
4763         /*
4764          * There is currently one reference to the zone, a cred_ref from
4765          * zone_kcred.  To free the zone, we call crfree, which will call
4766          * zone_cred_rele, which will call zone_free.
4767          */
4768         ASSERT(zone->zone_cred_ref == 1);
4769         ASSERT(zone->zone_kcred->cr_ref == 1);
4770         ASSERT(zone->zone_ref == 0);
4771         zkcr = zone->zone_kcred;
4772         zone->zone_kcred = NULL;
4773         crfree(zkcr);                           /* triggers call to zone_free */
4774         return (zone_create_error(error, error2, extended_error));
4775 }
4776
4777 /*
4778  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4779  * the heavy lifting.  initname is the path to the program to launch
4780  * at the "top" of the zone; if this is NULL, we use the system default,
4781  * which is stored at zone_default_initname.
4782  */
4783 static int
4784 zone_boot(zoneid_t zoneid)
4785 {
4786         int err;
4787         zone_t *zone;
4788
4789         if (secpolicy_zone_config(CRED()) != 0)
4790                 return (set_errno(EPERM));
4791         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4792                 return (set_errno(EINVAL));
4793
4794         mutex_enter(&zonehash_lock);
4795         /*
4796          * Look for zone under hash lock to prevent races with calls to
4797          * zone_shutdown, zone_destroy, etc.
4798          */
4799         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4800                 mutex_exit(&zonehash_lock);
4801                 return (set_errno(EINVAL));
4802         }
4803
4804         mutex_enter(&zone_status_lock);
4805         if (zone_status_get(zone) != ZONE_IS_READY) {
4806                 mutex_exit(&zone_status_lock);
4807                 mutex_exit(&zonehash_lock);
4808                 return (set_errno(EINVAL));
4809         }
4810         zone_status_set(zone, ZONE_IS_BOOTING);
4811         mutex_exit(&zone_status_lock);
4812
4813         zone_hold(zone);        /* so we can use the zone_t later */
4814         mutex_exit(&zonehash_lock);
4815
4816         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4817                 zone_rele(zone);
4818                 return (set_errno(EINTR));
4819         }
4820
4821         /*
4822          * Boot (starting init) might have failed, in which case the zone
4823          * will go to the SHUTTING_DOWN state; an appropriate errno will
4824          * be placed in zone->zone_boot_err, and so we return that.
4825          */
4826         err = zone->zone_boot_err;
4827         zone_rele(zone);
4828         return (err ? set_errno(err) : 0);
4829 }
4830
4831 /*
4832  * Kills all user processes in the zone, waiting for them all to exit
4833  * before returning.
4834  */
4835 static int
4836 zone_empty(zone_t *zone)
4837 {
4838         int waitstatus;
4839
4840         /*
4841          * We need to drop zonehash_lock before killing all
4842          * processes, otherwise we'll deadlock with zone_find_*
4843          * which can be called from the exit path.
4844          */
4845         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4846         while ((waitstatus = zone_status_timedwait_sig(zone,
4847             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4848                 killall(zone->zone_id);
4849         }
4850         /*
4851          * return EINTR if we were signaled
4852          */
4853         if (waitstatus == 0)
4854                 return (EINTR);
4855         return (0);
4856 }
4857
4858 /*
4859  * This function implements the policy for zone visibility.
4860  *
4861  * In standard Solaris, a non-global zone can only see itself.
4862  *
4863  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4864  * it dominates. For this test, the label of the global zone is treated as
4865  * admin_high so it is special-cased instead of being checked for dominance.
4866  *
4867  * Returns true if zone attributes are viewable, false otherwise.
4868  */
4869 static boolean_t
4870 zone_list_access(zone_t *zone)
4871 {
4872
4873         if (curproc->p_zone == global_zone ||
4874             curproc->p_zone == zone) {
4875                 return (B_TRUE);
4876         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4877                 bslabel_t *curproc_label;
4878                 bslabel_t *zone_label;
4879
4880                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4881                 zone_label = label2bslabel(zone->zone_slabel);
4882
4883                 if (zone->zone_id != GLOBAL_ZONEID &&
4884                     bldominates(curproc_label, zone_label)) {
4885                         return (B_TRUE);
4886                 } else {
4887                         return (B_FALSE);
4888                 }
4889         } else {
4890                 return (B_FALSE);
4891         }
4892 }
4893
4894 /*
4895  * Systemcall to start the zone's halt sequence.  By the time this
4896  * function successfully returns, all user processes and kernel threads
4897  * executing in it will have exited, ZSD shutdown callbacks executed,
4898  * and the zone status set to ZONE_IS_DOWN.
4899  *
4900  * It is possible that the call will interrupt itself if the caller is the
4901  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4902  */
4903 static int
4904 zone_shutdown(zoneid_t zoneid)
4905 {
4906         int error;
4907         zone_t *zone;
4908         zone_status_t status;
4909
4910         if (secpolicy_zone_config(CRED()) != 0)
4911                 return (set_errno(EPERM));
4912         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4913                 return (set_errno(EINVAL));
4914
4915         mutex_enter(&zonehash_lock);
4916         /*
4917          * Look for zone under hash lock to prevent races with other
4918          * calls to zone_shutdown and zone_destroy.
4919          */
4920         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4921                 mutex_exit(&zonehash_lock);
4922                 return (set_errno(EINVAL));
4923         }
4924
4925         /*
4926          * We have to drop zonehash_lock before calling block_mounts.
4927          * Hold the zone so we can continue to use the zone_t.
4928          */
4929         zone_hold(zone);
4930         mutex_exit(&zonehash_lock);
4931
4932         /*
4933          * Block mounts so that VFS_MOUNT() can get an accurate view of
4934          * the zone's status with regards to ZONE_IS_SHUTTING down.
4935          *
4936          * e.g. NFS can fail the mount if it determines that the zone
4937          * has already begun the shutdown sequence.
4938          *
4939          */
4940         if (block_mounts(zone) == 0) {
4941                 zone_rele(zone);
4942                 return (set_errno(EINTR));
4943         }
4944
4945         mutex_enter(&zonehash_lock);
4946         mutex_enter(&zone_status_lock);
4947         status = zone_status_get(zone);
4948         /*
4949          * Fail if the zone isn't fully initialized yet.
4950          */
4951         if (status < ZONE_IS_READY) {
4952                 mutex_exit(&zone_status_lock);
4953                 mutex_exit(&zonehash_lock);
4954                 resume_mounts(zone);
4955                 zone_rele(zone);
4956                 return (set_errno(EINVAL));
4957         }
4958         /*
4959          * If conditions required for zone_shutdown() to return have been met,
4960          * return success.
4961          */
4962         if (status >= ZONE_IS_DOWN) {
4963                 mutex_exit(&zone_status_lock);
4964                 mutex_exit(&zonehash_lock);
4965                 resume_mounts(zone);
4966                 zone_rele(zone);
4967                 return (0);
4968         }
4969         /*
4970          * If zone_shutdown() hasn't been called before, go through the motions.
4971          * If it has, there's nothing to do but wait for the kernel threads to
4972          * drain.
4973          */
4974         if (status < ZONE_IS_EMPTY) {
4975                 uint_t ntasks;
4976
4977                 mutex_enter(&zone->zone_lock);
4978                 if ((ntasks = zone->zone_ntasks) != 1) {
4979                         /*
4980                          * There's still stuff running.
4981                          */
4982                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4983                 }
4984                 mutex_exit(&zone->zone_lock);
4985                 if (ntasks == 1) {
4986                         /*
4987                          * The only way to create another task is through
4988                          * zone_enter(), which will block until we drop
4989                          * zonehash_lock.  The zone is empty.
4990                          */
4991                         if (zone->zone_kthreads == NULL) {
4992                                 /*
4993                                  * Skip ahead to ZONE_IS_DOWN
4994                                  */
4995                                 zone_status_set(zone, ZONE_IS_DOWN);
4996                         } else {
4997                                 zone_status_set(zone, ZONE_IS_EMPTY);
4998                         }
4999                 }
5000         }
5001         mutex_exit(&zone_status_lock);
5002         mutex_exit(&zonehash_lock);
5003         resume_mounts(zone);
5004
5005         if (error = zone_empty(zone)) {
5006                 zone_rele(zone);
5007                 return (set_errno(error));
5008         }
5009         /*
5010          * After the zone status goes to ZONE_IS_DOWN this zone will no
5011          * longer be notified of changes to the pools configuration, so
5012          * in order to not end up with a stale pool pointer, we point
5013          * ourselves at the default pool and remove all resource
5014          * visibility.  This is especially important as the zone_t may
5015          * languish on the deathrow for a very long time waiting for
5016          * cred's to drain out.
5017          *
5018          * This rebinding of the zone can happen multiple times
5019          * (presumably due to interrupted or parallel systemcalls)
5020          * without any adverse effects.
5021          */
5022         if (pool_lock_intr() != 0) {
5023                 zone_rele(zone);
5024                 return (set_errno(EINTR));
5025         }
5026         if (pool_state == POOL_ENABLED) {
5027                 mutex_enter(&cpu_lock);
5028                 zone_pool_set(zone, pool_default);
5029                 /*
5030                  * The zone no longer needs to be able to see any cpus.
5031                  */
5032                 zone_pset_set(zone, ZONE_PS_INVAL);
5033                 mutex_exit(&cpu_lock);
5034         }
5035         pool_unlock();
5036
5037         /*
5038          * ZSD shutdown callbacks can be executed multiple times, hence
5039          * it is safe to not be holding any locks across this call.
5040          */
5041         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5042
5043         mutex_enter(&zone_status_lock);
5044         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5045                 zone_status_set(zone, ZONE_IS_DOWN);
5046         mutex_exit(&zone_status_lock);
5047
5048         /*
5049          * Wait for kernel threads to drain.
5050          */
5051         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5052                 zone_rele(zone);
5053                 return (set_errno(EINTR));
5054         }
5055
5056         /*
5057          * Zone can be become down/destroyable even if the above wait
5058          * returns EINTR, so any code added here may never execute.
5059          * (i.e. don't add code here)
5060          */
5061
5062         zone_rele(zone);
5063         return (0);
5064 }
5065
5066 /*
5067  * Log the specified zone's reference counts.  The caller should not be
5068  * holding the zone's zone_lock.
5069  */
5070 static void
5071 zone_log_refcounts(zone_t *zone)
5072 {
5073         char *buffer;
5074         char *buffer_position;
5075         uint32_t buffer_size;
5076         uint32_t index;
5077         uint_t ref;
5078         uint_t cred_ref;
5079
5080         /*
5081          * Construct a string representing the subsystem-specific reference
5082          * counts.  The counts are printed in ascending order by index into the
5083          * zone_t::zone_subsys_ref array.  The list will be surrounded by
5084          * square brackets [] and will only contain nonzero reference counts.
5085          *
5086          * The buffer will hold two square bracket characters plus ten digits,
5087          * one colon, one space, one comma, and some characters for a
5088          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5089          * bit integers have at most ten decimal digits.)  The last
5090          * reference count's comma is replaced by the closing square
5091          * bracket and a NULL character to terminate the string.
5092          *
5093          * NOTE: We have to grab the zone's zone_lock to create a consistent
5094          * snapshot of the zone's reference counters.
5095          *
5096          * First, figure out how much space the string buffer will need.
5097          * The buffer's size is stored in buffer_size.
5098          */
5099         buffer_size = 2;                        /* for the square brackets */
5100         mutex_enter(&zone->zone_lock);
5101         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5102         ref = zone->zone_ref;
5103         cred_ref = zone->zone_cred_ref;
5104         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5105                 if (zone->zone_subsys_ref[index] != 0)
5106                         buffer_size += strlen(zone_ref_subsys_names[index]) +
5107                             13;
5108         if (buffer_size == 2) {
5109                 /*
5110                  * No subsystems had nonzero reference counts.  Don't bother
5111                  * with allocating a buffer; just log the general-purpose and
5112                  * credential reference counts.
5113                  */
5114                 mutex_exit(&zone->zone_lock);
5115                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5116                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
5117                     "references and %u credential references are still extant",
5118                     zone->zone_name, zone->zone_id, ref, cred_ref);
5119                 return;
5120         }
5121
5122         /*
5123          * buffer_size contains the exact number of characters that the
5124          * buffer will need.  Allocate the buffer and fill it with nonzero
5125          * subsystem-specific reference counts.  Surround the results with
5126          * square brackets afterwards.
5127          */
5128         buffer = kmem_alloc(buffer_size, KM_SLEEP);
5129         buffer_position = &buffer[1];
5130         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5131                 /*
5132                  * NOTE: The DDI's version of sprintf() returns a pointer to
5133                  * the modified buffer rather than the number of bytes written
5134                  * (as in snprintf(3C)).  This is unfortunate and annoying.
5135                  * Therefore, we'll use snprintf() with INT_MAX to get the
5136                  * number of bytes written.  Using INT_MAX is safe because
5137                  * the buffer is perfectly sized for the data: we'll never
5138                  * overrun the buffer.
5139                  */
5140                 if (zone->zone_subsys_ref[index] != 0)
5141                         buffer_position += snprintf(buffer_position, INT_MAX,
5142                             "%s: %u,", zone_ref_subsys_names[index],
5143                             zone->zone_subsys_ref[index]);
5144         }
5145         mutex_exit(&zone->zone_lock);
5146         buffer[0] = '[';
5147         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5148         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5149         buffer_position[-1] = ']';
5150
5151         /*
5152          * Log the reference counts and free the message buffer.
5153          */
5154         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5155             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5156             "%u credential references are still extant %s", zone->zone_name,
5157             zone->zone_id, ref, cred_ref, buffer);
5158         kmem_free(buffer, buffer_size);
5159 }
5160
5161 /*
5162  * Systemcall entry point to finalize the zone halt process.  The caller
5163  * must have already successfully called zone_shutdown().
5164  *
5165  * Upon successful completion, the zone will have been fully destroyed:
5166  * zsched will have exited, destructor callbacks executed, and the zone
5167  * removed from the list of active zones.
5168  */
5169 static int
5170 zone_destroy(zoneid_t zoneid)
5171 {
5172         uint64_t uniqid;
5173         zone_t *zone;
5174         zone_status_t status;
5175         clock_t wait_time;
5176         boolean_t log_refcounts;
5177
5178         if (secpolicy_zone_config(CRED()) != 0)
5179                 return (set_errno(EPERM));
5180         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5181                 return (set_errno(EINVAL));
5182
5183         mutex_enter(&zonehash_lock);
5184         /*
5185          * Look for zone under hash lock to prevent races with other
5186          * calls to zone_destroy.
5187          */
5188         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5189                 mutex_exit(&zonehash_lock);
5190                 return (set_errno(EINVAL));
5191         }
5192
5193         if (zone_mount_count(zone->zone_rootpath) != 0) {
5194                 mutex_exit(&zonehash_lock);
5195                 return (set_errno(EBUSY));
5196         }
5197         mutex_enter(&zone_status_lock);
5198         status = zone_status_get(zone);
5199         if (status < ZONE_IS_DOWN) {
5200                 mutex_exit(&zone_status_lock);
5201                 mutex_exit(&zonehash_lock);
5202                 return (set_errno(EBUSY));
5203         } else if (status == ZONE_IS_DOWN) {
5204                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5205         }
5206         mutex_exit(&zone_status_lock);
5207         zone_hold(zone);
5208         mutex_exit(&zonehash_lock);
5209
5210         /*
5211          * wait for zsched to exit
5212          */
5213         zone_status_wait(zone, ZONE_IS_DEAD);
5214         zone_zsd_callbacks(zone, ZSD_DESTROY);
5215         zone->zone_netstack = NULL;
5216         uniqid = zone->zone_uniqid;
5217         zone_rele(zone);
5218         zone = NULL;    /* potentially free'd */
5219
5220         log_refcounts = B_FALSE;
5221         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5222         mutex_enter(&zonehash_lock);
5223         for (; /* ever */; ) {
5224                 boolean_t unref;
5225                 boolean_t refs_have_been_logged;
5226
5227                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5228                     zone->zone_uniqid != uniqid) {
5229                         /*
5230                          * The zone has gone away.  Necessary conditions
5231                          * are met, so we return success.
5232                          */
5233                         mutex_exit(&zonehash_lock);
5234                         return (0);
5235                 }
5236                 mutex_enter(&zone->zone_lock);
5237                 unref = ZONE_IS_UNREF(zone);
5238                 refs_have_been_logged = (zone->zone_flags &
5239                     ZF_REFCOUNTS_LOGGED);
5240                 mutex_exit(&zone->zone_lock);
5241                 if (unref) {
5242                         /*
5243                          * There is only one reference to the zone -- that
5244                          * added when the zone was added to the hashtables --
5245                          * and things will remain this way until we drop
5246                          * zonehash_lock... we can go ahead and cleanup the
5247                          * zone.
5248                          */
5249                         break;
5250                 }
5251
5252                 /*
5253                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5254                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5255                  * some zone's general-purpose reference count reaches one.
5256                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5257                  * on zone_destroy_cv, then log the zone's reference counts and
5258                  * continue to wait for zone_rele() and zone_cred_rele().
5259                  */
5260                 if (!refs_have_been_logged) {
5261                         if (!log_refcounts) {
5262                                 /*
5263                                  * This thread hasn't timed out waiting on
5264                                  * zone_destroy_cv yet.  Wait wait_time clock
5265                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5266                                  * seconds) for the zone's references to clear.
5267                                  */
5268                                 ASSERT(wait_time > 0);
5269                                 wait_time = cv_reltimedwait_sig(
5270                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5271                                     TR_SEC);
5272                                 if (wait_time > 0) {
5273                                         /*
5274                                          * A thread in zone_rele() or
5275                                          * zone_cred_rele() signaled
5276                                          * zone_destroy_cv before this thread's
5277                                          * wait timed out.  The zone might have
5278                                          * only one reference left; find out!
5279                                          */
5280                                         continue;
5281                                 } else if (wait_time == 0) {
5282                                         /* The thread's process was signaled. */
5283                                         mutex_exit(&zonehash_lock);
5284                                         return (set_errno(EINTR));
5285                                 }
5286
5287                                 /*
5288                                  * The thread timed out while waiting on
5289                                  * zone_destroy_cv.  Even though the thread
5290                                  * timed out, it has to check whether another
5291                                  * thread woke up from zone_destroy_cv and
5292                                  * destroyed the zone.
5293                                  *
5294                                  * If the zone still exists and has more than
5295                                  * one unreleased general-purpose reference,
5296                                  * then log the zone's reference counts.
5297                                  */
5298                                 log_refcounts = B_TRUE;
5299                                 continue;
5300                         }
5301
5302                         /*
5303                          * The thread already timed out on zone_destroy_cv while
5304                          * waiting for subsystems to release the zone's last
5305                          * general-purpose references.  Log the zone's reference
5306                          * counts and wait indefinitely on zone_destroy_cv.
5307                          */
5308                         zone_log_refcounts(zone);
5309                 }
5310                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5311                         /* The thread's process was signaled. */
5312                         mutex_exit(&zonehash_lock);
5313                         return (set_errno(EINTR));
5314                 }
5315         }
5316
5317         /*
5318          * Remove CPU cap for this zone now since we're not going to
5319          * fail below this point.
5320          */
5321         cpucaps_zone_remove(zone);
5322
5323         /* Get rid of the zone's kstats */
5324         zone_kstat_delete(zone);
5325
5326         /* remove the pfexecd doors */
5327         if (zone->zone_pfexecd != NULL) {
5328                 klpd_freelist(&zone->zone_pfexecd);
5329                 zone->zone_pfexecd = NULL;
5330         }
5331
5332         /* free brand specific data */
5333         if (ZONE_IS_BRANDED(zone))
5334                 ZBROP(zone)->b_free_brand_data(zone);
5335
5336         /* Say goodbye to brand framework. */
5337         brand_unregister_zone(zone->zone_brand);
5338
5339         /*
5340          * It is now safe to let the zone be recreated; remove it from the
5341          * lists.  The memory will not be freed until the last cred
5342          * reference goes away.
5343          */
5344         ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5345         zonecount--;
5346         /* remove from active list and hash tables */
5347         list_remove(&zone_active, zone);
5348         (void) mod_hash_destroy(zonehashbyname,
5349             (mod_hash_key_t)zone->zone_name);
5350         (void) mod_hash_destroy(zonehashbyid,
5351             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5352         if (zone->zone_flags & ZF_HASHED_LABEL)
5353                 (void) mod_hash_destroy(zonehashbylabel,
5354                     (mod_hash_key_t)zone->zone_slabel);
5355         mutex_exit(&zonehash_lock);
5356
5357         /*
5358          * Release the root vnode; we're not using it anymore.  Nor should any
5359          * other thread that might access it exist.
5360          */
5361         if (zone->zone_rootvp != NULL) {
5362                 VN_RELE(zone->zone_rootvp);
5363                 zone->zone_rootvp = NULL;
5364         }
5365
5366         /* add to deathrow list */
5367         mutex_enter(&zone_deathrow_lock);
5368         list_insert_tail(&zone_deathrow, zone);
5369         mutex_exit(&zone_deathrow_lock);
5370
5371         /*
5372          * Drop last reference (which was added by zsched()), this will
5373          * free the zone unless there are outstanding cred references.
5374          */
5375         zone_rele(zone);
5376         return (0);
5377 }
5378
5379 /*
5380  * Systemcall entry point for zone_getattr(2).
5381  */
5382 static ssize_t
5383 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5384 {
5385         size_t size;
5386         int error = 0, err;
5387         zone_t *zone;
5388         char *zonepath;
5389         char *outstr;
5390         zone_status_t zone_status;
5391         pid_t initpid;
5392         boolean_t global = (curzone == global_zone);
5393         boolean_t inzone = (curzone->zone_id == zoneid);
5394         ushort_t flags;
5395         zone_net_data_t *zbuf;
5396
5397         mutex_enter(&zonehash_lock);
5398         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5399                 mutex_exit(&zonehash_lock);
5400                 return (set_errno(EINVAL));
5401         }
5402         zone_status = zone_status_get(zone);
5403         if (zone_status < ZONE_IS_INITIALIZED) {
5404                 mutex_exit(&zonehash_lock);
5405                 return (set_errno(EINVAL));
5406         }
5407         zone_hold(zone);
5408         mutex_exit(&zonehash_lock);
5409
5410         /*
5411          * If not in the global zone, don't show information about other zones,
5412          * unless the system is labeled and the local zone's label dominates
5413          * the other zone.
5414          */
5415         if (!zone_list_access(zone)) {
5416                 zone_rele(zone);
5417                 return (set_errno(EINVAL));
5418         }
5419
5420         switch (attr) {
5421         case ZONE_ATTR_ROOT:
5422                 if (global) {
5423                         /*
5424                          * Copy the path to trim the trailing "/" (except for
5425                          * the global zone).
5426                          */
5427                         if (zone != global_zone)
5428                                 size = zone->zone_rootpathlen - 1;
5429                         else
5430                                 size = zone->zone_rootpathlen;
5431                         zonepath = kmem_alloc(size, KM_SLEEP);
5432                         bcopy(zone->zone_rootpath, zonepath, size);
5433                         zonepath[size - 1] = '\0';
5434                 } else {
5435                         if (inzone || !is_system_labeled()) {
5436                                 /*
5437                                  * Caller is not in the global zone.
5438                                  * if the query is on the current zone
5439                                  * or the system is not labeled,
5440                                  * just return faked-up path for current zone.
5441                                  */
5442                                 zonepath = "/";
5443                                 size = 2;
5444                         } else {
5445                                 /*
5446                                  * Return related path for current zone.
5447                                  */
5448                                 int prefix_len = strlen(zone_prefix);
5449                                 int zname_len = strlen(zone->zone_name);
5450
5451                                 size = prefix_len + zname_len + 1;
5452                                 zonepath = kmem_alloc(size, KM_SLEEP);
5453                                 bcopy(zone_prefix, zonepath, prefix_len);
5454                                 bcopy(zone->zone_name, zonepath +
5455                                     prefix_len, zname_len);
5456                                 zonepath[size - 1] = '\0';
5457                         }
5458                 }
5459                 if (bufsize > size)
5460                         bufsize = size;
5461                 if (buf != NULL) {
5462                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5463                         if (err != 0 && err != ENAMETOOLONG)
5464                                 error = EFAULT;
5465                 }
5466                 if (global || (is_system_labeled() && !inzone))
5467                         kmem_free(zonepath, size);
5468                 break;
5469
5470         case ZONE_ATTR_NAME:
5471                 size = strlen(zone->zone_name) + 1;
5472                 if (bufsize > size)
5473                         bufsize = size;
5474                 if (buf != NULL) {
5475                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5476                         if (err != 0 && err != ENAMETOOLONG)
5477                                 error = EFAULT;
5478                 }
5479                 break;
5480
5481         case ZONE_ATTR_STATUS:
5482                 /*
5483                  * Since we're not holding zonehash_lock, the zone status
5484                  * may be anything; leave it up to userland to sort it out.
5485                  */
5486                 size = sizeof (zone_status);
5487                 if (bufsize > size)
5488                         bufsize = size;
5489                 zone_status = zone_status_get(zone);
5490                 if (buf != NULL &&
5491                     copyout(&zone_status, buf, bufsize) != 0)
5492                         error = EFAULT;
5493                 break;
5494         case ZONE_ATTR_FLAGS:
5495                 size = sizeof (zone->zone_flags);
5496                 if (bufsize > size)
5497                         bufsize = size;
5498                 flags = zone->zone_flags;
5499                 if (buf != NULL &&
5500                     copyout(&flags, buf, bufsize) != 0)
5501                         error = EFAULT;
5502                 break;
5503         case ZONE_ATTR_PRIVSET:
5504                 size = sizeof (priv_set_t);
5505                 if (bufsize > size)
5506                         bufsize = size;
5507                 if (buf != NULL &&
5508                     copyout(zone->zone_privset, buf, bufsize) != 0)
5509                         error = EFAULT;
5510                 break;
5511         case ZONE_ATTR_UNIQID:
5512                 size = sizeof (zone->zone_uniqid);
5513                 if (bufsize > size)
5514                         bufsize = size;
5515                 if (buf != NULL &&
5516                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5517                         error = EFAULT;
5518                 break;
5519         case ZONE_ATTR_POOLID:
5520                 {
5521                         pool_t *pool;
5522                         poolid_t poolid;
5523
5524                         if (pool_lock_intr() != 0) {
5525                                 error = EINTR;
5526                                 break;
5527                         }
5528                         pool = zone_pool_get(zone);
5529                         poolid = pool->pool_id;
5530                         pool_unlock();
5531                         size = sizeof (poolid);
5532                         if (bufsize > size)
5533                                 bufsize = size;
5534                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5535                                 error = EFAULT;
5536                 }
5537                 break;
5538         case ZONE_ATTR_SLBL:
5539                 size = sizeof (bslabel_t);
5540                 if (bufsize > size)
5541                         bufsize = size;
5542                 if (zone->zone_slabel == NULL)
5543                         error = EINVAL;
5544                 else if (buf != NULL &&
5545                     copyout(label2bslabel(zone->zone_slabel), buf,
5546                     bufsize) != 0)
5547                         error = EFAULT;
5548                 break;
5549         case ZONE_ATTR_INITPID:
5550                 size = sizeof (initpid);
5551                 if (bufsize > size)
5552                         bufsize = size;
5553                 initpid = zone->zone_proc_initpid;
5554                 if (initpid == -1) {
5555                         error = ESRCH;
5556                         break;
5557                 }
5558                 if (buf != NULL &&
5559                     copyout(&initpid, buf, bufsize) != 0)
5560                         error = EFAULT;
5561                 break;
5562         case ZONE_ATTR_BRAND:
5563                 size = strlen(zone->zone_brand->b_name) + 1;
5564
5565                 if (bufsize > size)
5566                         bufsize = size;
5567                 if (buf != NULL) {
5568                         err = copyoutstr(zone->zone_brand->b_name, buf,
5569                             bufsize, NULL);
5570                         if (err != 0 && err != ENAMETOOLONG)
5571                                 error = EFAULT;
5572                 }
5573                 break;
5574         case ZONE_ATTR_INITNAME:
5575                 size = strlen(zone->zone_initname) + 1;
5576                 if (bufsize > size)
5577                         bufsize = size;
5578                 if (buf != NULL) {
5579                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5580                             NULL);
5581                         if (err != 0 && err != ENAMETOOLONG)
5582                                 error = EFAULT;
5583                 }
5584                 break;
5585         case ZONE_ATTR_BOOTARGS:
5586                 if (zone->zone_bootargs == NULL)
5587                         outstr = "";
5588                 else
5589                         outstr = zone->zone_bootargs;
5590                 size = strlen(outstr) + 1;
5591                 if (bufsize > size)
5592                         bufsize = size;
5593                 if (buf != NULL) {
5594                         err = copyoutstr(outstr, buf, bufsize, NULL);
5595                         if (err != 0 && err != ENAMETOOLONG)
5596                                 error = EFAULT;
5597                 }
5598                 break;
5599         case ZONE_ATTR_PHYS_MCAP:
5600                 size = sizeof (zone->zone_phys_mcap);
5601                 if (bufsize > size)
5602                         bufsize = size;
5603                 if (buf != NULL &&
5604                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5605                         error = EFAULT;
5606                 break;
5607         case ZONE_ATTR_SCHED_CLASS:
5608                 mutex_enter(&class_lock);
5609
5610                 if (zone->zone_defaultcid >= loaded_classes)
5611                         outstr = "";
5612                 else
5613                         outstr = sclass[zone->zone_defaultcid].cl_name;
5614                 size = strlen(outstr) + 1;
5615                 if (bufsize > size)
5616                         bufsize = size;
5617                 if (buf != NULL) {
5618                         err = copyoutstr(outstr, buf, bufsize, NULL);
5619                         if (err != 0 && err != ENAMETOOLONG)
5620                                 error = EFAULT;
5621                 }
5622
5623                 mutex_exit(&class_lock);
5624                 break;
5625         case ZONE_ATTR_HOSTID:
5626                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5627                     bufsize == sizeof (zone->zone_hostid)) {
5628                         size = sizeof (zone->zone_hostid);
5629                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5630                             bufsize) != 0)
5631                                 error = EFAULT;
5632                 } else {
5633                         error = EINVAL;
5634                 }
5635                 break;
5636         case ZONE_ATTR_FS_ALLOWED:
5637                 if (zone->zone_fs_allowed == NULL)
5638                         outstr = "";
5639                 else
5640                         outstr = zone->zone_fs_allowed;
5641                 size = strlen(outstr) + 1;
5642                 if (bufsize > size)
5643                         bufsize = size;
5644                 if (buf != NULL) {
5645                         err = copyoutstr(outstr, buf, bufsize, NULL);
5646                         if (err != 0 && err != ENAMETOOLONG)
5647                                 error = EFAULT;
5648                 }
5649                 break;
5650         case ZONE_ATTR_SECFLAGS:
5651                 size = sizeof (zone->zone_secflags);
5652                 if (bufsize > size)
5653                         bufsize = size;
5654                 if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5655                         error = EFAULT;
5656                 break;
5657         case ZONE_ATTR_NETWORK:
5658                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5659                 if (copyin(buf, zbuf, bufsize) != 0) {
5660                         error = EFAULT;
5661                 } else {
5662                         error = zone_get_network(zoneid, zbuf);
5663                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5664                                 error = EFAULT;
5665                 }
5666                 kmem_free(zbuf, bufsize);
5667                 break;
5668         default:
5669                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5670                         size = bufsize;
5671                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5672                 } else {
5673                         error = EINVAL;
5674                 }
5675         }
5676         zone_rele(zone);
5677
5678         if (error)
5679                 return (set_errno(error));
5680         return ((ssize_t)size);
5681 }
5682
5683 /*
5684  * Systemcall entry point for zone_setattr(2).
5685  */
5686 /*ARGSUSED*/
5687 static int
5688 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5689 {
5690         zone_t *zone;
5691         zone_status_t zone_status;
5692         int err = -1;
5693         zone_net_data_t *zbuf;
5694
5695         if (secpolicy_zone_config(CRED()) != 0)
5696                 return (set_errno(EPERM));
5697
5698         /*
5699          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5700          * global zone.
5701          */
5702         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5703                 return (set_errno(EINVAL));
5704         }
5705
5706         mutex_enter(&zonehash_lock);
5707         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5708                 mutex_exit(&zonehash_lock);
5709                 return (set_errno(EINVAL));
5710         }
5711         zone_hold(zone);
5712         mutex_exit(&zonehash_lock);
5713
5714         /*
5715          * At present most attributes can only be set on non-running,
5716          * non-global zones.
5717          */
5718         zone_status = zone_status_get(zone);
5719         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5720                 err = EINVAL;
5721                 goto done;
5722         }
5723
5724         switch (attr) {
5725         case ZONE_ATTR_INITNAME:
5726                 err = zone_set_initname(zone, (const char *)buf);
5727                 break;
5728         case ZONE_ATTR_INITNORESTART:
5729                 zone->zone_restart_init = B_FALSE;
5730                 err = 0;
5731                 break;
5732         case ZONE_ATTR_BOOTARGS:
5733                 err = zone_set_bootargs(zone, (const char *)buf);
5734                 break;
5735         case ZONE_ATTR_BRAND:
5736                 err = zone_set_brand(zone, (const char *)buf);
5737                 break;
5738         case ZONE_ATTR_FS_ALLOWED:
5739                 err = zone_set_fs_allowed(zone, (const char *)buf);
5740                 break;
5741         case ZONE_ATTR_SECFLAGS:
5742                 err = zone_set_secflags(zone, (psecflags_t *)buf);
5743                 break;
5744         case ZONE_ATTR_PHYS_MCAP:
5745                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5746                 break;
5747         case ZONE_ATTR_SCHED_CLASS:
5748                 err = zone_set_sched_class(zone, (const char *)buf);
5749                 break;
5750         case ZONE_ATTR_HOSTID:
5751                 if (bufsize == sizeof (zone->zone_hostid)) {
5752                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5753                                 err = 0;
5754                         else
5755                                 err = EFAULT;
5756                 } else {
5757                         err = EINVAL;
5758                 }
5759                 break;
5760         case ZONE_ATTR_NETWORK:
5761                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5762                         err = EINVAL;
5763                         break;
5764                 }
5765                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5766                 if (copyin(buf, zbuf, bufsize) != 0) {
5767                         kmem_free(zbuf, bufsize);
5768                         err = EFAULT;
5769                         break;
5770                 }
5771                 err = zone_set_network(zoneid, zbuf);
5772                 kmem_free(zbuf, bufsize);
5773                 break;
5774         default:
5775                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5776                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5777                 else
5778                         err = EINVAL;
5779         }
5780
5781 done:
5782         zone_rele(zone);
5783         ASSERT(err != -1);
5784         return (err != 0 ? set_errno(err) : 0);
5785 }
5786
5787 /*
5788  * Return zero if the process has at least one vnode mapped in to its
5789  * address space which shouldn't be allowed to change zones.
5790  *
5791  * Also return zero if the process has any shared mappings which reserve
5792  * swap.  This is because the counting for zone.max-swap does not allow swap
5793  * reservation to be shared between zones.  zone swap reservation is counted
5794  * on zone->zone_max_swap.
5795  */
5796 static int
5797 as_can_change_zones(void)
5798 {
5799         proc_t *pp = curproc;
5800         struct seg *seg;
5801         struct as *as = pp->p_as;
5802         vnode_t *vp;
5803         int allow = 1;
5804
5805         ASSERT(pp->p_as != &kas);
5806         AS_LOCK_ENTER(as, RW_READER);
5807         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5808
5809                 /*
5810                  * Cannot enter zone with shared anon memory which
5811                  * reserves swap.  See comment above.
5812                  */
5813                 if (seg_can_change_zones(seg) == B_FALSE) {
5814                         allow = 0;
5815                         break;
5816                 }
5817                 /*
5818                  * if we can't get a backing vnode for this segment then skip
5819                  * it.
5820                  */
5821                 vp = NULL;
5822                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5823                         continue;
5824                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5825                         allow = 0;
5826                         break;
5827                 }
5828         }
5829         AS_LOCK_EXIT(as);
5830         return (allow);
5831 }
5832
5833 /*
5834  * Count swap reserved by curproc's address space
5835  */
5836 static size_t
5837 as_swresv(void)
5838 {
5839         proc_t *pp = curproc;
5840         struct seg *seg;
5841         struct as *as = pp->p_as;
5842         size_t swap = 0;
5843
5844         ASSERT(pp->p_as != &kas);
5845         ASSERT(AS_WRITE_HELD(as));
5846         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5847                 swap += seg_swresv(seg);
5848
5849         return (swap);
5850 }
5851
5852 /*
5853  * Systemcall entry point for zone_enter().
5854  *
5855  * The current process is injected into said zone.  In the process
5856  * it will change its project membership, privileges, rootdir/cwd,
5857  * zone-wide rctls, and pool association to match those of the zone.
5858  *
5859  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5860  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5861  * enter a zone that is "ready" or "running".
5862  */
5863 static int
5864 zone_enter(zoneid_t zoneid)
5865 {
5866         zone_t *zone;
5867         vnode_t *vp;
5868         proc_t *pp = curproc;
5869         contract_t *ct;
5870         cont_process_t *ctp;
5871         task_t *tk, *oldtk;
5872         kproject_t *zone_proj0;
5873         cred_t *cr, *newcr;
5874         pool_t *oldpool, *newpool;
5875         sess_t *sp;
5876         uid_t uid;
5877         zone_status_t status;
5878         int err = 0;
5879         rctl_entity_p_t e;
5880         size_t swap;
5881         kthread_id_t t;
5882
5883         if (secpolicy_zone_config(CRED()) != 0)
5884                 return (set_errno(EPERM));
5885         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5886                 return (set_errno(EINVAL));
5887
5888         /*
5889          * Stop all lwps so we don't need to hold a lock to look at
5890          * curproc->p_zone.  This needs to happen before we grab any
5891          * locks to avoid deadlock (another lwp in the process could
5892          * be waiting for the held lock).
5893          */
5894         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5895                 return (set_errno(EINTR));
5896
5897         /*
5898          * Make sure we're not changing zones with files open or mapped in
5899          * to our address space which shouldn't be changing zones.
5900          */
5901         if (!files_can_change_zones()) {
5902                 err = EBADF;
5903                 goto out;
5904         }
5905         if (!as_can_change_zones()) {
5906                 err = EFAULT;
5907                 goto out;
5908         }
5909
5910         mutex_enter(&zonehash_lock);
5911         if (pp->p_zone != global_zone) {
5912                 mutex_exit(&zonehash_lock);
5913                 err = EINVAL;
5914                 goto out;
5915         }
5916
5917         zone = zone_find_all_by_id(zoneid);
5918         if (zone == NULL) {
5919                 mutex_exit(&zonehash_lock);
5920                 err = EINVAL;
5921                 goto out;
5922         }
5923
5924         /*
5925          * To prevent processes in a zone from holding contracts on
5926          * extrazonal resources, and to avoid process contract
5927          * memberships which span zones, contract holders and processes
5928          * which aren't the sole members of their encapsulating process
5929          * contracts are not allowed to zone_enter.
5930          */
5931         ctp = pp->p_ct_process;
5932         ct = &ctp->conp_contract;
5933         mutex_enter(&ct->ct_lock);
5934         mutex_enter(&pp->p_lock);
5935         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5936                 mutex_exit(&pp->p_lock);
5937                 mutex_exit(&ct->ct_lock);
5938                 mutex_exit(&zonehash_lock);
5939                 err = EINVAL;
5940                 goto out;
5941         }
5942
5943         /*
5944          * Moreover, we don't allow processes whose encapsulating
5945          * process contracts have inherited extrazonal contracts.
5946          * While it would be easier to eliminate all process contracts
5947          * with inherited contracts, we need to be able to give a
5948          * restarted init (or other zone-penetrating process) its
5949          * predecessor's contracts.
5950          */
5951         if (ctp->conp_ninherited != 0) {
5952                 contract_t *next;
5953                 for (next = list_head(&ctp->conp_inherited); next;
5954                     next = list_next(&ctp->conp_inherited, next)) {
5955                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5956                                 mutex_exit(&pp->p_lock);
5957                                 mutex_exit(&ct->ct_lock);
5958                                 mutex_exit(&zonehash_lock);
5959                                 err = EINVAL;
5960                                 goto out;
5961                         }
5962                 }
5963         }
5964
5965         mutex_exit(&pp->p_lock);
5966         mutex_exit(&ct->ct_lock);
5967
5968         status = zone_status_get(zone);
5969         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5970                 /*
5971                  * Can't join
5972                  */
5973                 mutex_exit(&zonehash_lock);
5974                 err = EINVAL;
5975                 goto out;
5976         }
5977
5978         /*
5979          * Make sure new priv set is within the permitted set for caller
5980          */
5981         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5982                 mutex_exit(&zonehash_lock);
5983                 err = EPERM;
5984                 goto out;
5985         }
5986         /*
5987          * We want to momentarily drop zonehash_lock while we optimistically
5988          * bind curproc to the pool it should be running in.  This is safe
5989          * since the zone can't disappear (we have a hold on it).
5990          */
5991         zone_hold(zone);
5992         mutex_exit(&zonehash_lock);
5993
5994         /*
5995          * Grab pool_lock to keep the pools configuration from changing
5996          * and to stop ourselves from getting rebound to another pool
5997          * until we join the zone.
5998          */
5999         if (pool_lock_intr() != 0) {
6000                 zone_rele(zone);
6001                 err = EINTR;
6002                 goto out;
6003         }
6004         ASSERT(secpolicy_pool(CRED()) == 0);
6005         /*
6006          * Bind ourselves to the pool currently associated with the zone.
6007          */
6008         oldpool = curproc->p_pool;
6009         newpool = zone_pool_get(zone);
6010         if (pool_state == POOL_ENABLED && newpool != oldpool &&
6011             (err = pool_do_bind(newpool, P_PID, P_MYID,
6012             POOL_BIND_ALL)) != 0) {
6013                 pool_unlock();
6014                 zone_rele(zone);
6015                 goto out;
6016         }
6017
6018         /*
6019          * Grab cpu_lock now; we'll need it later when we call
6020          * task_join().
6021          */
6022         mutex_enter(&cpu_lock);
6023         mutex_enter(&zonehash_lock);
6024         /*
6025          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6026          */
6027         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6028                 /*
6029                  * Can't join anymore.
6030                  */
6031                 mutex_exit(&zonehash_lock);
6032                 mutex_exit(&cpu_lock);
6033                 if (pool_state == POOL_ENABLED &&
6034                     newpool != oldpool)
6035                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
6036                             POOL_BIND_ALL);
6037                 pool_unlock();
6038                 zone_rele(zone);
6039                 err = EINVAL;
6040                 goto out;
6041         }
6042
6043         /*
6044          * a_lock must be held while transfering locked memory and swap
6045          * reservation from the global zone to the non global zone because
6046          * asynchronous faults on the processes' address space can lock
6047          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6048          * segments respectively.
6049          */
6050         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6051         swap = as_swresv();
6052         mutex_enter(&pp->p_lock);
6053         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6054         /* verify that we do not exceed and task or lwp limits */
6055         mutex_enter(&zone->zone_nlwps_lock);
6056         /* add new lwps to zone and zone's proj0 */
6057         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6058         zone->zone_nlwps += pp->p_lwpcnt;
6059         /* add 1 task to zone's proj0 */
6060         zone_proj0->kpj_ntasks += 1;
6061
6062         zone_proj0->kpj_nprocs++;
6063         zone->zone_nprocs++;
6064         mutex_exit(&zone->zone_nlwps_lock);
6065
6066         mutex_enter(&zone->zone_mem_lock);
6067         zone->zone_locked_mem += pp->p_locked_mem;
6068         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6069         zone->zone_max_swap += swap;
6070         mutex_exit(&zone->zone_mem_lock);
6071
6072         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6073         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6074         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6075
6076         /* remove lwps and process from proc's old zone and old project */
6077         mutex_enter(&pp->p_zone->zone_nlwps_lock);
6078         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6079         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6080         pp->p_task->tk_proj->kpj_nprocs--;
6081         pp->p_zone->zone_nprocs--;
6082         mutex_exit(&pp->p_zone->zone_nlwps_lock);
6083
6084         mutex_enter(&pp->p_zone->zone_mem_lock);
6085         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6086         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6087         pp->p_zone->zone_max_swap -= swap;
6088         mutex_exit(&pp->p_zone->zone_mem_lock);
6089
6090         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6091         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6092         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6093
6094         pp->p_flag |= SZONETOP;
6095         pp->p_zone = zone;
6096         mutex_exit(&pp->p_lock);
6097         AS_LOCK_EXIT(pp->p_as);
6098
6099         /*
6100          * Joining the zone cannot fail from now on.
6101          *
6102          * This means that a lot of the following code can be commonized and
6103          * shared with zsched().
6104          */
6105
6106         /*
6107          * If the process contract fmri was inherited, we need to
6108          * flag this so that any contract status will not leak
6109          * extra zone information, svc_fmri in this case
6110          */
6111         if (ctp->conp_svc_ctid != ct->ct_id) {
6112                 mutex_enter(&ct->ct_lock);
6113                 ctp->conp_svc_zone_enter = ct->ct_id;
6114                 mutex_exit(&ct->ct_lock);
6115         }
6116
6117         /*
6118          * Reset the encapsulating process contract's zone.
6119          */
6120         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6121         contract_setzuniqid(ct, zone->zone_uniqid);
6122
6123         /*
6124          * Create a new task and associate the process with the project keyed
6125          * by (projid,zoneid).
6126          *
6127          * We might as well be in project 0; the global zone's projid doesn't
6128          * make much sense in a zone anyhow.
6129          *
6130          * This also increments zone_ntasks, and returns with p_lock held.
6131          */
6132         tk = task_create(0, zone);
6133         oldtk = task_join(tk, 0);
6134         mutex_exit(&cpu_lock);
6135
6136         /*
6137          * call RCTLOP_SET functions on this proc
6138          */
6139         e.rcep_p.zone = zone;
6140         e.rcep_t = RCENTITY_ZONE;
6141         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6142             RCD_CALLBACK);
6143         mutex_exit(&pp->p_lock);
6144
6145         /*
6146          * We don't need to hold any of zsched's locks here; not only do we know
6147          * the process and zone aren't going away, we know its session isn't
6148          * changing either.
6149          *
6150          * By joining zsched's session here, we mimic the behavior in the
6151          * global zone of init's sid being the pid of sched.  We extend this
6152          * to all zlogin-like zone_enter()'ing processes as well.
6153          */
6154         mutex_enter(&pidlock);
6155         sp = zone->zone_zsched->p_sessp;
6156         sess_hold(zone->zone_zsched);
6157         mutex_enter(&pp->p_lock);
6158         pgexit(pp);
6159         sess_rele(pp->p_sessp, B_TRUE);
6160         pp->p_sessp = sp;
6161         pgjoin(pp, zone->zone_zsched->p_pidp);
6162
6163         /*
6164          * If any threads are scheduled to be placed on zone wait queue they
6165          * should abandon the idea since the wait queue is changing.
6166          * We need to be holding pidlock & p_lock to do this.
6167          */
6168         if ((t = pp->p_tlist) != NULL) {
6169                 do {
6170                         thread_lock(t);
6171                         /*
6172                          * Kick this thread so that it doesn't sit
6173                          * on a wrong wait queue.
6174                          */
6175                         if (ISWAITING(t))
6176                                 setrun_locked(t);
6177
6178                         if (t->t_schedflag & TS_ANYWAITQ)
6179                                 t->t_schedflag &= ~ TS_ANYWAITQ;
6180
6181                         thread_unlock(t);
6182                 } while ((t = t->t_forw) != pp->p_tlist);
6183         }
6184
6185         /*
6186          * If there is a default scheduling class for the zone and it is not
6187          * the class we are currently in, change all of the threads in the
6188          * process to the new class.  We need to be holding pidlock & p_lock
6189          * when we call parmsset so this is a good place to do it.
6190          */
6191         if (zone->zone_defaultcid > 0 &&
6192             zone->zone_defaultcid != curthread->t_cid) {
6193                 pcparms_t pcparms;
6194
6195                 pcparms.pc_cid = zone->zone_defaultcid;
6196                 pcparms.pc_clparms[0] = 0;
6197
6198                 /*
6199                  * If setting the class fails, we still want to enter the zone.
6200                  */
6201                 if ((t = pp->p_tlist) != NULL) {
6202                         do {
6203                                 (void) parmsset(&pcparms, t);
6204                         } while ((t = t->t_forw) != pp->p_tlist);
6205                 }
6206         }
6207
6208         mutex_exit(&pp->p_lock);
6209         mutex_exit(&pidlock);
6210
6211         mutex_exit(&zonehash_lock);
6212         /*
6213          * We're firmly in the zone; let pools progress.
6214          */
6215         pool_unlock();
6216         task_rele(oldtk);
6217         /*
6218          * We don't need to retain a hold on the zone since we already
6219          * incremented zone_ntasks, so the zone isn't going anywhere.
6220          */
6221         zone_rele(zone);
6222
6223         /*
6224          * Chroot
6225          */
6226         vp = zone->zone_rootvp;
6227         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6228         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6229
6230         /*
6231          * Change process security flags.  Note that the _effective_ flags
6232          * cannot change
6233          */
6234         secflags_copy(&pp->p_secflags.psf_lower,
6235             &zone->zone_secflags.psf_lower);
6236         secflags_copy(&pp->p_secflags.psf_upper,
6237             &zone->zone_secflags.psf_upper);
6238         secflags_copy(&pp->p_secflags.psf_inherit,
6239             &zone->zone_secflags.psf_inherit);
6240
6241         /*
6242          * Change process credentials
6243          */
6244         newcr = cralloc();
6245         mutex_enter(&pp->p_crlock);
6246         cr = pp->p_cred;
6247         crcopy_to(cr, newcr);
6248         crsetzone(newcr, zone);
6249         pp->p_cred = newcr;
6250
6251         /*
6252          * Restrict all process privilege sets to zone limit
6253          */
6254         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6255         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6256         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6257         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6258         mutex_exit(&pp->p_crlock);
6259         crset(pp, newcr);
6260
6261         /*
6262          * Adjust upcount to reflect zone entry.
6263          */
6264         uid = crgetruid(newcr);
6265         mutex_enter(&pidlock);
6266         upcount_dec(uid, GLOBAL_ZONEID);
6267         upcount_inc(uid, zoneid);
6268         mutex_exit(&pidlock);
6269
6270         /*
6271          * Set up core file path and content.
6272          */
6273         set_core_defaults();
6274
6275 out:
6276         /*
6277          * Let the other lwps continue.
6278          */
6279         mutex_enter(&pp->p_lock);
6280         if (curthread != pp->p_agenttp)
6281                 continuelwps(pp);
6282         mutex_exit(&pp->p_lock);
6283
6284         return (err != 0 ? set_errno(err) : 0);
6285 }
6286
6287 /*
6288  * Systemcall entry point for zone_list(2).
6289  *
6290  * Processes running in a (non-global) zone only see themselves.
6291  * On labeled systems, they see all zones whose label they dominate.
6292  */
6293 static int
6294 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6295 {
6296         zoneid_t *zoneids;
6297         zone_t *zone, *myzone;
6298         uint_t user_nzones, real_nzones;
6299         uint_t domi_nzones;
6300         int error;
6301
6302         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6303                 return (set_errno(EFAULT));
6304
6305         myzone = curproc->p_zone;
6306         if (myzone != global_zone) {
6307                 bslabel_t *mybslab;
6308
6309                 if (!is_system_labeled()) {
6310                         /* just return current zone */
6311                         real_nzones = domi_nzones = 1;
6312                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6313                         zoneids[0] = myzone->zone_id;
6314                 } else {
6315                         /* return all zones that are dominated */
6316                         mutex_enter(&zonehash_lock);
6317                         real_nzones = zonecount;
6318                         domi_nzones = 0;
6319                         if (real_nzones > 0) {
6320                                 zoneids = kmem_alloc(real_nzones *
6321                                     sizeof (zoneid_t), KM_SLEEP);
6322                                 mybslab = label2bslabel(myzone->zone_slabel);
6323                                 for (zone = list_head(&zone_active);
6324                                     zone != NULL;
6325                                     zone = list_next(&zone_active, zone)) {
6326                                         if (zone->zone_id == GLOBAL_ZONEID)
6327                                                 continue;
6328                                         if (zone != myzone &&
6329                                             (zone->zone_flags & ZF_IS_SCRATCH))
6330                                                 continue;
6331                                         /*
6332                                          * Note that a label always dominates
6333                                          * itself, so myzone is always included
6334                                          * in the list.
6335                                          */
6336                                         if (bldominates(mybslab,
6337                                             label2bslabel(zone->zone_slabel))) {
6338                                                 zoneids[domi_nzones++] =
6339                                                     zone->zone_id;
6340                                         }
6341                                 }
6342                         }
6343                         mutex_exit(&zonehash_lock);
6344                 }
6345         } else {
6346                 mutex_enter(&zonehash_lock);
6347                 real_nzones = zonecount;
6348                 domi_nzones = 0;
6349                 if (real_nzones > 0) {
6350                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6351                             KM_SLEEP);
6352                         for (zone = list_head(&zone_active); zone != NULL;
6353                             zone = list_next(&zone_active, zone))
6354                                 zoneids[domi_nzones++] = zone->zone_id;
6355                         ASSERT(domi_nzones == real_nzones);
6356                 }
6357                 mutex_exit(&zonehash_lock);
6358         }
6359
6360         /*
6361          * If user has allocated space for fewer entries than we found, then
6362          * return only up to their limit.  Either way, tell them exactly how
6363          * many we found.
6364          */
6365         if (domi_nzones < user_nzones)
6366                 user_nzones = domi_nzones;
6367         error = 0;
6368         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6369                 error = EFAULT;
6370         } else if (zoneidlist != NULL && user_nzones != 0) {
6371                 if (copyout(zoneids, zoneidlist,
6372                     user_nzones * sizeof (zoneid_t)) != 0)
6373                         error = EFAULT;
6374         }
6375
6376         if (real_nzones > 0)
6377                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6378
6379         if (error != 0)
6380                 return (set_errno(error));
6381         else
6382                 return (0);
6383 }
6384
6385 /*
6386  * Systemcall entry point for zone_lookup(2).
6387  *
6388  * Non-global zones are only able to see themselves and (on labeled systems)
6389  * the zones they dominate.
6390  */
6391 static zoneid_t
6392 zone_lookup(const char *zone_name)
6393 {
6394         char *kname;
6395         zone_t *zone;
6396         zoneid_t zoneid;
6397         int err;
6398
6399         if (zone_name == NULL) {
6400                 /* return caller's zone id */
6401                 return (getzoneid());
6402         }
6403
6404         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6405         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6406                 kmem_free(kname, ZONENAME_MAX);
6407                 return (set_errno(err));
6408         }
6409
6410         mutex_enter(&zonehash_lock);
6411         zone = zone_find_all_by_name(kname);
6412         kmem_free(kname, ZONENAME_MAX);
6413         /*
6414          * In a non-global zone, can only lookup global and own name.
6415          * In Trusted Extensions zone label dominance rules apply.
6416          */
6417         if (zone == NULL ||
6418             zone_status_get(zone) < ZONE_IS_READY ||
6419             !zone_list_access(zone)) {
6420                 mutex_exit(&zonehash_lock);
6421                 return (set_errno(EINVAL));
6422         } else {
6423                 zoneid = zone->zone_id;
6424                 mutex_exit(&zonehash_lock);
6425                 return (zoneid);
6426         }
6427 }
6428
6429 static int
6430 zone_version(int *version_arg)
6431 {
6432         int version = ZONE_SYSCALL_API_VERSION;
6433
6434         if (copyout(&version, version_arg, sizeof (int)) != 0)
6435                 return (set_errno(EFAULT));
6436         return (0);
6437 }
6438
6439 /* ARGSUSED */
6440 long
6441 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6442 {
6443         zone_def zs;
6444         int err;
6445
6446         switch (cmd) {
6447         case ZONE_CREATE:
6448                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6449                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6450                                 return (set_errno(EFAULT));
6451                         }
6452                 } else {
6453 #ifdef _SYSCALL32_IMPL
6454                         zone_def32 zs32;
6455
6456                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6457                                 return (set_errno(EFAULT));
6458                         }
6459                         zs.zone_name =
6460                             (const char *)(unsigned long)zs32.zone_name;
6461                         zs.zone_root =
6462                             (const char *)(unsigned long)zs32.zone_root;
6463                         zs.zone_privs =
6464                             (const struct priv_set *)
6465                             (unsigned long)zs32.zone_privs;
6466                         zs.zone_privssz = zs32.zone_privssz;
6467                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6468                         zs.rctlbufsz = zs32.rctlbufsz;
6469                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6470                         zs.zfsbufsz = zs32.zfsbufsz;
6471                         zs.extended_error =
6472                             (int *)(unsigned long)zs32.extended_error;
6473                         zs.match = zs32.match;
6474                         zs.doi = zs32.doi;
6475                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6476                         zs.flags = zs32.flags;
6477 #else
6478                         panic("get_udatamodel() returned bogus result\n");
6479 #endif
6480                 }
6481
6482                 return (zone_create(zs.zone_name, zs.zone_root,
6483                     zs.zone_privs, zs.zone_privssz,
6484                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6485                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6486                     zs.extended_error, zs.match, zs.doi,
6487                     zs.label, zs.flags));
6488         case ZONE_BOOT:
6489                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6490         case ZONE_DESTROY:
6491                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6492         case ZONE_GETATTR:
6493                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6494                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6495         case ZONE_SETATTR:
6496                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6497                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6498         case ZONE_ENTER:
6499                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6500         case ZONE_LIST:
6501                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6502         case ZONE_SHUTDOWN:
6503                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6504         case ZONE_LOOKUP:
6505                 return (zone_lookup((const char *)arg1));
6506         case ZONE_VERSION:
6507                 return (zone_version((int *)arg1));
6508         case ZONE_ADD_DATALINK:
6509                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6510                     (datalink_id_t)(uintptr_t)arg2));
6511         case ZONE_DEL_DATALINK:
6512                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6513                     (datalink_id_t)(uintptr_t)arg2));
6514         case ZONE_CHECK_DATALINK: {
6515                 zoneid_t        zoneid;
6516                 boolean_t       need_copyout;
6517
6518                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6519                         return (EFAULT);
6520                 need_copyout = (zoneid == ALL_ZONES);
6521                 err = zone_check_datalink(&zoneid,
6522                     (datalink_id_t)(uintptr_t)arg2);
6523                 if (err == 0 && need_copyout) {
6524                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6525                                 err = EFAULT;
6526                 }
6527                 return (err == 0 ? 0 : set_errno(err));
6528         }
6529         case ZONE_LIST_DATALINK:
6530                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6531                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6532         default:
6533                 return (set_errno(EINVAL));
6534         }
6535 }
6536
6537 struct zarg {
6538         zone_t *zone;
6539         zone_cmd_arg_t arg;
6540 };
6541
6542 static int
6543 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6544 {
6545         char *buf;
6546         size_t buflen;
6547         int error;
6548
6549         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6550         buf = kmem_alloc(buflen, KM_SLEEP);
6551         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6552         error = door_ki_open(buf, doorp);
6553         kmem_free(buf, buflen);
6554         return (error);
6555 }
6556
6557 static void
6558 zone_release_door(door_handle_t *doorp)
6559 {
6560         door_ki_rele(*doorp);
6561         *doorp = NULL;
6562 }
6563
6564 static void
6565 zone_ki_call_zoneadmd(struct zarg *zargp)
6566 {
6567         door_handle_t door = NULL;
6568         door_arg_t darg, save_arg;
6569         char *zone_name;
6570         size_t zone_namelen;
6571         zoneid_t zoneid;
6572         zone_t *zone;
6573         zone_cmd_arg_t arg;
6574         uint64_t uniqid;
6575         size_t size;
6576         int error;
6577         int retry;
6578
6579         zone = zargp->zone;
6580         arg = zargp->arg;
6581         kmem_free(zargp, sizeof (*zargp));
6582
6583         zone_namelen = strlen(zone->zone_name) + 1;
6584         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6585         bcopy(zone->zone_name, zone_name, zone_namelen);
6586         zoneid = zone->zone_id;
6587         uniqid = zone->zone_uniqid;
6588         /*
6589          * zoneadmd may be down, but at least we can empty out the zone.
6590          * We can ignore the return value of zone_empty() since we're called
6591          * from a kernel thread and know we won't be delivered any signals.
6592          */
6593         ASSERT(curproc == &p0);
6594         (void) zone_empty(zone);
6595         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6596         zone_rele(zone);
6597
6598         size = sizeof (arg);
6599         darg.rbuf = (char *)&arg;
6600         darg.data_ptr = (char *)&arg;
6601         darg.rsize = size;
6602         darg.data_size = size;
6603         darg.desc_ptr = NULL;
6604         darg.desc_num = 0;
6605
6606         save_arg = darg;
6607         /*
6608          * Since we're not holding a reference to the zone, any number of
6609          * things can go wrong, including the zone disappearing before we get a
6610          * chance to talk to zoneadmd.
6611          */
6612         for (retry = 0; /* forever */; retry++) {
6613                 if (door == NULL &&
6614                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6615                         goto next;
6616                 }
6617                 ASSERT(door != NULL);
6618
6619                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6620                     SIZE_MAX, 0)) == 0) {
6621                         break;
6622                 }
6623                 switch (error) {
6624                 case EINTR:
6625                         /* FALLTHROUGH */
6626                 case EAGAIN:    /* process may be forking */
6627                         /*
6628                          * Back off for a bit
6629                          */
6630                         break;
6631                 case EBADF:
6632                         zone_release_door(&door);
6633                         if (zone_lookup_door(zone_name, &door) != 0) {
6634                                 /*
6635                                  * zoneadmd may be dead, but it may come back to
6636                                  * life later.
6637                                  */
6638                                 break;
6639                         }
6640                         break;
6641                 default:
6642                         cmn_err(CE_WARN,
6643                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6644                             error);
6645                         goto out;
6646                 }
6647 next:
6648                 /*
6649                  * If this isn't the same zone_t that we originally had in mind,
6650                  * then this is the same as if two kadmin requests come in at
6651                  * the same time: the first one wins.  This means we lose, so we
6652                  * bail.
6653                  */
6654                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6655                         /*
6656                          * Problem is solved.
6657                          */
6658                         break;
6659                 }
6660                 if (zone->zone_uniqid != uniqid) {
6661                         /*
6662                          * zoneid recycled
6663                          */
6664                         zone_rele(zone);
6665                         break;
6666                 }
6667                 /*
6668                  * We could zone_status_timedwait(), but there doesn't seem to
6669                  * be much point in doing that (plus, it would mean that
6670                  * zone_free() isn't called until this thread exits).
6671                  */
6672                 zone_rele(zone);
6673                 delay(hz);
6674                 darg = save_arg;
6675         }
6676 out:
6677         if (door != NULL) {
6678                 zone_release_door(&door);
6679         }
6680         kmem_free(zone_name, zone_namelen);
6681         thread_exit();
6682 }
6683
6684 /*
6685  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6686  * kadmin().  The caller is a process in the zone.
6687  *
6688  * In order to shutdown the zone, we will hand off control to zoneadmd
6689  * (running in the global zone) via a door.  We do a half-hearted job at
6690  * killing all processes in the zone, create a kernel thread to contact
6691  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6692  * a form of generation number used to let zoneadmd (as well as
6693  * zone_destroy()) know exactly which zone they're re talking about.
6694  */
6695 int
6696 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6697 {
6698         struct zarg *zargp;
6699         zone_cmd_t zcmd;
6700         zone_t *zone;
6701
6702         zone = curproc->p_zone;
6703         ASSERT(getzoneid() != GLOBAL_ZONEID);
6704
6705         switch (cmd) {
6706         case A_SHUTDOWN:
6707                 switch (fcn) {
6708                 case AD_HALT:
6709                 case AD_POWEROFF:
6710                         zcmd = Z_HALT;
6711                         break;
6712                 case AD_BOOT:
6713                         zcmd = Z_REBOOT;
6714                         break;
6715                 case AD_IBOOT:
6716                 case AD_SBOOT:
6717                 case AD_SIBOOT:
6718                 case AD_NOSYNC:
6719                         return (ENOTSUP);
6720                 default:
6721                         return (EINVAL);
6722                 }
6723                 break;
6724         case A_REBOOT:
6725                 zcmd = Z_REBOOT;
6726                 break;
6727         case A_FTRACE:
6728         case A_REMOUNT:
6729         case A_FREEZE:
6730         case A_DUMP:
6731         case A_CONFIG:
6732                 return (ENOTSUP);
6733         default:
6734                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6735                 return (EINVAL);
6736         }
6737
6738         if (secpolicy_zone_admin(credp, B_FALSE))
6739                 return (EPERM);
6740         mutex_enter(&zone_status_lock);
6741
6742         /*
6743          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6744          * is in the zone.
6745          */
6746         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6747         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6748                 /*
6749                  * This zone is already on its way down.
6750                  */
6751                 mutex_exit(&zone_status_lock);
6752                 return (0);
6753         }
6754         /*
6755          * Prevent future zone_enter()s
6756          */
6757         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6758         mutex_exit(&zone_status_lock);
6759
6760         /*
6761          * Kill everyone now and call zoneadmd later.
6762          * zone_ki_call_zoneadmd() will do a more thorough job of this
6763          * later.
6764          */
6765         killall(zone->zone_id);
6766         /*
6767          * Now, create the thread to contact zoneadmd and do the rest of the
6768          * work.  This thread can't be created in our zone otherwise
6769          * zone_destroy() would deadlock.
6770          */
6771         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6772         zargp->arg.cmd = zcmd;
6773         zargp->arg.uniqid = zone->zone_uniqid;
6774         zargp->zone = zone;
6775         (void) strcpy(zargp->arg.locale, "C");
6776         /* mdep was already copied in for us by uadmin */
6777         if (mdep != NULL)
6778                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6779                     sizeof (zargp->arg.bootbuf));
6780         zone_hold(zone);
6781
6782         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6783             TS_RUN, minclsyspri);
6784         exit(CLD_EXITED, 0);
6785
6786         return (EINVAL);
6787 }
6788
6789 /*
6790  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6791  * status to ZONE_IS_SHUTTING_DOWN.
6792  *
6793  * This function also shuts down all running zones to ensure that they won't
6794  * fork new processes.
6795  */
6796 void
6797 zone_shutdown_global(void)
6798 {
6799         zone_t *current_zonep;
6800
6801         ASSERT(INGLOBALZONE(curproc));
6802         mutex_enter(&zonehash_lock);
6803         mutex_enter(&zone_status_lock);
6804
6805         /* Modify the global zone's status first. */
6806         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6807         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6808
6809         /*
6810          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6811          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6812          * could cause assertions to fail (e.g., assertions about a zone's
6813          * state during initialization, readying, or booting) or produce races.
6814          * We'll let threads continue to initialize and ready new zones: they'll
6815          * fail to boot the new zones when they see that the global zone is
6816          * shutting down.
6817          */
6818         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6819             current_zonep = list_next(&zone_active, current_zonep)) {
6820                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6821                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6822         }
6823         mutex_exit(&zone_status_lock);
6824         mutex_exit(&zonehash_lock);
6825 }
6826
6827 /*
6828  * Returns true if the named dataset is visible in the current zone.
6829  * The 'write' parameter is set to 1 if the dataset is also writable.
6830  */
6831 int
6832 zone_dataset_visible(const char *dataset, int *write)
6833 {
6834         static int zfstype = -1;
6835         zone_dataset_t *zd;
6836         size_t len;
6837         zone_t *zone = curproc->p_zone;
6838         const char *name = NULL;
6839         vfs_t *vfsp = NULL;
6840
6841         if (dataset[0] == '\0')
6842                 return (0);
6843
6844         /*
6845          * Walk the list once, looking for datasets which match exactly, or
6846          * specify a dataset underneath an exported dataset.  If found, return
6847          * true and note that it is writable.
6848          */
6849         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6850             zd = list_next(&zone->zone_datasets, zd)) {
6851
6852                 len = strlen(zd->zd_dataset);
6853                 if (strlen(dataset) >= len &&
6854                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6855                     (dataset[len] == '\0' || dataset[len] == '/' ||
6856                     dataset[len] == '@')) {
6857                         if (write)
6858                                 *write = 1;
6859                         return (1);
6860                 }
6861         }
6862
6863         /*
6864          * Walk the list a second time, searching for datasets which are parents
6865          * of exported datasets.  These should be visible, but read-only.
6866          *
6867          * Note that we also have to support forms such as 'pool/dataset/', with
6868          * a trailing slash.
6869          */
6870         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6871             zd = list_next(&zone->zone_datasets, zd)) {
6872
6873                 len = strlen(dataset);
6874                 if (dataset[len - 1] == '/')
6875                         len--;  /* Ignore trailing slash */
6876                 if (len < strlen(zd->zd_dataset) &&
6877                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6878                     zd->zd_dataset[len] == '/') {
6879                         if (write)
6880                                 *write = 0;
6881                         return (1);
6882                 }
6883         }
6884
6885         /*
6886          * We reach here if the given dataset is not found in the zone_dataset
6887          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6888          * instead of delegation. For this we search for the dataset in the
6889          * zone_vfslist of this zone. If found, return true and note that it is
6890          * not writable.
6891          */
6892
6893         /*
6894          * Initialize zfstype if it is not initialized yet.
6895          */
6896         if (zfstype == -1) {
6897                 struct vfssw *vswp = vfs_getvfssw("zfs");
6898                 zfstype = vswp - vfssw;
6899                 vfs_unrefvfssw(vswp);
6900         }
6901
6902         vfs_list_read_lock();
6903         vfsp = zone->zone_vfslist;
6904         do {
6905                 ASSERT(vfsp);
6906                 if (vfsp->vfs_fstype == zfstype) {
6907                         name = refstr_value(vfsp->vfs_resource);
6908
6909                         /*
6910                          * Check if we have an exact match.
6911                          */
6912                         if (strcmp(dataset, name) == 0) {
6913                                 vfs_list_unlock();
6914                                 if (write)
6915                                         *write = 0;
6916                                 return (1);
6917                         }
6918                         /*
6919                          * We need to check if we are looking for parents of
6920                          * a dataset. These should be visible, but read-only.
6921                          */
6922                         len = strlen(dataset);
6923                         if (dataset[len - 1] == '/')
6924                                 len--;
6925
6926                         if (len < strlen(name) &&
6927                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6928                                 vfs_list_unlock();
6929                                 if (write)
6930                                         *write = 0;
6931                                 return (1);
6932                         }
6933                 }
6934                 vfsp = vfsp->vfs_zone_next;
6935         } while (vfsp != zone->zone_vfslist);
6936
6937         vfs_list_unlock();
6938         return (0);
6939 }
6940
6941 /*
6942  * zone_find_by_any_path() -
6943  *
6944  * kernel-private routine similar to zone_find_by_path(), but which
6945  * effectively compares against zone paths rather than zonerootpath
6946  * (i.e., the last component of zonerootpaths, which should be "root/",
6947  * are not compared.)  This is done in order to accurately identify all
6948  * paths, whether zone-visible or not, including those which are parallel
6949  * to /root/, such as /dev/, /home/, etc...
6950  *
6951  * If the specified path does not fall under any zone path then global
6952  * zone is returned.
6953  *
6954  * The treat_abs parameter indicates whether the path should be treated as
6955  * an absolute path although it does not begin with "/".  (This supports
6956  * nfs mount syntax such as host:any/path.)
6957  *
6958  * The caller is responsible for zone_rele of the returned zone.
6959  */
6960 zone_t *
6961 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6962 {
6963         zone_t *zone;
6964         int path_offset = 0;
6965
6966         if (path == NULL) {
6967                 zone_hold(global_zone);
6968                 return (global_zone);
6969         }
6970
6971         if (*path != '/') {
6972                 ASSERT(treat_abs);
6973                 path_offset = 1;
6974         }
6975
6976         mutex_enter(&zonehash_lock);
6977         for (zone = list_head(&zone_active); zone != NULL;
6978             zone = list_next(&zone_active, zone)) {
6979                 char    *c;
6980                 size_t  pathlen;
6981                 char *rootpath_start;
6982
6983                 if (zone == global_zone)        /* skip global zone */
6984                         continue;
6985
6986                 /* scan backwards to find start of last component */
6987                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6988                 do {
6989                         c--;
6990                 } while (*c != '/');
6991
6992                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
6993                 rootpath_start = (zone->zone_rootpath + path_offset);
6994                 if (strncmp(path, rootpath_start, pathlen) == 0)
6995                         break;
6996         }
6997         if (zone == NULL)
6998                 zone = global_zone;
6999         zone_hold(zone);
7000         mutex_exit(&zonehash_lock);
7001         return (zone);
7002 }
7003
7004 /*
7005  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7006  * zone_dl_t pointer if found, and NULL otherwise.
7007  */
7008 static zone_dl_t *
7009 zone_find_dl(zone_t *zone, datalink_id_t linkid)
7010 {
7011         zone_dl_t *zdl;
7012
7013         ASSERT(mutex_owned(&zone->zone_lock));
7014         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7015             zdl = list_next(&zone->zone_dl_list, zdl)) {
7016                 if (zdl->zdl_id == linkid)
7017                         break;
7018         }
7019         return (zdl);
7020 }
7021
7022 static boolean_t
7023 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7024 {
7025         boolean_t exists;
7026
7027         mutex_enter(&zone->zone_lock);
7028         exists = (zone_find_dl(zone, linkid) != NULL);
7029         mutex_exit(&zone->zone_lock);
7030         return (exists);
7031 }
7032
7033 /*
7034  * Add an data link name for the zone.
7035  */
7036 static int
7037 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7038 {
7039         zone_dl_t *zdl;
7040         zone_t *zone;
7041         zone_t *thiszone;
7042
7043         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7044                 return (set_errno(ENXIO));
7045
7046         /* Verify that the datalink ID doesn't already belong to a zone. */
7047         mutex_enter(&zonehash_lock);
7048         for (zone = list_head(&zone_active); zone != NULL;
7049             zone = list_next(&zone_active, zone)) {
7050                 if (zone_dl_exists(zone, linkid)) {
7051                         mutex_exit(&zonehash_lock);
7052                         zone_rele(thiszone);
7053                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7054                 }
7055         }
7056
7057         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7058         zdl->zdl_id = linkid;
7059         zdl->zdl_net = NULL;
7060         mutex_enter(&thiszone->zone_lock);
7061         list_insert_head(&thiszone->zone_dl_list, zdl);
7062         mutex_exit(&thiszone->zone_lock);
7063         mutex_exit(&zonehash_lock);
7064         zone_rele(thiszone);
7065         return (0);
7066 }
7067
7068 static int
7069 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7070 {
7071         zone_dl_t *zdl;
7072         zone_t *zone;
7073         int err = 0;
7074
7075         if ((zone = zone_find_by_id(zoneid)) == NULL)
7076                 return (set_errno(EINVAL));
7077
7078         mutex_enter(&zone->zone_lock);
7079         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7080                 err = ENXIO;
7081         } else {
7082                 list_remove(&zone->zone_dl_list, zdl);
7083                 nvlist_free(zdl->zdl_net);
7084                 kmem_free(zdl, sizeof (zone_dl_t));
7085         }
7086         mutex_exit(&zone->zone_lock);
7087         zone_rele(zone);
7088         return (err == 0 ? 0 : set_errno(err));
7089 }
7090
7091 /*
7092  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7093  * the linkid.  Otherwise we just check if the specified zoneidp has been
7094  * assigned the supplied linkid.
7095  */
7096 int
7097 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7098 {
7099         zone_t *zone;
7100         int err = ENXIO;
7101
7102         if (*zoneidp != ALL_ZONES) {
7103                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7104                         if (zone_dl_exists(zone, linkid))
7105                                 err = 0;
7106                         zone_rele(zone);
7107                 }
7108                 return (err);
7109         }
7110
7111         mutex_enter(&zonehash_lock);
7112         for (zone = list_head(&zone_active); zone != NULL;
7113             zone = list_next(&zone_active, zone)) {
7114                 if (zone_dl_exists(zone, linkid)) {
7115                         *zoneidp = zone->zone_id;
7116                         err = 0;
7117                         break;
7118                 }
7119         }
7120         mutex_exit(&zonehash_lock);
7121         return (err);
7122 }
7123
7124 /*
7125  * Get the list of datalink IDs assigned to a zone.
7126  *
7127  * On input, *nump is the number of datalink IDs that can fit in the supplied
7128  * idarray.  Upon return, *nump is either set to the number of datalink IDs
7129  * that were placed in the array if the array was large enough, or to the
7130  * number of datalink IDs that the function needs to place in the array if the
7131  * array is too small.
7132  */
7133 static int
7134 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7135 {
7136         uint_t num, dlcount;
7137         zone_t *zone;
7138         zone_dl_t *zdl;
7139         datalink_id_t *idptr = idarray;
7140
7141         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7142                 return (set_errno(EFAULT));
7143         if ((zone = zone_find_by_id(zoneid)) == NULL)
7144                 return (set_errno(ENXIO));
7145
7146         num = 0;
7147         mutex_enter(&zone->zone_lock);
7148         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7149             zdl = list_next(&zone->zone_dl_list, zdl)) {
7150                 /*
7151                  * If the list is bigger than what the caller supplied, just
7152                  * count, don't do copyout.
7153                  */
7154                 if (++num > dlcount)
7155                         continue;
7156                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7157                         mutex_exit(&zone->zone_lock);
7158                         zone_rele(zone);
7159                         return (set_errno(EFAULT));
7160                 }
7161                 idptr++;
7162         }
7163         mutex_exit(&zone->zone_lock);
7164         zone_rele(zone);
7165
7166         /* Increased or decreased, caller should be notified. */
7167         if (num != dlcount) {
7168                 if (copyout(&num, nump, sizeof (num)) != 0)
7169                         return (set_errno(EFAULT));
7170         }
7171         return (0);
7172 }
7173
7174 /*
7175  * Public interface for looking up a zone by zoneid. It's a customized version
7176  * for netstack_zone_create(). It can only be called from the zsd create
7177  * callbacks, since it doesn't have reference on the zone structure hence if
7178  * it is called elsewhere the zone could disappear after the zonehash_lock
7179  * is dropped.
7180  *
7181  * Furthermore it
7182  * 1. Doesn't check the status of the zone.
7183  * 2. It will be called even before zone_init is called, in that case the
7184  *    address of zone0 is returned directly, and netstack_zone_create()
7185  *    will only assign a value to zone0.zone_netstack, won't break anything.
7186  * 3. Returns without the zone being held.
7187  */
7188 zone_t *
7189 zone_find_by_id_nolock(zoneid_t zoneid)
7190 {
7191         zone_t *zone;
7192
7193         mutex_enter(&zonehash_lock);
7194         if (zonehashbyid == NULL)
7195                 zone = &zone0;
7196         else
7197                 zone = zone_find_all_by_id(zoneid);
7198         mutex_exit(&zonehash_lock);
7199         return (zone);
7200 }
7201
7202 /*
7203  * Walk the datalinks for a given zone
7204  */
7205 int
7206 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7207     void *data)
7208 {
7209         zone_t          *zone;
7210         zone_dl_t       *zdl;
7211         datalink_id_t   *idarray;
7212         uint_t          idcount = 0;
7213         int             i, ret = 0;
7214
7215         if ((zone = zone_find_by_id(zoneid)) == NULL)
7216                 return (ENOENT);
7217
7218         /*
7219          * We first build an array of linkid's so that we can walk these and
7220          * execute the callback with the zone_lock dropped.
7221          */
7222         mutex_enter(&zone->zone_lock);
7223         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7224             zdl = list_next(&zone->zone_dl_list, zdl)) {
7225                 idcount++;
7226         }
7227
7228         if (idcount == 0) {
7229                 mutex_exit(&zone->zone_lock);
7230                 zone_rele(zone);
7231                 return (0);
7232         }
7233
7234         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7235         if (idarray == NULL) {
7236                 mutex_exit(&zone->zone_lock);
7237                 zone_rele(zone);
7238                 return (ENOMEM);
7239         }
7240
7241         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7242             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7243                 idarray[i] = zdl->zdl_id;
7244         }
7245
7246         mutex_exit(&zone->zone_lock);
7247
7248         for (i = 0; i < idcount && ret == 0; i++) {
7249                 if ((ret = (*cb)(idarray[i], data)) != 0)
7250                         break;
7251         }
7252
7253         zone_rele(zone);
7254         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7255         return (ret);
7256 }
7257
7258 static char *
7259 zone_net_type2name(int type)
7260 {
7261         switch (type) {
7262         case ZONE_NETWORK_ADDRESS:
7263                 return (ZONE_NET_ADDRNAME);
7264         case ZONE_NETWORK_DEFROUTER:
7265                 return (ZONE_NET_RTRNAME);
7266         default:
7267                 return (NULL);
7268         }
7269 }
7270
7271 static int
7272 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7273 {
7274         zone_t *zone;
7275         zone_dl_t *zdl;
7276         nvlist_t *nvl;
7277         int err = 0;
7278         uint8_t *new = NULL;
7279         char *nvname;
7280         int bufsize;
7281         datalink_id_t linkid = znbuf->zn_linkid;
7282
7283         if (secpolicy_zone_config(CRED()) != 0)
7284                 return (set_errno(EPERM));
7285
7286         if (zoneid == GLOBAL_ZONEID)
7287                 return (set_errno(EINVAL));
7288
7289         nvname = zone_net_type2name(znbuf->zn_type);
7290         bufsize = znbuf->zn_len;
7291         new = znbuf->zn_val;
7292         if (nvname == NULL)
7293                 return (set_errno(EINVAL));
7294
7295         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7296                 return (set_errno(EINVAL));
7297         }
7298
7299         mutex_enter(&zone->zone_lock);
7300         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7301                 err = ENXIO;
7302                 goto done;
7303         }
7304         if ((nvl = zdl->zdl_net) == NULL) {
7305                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7306                         err = ENOMEM;
7307                         goto done;
7308                 } else {
7309                         zdl->zdl_net = nvl;
7310                 }
7311         }
7312         if (nvlist_exists(nvl, nvname)) {
7313                 err = EINVAL;
7314                 goto done;
7315         }
7316         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7317         ASSERT(err == 0);
7318 done:
7319         mutex_exit(&zone->zone_lock);
7320         zone_rele(zone);
7321         if (err != 0)
7322                 return (set_errno(err));
7323         else
7324                 return (0);
7325 }
7326
7327 static int
7328 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7329 {
7330         zone_t *zone;
7331         zone_dl_t *zdl;
7332         nvlist_t *nvl;
7333         uint8_t *ptr;
7334         uint_t psize;
7335         int err = 0;
7336         char *nvname;
7337         int bufsize;
7338         void *buf;
7339         datalink_id_t linkid = znbuf->zn_linkid;
7340
7341         if (zoneid == GLOBAL_ZONEID)
7342                 return (set_errno(EINVAL));
7343
7344         nvname = zone_net_type2name(znbuf->zn_type);
7345         bufsize = znbuf->zn_len;
7346         buf = znbuf->zn_val;
7347
7348         if (nvname == NULL)
7349                 return (set_errno(EINVAL));
7350         if ((zone = zone_find_by_id(zoneid)) == NULL)
7351                 return (set_errno(EINVAL));
7352
7353         mutex_enter(&zone->zone_lock);
7354         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7355                 err = ENXIO;
7356                 goto done;
7357         }
7358         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7359                 err = ENOENT;
7360                 goto done;
7361         }
7362         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7363         ASSERT(err == 0);
7364
7365         if (psize > bufsize) {
7366                 err = ENOBUFS;
7367                 goto done;
7368         }
7369         znbuf->zn_len = psize;
7370         bcopy(ptr, buf, psize);
7371 done:
7372         mutex_exit(&zone->zone_lock);
7373         zone_rele(zone);
7374         if (err != 0)
7375                 return (set_errno(err));
7376         else
7377                 return (0);
7378 }